summaryrefslogtreecommitdiff
path: root/include/linux/workqueue.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/workqueue.h')
0 files changed, 0 insertions, 0 deletions
.git/log/kernel?h=devel-stable'>logtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt23
-rw-r--r--kernel/Makefile18
-rw-r--r--kernel/acct.c42
-rw-r--r--kernel/async.c3
-rw-r--r--kernel/audit.c121
-rw-r--r--kernel/audit.h22
-rw-r--r--kernel/audit_fsnotify.c9
-rw-r--r--kernel/audit_tree.c61
-rw-r--r--kernel/audit_watch.c10
-rw-r--r--kernel/auditfilter.c22
-rw-r--r--kernel/auditsc.c679
-rw-r--r--kernel/bounds.c7
-rw-r--r--kernel/bpf/Kconfig15
-rw-r--r--kernel/bpf/Makefile15
-rw-r--r--kernel/bpf/arraymap.c184
-rw-r--r--kernel/bpf/bloom_filter.c209
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c240
-rw-r--r--kernel/bpf/bpf_inode_storage.c114
-rw-r--r--kernel/bpf/bpf_iter.c204
-rw-r--r--kernel/bpf/bpf_local_storage.c596
-rw-r--r--kernel/bpf/bpf_lru_list.h1
-rw-r--r--kernel/bpf/bpf_lsm.c152
-rw-r--r--kernel/bpf/bpf_struct_ops.c381
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h3
-rw-r--r--kernel/bpf/bpf_task_storage.c202
-rw-r--r--kernel/bpf/btf.c3021
-rw-r--r--kernel/bpf/cgroup.c1082
-rw-r--r--kernel/bpf/cgroup_iter.c296
-rw-r--r--kernel/bpf/core.c564
-rw-r--r--kernel/bpf/cpumap.c195
-rw-r--r--kernel/bpf/cpumask.c456
-rw-r--r--kernel/bpf/devmap.c229
-rw-r--r--kernel/bpf/disasm.c18
-rw-r--r--kernel/bpf/disasm.h2
-rw-r--r--kernel/bpf/dispatcher.c52
-rw-r--r--kernel/bpf/hashtab.c562
-rw-r--r--kernel/bpf/helpers.c1527
-rw-r--r--kernel/bpf/inode.c61
-rw-r--r--kernel/bpf/link_iter.c107
-rw-r--r--kernel/bpf/local_storage.c58
-rw-r--r--kernel/bpf/log.c330
-rw-r--r--kernel/bpf/lpm_trie.c31
-rw-r--r--kernel/bpf/map_in_map.c49
-rw-r--r--kernel/bpf/map_iter.c4
-rw-r--r--kernel/bpf/memalloc.c719
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/net_namespace.c1
-rw-r--r--kernel/bpf/offload.c434
-rw-r--r--kernel/bpf/percpu_freelist.c83
-rw-r--r--kernel/bpf/preload/.gitignore4
-rw-r--r--kernel/bpf/preload/Kconfig7
-rw-r--r--kernel/bpf/preload/Makefile27
-rw-r--r--kernel/bpf/preload/bpf_preload.h8
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c130
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/Makefile50
-rw-r--r--kernel/bpf/preload/iterators/README5
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h419
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h425
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/queue_stack_maps.c44
-rw-r--r--kernel/bpf/reuseport_array.c33
-rw-r--r--kernel/bpf/ringbuf.c375
-rw-r--r--kernel/bpf/stackmap.c189
-rw-r--r--kernel/bpf/syscall.c1383
-rw-r--r--kernel/bpf/task_iter.c351
-rw-r--r--kernel/bpf/trampoline.c714
-rw-r--r--kernel/bpf/verifier.c8588
-rw-r--r--kernel/capability.c115
-rw-r--r--kernel/cfi.c344
-rw-r--r--kernel/cgroup/cgroup-internal.h25
-rw-r--r--kernel/cgroup/cgroup-v1.c115
-rw-r--r--kernel/cgroup/cgroup.c1000
-rw-r--r--kernel/cgroup/cpuset.c1382
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/legacy_freezer.c26
-rw-r--r--kernel/cgroup/misc.c31
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/cgroup/pids.c37
-rw-r--r--kernel/cgroup/rstat.c194
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/configs/android-base.config160
-rw-r--r--kernel/configs/android-recommended.config127
-rw-r--r--kernel/configs/debug.config106
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/configs/x86_debug.config17
-rw-r--r--kernel/configs/xen.config1
-rw-r--r--kernel/context_tracking.c617
-rw-r--r--kernel/cpu.c257
-rw-r--r--kernel/cpu_pm.c53
-rw-r--r--kernel/crash_core.c55
-rw-r--r--kernel/cred.c48
-rw-r--r--kernel/debug/debug_core.c41
-rw-r--r--kernel/debug/gdbstub.c5
-rw-r--r--kernel/debug/kdb/kdb_bp.c72
-rw-r--r--kernel/debug/kdb/kdb_bt.c16
-rw-r--r--kernel/debug/kdb/kdb_debugger.c1
-rw-r--r--kernel/debug/kdb/kdb_io.c19
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c768
-rw-r--r--kernel/debug/kdb/kdb_private.h26
-rw-r--r--kernel/debug/kdb/kdb_support.c445
-rw-r--r--kernel/delayacct.c114
-rw-r--r--kernel/dma/Kconfig27
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/coherent.c176
-rw-r--r--kernel/dma/contiguous.c4
-rw-r--r--kernel/dma/debug.c205
-rw-r--r--kernel/dma/debug.h24
-rw-r--r--kernel/dma/direct.c348
-rw-r--r--kernel/dma/direct.h19
-rw-r--r--kernel/dma/dummy.c2
-rw-r--r--kernel/dma/map_benchmark.c26
-rw-r--r--kernel/dma/mapping.c157
-rw-r--r--kernel/dma/ops_helpers.c12
-rw-r--r--kernel/dma/pool.c10
-rw-r--r--kernel/dma/swiotlb.c891
-rw-r--r--kernel/entry/common.c82
-rw-r--r--kernel/entry/kvm.c11
-rw-r--r--kernel/entry/syscall_user_dispatch.c86
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/callchain.c4
-rw-r--r--kernel/events/core.c3278
-rw-r--r--kernel/events/hw_breakpoint.c652
-rw-r--r--kernel/events/hw_breakpoint_test.c332
-rw-r--r--kernel/events/internal.h12
-rw-r--r--kernel/events/ring_buffer.c14
-rw-r--r--kernel/events/uprobes.c61
-rw-r--r--kernel/exit.c299
-rw-r--r--kernel/extable.c63
-rw-r--r--kernel/fail_function.c31
-rw-r--r--kernel/fork.c1025
-rw-r--r--kernel/freezer.c133
-rw-r--r--kernel/futex.c4038
-rw-r--r--kernel/futex/Makefile3
-rw-r--r--kernel/futex/core.c1149
-rw-r--r--kernel/futex/futex.h294
-rw-r--r--kernel/futex/pi.c1233
-rw-r--r--kernel/futex/requeue.c897
-rw-r--r--kernel/futex/syscalls.c379
-rw-r--r--kernel/futex/waitwake.c708
-rw-r--r--kernel/gcov/Kconfig1
-rw-r--r--kernel/gcov/clang.c2
-rw-r--r--kernel/gcov/gcc_4_7.c23
-rwxr-xr-xkernel/gen_kheaders.sh12
-rw-r--r--kernel/groups.c13
-rw-r--r--kernel/hung_task.c109
-rw-r--r--kernel/irq/Kconfig20
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c408
-rw-r--r--kernel/irq/chip.c63
-rw-r--r--kernel/irq/cpuhotplug.c6
-rw-r--r--kernel/irq/debugfs.c11
-rw-r--r--kernel/irq/generic-chip.c24
-rw-r--r--kernel/irq/handle.c29
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/ipi.c54
-rw-r--r--kernel/irq/irq_sim.c2
-rw-r--r--kernel/irq/irqdesc.c138
-rw-r--r--kernel/irq/irqdomain.c484
-rw-r--r--kernel/irq/manage.c131
-rw-r--r--kernel/irq/matrix.c5
-rw-r--r--kernel/irq/msi.c1454
-rw-r--r--kernel/irq/pm.c4
-rw-r--r--kernel/irq/proc.c10
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq/timings.c7
-rw-r--r--kernel/irq_work.c142
-rw-r--r--kernel/jump_label.c99
-rw-r--r--kernel/kallsyms.c298
-rw-r--r--kernel/kallsyms_internal.h31
-rw-r--r--kernel/kallsyms_selftest.c476
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcov.c156
-rw-r--r--kernel/kcsan/.kunitconfig24
-rw-r--r--kernel/kcsan/Makefile6
-rw-r--r--kernel/kcsan/atomic.h23
-rw-r--r--kernel/kcsan/core.c552
-rw-r--r--kernel/kcsan/debugfs.c2
-rw-r--r--kernel/kcsan/kcsan.h8
-rw-r--r--kernel/kcsan/kcsan_test.c565
-rw-r--r--kernel/kcsan/permissive.h94
-rw-r--r--kernel/kcsan/report.c115
-rw-r--r--kernel/kcsan/selftest.c227
-rw-r--r--kernel/kexec.c104
-rw-r--r--kernel/kexec_core.c199
-rw-r--r--kernel/kexec_file.c166
-rw-r--r--kernel/kexec_internal.h15
-rw-r--r--kernel/kheaders.c10
-rw-r--r--kernel/kprobes.c893
-rw-r--r--kernel/ksysfs.c57
-rw-r--r--kernel/kthread.c203
-rw-r--r--kernel/latencytop.c45
-rw-r--r--kernel/livepatch/core.c154
-rw-r--r--kernel/livepatch/patch.c33
-rw-r--r--kernel/livepatch/shadow.c6
-rw-r--r--kernel/livepatch/transition.c283
-rw-r--r--kernel/locking/Makefile5
-rw-r--r--kernel/locking/lockdep.c215
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/lockdep_proc.c51
-rw-r--r--kernel/locking/locktorture.c330
-rw-r--r--kernel/locking/mutex-debug.c5
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c623
-rw-r--r--kernel/locking/mutex.h48
-rw-r--r--kernel/locking/percpu-rwsem.c16
-rw-r--r--kernel/locking/qrwlock.c21
-rw-r--r--kernel/locking/qspinlock.c11
-rw-r--r--kernel/locking/qspinlock_paravirt.h4
-rw-r--r--kernel/locking/rtmutex.c1242
-rw-r--r--kernel/locking/rtmutex_api.c612
-rw-r--r--kernel/locking/rtmutex_common.h135
-rw-r--r--kernel/locking/rwbase_rt.c289
-rw-r--r--kernel/locking/rwsem.c583
-rw-r--r--kernel/locking/semaphore.c31
-rw-r--r--kernel/locking/spinlock.c76
-rw-r--r--kernel/locking/spinlock_debug.c5
-rw-r--r--kernel/locking/spinlock_rt.c280
-rw-r--r--kernel/locking/test-ww_mutex.c93
-rw-r--r--kernel/locking/ww_mutex.h569
-rw-r--r--kernel/locking/ww_rt_mutex.c101
-rw-r--r--kernel/module-internal.h31
-rw-r--r--kernel/module/Kconfig392
-rw-r--r--kernel/module/Makefile25
-rw-r--r--kernel/module/debug_kmemleak.c30
-rw-r--r--kernel/module/decompress.c368
-rw-r--r--kernel/module/dups.c248
-rw-r--r--kernel/module/internal.h394
-rw-r--r--kernel/module/kallsyms.c549
-rw-r--r--kernel/module/kdb.c63
-rw-r--r--kernel/module/kmod.c (renamed from kernel/kmod.c)49
-rw-r--r--kernel/module/livepatch.c74
-rw-r--r--kernel/module/main.c (renamed from kernel/module.c)3081
-rw-r--r--kernel/module/procfs.c152
-rw-r--r--kernel/module/signing.c125
-rw-r--r--kernel/module/stats.c430
-rw-r--r--kernel/module/strict_rwx.c80
-rw-r--r--kernel/module/sysfs.c436
-rw-r--r--kernel/module/tracking.c129
-rw-r--r--kernel/module/tree_lookup.c112
-rw-r--r--kernel/module/version.c101
-rw-r--r--kernel/module_signing.c45
-rw-r--r--kernel/notifier.c146
-rw-r--r--kernel/nsproxy.c42
-rw-r--r--kernel/padata.c68
-rw-r--r--kernel/panic.c189
-rw-r--r--kernel/params.c50
-rw-r--r--kernel/pid.c59
-rw-r--r--kernel/pid_namespace.c30
-rw-r--r--kernel/pid_sysctl.h59
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/energy_model.c140
-rw-r--r--kernel/power/hibernate.c77
-rw-r--r--kernel/power/main.c118
-rw-r--r--kernel/power/power.h16
-rw-r--r--kernel/power/process.c42
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/power/snapshot.c45
-rw-r--r--kernel/power/suspend.c38
-rw-r--r--kernel/power/suspend_test.c10
-rw-r--r--kernel/power/swap.c96
-rw-r--r--kernel/power/user.c39
-rw-r--r--kernel/power/wakelock.c11
-rw-r--r--kernel/printk/Makefile6
-rw-r--r--kernel/printk/index.c194
-rw-r--r--kernel/printk/internal.h82
-rw-r--r--kernel/printk/printk.c1807
-rw-r--r--kernel/printk/printk_ringbuffer.c54
-rw-r--r--kernel/printk/printk_ringbuffer.h2
-rw-r--r--kernel/printk/printk_safe.c364
-rw-r--r--kernel/printk/sysctl.c85
-rw-r--r--kernel/profile.c129
-rw-r--r--kernel/ptrace.c153
-rw-r--r--kernel/rcu/Kconfig147
-rw-r--r--kernel/rcu/Kconfig.debug40
-rw-r--r--kernel/rcu/rcu.h101
-rw-r--r--kernel/rcu/rcu_segcblist.c20
-rw-r--r--kernel/rcu/rcu_segcblist.h18
-rw-r--r--kernel/rcu/rcuscale.c129
-rw-r--r--kernel/rcu/rcutorture.c1283
-rw-r--r--kernel/rcu/refscale.c384
-rw-r--r--kernel/rcu/srcutiny.c30
-rw-r--r--kernel/rcu/srcutree.c1183
-rw-r--r--kernel/rcu/sync.c4
-rw-r--r--kernel/rcu/tasks.h1242
-rw-r--r--kernel/rcu/tiny.c51
-rw-r--r--kernel/rcu/tree.c2227
-rw-r--r--kernel/rcu/tree.h110
-rw-r--r--kernel/rcu/tree_exp.h392
-rw-r--r--kernel/rcu/tree_nocb.h1762
-rw-r--r--kernel/rcu/tree_plugin.h1917
-rw-r--r--kernel/rcu/tree_stall.h250
-rw-r--r--kernel/rcu/update.c103
-rw-r--r--kernel/reboot.c438
-rw-r--r--kernel/relay.c20
-rw-r--r--kernel/resource.c326
-rw-r--r--kernel/rseq.c125
-rw-r--r--kernel/scftorture.c134
-rw-r--r--kernel/sched/Makefile39
-rw-r--r--kernel/sched/autogroup.c31
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c54
-rw-r--r--kernel/sched/build_utility.c110
-rw-r--r--kernel/sched/clock.c66
-rw-r--r--kernel/sched/completion.c14
-rw-r--r--kernel/sched/core.c3515
-rw-r--r--kernel/sched/core_sched.c99
-rw-r--r--kernel/sched/cpuacct.c117
-rw-r--r--kernel/sched/cpudeadline.c5
-rw-r--r--kernel/sched/cpufreq.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c82
-rw-r--r--kernel/sched/cpupri.c3
-rw-r--r--kernel/sched/cputime.c36
-rw-r--r--kernel/sched/deadline.c501
-rw-r--r--kernel/sched/debug.c189
-rw-r--r--kernel/sched/fair.c3172
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c59
-rw-r--r--kernel/sched/isolation.c163
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/membarrier.c47
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/pelt.h48
-rw-r--r--kernel/sched/psi.c1034
-rw-r--r--kernel/sched/rt.c347
-rw-r--r--kernel/sched/sched.h1009
-rw-r--r--kernel/sched/smp.h8
-rw-r--r--kernel/sched/stats.c105
-rw-r--r--kernel/sched/stats.h84
-rw-r--r--kernel/sched/stop_task.c16
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/topology.c399
-rw-r--r--kernel/sched/wait.c26
-rw-r--r--kernel/sched/wait_bit.c4
-rw-r--r--kernel/scs.c25
-rw-r--r--kernel/seccomp.c116
-rw-r--r--kernel/signal.c588
-rw-r--r--kernel/smp.c393
-rw-r--r--kernel/smpboot.c32
-rw-r--r--kernel/softirq.c41
-rw-r--r--kernel/stackleak.c143
-rw-r--r--kernel/stacktrace.c33
-rw-r--r--kernel/static_call.c542
-rw-r--r--kernel/static_call_inline.c556
-rw-r--r--kernel/stop_machine.c23
-rw-r--r--kernel/sys.c419
-rw-r--r--kernel/sys_ni.c12
-rw-r--r--kernel/sysctl-test.c43
-rw-r--r--kernel/sysctl.c1431
-rw-r--r--kernel/task_work.c45
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/test_kprobes.c313
-rw-r--r--kernel/time/Kconfig50
-rw-r--r--kernel/time/alarmtimer.c36
-rw-r--r--kernel/time/clockevents.c11
-rw-r--r--kernel/time/clocksource-wdtest.c5
-rw-r--r--kernel/time/clocksource.c130
-rw-r--r--kernel/time/hrtimer.c363
-rw-r--r--kernel/time/jiffies.c21
-rw-r--r--kernel/time/namespace.c22
-rw-r--r--kernel/time/posix-cpu-timers.c235
-rw-r--r--kernel/time/posix-stubs.c5
-rw-r--r--kernel/time/posix-timers.c31
-rw-r--r--kernel/time/sched_clock.c13
-rw-r--r--kernel/time/test_udelay.c2
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c29
-rw-r--r--kernel/time/tick-broadcast.c126
-rw-r--r--kernel/time/tick-common.c19
-rw-r--r--kernel/time/tick-internal.h32
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c256
-rw-r--r--kernel/time/tick-sched.h63
-rw-r--r--kernel/time/time.c12
-rw-r--r--kernel/time/timekeeping.c116
-rw-r--r--kernel/time/timer.c558
-rw-r--r--kernel/torture.c20
-rw-r--r--kernel/trace/Kconfig133
-rw-r--r--kernel/trace/Makefile9
-rw-r--r--kernel/trace/blktrace.c226
-rw-r--r--kernel/trace/bpf_trace.c1112
-rw-r--r--kernel/trace/fgraph.c39
-rw-r--r--kernel/trace/fprobe.c341
-rw-r--r--kernel/trace/ftrace.c1704
-rw-r--r--kernel/trace/kprobe_event_gen_test.c119
-rw-r--r--kernel/trace/pid_list.c495
-rw-r--r--kernel/trace/pid_list.h88
-rw-r--r--kernel/trace/rethook.c328
-rw-r--r--kernel/trace/ring_buffer.c489
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/rv/Kconfig78
-rw-r--r--kernel/trace/rv/Makefile8
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c88
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h46
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c87
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h46
-rw-r--r--kernel/trace/rv/reactor_panic.c42
-rw-r--r--kernel/trace/rv/reactor_printk.c41
-rw-r--r--kernel/trace/rv/rv.c797
-rw-r--r--kernel/trace/rv/rv.h68
-rw-r--r--kernel/trace/rv/rv_reactors.c510
-rw-r--r--kernel/trace/synth_event_gen_test.c18
-rw-r--r--kernel/trace/trace.c914
-rw-r--r--kernel/trace/trace.h211
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_benchmark.h8
-rw-r--r--kernel/trace/trace_boot.c43
-rw-r--r--kernel/trace/trace_dynevent.c13
-rw-r--r--kernel/trace/trace_eprobe.c463
-rw-r--r--kernel/trace/trace_event_perf.c32
-rw-r--r--kernel/trace/trace_events.c345
-rw-r--r--kernel/trace/trace_events_filter.c469
-rw-r--r--kernel/trace/trace_events_hist.c1379
-rw-r--r--kernel/trace/trace_events_inject.c11
-rw-r--r--kernel/trace/trace_events_synth.c177
-rw-r--r--kernel/trace/trace_events_trigger.c753
-rw-r--r--kernel/trace/trace_events_user.c2571
-rw-r--r--kernel/trace/trace_export.c3
-rw-r--r--kernel/trace/trace_functions.c5
-rw-r--r--kernel/trace/trace_functions_graph.c4
-rw-r--r--kernel/trace/trace_hwlat.c25
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c220
-rw-r--r--kernel/trace/trace_osnoise.c1033
-rw-r--r--kernel/trace/trace_output.c277
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_preemptirq.c63
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c144
-rw-r--r--kernel/trace/trace_probe.h29
-rw-r--r--kernel/trace/trace_probe_kernel.h133
-rw-r--r--kernel/trace/trace_probe_tmpl.h95
-rw-r--r--kernel/trace/trace_recursion_record.c7
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c3
-rw-r--r--kernel/trace/trace_selftest.c95
-rw-r--r--kernel/trace/trace_seq.c23
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_stat.c6
-rw-r--r--kernel/trace/trace_synth.h1
-rw-r--r--kernel/trace/trace_syscalls.c39
-rw-r--r--kernel/trace/trace_uprobe.c79
-rw-r--r--kernel/trace/tracing_map.c51
-rw-r--r--kernel/tracepoint.c19
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/ucount.c116
-rw-r--r--kernel/umh.c67
-rw-r--r--kernel/user.c26
-rw-r--r--kernel/user_namespace.c31
-rw-r--r--kernel/usermode_driver.c4
-rw-r--r--kernel/utsname_sysctl.c21
-rw-r--r--kernel/vhost_task.c117
-rw-r--r--kernel/watch_queue.c133
-rw-r--r--kernel/watchdog.c128
-rw-r--r--kernel/workqueue.c950
-rw-r--r--kernel/workqueue_internal.h3
462 files changed, 86023 insertions, 37271 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 3de8fd11873b..4198f0273ecd 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
- depends on SMP
+ depends on SMP && !PREEMPT_RT
config ARCH_HAS_MMIOWB
bool
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 5876e30c5740..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,11 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -20,6 +32,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -38,9 +51,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPTION
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
- select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC
+ select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -83,7 +94,11 @@ config PREEMPTION
select PREEMPT_COUNT
config PREEMPT_DYNAMIC
- bool
+ bool "Preemption behaviour defined on boot"
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
+ select PREEMPT_BUILD
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption
diff --git a/kernel/Makefile b/kernel/Makefile
index 4df609be42d0..b69c95315480 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
async.o range.o smpboot.o ucount.o regset.o
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
-obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
@@ -29,7 +29,6 @@ KCOV_INSTRUMENT_softirq.o := n
KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
-KCOV_INSTRUMENT_module.o := n
KCOV_INSTRUMENT_extable.o := n
KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
@@ -39,11 +38,9 @@ KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
-# Don't instrument error handlers
-CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
-
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -53,23 +50,23 @@ obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
obj-y += entry/
+obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_FUTEX) += futex/
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
@@ -85,7 +82,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
-obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
@@ -108,12 +104,14 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/acct.c b/kernel/acct.c
index a64102be2bb0..010667ce6080 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -60,7 +60,6 @@
#include <linux/sched/cputime.h>
#include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
@@ -71,11 +70,31 @@
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
-int acct_parm[3] = {4, 2, 30};
+static int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_acct_table[] = {
+ {
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static __init int kernel_acct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_acct_table);
+ return 0;
+}
+late_initcall(kernel_acct_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* External references and all of the globals.
*/
@@ -301,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -312,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -331,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
@@ -478,7 +499,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/*
* Accounting records are not subject to resource limits.
*/
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
@@ -536,15 +557,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index b8d7a663497f..b2c4ba5686ee 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
-
/* schedule for execution */
queue_work_node(node, system_unbound_wq, &entry->work);
diff --git a/kernel/audit.c b/kernel/audit.c
index 121d37e700a6..9bc0b0301198 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -321,7 +321,6 @@ static inline int audit_rate_check(void)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
- unsigned long elapsed;
int retval = 0;
if (!audit_rate_limit) return 1;
@@ -330,9 +329,8 @@ static inline int audit_rate_check(void)
if (++messages < audit_rate_limit) {
retval = 1;
} else {
- now = jiffies;
- elapsed = now - last_check;
- if (elapsed > HZ) {
+ now = jiffies;
+ if (time_after(now, last_check + HZ)) {
last_check = now;
messages = 0;
retval = 1;
@@ -366,7 +364,7 @@ void audit_log_lost(const char *message)
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
- if (now - last_msg > HZ) {
+ if (time_after(now, last_msg + HZ)) {
print = 1;
last_msg = now;
}
@@ -541,20 +539,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
-static void kauditd_rehold_skb(struct sk_buff *skb)
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
- /* put the record back in the queue at the same place */
- skb_queue_head(&audit_hold_queue, skb);
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
+ * @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
@@ -564,19 +564,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
-static void kauditd_hold_skb(struct sk_buff *skb)
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
- if (!audit_default) {
- kfree_skb(skb);
- return;
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
}
- /* if we have room, queue the message */
+ /* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
@@ -585,24 +597,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
+drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
-static void kauditd_retry_skb(struct sk_buff *skb)
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- /* NOTE: because records should only live in the retry queue for a
- * short period of time, before either being sent or moved to the hold
- * queue, we don't currently enforce a limit on this queue */
- skb_queue_tail(&audit_retry_queue, skb);
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
}
/**
@@ -640,7 +660,7 @@ static void auditd_reset(const struct auditd_connection *ac)
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
+ kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
@@ -714,16 +734,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
- void (*err_hook)(struct sk_buff *skb))
+ void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
- struct sk_buff *skb;
- static unsigned int failed = 0;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) {
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
@@ -731,36 +753,34 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, -ECONNREFUSED);
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
- (*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ (*err_hook)(skb, rc);
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -1078,7 +1098,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
audit_log_common_recv_msg(NULL, ab, msg_type);
}
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
@@ -1368,7 +1388,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
str);
} else {
audit_log_format(ab, " data=");
- if (data_len > 0 && str[data_len - 1] == '\0')
+ if (str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
@@ -1446,7 +1466,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@@ -1459,7 +1479,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1542,6 +1562,20 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
/* Log information about who is connecting to the audit multicast socket */
@@ -1609,7 +1643,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1825,7 +1860,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
@@ -2132,7 +2169,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid_subj(current, &sid);
+ security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@@ -2353,7 +2390,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid_subj(current, &audit_sig_sid);
+ security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index b565ea16c0a5..c57b008b9914 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -6,11 +6,15 @@
* Copyright 2005 IBM Corporation
*/
+#ifndef _KERNEL_AUDIT_H_
+#define _KERNEL_AUDIT_H_
+
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
+#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@@ -97,10 +101,15 @@ struct audit_proctitle {
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
+ enum {
+ AUDIT_CTX_UNUSED, /* audit_context is currently unused */
+ AUDIT_CTX_SYSCALL, /* in use by syscall */
+ AUDIT_CTX_URING, /* in use by io_uring */
+ } context;
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
+ int uring_op; /* uring operation */
struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
@@ -124,7 +133,7 @@ struct audit_context {
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
- pid_t pid, ppid;
+ pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -185,12 +194,17 @@ struct audit_context {
int fd;
int flags;
} mmap;
+ struct open_how openat2;
struct {
int argc;
} execve;
struct {
char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
@@ -231,8 +245,6 @@ struct audit_netlink_list {
int audit_send_list_thread(void *_dest);
-extern int selinux_audit_rule_update(void);
-
extern struct mutex audit_filter_mutex;
extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
@@ -331,3 +343,5 @@ extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
+
+#endif
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 60739d5e3373..c565fbf66ac8 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -100,8 +100,9 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
if (ret < 0) {
+ audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
@@ -160,8 +161,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
- WARN_ON_ONCE(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
@@ -182,7 +182,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = {
static int __init audit_fsnotify_init(void)
{
- audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
+ FSNOTIFY_GROUP_DUPS);
if (IS_ERR(audit_fsnotify_group)) {
audit_fsnotify_group = NULL;
audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b2be4e978ba3..e867c17d3f84 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -30,7 +30,7 @@ struct audit_chunk {
int count;
atomic_long_t refs;
struct rcu_head head;
- struct node {
+ struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
@@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -269,7 +269,7 @@ bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
-static struct audit_chunk *find_chunk(struct node *p)
+static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
@@ -322,7 +322,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
list_replace_rcu(&old->hash, &new->hash);
}
-static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
{
struct audit_tree *owner = p->owner;
@@ -351,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
struct audit_chunk *new;
int size;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
/*
* mark_mutex stabilizes chunk attached to the mark so we can check
* whether it didn't change while we've dropped hash_lock.
@@ -368,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
fsnotify_free_mark(mark);
return;
@@ -385,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
*/
replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
return;
out_mutex:
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
}
/* Call with group->mark_mutex held, releases it */
@@ -400,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
return -ENOMEM;
}
mark = alloc_mark();
if (!mark) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
kfree(chunk);
return -ENOMEM;
}
if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return -ENOSPC;
@@ -422,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
if (tree->goner) {
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
kfree(chunk);
@@ -444,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
*/
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
/*
* Drop our initial reference. When mark we point to is getting freed,
* we get notification through ->freeing_mark callback and cleanup
@@ -459,10 +459,10 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
- struct node *p;
+ struct audit_node *p;
int n;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
if (!mark)
return create_chunk(inode, tree);
@@ -478,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return 0;
}
@@ -487,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return -ENOMEM;
}
@@ -495,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return 0;
@@ -515,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
*/
replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
audit_mark_put_chunk(old);
@@ -570,11 +570,11 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
- struct node *p;
+ struct audit_node *p;
struct audit_chunk *chunk;
struct fsnotify_mark *mark;
- p = list_first_entry(&victim->chunks, struct node, list);
+ p = list_first_entry(&victim->chunks, struct audit_node, list);
/* have we run out of marked? */
if (tagged && !(p->index & (1U<<31)))
break;
@@ -593,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
- put_tree(victim);
}
/*
@@ -602,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
static void prune_one(struct audit_tree *victim)
{
prune_tree_chunks(victim, false);
+ put_tree(victim);
}
/* trim the uncommitted chunks from tree */
@@ -616,7 +616,7 @@ static void trim_marked(struct audit_tree *tree)
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
- struct node *node = list_entry(p, struct node, list);
+ struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
@@ -684,7 +684,7 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
- struct node *node;
+ struct audit_node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -726,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
- rule->listnr != AUDIT_FILTER_EXIT ||
+ (rule->listnr != AUDIT_FILTER_EXIT &&
+ rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
@@ -839,7 +840,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
drop_collected_mounts(mnt);
if (!err) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -938,7 +939,7 @@ int audit_tag_tree(char *old, char *new)
mutex_unlock(&audit_filter_mutex);
if (!failed) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -1043,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
{
struct audit_chunk *chunk;
- mutex_lock(&mark->group->mark_mutex);
+ fsnotify_group_lock(mark->group);
spin_lock(&hash_lock);
chunk = mark_chunk(mark);
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- mutex_unlock(&mark->group->mark_mutex);
+ fsnotify_group_unlock(mark->group);
if (chunk) {
evict_chunk(chunk);
audit_mark_put_chunk(chunk);
@@ -1073,7 +1074,7 @@ static int __init audit_tree_init(void)
audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
- audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 2acf7ca49154..65075f1e4ac8 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct path *path)
+static struct audit_parent *audit_init_parent(const struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
@@ -183,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
+ (krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
@@ -472,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
parent = container_of(inode_mark, struct audit_parent, mark);
- if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
- WARN_ON_ONCE(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
@@ -493,7 +493,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
static int __init audit_watch_init(void)
{
- audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index db2c6b59dfc3..42d99896e7a6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -44,7 +44,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
LIST_HEAD_INIT(audit_filter_list[6]),
-#if AUDIT_NR_FILTERS != 7
+ LIST_HEAD_INIT(audit_filter_list[7]),
+#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
@@ -56,6 +57,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
LIST_HEAD_INIT(audit_rules_list[6]),
+ LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -151,7 +153,8 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
- if (krule->listnr != AUDIT_FILTER_EXIT ||
+ if ((krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@@ -248,6 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
goto exit_err;
case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
@@ -332,6 +336,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (entry->rule.listnr != AUDIT_FILTER_FS)
return -EINVAL;
break;
+ case AUDIT_PERM:
+ if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
+ return -EINVAL;
+ break;
}
switch (entry->rule.listnr) {
@@ -629,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@@ -980,7 +988,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
}
entry->rule.prio = ~0ULL;
- if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+ if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
+ entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
@@ -1083,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
- sizeof(*data) + data->buflen);
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@@ -1359,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid_subj(current,
- &sid);
+ security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dd73a64f921..addeed3df15d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
@@ -6,20 +7,6 @@
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
@@ -76,6 +63,8 @@
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
+#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -166,7 +155,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
- case 0: /* native */
+ case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
@@ -177,7 +166,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
- case 1: /* 32bit on biarch */
+ case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
@@ -188,14 +177,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
- case 2: /* open */
+ case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
- case 3: /* openat */
+ case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
- case 4: /* socketcall */
+ case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
- case 5: /* execve */
+ case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
+ case AUDITSC_OPENAT2:
+ return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@@ -480,6 +471,9 @@ static int audit_filter_rules(struct task_struct *tsk,
u32 sid;
unsigned int sessionid;
+ if (ctx && rule->prio <= ctx->prio)
+ return 0;
+
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
@@ -657,7 +651,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
- if (ctx->sockaddr)
+ if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
@@ -673,7 +667,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid_subj(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use security_current_getsecid_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -747,8 +750,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
if (ctx) {
- if (rule->prio <= ctx->prio)
- return 0;
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -805,6 +806,57 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
+/**
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
+ * @tsk: associated task
+ * @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
+ */
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
+ &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
+ rcu_read_unlock();
+}
+
/* At syscall exit time, this filter is called if the audit_state is
* not low enough that auditing cannot take place, but is also not
* high enough that we already know we have to write an audit record
@@ -813,24 +865,13 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
{
- struct audit_entry *e;
- enum audit_state state;
-
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return;
}
/*
@@ -842,17 +883,8 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_context *ctx) {
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
@@ -915,10 +947,80 @@ static inline void audit_free_aux(struct audit_context *context)
context->aux = aux->next;
kfree(aux);
}
+ context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
+ context->aux_pids = NULL;
+}
+
+/**
+ * audit_reset_context - reset a audit_context structure
+ * @ctx: the audit_context to reset
+ *
+ * All fields in the audit_context will be reset to an initial state, all
+ * references held by fields will be dropped, and private memory will be
+ * released. When this function returns the audit_context will be suitable
+ * for reuse, so long as the passed context is not NULL or a dummy context.
+ */
+static void audit_reset_context(struct audit_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ /* if ctx is non-null, reset the "ctx->context" regardless */
+ ctx->context = AUDIT_CTX_UNUSED;
+ if (ctx->dummy)
+ return;
+
+ /*
+ * NOTE: It shouldn't matter in what order we release the fields, so
+ * release them in the order in which they appear in the struct;
+ * this gives us some hope of quickly making sure we are
+ * resetting the audit_context properly.
+ *
+ * Other things worth mentioning:
+ * - we don't reset "dummy"
+ * - we don't reset "state", we do reset "current_state"
+ * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
+ * - much of this is likely overkill, but play it safe for now
+ * - we really need to work on improving the audit_context struct
+ */
+
+ ctx->current_state = ctx->state;
+ ctx->serial = 0;
+ ctx->major = 0;
+ ctx->uring_op = 0;
+ ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
+ memset(ctx->argv, 0, sizeof(ctx->argv));
+ ctx->return_code = 0;
+ ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
+ ctx->return_valid = AUDITSC_INVALID;
+ audit_free_names(ctx);
+ if (ctx->state != AUDIT_STATE_RECORD) {
+ kfree(ctx->filterkey);
+ ctx->filterkey = NULL;
+ }
+ audit_free_aux(ctx);
+ kfree(ctx->sockaddr);
+ ctx->sockaddr = NULL;
+ ctx->sockaddr_len = 0;
+ ctx->ppid = 0;
+ ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
+ ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
+ ctx->personality = 0;
+ ctx->arch = 0;
+ ctx->target_pid = 0;
+ ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
+ ctx->target_sessionid = 0;
+ ctx->target_sid = 0;
+ ctx->target_comm[0] = '\0';
+ unroll_tree_refs(ctx, NULL, 0);
+ WARN_ON(!list_empty(&ctx->killed_trees));
+ audit_free_module(ctx);
+ ctx->fds[0] = -1;
+ ctx->type = 0; /* reset last for audit_free_*() */
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -928,6 +1030,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
+ context->context = AUDIT_CTX_UNUSED;
context->state = state;
context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
@@ -953,7 +1056,7 @@ int audit_alloc(struct task_struct *tsk)
char *key = NULL;
if (likely(!audit_ever_enabled))
- return 0; /* Return if not auditing. */
+ return 0;
state = audit_filter_task(tsk, &key);
if (state == AUDIT_STATE_DISABLED) {
@@ -975,14 +1078,11 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
+ /* resetting is extra work, but it is likely just noise */
+ audit_reset_context(context);
+ audit_proctitle_free(context);
free_tree_refs(context);
- audit_free_aux(context);
kfree(context->filterkey);
- kfree(context->sockaddr);
- audit_proctitle_free(context);
kfree(context);
}
@@ -1195,15 +1295,11 @@ out:
static void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap)
{
- int i;
-
if (cap_isclear(*cap)) {
audit_log_format(ab, " %s=0", prefix);
return;
}
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+ audit_log_format(ab, " %s=%016llx", prefix, cap->val);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
@@ -1219,6 +1315,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1316,6 +1459,12 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break;
+ case AUDIT_OPENAT2:
+ audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
+ context->openat2.flags,
+ context->openat2.mode,
+ context->openat2.resolve);
+ break;
case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
break;
@@ -1327,6 +1476,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -1479,6 +1633,44 @@ out:
audit_log_end(ab);
}
+/**
+ * audit_log_uring - generate a AUDIT_URINGOP record
+ * @ctx: the audit context
+ */
+static void audit_log_uring(struct audit_context *ctx)
+{
+ struct audit_buffer *ab;
+ const struct cred *cred;
+
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
+ if (!ab)
+ return;
+ cred = current_cred();
+ audit_log_format(ab, "uring_op=%d", ctx->uring_op);
+ if (ctx->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (ctx->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ ctx->return_code);
+ audit_log_format(ab,
+ " items=%d"
+ " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
+ " fsuid=%u egid=%u sgid=%u fsgid=%u",
+ ctx->name_count,
+ task_ppid_nr(current), task_tgid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid));
+ audit_log_task_context(ab);
+ audit_log_key(ab, ctx->filterkey);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(void)
{
int i, call_panic = 0;
@@ -1489,29 +1681,38 @@ static void audit_log_exit(void)
context->personality = current->personality;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
- if (!ab)
- return; /* audit_panic has been called */
- audit_log_format(ab, "arch=%x syscall=%d",
- context->arch, context->major);
- if (context->personality != PER_LINUX)
- audit_log_format(ab, " per=%lx", context->personality);
- if (context->return_valid != AUDITSC_INVALID)
- audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
- context->return_code);
-
- audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count);
-
- audit_log_task_info(ab);
- audit_log_key(ab, context->filterkey);
- audit_log_end(ab);
+ switch (context->context) {
+ case AUDIT_CTX_SYSCALL:
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return;
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (context->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
+ audit_log_task_info(ab);
+ audit_log_key(ab, context->filterkey);
+ audit_log_end(ab);
+ break;
+ case AUDIT_CTX_URING:
+ audit_log_uring(context);
+ break;
+ default:
+ BUG();
+ break;
+ }
for (aux = context->aux; aux; aux = aux->next) {
@@ -1602,21 +1803,22 @@ static void audit_log_exit(void)
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle();
+ if (context->context == AUDIT_CTX_SYSCALL)
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
audit_log_end(ab);
if (call_panic)
- audit_panic("error converting sid to string");
+ audit_panic("error in audit_log_exit()");
}
/**
* __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
- * Called from copy_process and do_exit
+ * Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
@@ -1625,22 +1827,30 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
- * random task_struct that doesn't doesn't have any meaningful data we
+ * random task_struct that doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
- if (tsk == current && !context->dummy && context->in_syscall) {
+ if (tsk == current && !context->dummy) {
context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
-
- audit_filter_syscall(tsk, context);
- audit_filter_inodes(tsk, context);
- if (context->current_state == AUDIT_STATE_RECORD)
- audit_log_exit();
+ if (context->context == AUDIT_CTX_SYSCALL) {
+ audit_filter_syscall(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_exit();
+ } else if (context->context == AUDIT_CTX_URING) {
+ /* TODO: verify this case is real and valid */
+ audit_filter_uring(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_uring(context);
+ }
}
audit_set_context(tsk, NULL);
@@ -1648,6 +1858,137 @@ void __audit_free(struct task_struct *tsk)
}
/**
+ * audit_return_fixup - fixup the return codes in the audit_context
+ * @ctx: the audit_context
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * We need to fixup the return code in the audit logs if the actual return
+ * codes are later going to be fixed by the arch specific signal handlers.
+ */
+static void audit_return_fixup(struct audit_context *ctx,
+ int success, long code)
+{
+ /*
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(code <= -ERESTARTSYS) &&
+ (code >= -ERESTART_RESTARTBLOCK) &&
+ (code != -ENOIOCTLCMD))
+ ctx->return_code = -EINTR;
+ else
+ ctx->return_code = code;
+ ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
+}
+
+/**
+ * __audit_uring_entry - prepare the kernel task's audit context for io_uring
+ * @op: the io_uring opcode
+ *
+ * This is similar to audit_syscall_entry() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_entry() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_entry(u8 op)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->state == AUDIT_STATE_DISABLED)
+ return;
+
+ /*
+ * NOTE: It's possible that we can be called from the process' context
+ * before it returns to userspace, and before audit_syscall_exit()
+ * is called. In this case there is not much to do, just record
+ * the io_uring details and return.
+ */
+ ctx->uring_op = op;
+ if (ctx->context == AUDIT_CTX_SYSCALL)
+ return;
+
+ ctx->dummy = !audit_n_rules;
+ if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
+ ctx->prio = 0;
+
+ ctx->context = AUDIT_CTX_URING;
+ ctx->current_state = ctx->state;
+ ktime_get_coarse_real_ts64(&ctx->ctime);
+}
+
+/**
+ * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * This is similar to audit_syscall_exit() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_exit() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_exit(int success, long code)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->dummy) {
+ if (ctx->context != AUDIT_CTX_URING)
+ return;
+ goto out;
+ }
+
+ audit_return_fixup(ctx, success, code);
+ if (ctx->context == AUDIT_CTX_SYSCALL) {
+ /*
+ * NOTE: See the note in __audit_uring_entry() about the case
+ * where we may be called from process context before we
+ * return to userspace via audit_syscall_exit(). In this
+ * case we simply emit a URINGOP record and bail, the
+ * normal syscall exit handling will take care of
+ * everything else.
+ * It is also worth mentioning that when we are called,
+ * the current process creds may differ from the creds
+ * used during the normal syscall processing; keep that
+ * in mind if/when we move the record generation code.
+ */
+
+ /*
+ * We need to filter on the syscall info here to decide if we
+ * should emit a URINGOP record. I know it seems odd but this
+ * solves the problem where users have a filter to block *all*
+ * syscall records in the "exit" filter; we want to preserve
+ * the behavior here.
+ */
+ audit_filter_syscall(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ return;
+
+ audit_log_uring(ctx);
+ return;
+ }
+
+ /* this may generate CONFIG_CHANGE records */
+ if (!list_empty(&ctx->killed_trees))
+ audit_kill_trees(ctx);
+
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ goto out;
+ audit_log_exit();
+
+out:
+ audit_reset_context(ctx);
+}
+
+/**
* __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
@@ -1672,7 +2013,12 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
if (!audit_enabled || !context)
return;
- BUG_ON(context->in_syscall || context->name_count);
+ WARN_ON(context->context != AUDIT_CTX_UNUSED);
+ WARN_ON(context->name_count);
+ if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
+ audit_panic("unrecoverable error in audit_syscall_entry()");
+ return;
+ }
state = context->state;
if (state == AUDIT_STATE_DISABLED)
@@ -1691,10 +2037,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
- context->serial = 0;
- context->in_syscall = 1;
+ context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- context->ppid = 0;
ktime_get_coarse_real_ts64(&context->ctime);
}
@@ -1711,63 +2055,27 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct audit_context *context;
+ struct audit_context *context = audit_context();
- context = audit_context();
- if (!context)
- return;
+ if (!context || context->dummy ||
+ context->context != AUDIT_CTX_SYSCALL)
+ goto out;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
- if (!context->dummy && context->in_syscall) {
- if (success)
- context->return_valid = AUDITSC_SUCCESS;
- else
- context->return_valid = AUDITSC_FAILURE;
+ audit_return_fixup(context, success, return_code);
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_syscall(current, context);
+ audit_filter_inodes(current, context);
+ if (context->current_state != AUDIT_STATE_RECORD)
+ goto out;
- /*
- * we need to fix up the return code in the audit logs if the
- * actual return codes are later going to be fixed up by the
- * arch specific signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- audit_filter_syscall(current, context);
- audit_filter_inodes(current, context);
- if (context->current_state == AUDIT_STATE_RECORD)
- audit_log_exit();
- }
-
- context->in_syscall = 0;
- context->prio = context->state == AUDIT_STATE_RECORD ? ~0ULL : 0;
-
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_STATE_RECORD) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
+ audit_log_exit();
+
+out:
+ audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
@@ -1919,7 +2227,7 @@ void __audit_getname(struct filename *name)
struct audit_context *context = audit_context();
struct audit_names *n;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
@@ -1941,7 +2249,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -1991,7 +2299,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2109,7 +2417,7 @@ void __audit_inode_child(struct inode *parent,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2208,7 +2516,7 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
- if (!ctx->in_syscall)
+ if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
@@ -2496,7 +2804,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(&init_user_ns,
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
@@ -2546,6 +2854,16 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
+void __audit_openat2_how(struct open_how *how)
+{
+ struct audit_context *context = audit_context();
+
+ context->openat2.flags = how->flags;
+ context->openat2.mode = how->mode;
+ context->openat2.resolve = how->resolve;
+ context->type = AUDIT_OPENAT2;
+}
+
void __audit_log_kern_module(char *name)
{
struct audit_context *context = audit_context();
@@ -2556,39 +2874,45 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
@@ -2706,8 +3030,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
-
- if (likely(!ctx || !ctx->in_syscall))
+ if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..b529182e8b04 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,13 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bd04f4a44c01..2dfe1079f772 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -27,9 +27,11 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
+ select TASKS_RCU if PREEMPTION
select TASKS_TRACE_RCU
select BINARY_PRINTF
- select NET_SOCK_MSG if INET
+ select NET_SOCK_MSG if NET
+ select PAGE_POOL if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
@@ -58,12 +60,17 @@ config BPF_JIT_ALWAYS_ON
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
config BPF_UNPRIV_DEFAULT_OFF
bool "Disable unprivileged BPF by default"
+ default y
depends on BPF_SYSCALL
help
Disables unprivileged BPF by default by setting the corresponding
@@ -72,6 +79,12 @@ config BPF_UNPRIV_DEFAULT_OFF
disable it by setting it to 1 (from which no other transition to
0 is possible anymore).
+ Unprivileged BPF could be used to exploit certain potential
+ speculative execution side-channel vulnerabilities on unmitigated
+ affected hardware.
+
+ If you are unsure how to answer this question, answer Y.
+
source "kernel/bpf/preload/Kconfig"
config BPF_LSM
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,14 +6,15 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
@@ -24,6 +25,9 @@ endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
@@ -33,6 +37,11 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+ $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "map_in_map.h"
@@ -69,10 +70,8 @@ int array_map_alloc_check(union bpf_attr *attr)
attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
- if (attr->value_size > KMALLOC_MAX_SIZE)
- /* if value_size is bigger, the user space won't be able to
- * access the elements.
- */
+ /* avoid overflow on round_up(map->value_size) */
+ if (attr->value_size > INT_MAX)
return -E2BIG;
return 0;
@@ -155,6 +154,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return &array->map;
}
+static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
+{
+ return array->value + (u64)array->elem_size * index;
+}
+
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -164,7 +168,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * (index & array->index_mask);
+ return array->value + (u64)array->elem_size * (index & array->index_mask);
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
@@ -202,7 +206,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
@@ -242,6 +246,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -257,11 +275,12 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
rcu_read_unlock();
@@ -288,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
}
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -308,19 +327,21 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
- value, map->value_size);
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
} else {
val = array->value +
- array->elem_size * (index & array->index_mask);
+ (u64)array->elem_size * (index & array->index_mask);
if (map_flags & BPF_F_LOCK)
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
}
return 0;
}
@@ -352,11 +373,12 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -364,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -374,10 +396,41 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
+static void array_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ /* We don't reset or free fields other than timer on uref dropping to zero. */
+ if (!btf_record_has_field(map->record, BPF_TIMER))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (!IS_ERR_OR_NULL(map->record)) {
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
+ }
+ }
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -511,7 +564,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return array_map_elem_ptr(array, index);
}
static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -530,7 +583,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return array_map_elem_ptr(array, index);
}
static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
@@ -538,6 +591,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
@@ -558,11 +612,11 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
ctx.value = v;
} else {
pptr = v;
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += size;
}
ctx.value = info->percpu_value_buf;
@@ -588,11 +642,12 @@ static int bpf_iter_init_array_map(void *priv_data,
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
struct bpf_map *map = aux->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
void *value_buf;
u32 buf_size;
if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ buf_size = array->elem_size * num_possible_cpus();
value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
if (!value_buf)
return -ENOMEM;
@@ -600,6 +655,11 @@ static int bpf_iter_init_array_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ /* bpf_iter_attach_map() acquires a map uref, and the uref may be
+ * released before or in the middle of iterating map elements, so
+ * acquire an extra map uref for iterator.
+ */
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
return 0;
}
@@ -608,6 +668,7 @@ static void bpf_iter_fini_array_map(void *priv_data)
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
@@ -625,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
-static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
struct bpf_array *array;
@@ -645,12 +706,11 @@ static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
else
- val = array->value + array->elem_size * i;
+ val = array_map_elem_ptr(array, i);
num_elems++;
key = i;
- ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
- (u64)(long)&key, (u64)(long)val,
- (u64)(long)callback_ctx, 0);
+ ret = callback_fn((u64)(long)map, (u64)(long)&key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
break;
@@ -661,13 +721,36 @@ static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
return num_elems;
}
-static int array_map_btf_id;
+static u64 array_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size = array->elem_size;
+ u64 entries = map->max_entries;
+ u64 usage = sizeof(*array);
+
+ if (percpu) {
+ usage += entries * sizeof(void *);
+ usage += entries * elem_size * num_possible_cpus();
+ } else {
+ if (map->map_flags & BPF_F_MMAPABLE) {
+ usage = PAGE_ALIGN(usage);
+ usage += PAGE_ALIGN(entries * elem_size);
+ } else {
+ usage += entries * elem_size;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
+ .map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -681,12 +764,11 @@ const struct bpf_map_ops array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
@@ -696,14 +778,15 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &percpu_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -788,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
return 0;
}
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -817,13 +900,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_array_compatible(array, prog)) {
+ if (!bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
@@ -1083,7 +1165,6 @@ static void prog_array_map_free(struct bpf_map *map)
* Thus, prog_array_map cannot be used as an inner_map
* and map_meta_equal is not implemented.
*/
-static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
@@ -1099,8 +1180,8 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &prog_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -1189,7 +1270,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map)
fd_array_map_free(map);
}
-static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
@@ -1202,8 +1282,8 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &perf_event_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
#ifdef CONFIG_CGROUPS
@@ -1226,7 +1306,6 @@ static void cgroup_fd_array_free(struct bpf_map *map)
fd_array_map_free(map);
}
-static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
@@ -1238,8 +1317,8 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &cgroup_array_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
#endif
@@ -1286,7 +1365,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
@@ -1313,7 +1392,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
-static int array_of_maps_map_btf_id;
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
@@ -1325,7 +1403,9 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &array_of_maps_map_btf_id,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
new file mode 100644
index 000000000000..540331b610a9
--- /dev/null
+++ b/kernel/bpf/bloom_filter.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bitmap.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/btf_ids.h>
+
+#define BLOOM_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
+
+struct bpf_bloom_filter {
+ struct bpf_map map;
+ u32 bitset_mask;
+ u32 hash_seed;
+ u32 nr_hash_funcs;
+ unsigned long bitset[];
+};
+
+static u32 hash(struct bpf_bloom_filter *bloom, void *value,
+ u32 value_size, u32 index)
+{
+ u32 h;
+
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
+ else
+ h = jhash(value, value_size, bloom->hash_seed + index);
+
+ return h & bloom->bitset_mask;
+}
+
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ if (!test_bit(h, bloom->bitset))
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ if (flags != BPF_ANY)
+ return -EINVAL;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ set_bit(h, bloom->bitset);
+ }
+
+ return 0;
+}
+
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
+{
+ u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_bloom_filter *bloom;
+
+ if (!bpf_capable())
+ return ERR_PTR(-EPERM);
+
+ if (attr->key_size != 0 || attr->value_size == 0 ||
+ attr->max_entries == 0 ||
+ attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ /* The lower 4 bits of map_extra (0xF) specify the number
+ * of hash functions
+ */
+ (attr->map_extra & ~0xF))
+ return ERR_PTR(-EINVAL);
+
+ nr_hash_funcs = attr->map_extra;
+ if (nr_hash_funcs == 0)
+ /* Default to using 5 hash functions if unspecified */
+ nr_hash_funcs = 5;
+
+ /* For the bloom filter, the optimal bit array size that minimizes the
+ * false positive probability is n * k / ln(2) where n is the number of
+ * expected entries in the bloom filter and k is the number of hash
+ * functions. We use 7 / 5 to approximate 1 / ln(2).
+ *
+ * We round this up to the nearest power of two to enable more efficient
+ * hashing using bitmasks. The bitmask will be the bit array size - 1.
+ *
+ * If this overflows a u32, the bit array size will have 2^32 (4
+ * GB) bits.
+ */
+ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
+ check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
+ nr_bits > (1UL << 31)) {
+ /* The bit array size is 2^32 bits but to avoid overflowing the
+ * u32, we use U32_MAX, which will round up to the equivalent
+ * number of bytes
+ */
+ bitset_bytes = BITS_TO_BYTES(U32_MAX);
+ bitset_mask = U32_MAX;
+ } else {
+ if (nr_bits <= BITS_PER_LONG)
+ nr_bits = BITS_PER_LONG;
+ else
+ nr_bits = roundup_pow_of_two(nr_bits);
+ bitset_bytes = BITS_TO_BYTES(nr_bits);
+ bitset_mask = nr_bits - 1;
+ }
+
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
+
+ if (!bloom)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&bloom->map, attr);
+
+ bloom->nr_hash_funcs = nr_hash_funcs;
+ bloom->bitset_mask = bitset_mask;
+
+ if (!(attr->map_flags & BPF_F_ZERO_SEED))
+ bloom->hash_seed = get_random_u32();
+
+ return &bloom->map;
+}
+
+static void bloom_map_free(struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+
+ bpf_map_area_free(bloom);
+}
+
+static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ /* The eBPF program should use map_peek_elem instead */
+ return ERR_PTR(-EINVAL);
+}
+
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ /* The eBPF program should use map_push_elem instead */
+ return -EINVAL;
+}
+
+static int bloom_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Bloom filter maps are keyless */
+ return btf_type_is_void(key_type) ? 0 : -EINVAL;
+}
+
+static u64 bloom_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom;
+ u64 bitset_bytes;
+
+ bloom = container_of(map, struct bpf_bloom_filter, map);
+ bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1);
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ return sizeof(*bloom) + bitset_bytes;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
+const struct bpf_map_ops bloom_filter_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = bloom_map_alloc,
+ .map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
+ .map_push_elem = bloom_map_push_elem,
+ .map_peek_elem = bloom_map_peek_elem,
+ .map_pop_elem = bloom_map_pop_elem,
+ .map_lookup_elem = bloom_map_lookup_elem,
+ .map_update_elem = bloom_map_update_elem,
+ .map_delete_elem = bloom_map_delete_elem,
+ .map_check_btf = bloom_map_check_btf,
+ .map_mem_usage = bloom_map_mem_usage,
+ .map_btf_id = &bpf_bloom_map_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..d44fe8dd9732
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ migrate_disable();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+
+ rcu_read_lock();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_cgrp_storage_lock();
+ bpf_local_storage_destroy(local_storage);
+ bpf_cgrp_storage_unlock();
+ rcu_read_unlock();
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+ return 0;
+}
+
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return (unsigned long)NULL;
+
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, gfp_flags);
+
+unlock:
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96ceed0e0fb5..b0ef45db207c 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -17,6 +17,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
if (!bsb)
return NULL;
- inode_storage = rcu_dereference(bsb->storage);
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
if (!inode_storage)
return NULL;
@@ -54,11 +56,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
- bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
- struct hlist_node *n;
bsb = bpf_inode(inode);
if (!bsb)
@@ -72,70 +71,40 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
- raw_spin_lock_bh(&local_storage->lock);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_inode_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false);
- }
- raw_spin_unlock_bh(&local_storage->lock);
+ bpf_local_storage_destroy(local_storage);
rcu_read_unlock();
-
- /* free_inoode_storage should always be true as long as
- * local_storage->list was non-empty.
- */
- if (free_inode_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(f->f_inode, map, true);
- fput(f);
+ sdata = inode_storage_lookup(file_inode(f.file), map, true);
+ fdput(f);
return sdata ? sdata->data : NULL;
}
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- if (!inode_storage_ptr(f->f_inode)) {
- fput(f);
+ if (!inode_storage_ptr(file_inode(f.file))) {
+ fdput(f);
return -EBADF;
}
- sdata = bpf_local_storage_update(f->f_inode,
+ sdata = bpf_local_storage_update(file_inode(f.file),
(struct bpf_local_storage_map *)map,
- value, map_flags);
- fput(f);
+ value, map_flags, GFP_ATOMIC);
+ fdput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -147,31 +116,31 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct file *f;
- int fd, err;
+ struct fd f = fdget_raw(*(int *)key);
+ int err;
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- err = inode_storage_delete(f->f_inode, map);
- fput(f);
+ err = inode_storage_delete(file_inode(f.file), map);
+ fdput(f);
return err;
}
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -193,7 +162,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
@@ -204,6 +173,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
BPF_CALL_2(bpf_inode_storage_delete,
struct bpf_map *, map, struct inode *, inode)
{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!inode)
return -EINVAL;
@@ -221,26 +191,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}
static void inode_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, NULL);
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
}
-static int inode_storage_map_btf_id;
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -251,8 +209,8 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_name = "bpf_local_storage_map",
- .map_btf_id = &inode_storage_map_btf_id,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
@@ -263,7 +221,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -274,6 +232,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2d4fbdbb194e..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
@@ -67,23 +68,27 @@ static void bpf_iter_done_stop(struct seq_file *seq)
iter_priv->done_stop = true;
}
+static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
+{
+ return tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
static bool bpf_iter_support_resched(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
- return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
+ return bpf_iter_target_support_resched(iter_priv->tinfo);
}
/* maximum visited objects before bailing out */
#define MAX_ITER_OBJECTS 1000000
/* bpf_seq_read, a customized and simpler version for bpf iterator.
- * no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
- * . assuming no_llseek
+ * . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
@@ -197,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
}
stop:
offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
@@ -329,35 +339,56 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
- struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
- bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
- supported = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id && iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
- if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
- cache_btf_id(tinfo, prog);
- supported = true;
+ if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+ cache_btf_id(iter, prog);
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (supported) {
+ if (tinfo) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
- return supported;
+ return tinfo != NULL;
+}
+
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_iter_target_info *tinfo;
+ const struct bpf_func_proto *fn = NULL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog->aux->attach_btf_id) {
+ const struct bpf_iter_reg *reg_info;
+
+ reg_info = tinfo->reg_info;
+ if (reg_info->get_func_proto)
+ fn = reg_info->get_func_proto(func_id, prog);
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ return fn;
}
static void bpf_iter_link_release(struct bpf_link *link)
@@ -476,12 +507,11 @@ bool bpf_link_is_iter(struct bpf_link *link)
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
struct bpf_prog *prog)
{
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
struct bpf_link_primer link_primer;
- struct bpf_iter_target_info *tinfo;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
- bool existed = false;
bpfptr_t ulinfo;
int err;
@@ -507,16 +537,20 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id == prog_btf_id) {
- existed = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (!existed)
+ if (!tinfo)
return -ENOENT;
+ /* Only allow sleepable program for resched-able iterator */
+ if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+ return -EINVAL;
+
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
@@ -524,7 +558,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
@@ -660,13 +694,27 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
- rcu_read_lock();
- migrate_disable();
- ret = BPF_PROG_RUN(prog, ctx);
- migrate_enable();
- rcu_read_unlock();
+ if (prog->aux->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock();
+ migrate_disable();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ rcu_read_unlock();
+ }
/* bpf program can only return 0 or 1:
* 0 : okay
@@ -692,3 +740,109 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ /* Note: these safety checks are also verified when bpf_loop
+ * is inlined, be careful to modify this code in sync. See
+ * function verifier.c:inline_bpf_loop.
+ */
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > BPF_MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ BTF_TYPE_EMIT(struct btf_iter_num);
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..47d9948d768f 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -11,6 +11,9 @@
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
@@ -48,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner)
return map->ops->map_owner_storage_ptr(owner);
}
+static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->snode);
+}
+
static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->snode);
}
+static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->map_node);
+}
+
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
@@ -60,18 +73,35 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem)
+ void *value, bool charge_mem, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL;
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+ migrate_enable();
+ if (selem)
+ /* Keep the original bpf_map_kzalloc behavior
+ * before started using the bpf_mem_cache_alloc.
+ *
+ * No need to use zero_map_value. The bpf_selem_free()
+ * only does bpf_mem_cache_free when there is
+ * no other bpf prog is using the selem.
+ */
+ memset(SDATA(selem)->data, 0, smap->map.value_size);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (selem) {
if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
+ copy_map_value(&smap->map, SDATA(selem)->data, value);
+ /* No need to call check_and_init_map_value as memory is zero init */
return selem;
}
@@ -81,19 +111,154 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ bpf_mem_cache_raw_free(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_local_storage_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_local_storage_free_rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool bpf_ma, bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!bpf_ma) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+ return;
+ }
+
+ if (smap) {
+ migrate_disable();
+ bpf_mem_cache_free(&smap->storage_ma, local_storage);
+ migrate_enable();
+ } else {
+ /* smap could be NULL if the selem that triggered
+ * this 'local_storage' creation had been long gone.
+ * In this case, directly do call_rcu().
+ */
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ }
+}
+
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ bpf_mem_cache_raw_free(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ bool reuse_now)
+{
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+
+ if (!smap->bpf_ma) {
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+ } else {
+ /* Instead of using the vanilla call_rcu(),
+ * bpf_mem_cache_free will be able to reuse selem
+ * immediately.
+ */
+ migrate_disable();
+ bpf_mem_cache_free(&smap->selem_ma, selem);
+ migrate_enable();
+ }
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem,
- bool uncharge_mem)
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem, bool reuse_now)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
void *owner;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
owner = local_storage->owner;
/* All uncharging on the owner must be done first.
@@ -117,13 +282,13 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
* will be done by the caller.
*
* Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(local_storage, rcu) is done
+ * rcu_read_lock(), it is more intuitive to
+ * read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
* Hence, a "bool free_local_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
*/
}
hlist_del_init_rcu(&selem->snode);
@@ -131,30 +296,75 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- kfree_rcu(selem, rcu);
+ bpf_selem_free(selem, smap, reuse_now);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
return free_local_storage;
}
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *storage_smap,
+ struct bpf_local_storage_elem *selem)
{
+
+ struct bpf_local_storage_map *selem_smap;
+
+ /* local_storage->smap may be NULL. If it is, get the bpf_ma
+ * from any selem in the local_storage->list. The bpf_ma of all
+ * local_storage and selem should have the same value
+ * for the same map type.
+ *
+ * If the local_storage->list is already empty, the caller will not
+ * care about the bpf_ma value also because the caller is not
+ * responsibile to free the local_storage.
+ */
+
+ if (storage_smap)
+ return storage_smap->bpf_ma;
+
+ if (!selem) {
+ struct hlist_node *n;
+
+ n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+ bpf_rcu_lock_held());
+ if (!n)
+ return false;
+
+ selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+ }
+ selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ return selem_smap->bpf_ma;
+}
+
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
+{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
- bool free_local_storage = false;
+ bool bpf_ma, free_local_storage = false;
unsigned long flags;
- if (unlikely(!selem_linked_to_storage(selem)))
+ if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
return;
- local_storage = rcu_dereference(selem->local_storage);
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
+ storage_smap = rcu_dereference_check(local_storage->smap,
+ bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true);
+ local_storage, selem, true, reuse_now);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
- kfree_rcu(local_storage, rcu);
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -164,17 +374,17 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
hlist_add_head_rcu(&selem->snode, &local_storage->list);
}
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
unsigned long flags;
- if (unlikely(!selem_linked_to_map(selem)))
+ if (unlikely(!selem_linked_to_map_lockless(selem)))
/* selem has already be unlinked from smap */
return;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
b = select_bucket(smap, selem);
raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
@@ -194,16 +404,17 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
raw_spin_unlock_irqrestore(&b->lock, flags);
}
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
- __bpf_selem_unlink_storage(selem);
+ bpf_selem_unlink_storage(selem, reuse_now);
}
+/* If cacheit_lockit is false, this lookup function is lockless */
struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
@@ -213,12 +424,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem;
/* Fast path (cache hit) */
- sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+ bpf_rcu_lock_held());
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+ rcu_read_lock_trace_held())
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
@@ -260,7 +473,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem)
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -270,13 +484,21 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- GFP_ATOMIC | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ migrate_enable();
+ } else {
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (!storage) {
err = -ENOMEM;
goto uncharge;
}
+ RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
@@ -306,7 +528,8 @@ int bpf_local_storage_alloc(void *owner,
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
+ * synchronize_rcu_mult (waiting for both sleepable and
+ * normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
@@ -315,7 +538,7 @@ int bpf_local_storage_alloc(void *owner,
return 0;
uncharge:
- kfree(storage);
+ bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -327,10 +550,10 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags)
+ void *value, u64 map_flags, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
- struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_elem *selem = NULL;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
@@ -339,23 +562,27 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(&smap->map)))
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
+ return ERR_PTR(-EINVAL);
+
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
return ERR_PTR(-EINVAL);
- local_storage = rcu_dereference(*owner_storage(smap, owner));
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true);
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem);
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -373,13 +600,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = check_flags(old_sdata, map_flags);
if (err)
return ERR_PTR(err);
- if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+ if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
copy_map_value_locked(&smap->map, old_sdata->data,
value, false);
return old_sdata;
}
}
+ if (gfp_flags == GFP_KERNEL) {
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+ }
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
@@ -405,19 +638,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
- /* local_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during
- * bpf_selem_unlink_storage_nolock().
- */
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
+ if (gfp_flags != GFP_KERNEL) {
+ /* local_storage->lock is held. Hence, we are sure
+ * we can unlink and uncharge the old_sdata successfully
+ * later. Hence, instead of charging the new selem now
+ * and then uncharge the old selem later (which may cause
+ * a potential but unnecessary charge failure), avoid taking
+ * a charge at all here (the "!old_sdata" check) and the
+ * old_sdata will not be uncharged later during
+ * bpf_selem_unlink_storage_nolock().
+ */
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
}
/* First, link the new selem to the map */
@@ -430,7 +665,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false);
+ false, false);
}
unlock:
@@ -439,10 +674,14 @@ unlock:
unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ if (selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ bpf_selem_free(selem, smap, true);
+ }
return ERR_PTR(err);
}
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
u16 i, res = 0;
@@ -466,21 +705,180 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
return res;
}
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
- u16 idx)
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
{
spin_lock(&cache->idx_lock);
cache->idx_usage_counts[idx]--;
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
- int __percpu *busy_counter)
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
+ attr->key_size != sizeof(int) || !attr->value_size ||
+ /* Enforce BTF for userspace sk dumping */
+ !attr->btf_key_type_id || !attr->btf_value_type_id)
+ return -EINVAL;
+
+ if (!bpf_capable())
+ return -EPERM;
+
+ if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ u32 int_data;
+
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
+
+ int_data = *(u32 *)(key_type + 1);
+ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+
+ return 0;
+}
+
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
+ bool bpf_ma, free_storage = false;
+ struct hlist_node *n;
+ unsigned long flags;
+
+ storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false, true);
+ }
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ if (free_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
+}
+
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+ u64 usage = sizeof(*smap);
+
+ /* The dynamically callocated selems are not counted currently. */
+ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+ return usage;
+}
+
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache,
+ bool bpf_ma)
+{
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
+
+ smap->bpf_ma = bpf_ma;
+ if (bpf_ma) {
+ err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+ if (err)
+ goto free_smap;
+
+ err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+ if (err) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ goto free_smap;
+ }
+ }
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
unsigned int i;
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
/* Note that this map might be concurrently cloned from
* bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
* RCU read section to finish before proceeding. New RCU
@@ -506,11 +904,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
- __this_cpu_inc(*busy_counter);
+ this_cpu_inc(*busy_counter);
}
- bpf_selem_unlink(selem);
+ bpf_selem_unlink(selem, true);
if (busy_counter) {
- __this_cpu_dec(*busy_counter);
+ this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
@@ -532,76 +930,10 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
*/
synchronize_rcu();
- kvfree(smap->buckets);
- kfree(smap);
-}
-
-int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
-{
- if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
- !(attr->map_flags & BPF_F_NO_PREALLOC) ||
- attr->max_entries ||
- attr->key_size != sizeof(int) || !attr->value_size ||
- /* Enforce BTF for userspace sk dumping */
- !attr->btf_key_type_id || !attr->btf_value_type_id)
- return -EINVAL;
-
- if (!bpf_capable())
- return -EPERM;
-
- if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
- return -E2BIG;
-
- return 0;
-}
-
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
- struct bpf_local_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
-
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
-
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
- if (!smap->buckets) {
- kfree(smap);
- return ERR_PTR(-ENOMEM);
- }
-
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
+ if (smap->bpf_ma) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ bpf_mem_alloc_destroy(&smap->storage_ma);
}
-
- smap->elem_size =
- sizeof(struct bpf_local_storage_elem) + attr->value_size;
-
- return smap;
-}
-
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
- const struct btf *btf,
- const struct btf_type *key_type,
- const struct btf_type *value_type)
-{
- u32 int_data;
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
- return -EINVAL;
-
- return 0;
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 6b12f06ee18c..4ea227c9c1ad 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -4,6 +4,7 @@
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
+#include <linux/cache.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 06062370c3b8..e14c822f8911 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -16,6 +16,7 @@
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -35,6 +36,64 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
+BTF_SET_END(bpf_lsm_current_hooks)
+
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
+#ifdef CONFIG_CGROUP_BPF
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+ bpf_func_t *bpf_func)
+{
+ const struct btf_param *args __maybe_unused;
+
+ if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+ btf_id_set_contains(&bpf_lsm_current_hooks,
+ prog->aux->attach_btf_id)) {
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+ return;
+ }
+
+#ifdef CONFIG_NET
+ args = btf_params(prog->aux->attach_func_proto);
+
+ if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+ *bpf_func = __cgroup_bpf_run_lsm_socket;
+ else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ *bpf_func = __cgroup_bpf_run_lsm_sock;
+ else
+#endif
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+}
+#endif
+
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
@@ -91,6 +150,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
@@ -99,9 +159,51 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.allowed = bpf_ima_inode_hash_allowed,
};
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+ .func = bpf_get_attach_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
@@ -120,7 +222,33 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
- return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ return &bpf_ima_inode_hash_proto;
+ case BPF_FUNC_ima_file_hash:
+ return &bpf_ima_file_hash_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+#ifdef CONFIG_NET
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_setsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_getsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_getsockopt_proto;
+ return NULL;
+#endif
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -167,6 +295,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
#ifdef CONFIG_KEYS
@@ -207,18 +336,37 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif /* CONFIG_SECURITY_NETWORK */
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 70f6fd4fa305..d3f0a4825fa6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -10,11 +10,14 @@
#include <linux/seq_file.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
+#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -28,18 +31,19 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
+ struct rcu_head rcu;
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
- /* progs has all the bpf_prog that is populated
+ /* link has all the bpf_links that is populated
* to the func ptr of the kernel's struct
* (in kvalue.data).
*/
- struct bpf_prog **progs;
+ struct bpf_link **links;
/* image is a page that has all the trampolines
* that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
- * "progs[]".
+ * "links[]".
*/
void *image;
/* uvalue->data stores the kernel struct
@@ -56,6 +60,13 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -92,6 +103,9 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_struct_ops_test_run,
+#endif
};
static const struct btf_type *module_type;
@@ -161,7 +175,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
- if (btf_member_bitfield_size(t, member)) {
+ if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@@ -244,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -259,10 +274,17 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
/* No lock is needed. state and refcnt do not need
* to be updated together under atomic context.
*/
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -278,9 +300,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
u32 i;
for (i = 0; i < btf_type_vlen(t); i++) {
- if (st_map->progs[i]) {
- bpf_prog_put(st_map->progs[i]);
- st_map->progs[i] = NULL;
+ if (st_map->links[i]) {
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
}
@@ -292,7 +314,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
@@ -311,18 +333,51 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+ kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+ .release = bpf_struct_ops_link_release,
+ .dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+ struct bpf_tramp_link *link,
+ const struct btf_func_model *model,
+ void *image, void *image_end)
+{
+ u32 flags;
+
+ tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+ tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+ /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
+ * and it must be used alone.
+ */
+ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+ return arch_prepare_bpf_trampoline(NULL, image, image_end,
+ model, flags, tlinks, NULL);
+}
+
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
const struct bpf_struct_ops *st_ops = st_map->st_ops;
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
- struct bpf_tramp_progs *tprogs = NULL;
+ struct bpf_tramp_links *tlinks = NULL;
void *udata, *kdata;
int prog_fd, err = 0;
- void *image;
+ void *image, *image_end;
u32 i;
if (flags)
@@ -335,7 +390,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
return err;
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
err = check_zero_holes(t, uvalue->data);
if (err)
return err;
@@ -343,8 +398,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (uvalue->state || refcount_read(&uvalue->refcnt))
return -EINVAL;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return -ENOMEM;
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -362,13 +417,15 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
+ image_end = st_map->image + PAGE_SIZE;
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
+ struct bpf_tramp_link *link;
u32 moff;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -419,21 +476,28 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = PTR_ERR(prog);
goto reset_unlock;
}
- st_map->progs[i] = prog;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
prog->aux->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != i) {
+ bpf_prog_put(prog);
err = -EINVAL;
goto reset_unlock;
}
- tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
- tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- err = arch_prepare_bpf_trampoline(NULL, image,
- st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- tprogs, NULL);
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ bpf_prog_put(prog);
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+ &bpf_struct_ops_link_lops, prog);
+ st_map->links[i] = &link->link;
+
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[i],
+ image, image_end);
if (err < 0)
goto reset_unlock;
@@ -444,13 +508,29 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ set_memory_rox((long)st_map->image, 1);
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
- set_memory_ro((long)st_map->image, 1);
- set_memory_x((long)st_map->image, 1);
+ set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -459,40 +539,40 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- /* Error during st_ops->reg(). It is very unlikely since
- * the above init_member() should have caught it earlier
- * before reg(). The only possibility is if there was a race
- * in registering the struct_ops (under the same name) to
+ /* Error during st_ops->reg(). Can happen if this struct_ops needs to be
+ * verified as a whole, after all init_member() calls. Can also happen if
+ * there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
- kfree(tprogs);
+ kfree(tlinks);
mutex_unlock(&st_map->lock);
return err;
}
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -525,22 +605,44 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- if (st_map->progs)
+ if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
- bpf_map_area_free(st_map->progs);
+ bpf_map_area_free(st_map->links);
bpf_jit_free_exec(st_map->image);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -564,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
+ return ERR_PTR(-EOPNOTSUPP);
+
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -580,12 +685,12 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
- st_map->progs =
- bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ st_map->links =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!st_map->uvalue || !st_map->progs || !st_map->image) {
- bpf_struct_ops_map_free(map);
+ if (!st_map->uvalue || !st_map->links || !st_map->image) {
+ __bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -596,7 +701,22 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
return map;
}
-static int bpf_struct_ops_map_btf_id;
+static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct btf_type *vt = st_ops->value_type;
+ u64 usage;
+
+ usage = sizeof(*st_map) +
+ vt->size - sizeof(struct bpf_struct_ops_value);
+ usage += vt->size;
+ usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += PAGE_SIZE;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
@@ -606,8 +726,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
- .map_btf_name = "bpf_struct_ops_map",
- .map_btf_id = &bpf_struct_ops_map_btf_id,
+ .map_mem_usage = bpf_struct_ops_map_mem_usage,
+ .map_btf_id = &bpf_struct_ops_map_btf_ids[0],
};
/* "const void *" because some subsystem is
@@ -616,22 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ bpf_map_put(&st_map->map);
+}
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ /* st_link->map can be NULL if
+ * bpf_struct_ops_link_create() fails to register.
+ */
+ st_map->st_ops->unreg(&st_map->kvalue.data);
bpf_map_put(&st_map->map);
}
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err = 0;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops != old_st_map->st_ops) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
index 066d83ea1c99..5678a9ddf817 100644
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -2,6 +2,9 @@
/* internal file - do not include directly */
#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_NET
+BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
+#endif
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 3ce75758d394..adf6dfe0ba68 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,6 +17,7 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -25,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
- __this_cpu_inc(bpf_task_storage_busy);
+ this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
- __this_cpu_dec(bpf_task_storage_busy);
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
- if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- __this_cpu_dec(bpf_task_storage_busy);
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
@@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
- task_storage = rcu_dereference(task->bpf_storage);
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
if (!task_storage)
return NULL;
@@ -69,11 +71,7 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
- bool free_task_storage = false;
- struct hlist_node *n;
- unsigned long flags;
rcu_read_lock();
@@ -83,34 +81,10 @@ void bpf_task_storage_free(struct task_struct *task)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
bpf_task_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_task_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false);
- }
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
rcu_read_unlock();
-
- /* free_task_storage should always be true as long as
- * local_storage->list was non-empty.
- */
- if (free_task_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -146,8 +120,8 @@ out:
return ERR_PTR(err);
}
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
@@ -172,7 +146,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
- task, (struct bpf_local_storage_map *)map, value, map_flags);
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -181,7 +156,8 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
{
struct bpf_local_storage_data *sdata;
@@ -189,12 +165,15 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata));
+ if (!nobusy)
+ return -EBUSY;
+
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
{
struct task_struct *task;
unsigned int f_flags;
@@ -217,60 +196,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
}
bpf_task_storage_lock();
- err = task_storage_delete(task, map);
+ err = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
}
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags)
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
{
struct bpf_local_storage_data *sdata;
- if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
- return (unsigned long)NULL;
-
- if (!task)
- return (unsigned long)NULL;
-
- if (!bpf_task_storage_trylock())
- return (unsigned long)NULL;
-
- sdata = task_storage_lookup(task, map, true);
+ sdata = task_storage_lookup(task, map, nobusy);
if (sdata)
- goto unlock;
+ return sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
-unlock:
+ return NULL;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
bpf_task_storage_unlock();
- return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
+ return (unsigned long)data;
}
-BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
task)
{
+ bool nobusy;
int ret;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- if (!bpf_task_storage_trylock())
- return -EBUSY;
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+ bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map);
+ ret = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
return ret;
}
@@ -282,26 +309,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
}
static void task_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
}
-static int task_storage_map_btf_id;
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -312,29 +328,47 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_name = "bpf_local_storage_map",
- .map_btf_id = &task_storage_map_btf_id,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
-BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct)
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &bpf_task_storage_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &bpf_task_storage_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cb4b72997d9b..6b682b8e4b50 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
@@ -19,12 +19,17 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
#include <net/sock.h>
+#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -197,6 +202,37 @@
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_TRACING,
+ BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
+ BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 256,
+ BTF_DTOR_KFUNC_MAX_CNT = 256,
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
+};
+
+struct btf_id_dtor_kfunc_tab {
+ u32 cnt;
+ struct btf_id_dtor_kfunc dtors[];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -211,6 +247,9 @@ struct btf {
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
+ struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
/* split BTF support */
struct btf *base_btf;
@@ -281,6 +320,9 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
[BTF_KIND_FLOAT] = "FLOAT",
+ [BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
+ [BTF_KIND_ENUM64] = "ENUM64",
};
const char *btf_type_str(const struct btf_type *t)
@@ -304,6 +346,12 @@ const char *btf_type_str(const struct btf_type *t)
#define BTF_SHOW_NAME_SIZE 80
/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
* Common data to all BTF show operations. Private show functions can add
* their own data to a structure containing a struct btf_show and consult it
* in the show callback. See btf_type_show() below.
@@ -400,6 +448,9 @@ static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -417,6 +468,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
return true;
}
@@ -444,19 +496,20 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-static bool __btf_type_is_struct(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static bool btf_type_is_array(const struct btf_type *t)
+static bool btf_type_is_decl_tag(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
-static bool btf_type_is_datasec(const struct btf_type *t)
+static bool btf_type_is_decl_tag_target(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+ return btf_type_is_func(t) || btf_type_is_struct(t) ||
+ btf_type_is_var(t) || btf_type_is_typedef(t);
}
u32 btf_nr_types(const struct btf *btf)
@@ -491,6 +544,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
+static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ btf_put(btf);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
@@ -537,6 +634,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
return btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -563,6 +661,8 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -576,6 +676,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
return true;
}
@@ -616,6 +717,16 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
+static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
+{
+ return (const struct btf_decl_tag *)(t + 1);
+}
+
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+ return (const struct btf_enum64 *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -716,6 +827,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
return NULL;
return btf->types[type_id];
}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
* Regular int is not a bit field and it must be either
@@ -815,7 +927,7 @@ static const char *btf_show_name(struct btf_show *show)
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
const char *name = NULL, *prefix = "", *parens = "";
const struct btf_member *m = show->state.member;
- const struct btf_type *t = show->state.type;
+ const struct btf_type *t;
const struct btf_array *array;
u32 id = show->state.type_id;
const char *member = NULL;
@@ -924,6 +1036,7 @@ static const char *btf_show_name(struct btf_show *show)
parens = "{";
break;
case BTF_KIND_ENUM:
+ case BTF_KIND_ENUM64:
prefix = "enum";
break;
default:
@@ -1013,7 +1126,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
*/
#define btf_show_type_value(show, fmt, value) \
do { \
- if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
+ if ((value) != (__typeof__(value))0 || \
+ (show->flags & BTF_SHOW_ZERO) || \
show->state.depth == 0) { \
btf_show(show, "%s%s" fmt "%s%s", \
btf_show_indent(show), \
@@ -1292,23 +1406,28 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
- u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
- /* btf verifier prints all types it is processing via
- * btf_verifier_log_type(..., fmt = NULL).
- * Skip those prints for in-kernel BTF verification.
- */
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
- btf_kind_str[kind],
+ btf_type_str(t),
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
@@ -1343,8 +1462,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -1509,8 +1635,59 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook;
+
+ if (!tab)
+ return;
+ /* For module BTF, we directly assign the sets being registered, so
+ * there is nothing to free except kfunc_set_tab.
+ */
+ if (btf_is_module(btf))
+ goto free_tab;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+ kfree(tab->sets[hook]);
+free_tab:
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
+static void btf_free_dtor_kfunc_tab(struct btf *btf)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+
+ if (!tab)
+ return;
+ kfree(tab);
+ btf->dtor_kfunc_tab = NULL;
+}
+
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++)
+ btf_record_free(tab->types[i].record);
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_struct_meta_tab(btf);
+ btf_free_dtor_kfunc_tab(btf);
+ btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
@@ -1706,6 +1883,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
size = type->size;
goto resolved;
@@ -1718,6 +1896,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -2326,6 +2505,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
+ const char *value;
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -2341,7 +2522,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- /* typedef type must have a valid name, and other ref types,
+ /* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -2350,6 +2531,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@@ -2474,7 +2661,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
- * the currenct ptr (t).
+ * the current ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
@@ -2939,7 +3126,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- offset = btf_member_bit_offset(t, member);
+ offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@@ -2983,7 +3170,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
- u16 last_member_type_id;
+ u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
@@ -3046,41 +3233,671 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+enum {
+ BTF_FIELD_IGNORE = 0,
+ BTF_FIELD_FOUND = 1,
+};
+
+struct btf_field_info {
+ enum btf_field_type type;
+ u32 off;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } graph_root;
+ };
+};
+
+static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
{
- const struct btf_member *member;
- u32 i, off = -ENOENT;
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ info->type = field_type;
+ info->off = off;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, struct btf_field_info *info)
+{
+ enum btf_field_type type;
+ u32 res_id;
+
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
+ /* For PTR, sz is always == 8 */
+ if (!btf_type_is_ptr(t))
+ return BTF_FIELD_IGNORE;
+ t = btf_type_by_id(btf, t->type);
+
+ if (!btf_type_is_type_tag(t))
+ return BTF_FIELD_IGNORE;
+ /* Reject extra tags */
+ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
+ return -EINVAL;
+ if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_UNREF;
+ else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_REF;
+ else
+ return -EINVAL;
+
+ /* Get the base type */
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ /* Only pointer to struct is allowed */
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
+ return BTF_FIELD_FOUND;
+}
+
+static const char *btf_find_decl_tag_value(const struct btf *btf,
+ const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ int i;
+
+ for (i = 1; i < btf_nr_types(btf); i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ int len = strlen(tag_key);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type) ||
+ btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ return __btf_name_by_offset(btf, t->name_off) + len;
+ }
+ return NULL;
+}
+
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx, u32 off,
+ int sz, struct btf_field_info *info,
+ enum btf_field_type head_type)
+{
+ const char *node_field_name;
+ const char *value_type;
+ s32 id;
if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (!value_type)
+ return -EINVAL;
+ node_field_name = strstr(value_type, ":");
+ if (!node_field_name)
return -EINVAL;
+ value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ node_field_name++;
+ if (str_is_empty(node_field_name))
+ return -EINVAL;
+ info->type = head_type;
+ info->off = off;
+ info->graph_root.value_btf_id = id;
+ info->graph_root.node_name = node_field_name;
+ return BTF_FIELD_FOUND;
+}
+
+#define field_mask_test_name(field_type, field_type_str) \
+ if (field_mask & field_type && !strcmp(name, field_type_str)) { \
+ type = field_type; \
+ goto end; \
+ }
+
+static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+ int *align, int *sz)
+{
+ int type = 0;
+
+ if (field_mask & BPF_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_spin_lock")) {
+ if (*seen_mask & BPF_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_SPIN_LOCK;
+ type = BPF_SPIN_LOCK;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_TIMER) {
+ if (!strcmp(name, "bpf_timer")) {
+ if (*seen_mask & BPF_TIMER)
+ return -E2BIG;
+ *seen_mask |= BPF_TIMER;
+ type = BPF_TIMER;
+ goto end;
+ }
+ }
+ field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
+ field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
+ field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
+ field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
+ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
+
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & BPF_KPTR) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+#undef field_mask_test_name
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt)
+{
+ int ret, idx = 0, align, sz, field_type;
+ const struct btf_member *member;
+ struct btf_field_info tmp;
+ u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (!__btf_type_is_struct(member_type))
- continue;
- if (member_type->size != sizeof(struct bpf_spin_lock))
- continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
- "bpf_spin_lock"))
+
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
- if (off != -ENOENT)
- /* only one 'struct bpf_spin_lock' is allowed */
- return -E2BIG;
- off = btf_member_bit_offset(t, member);
+ if (field_type < 0)
+ return field_type;
+
+ off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % __alignof__(struct bpf_spin_lock))
- /* valid struct bpf_spin_lock will be 4 byte aligned */
+ if (off % align)
+ continue;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, member_type, off, sz, field_type,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ ret = btf_find_kptr(btf, member_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, t, member_type,
+ i, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ continue;
+ if (idx >= info_cnt)
+ return -E2BIG;
+ ++idx;
+ }
+ return idx;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
+{
+ int ret, idx = 0, align, sz, field_type;
+ const struct btf_var_secinfo *vsi;
+ struct btf_field_info tmp;
+ u32 i, off, seen_mask = 0;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
+ continue;
+ if (field_type < 0)
+ return field_type;
+
+ off = vsi->offset;
+ if (vsi->size != sz)
+ continue;
+ if (off % align)
+ continue;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ -1, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ continue;
+ if (idx >= info_cnt)
+ return -E2BIG;
+ ++idx;
+ }
+ return idx;
+}
+
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
+{
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
+ return -EINVAL;
+}
+
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
+ int ret;
+ s32 id;
+
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ btf_get(kptr_btf);
+ goto found_dtor;
+ }
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+found_dtor:
+ field->kptr.btf_id = id;
+ field->kptr.btf = kptr_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kptr_btf);
+ return ret;
+}
+
+static int btf_parse_graph_root(const struct btf *btf,
+ struct btf_field *field,
+ struct btf_field_info *info,
+ const char *node_type_name,
+ size_t node_type_align)
+{
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
+
+ t = btf_type_by_id(btf, info->graph_root.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->graph_root.node_name,
+ __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
+ return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % node_type_align)
return -EINVAL;
+
+ field->graph_root.btf = (struct btf *)btf;
+ field->graph_root.value_btf_id = info->graph_root.value_btf_id;
+ field->graph_root.node_offset = offset;
+ }
+ if (!n)
+ return -ENOENT;
+ return 0;
+}
+
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+ __alignof__(struct bpf_list_node));
+}
+
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+ __alignof__(struct bpf_rb_node));
+}
+
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const struct btf_field *a = (const struct btf_field *)_a;
+ const struct btf_field *b = (const struct btf_field *)_b;
+
+ if (a->offset < b->offset)
+ return -1;
+ else if (a->offset > b->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
+{
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ u32 next_off = 0, field_type_size;
+ struct btf_record *rec;
+ int ret, i, cnt;
+
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (!ret)
+ return NULL;
+
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+
+ rec->spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ rec->refcount_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ field_type_size = btf_field_type_size(info_arr[i].type);
+ if (info_arr[i].off + field_type_size > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + field_type_size;
+
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+ rec->fields[i].size = field_type_size;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_REFCOUNT:
+ WARN_ON_ONCE(rec->refcount_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->refcount_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_RB_ROOT:
+ ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ break;
+ default:
+ ret = -EFAULT;
+ goto end;
+ }
+ rec->cnt++;
+ }
+
+ /* bpf_{list_head, rb_node} require bpf_spin_lock */
+ if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+ btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (rec->refcount_off < 0 &&
+ btf_record_has_field(rec, BPF_LIST_NODE) &&
+ btf_record_has_field(rec, BPF_RB_NODE)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+ NULL, rec);
+
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
+}
+
+#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
+#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
+
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+ int i;
+
+ /* There are three types that signify ownership of some other type:
+ * kptr_ref, bpf_list_head, bpf_rb_root.
+ * kptr_ref only supports storing kernel types, which can't store
+ * references to program allocated local types.
+ *
+ * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+ * does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ u32 btf_id;
+
+ if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
+ continue;
+ btf_id = rec->fields[i].graph_root.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].graph_root.value_rec = meta->record;
+
+ /* We need to set value_rec for all root types, but no need
+ * to check ownership cycle for a type unless it's also a
+ * node type.
+ */
+ if (!(rec->field_mask & GRAPH_NODE_MASK))
+ continue;
+
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_{list_head,rb_node} in a type. Instead, we use the
+ * following resaoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_{list,rb}_node. Let's call these node types.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_{list_head,rb_root}. Let's call these root types.
+ *
+ * We ensure that if a type is both a root and node, its
+ * element types cannot be root types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A is an root type but not a node, its ownership
+ * chain can be:
+ * A -> B -> C
+ * Where:
+ * - A is an root, e.g. has bpf_rb_root.
+ * - B is both a root and node, e.g. has bpf_rb_node and
+ * bpf_list_head.
+ * - C is only an root, e.g. has bpf_list_node
+ *
+ * When A is both a root and node, some other type already
+ * owns it in the BTF domain, hence it can not own
+ * another root type through any of the ownership edges.
+ * A -> B
+ * Where:
+ * - A is both an root and node.
+ * - B is only an node.
+ */
+ if (meta->record->field_mask & GRAPH_ROOT_MASK)
+ return -ELOOP;
}
- return off;
+ return 0;
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
@@ -3105,8 +3922,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
btf_show_start_member(show, member);
- member_offset = btf_member_bit_offset(t, member);
- bitfield_size = btf_member_bitfield_size(t, member);
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
@@ -3239,6 +4056,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
{
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
+ const char *fmt_str;
u16 i, nr_enums;
u32 meta_needed;
@@ -3252,11 +4070,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
if (t->size > 8 || !is_power_of_2(t->size)) {
btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
@@ -3287,7 +4100,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
if (env->log.level == BPF_LOG_KERNEL)
continue;
- btf_verifier_log(env, "\t%s val=%d\n",
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+ btf_verifier_log(env, fmt_str,
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -3328,7 +4142,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
return;
}
- btf_show_type_value(show, "%d", v);
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%d", v);
+ else
+ btf_show_type_value(show, "%u", v);
btf_show_end_type(show);
}
@@ -3341,6 +4158,109 @@ static struct btf_kind_operations enum_ops = {
.show = btf_enum_show,
};
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ btf_enum64_value(enums + i));
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ s64 v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(u64 *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != btf_enum64_value(enums + i))
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%lld", v);
+ else
+ btf_show_type_value(show, "%llu", v);
+ btf_show_end_type(show);
+}
+
+static struct btf_kind_operations enum64_ops = {
+ .check_meta = btf_enum64_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum64_show,
+};
+
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -3453,9 +4373,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return 0;
}
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
static struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
- .resolve = btf_df_resolve,
+ .resolve = btf_func_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
@@ -3606,6 +4541,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,
struct btf *btf = env->btf;
u16 i;
+ env->resolve_mode = RESOLVE_TBD;
for_each_vsi_from(i, v->next_member, v->t, vsi) {
u32 var_type_id = vsi->type, type_id, type_size = 0;
const struct btf_type *var_type = btf_type_by_id(env->btf,
@@ -3752,6 +4688,110 @@ static const struct btf_kind_operations float_ops = {
.show = btf_df_show,
};
+static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_decl_tag *tag;
+ u32 meta_needed = sizeof(*tag);
+ s32 component_idx;
+ const char *value;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid value");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx < -1) {
+ btf_verifier_log_type(env, t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_decl_tag_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ s32 component_idx;
+ u32 vlen;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx != -1) {
+ if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_struct(next_type)) {
+ vlen = btf_type_vlen(next_type);
+ } else {
+ /* next_type should be a function */
+ next_type = btf_type_by_id(btf, next_type->type);
+ vlen = btf_type_vlen(next_type);
+ }
+
+ if ((u32)component_idx >= vlen) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ btf_verifier_log(env, "type=%u component_idx=%d", t->type,
+ btf_type_decl_tag(t)->component_idx);
+}
+
+static const struct btf_kind_operations decl_tag_ops = {
+ .check_meta = btf_decl_tag_check_meta,
+ .resolve = btf_decl_tag_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_decl_tag_log,
+ .show = btf_df_show,
+};
+
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -3775,6 +4815,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_is_resolve_source_only(ret_type)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
if (btf_type_needs_resolve(ret_type) &&
!env_type_is_resolved(env, ret_type_id)) {
err = btf_resolve(env, ret_type, ret_type_id);
@@ -3802,7 +4847,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
nr_args--;
}
- err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
@@ -3811,8 +4855,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
}
if (args[i].name_off &&
@@ -3820,25 +4868,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
- break;
+ return err;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
}
- return err;
+ return 0;
}
static int btf_func_check(struct btf_verifier_env *env,
@@ -3886,6 +4932,9 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
[BTF_KIND_FLOAT] = &float_ops,
+ [BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
+ [BTF_KIND_ENUM64] = &enum64_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -3970,6 +5019,10 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
+ return btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
+
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
btf_type_is_var(t)) {
t = btf_type_id_resolve(btf, &type_id);
@@ -4056,12 +5109,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
if (err)
return err;
}
-
- if (btf_type_is_func(t)) {
- err = btf_func_check(env, t);
- if (err)
- return err;
- }
}
return 0;
@@ -4192,13 +5239,11 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
- int err;
btf = env->btf;
btf_data_size = btf->data_size;
- if (btf_data_size <
- offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
@@ -4250,45 +5295,201 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -EINVAL;
}
- err = btf_check_sec_info(env, btf_data_size);
- if (err)
- return err;
+ return btf_check_sec_info(env, btf_data_size);
+}
+
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+ "bpf_rb_root",
+ "bpf_rb_node",
+ "bpf_refcount",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ union {
+ struct btf_id_set set;
+ struct {
+ u32 _cnt;
+ u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
+ } _arr;
+ } aof;
+ struct btf_struct_metas *tab = NULL;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ memset(&aof, 0, sizeof(aof));
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+ aof.set.ids[aof.set.cnt++] = id;
+ }
+
+ if (!aof.set.cnt)
+ return NULL;
+ sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free;
+ }
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(&aof.set, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ tab->cnt++;
+ }
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
+static int btf_check_type_tags(struct btf_verifier_env *env,
+ struct btf *btf, int start_id)
+{
+ int i, n, good_id = start_id - 1;
+ bool in_tags;
+
+ n = btf_nr_types(btf);
+ for (i = start_id; i < n; i++) {
+ const struct btf_type *t;
+ int chain_limit = 32;
+ u32 cur_id = i;
+
+ t = btf_type_by_id(btf, i);
+ if (!t)
+ return -EINVAL;
+ if (!btf_type_is_modifier(t))
+ continue;
+ cond_resched();
+
+ in_tags = btf_type_is_type_tag(t);
+ while (btf_type_is_modifier(t)) {
+ if (!chain_limit--) {
+ btf_verifier_log(env, "Max chain length or cycle detected");
+ return -ELOOP;
+ }
+ if (btf_type_is_type_tag(t)) {
+ if (!in_tags) {
+ btf_verifier_log(env, "Type tags don't precede modifiers");
+ return -EINVAL;
+ }
+ } else if (in_tags) {
+ in_tags = false;
+ }
+ if (cur_id <= good_id)
+ break;
+ /* Move to next type */
+ cur_id = t->type;
+ t = btf_type_by_id(btf, cur_id);
+ if (!t)
+ return -EINVAL;
+ }
+ good_id = i;
+ }
return 0;
}
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
- u32 log_level, char __user *log_ubuf, u32 log_size)
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
+ struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
- int err;
+ int err, ret;
- if (btf_data_size > BTF_MAX_SIZE)
+ if (attr->btf_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
- log = &env->log;
- if (log_level || log_ubuf || log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = log_level;
- log->ubuf = log_ubuf;
- log->len_total = log_size;
-
- /* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf) {
- err = -EINVAL;
- goto errout;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -4297,16 +5498,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
env->btf = btf;
- data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
- btf->data_size = btf_data_size;
+ btf->data_size = attr->btf_size;
- if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
err = -EFAULT;
goto errout;
}
@@ -4325,16 +5526,43 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
- if (log->level && bpf_verifier_log_full(log)) {
- err = -ENOSPC;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
goto errout;
+
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
+ goto errout;
+ }
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
}
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
+errout_meta:
+ btf_free_struct_meta_tab(btf);
errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
@@ -4375,7 +5603,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-static const struct btf_member *
+const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -4415,6 +5643,7 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
if (!ctx_struct)
/* should not happen */
return NULL;
+again:
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
if (!ctx_tname) {
/* should not happen */
@@ -4428,44 +5657,21 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, tname))
- return NULL;
- return ctx_type;
-}
-
-static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
-#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
-#define BPF_LINK_TYPE(_id, _name)
-#define BPF_MAP_TYPE(_id, _ops) \
- [_id] = &_ops,
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_LINK_TYPE
-#undef BPF_MAP_TYPE
-};
-
-static int btf_vmlinux_map_ids_init(const struct btf *btf,
- struct bpf_verifier_log *log)
-{
- const struct bpf_map_ops *ops;
- int i, btf_id;
-
- for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
- ops = btf_vmlinux_map_ops[i];
- if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
- continue;
- if (!ops->map_btf_name || !ops->map_btf_id) {
- bpf_log(log, "map type %d is misconfigured\n", i);
- return -EINVAL;
- }
- btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
- BTF_KIND_STRUCT);
- if (btf_id < 0)
- return btf_id;
- *ops->map_btf_id = btf_id;
+ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, tname)) {
+ /* bpf_user_pt_regs_t is a typedef, so resolve it to
+ * underlying struct and check name again
+ */
+ if (!btf_type_is_modifier(ctx_struct))
+ return NULL;
+ while (btf_type_is_modifier(ctx_struct))
+ ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+ goto again;
}
-
- return 0;
+ return ctx_type;
}
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
@@ -4483,6 +5689,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
return kern_ctx_type->type;
}
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
+
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
@@ -4526,14 +5752,13 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
+
/* btf_parse_vmlinux() runs under bpf_verifier_lock */
bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
- /* find bpf map structs for map_ptr access checking */
- err = btf_vmlinux_map_ids_init(btf, log);
- if (err < 0)
- goto errout;
-
bpf_struct_ops_init(btf, log);
refcount_set(&btf->refcnt, 1);
@@ -4611,6 +5836,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
+ if (err)
+ goto errout;
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -4637,17 +5866,56 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
- /* t comes in already as a pointer */
- t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
- /* allow const */
- if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
- t = btf_type_by_id(btf, t->type);
+ return btf_type_is_int(t);
+}
+
+static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
+
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
+
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
- /* char, signed char, unsigned char */
- return btf_type_is_int(t) && t->size == 1;
+ return nr_args + 1;
+}
+
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -4660,6 +5928,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -4668,7 +5937,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = off / 8;
+ arg = get_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -4688,6 +5957,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
+ case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
@@ -4718,7 +5988,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
break;
@@ -4737,7 +6007,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -4745,17 +6015,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
- if (ctx_arg_info->offset == off &&
- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
info->reg_type = ctx_arg_info->reg_type;
return true;
}
@@ -4768,7 +6040,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
- if (is_string_ptr(btf, t))
+ if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -4776,6 +6048,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
+ if (!ctx_arg_info->btf_id) {
+ bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+ return false;
+ }
+
info->reg_type = ctx_arg_info->reg_type;
info->btf = btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
@@ -4784,6 +6061,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -4805,6 +6085,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
@@ -4813,11 +6102,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
- tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, arg, btf_type_str(t));
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
- tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
return true;
}
@@ -4831,14 +6120,16 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
- const char *tname, *mname;
+ const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
+ *flag = 0;
again:
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
@@ -4866,15 +6157,17 @@ again:
if (array_elem->nelems != 0)
goto error;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
- /* Only allow structure for now, can be relaxed for
- * other types later.
- */
+ /* allow structure and integer */
t = btf_type_skip_modifiers(btf, array_elem->type,
NULL);
+
+ if (btf_type_is_int(t))
+ return WALK_SCALAR;
+
if (!btf_type_is_struct(t))
goto error;
@@ -4889,14 +6182,14 @@ error:
for_each_member(i, t, member) {
/* offset of the field in bytes */
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
- if (btf_member_bitfield_size(t, member)) {
- u32 end_bit = btf_member_bit_offset(t, member) +
- btf_member_bitfield_size(t, member);
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@@ -5005,6 +6298,15 @@ error:
* of this field or inside of this struct
*/
if (btf_type_is_struct(mtype)) {
+ if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
+ btf_type_vlen(mtype) != 1)
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
/* our field must be inside that union or struct */
t = mtype;
@@ -5020,7 +6322,8 @@ error:
}
if (btf_type_is_ptr(mtype)) {
- const struct btf_type *stype;
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
u32 id;
if (msize != size || off != moff) {
@@ -5029,9 +6332,28 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
+
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
+ }
+
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
+ *flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
return WALK_PTR;
}
}
@@ -5055,23 +6377,56 @@ error:
return -EINVAL;
}
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id)
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
+ const struct btf *btf = reg->btf;
+ enum bpf_type_flag tmp_flag = 0;
+ const struct btf_type *t;
+ u32 id = reg->btf_id;
int err;
- u32 id;
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
+
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+
+ t = btf_type_by_id(btf, id);
do {
- err = btf_struct_walk(log, btf, t, off, size, &id);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
switch (err) {
case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
/* If we found the pointer or scalar on t+off,
* we're done.
*/
*next_btf_id = id;
+ *flag = tmp_flag;
return PTR_TO_BTF_ID;
case WALK_SCALAR:
return SCALAR_VALUE;
@@ -5101,8 +6456,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
-static bool btf_types_are_same(const struct btf *btf1, u32 id1,
- const struct btf *btf2, u32 id2)
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
@@ -5113,20 +6468,27 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1,
bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *btf, u32 id, int off,
- const struct btf *need_btf, u32 need_type_id)
+ const struct btf *need_btf, u32 need_type_id,
+ bool strict)
{
const struct btf_type *type;
+ enum bpf_type_flag flag;
int err;
/* Are we already done? */
if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
return true;
-
+ /* In case of strict type match, we do not walk struct, the top level
+ * type match must succeed. When strict is true, off should have already
+ * been 0.
+ */
+ if (strict)
+ return false;
again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
if (err != WALK_STRUCT)
return false;
@@ -5144,29 +6506,40 @@ again:
}
static int __get_type_size(struct btf *btf, u32 btf_id,
- const struct btf_type **bad_type)
+ const struct btf_type **ret_type)
{
const struct btf_type *t;
+ *ret_type = btf_type_by_id(btf, 0);
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!t) {
- *bad_type = btf_type_by_id(btf, 0);
+ if (!t)
return -EINVAL;
- }
+ *ret_type = t;
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
return t->size;
- *bad_type = t;
return -EINVAL;
}
+static u8 __get_type_fmodel_flags(const struct btf_type *t)
+{
+ u8 flags = 0;
+
+ if (__btf_type_is_struct(t))
+ flags |= BTF_FMODEL_STRUCT_ARG;
+ if (btf_type_is_signed_int(t))
+ flags |= BTF_FMODEL_SIGNED_ARG;
+
+ return flags;
+}
+
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
@@ -5182,28 +6555,32 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
/* BTF function prototype doesn't match the verifier types.
* Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
m->ret_size = 8;
+ m->ret_flags = 0;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
- if (nargs >= MAX_BPF_FUNC_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0) {
+ if (ret < 0 || __btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
- tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, btf_type_str(t));
return -EINVAL;
}
m->ret_size = ret;
+ m->ret_flags = __get_type_fmodel_flags(t);
for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
@@ -5213,10 +6590,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
- if (ret < 0) {
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
- tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, i, btf_type_str(t));
return -EINVAL;
}
if (ret == 0) {
@@ -5226,6 +6605,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
+ m->arg_flags[i] = __get_type_fmodel_flags(t);
}
m->nr_args = nargs;
return 0;
@@ -5309,7 +6689,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
* to context only. And only global functions can be replaced.
* Hence type check only those types.
*/
- if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ if (btf_type_is_int(t1) || btf_is_any_enum(t1))
continue;
if (!btf_type_is_ptr(t1)) {
bpf_log(log,
@@ -5373,24 +6753,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
- bool ptr_to_mem_ok)
+ bool ptr_to_mem_ok,
+ bool processing_call)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
struct bpf_verifier_log *log = &env->log;
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
u32 i, nargs, ref_id;
+ int ret;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -5420,6 +6795,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
@@ -5439,47 +6815,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (btf_is_kernel(btf)) {
- const struct btf_type *reg_ref_t;
- const struct btf *reg_btf;
- const char *reg_ref_tname;
- u32 reg_ref_id;
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
- if (reg->type == PTR_TO_BTF_ID) {
- reg_btf = reg->btf;
- reg_ref_id = reg->btf_id;
- } else if (reg2btf_ids[reg->type]) {
- reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[reg->type];
- } else {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname, regno);
- return -EINVAL;
- }
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
- reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
- &reg_ref_id);
- reg_ref_tname = btf_name_by_offset(reg_btf,
- reg_ref_t->name_off);
- if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id)) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname,
- regno, btf_type_str(reg_ref_t),
- reg_ref_tname);
- return -EINVAL;
- }
- } else if (btf_get_prog_ctx_type(log, btf, t,
- env->prog->type, i)) {
+ if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
@@ -5489,9 +6830,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
i, btf_type_str(t));
return -EINVAL;
}
- if (check_ctx_reg(env, reg, regno))
- return -EINVAL;
- } else if (ptr_to_mem_ok) {
+ } else if (ptr_to_mem_ok && processing_call) {
const struct btf_type *resolve_ret;
u32 type_size;
@@ -5507,6 +6846,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
+ bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+ func_name, func_id);
return -EINVAL;
}
}
@@ -5514,7 +6855,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return 0;
}
-/* Compare BTF of a function with given bpf_reg_state.
+/* Compare BTF of a function declaration with given bpf_reg_state.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - there is a type mismatch or BTF is not available.
@@ -5541,7 +6882,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -5552,11 +6893,47 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return err;
}
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs)
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ *
+ * NOTE: the code is duplicated from btf_check_subprog_arg_match()
+ * because btf_check_func_arg_match() is still doing both. Once that
+ * function is split in 2, we can call from here btf_check_subprog_arg_match()
+ * first, and then treat the calling part in a new code path.
+ */
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, false);
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
+
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -5628,7 +7005,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -5643,7 +7020,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
reg->type = SCALAR_VALUE;
continue;
}
@@ -5664,13 +7041,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
- reg->type = PTR_TO_MEM_OR_NULL;
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
reg->id = ++env->id_gen;
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ i, btf_type_str(t), tname);
return -EINVAL;
}
return 0;
@@ -5733,7 +7110,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt,
if (len < 0) {
ssnprintf->len_left = 0;
ssnprintf->len = len;
- } else if (len > ssnprintf->len_left) {
+ } else if (len >= ssnprintf->len_left) {
/* no space, drive on to get length we would have written */
ssnprintf->len_left = 0;
ssnprintf->len += len;
@@ -5757,7 +7134,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
- /* If we encontered an error, return it. */
+ /* If we encountered an error, return it. */
if (ssnprintf.show.state.status)
return ssnprintf.show.state.status;
@@ -5792,15 +7169,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
struct btf *btf;
int ret;
- btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
- attr->btf_size, attr->btf_log_level,
- u64_to_user_ptr(attr->btf_log_buf),
- attr->btf_log_size);
+ btf = btf_parse(attr, uattr, uattr_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -5941,17 +7315,9 @@ bool btf_is_module(const struct btf *btf)
return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}
-static int btf_id_cmp_func(const void *a, const void *b)
-{
- const int *pa = a, *pb = b;
-
- return *pa - *pb;
-}
-
-bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
-{
- return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
-}
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
@@ -5959,6 +7325,7 @@ struct btf_module {
struct module *module;
struct btf *btf;
struct bin_attribute *sysfs_attr;
+ int flags;
};
static LIST_HEAD(btf_modules);
@@ -5975,6 +7342,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
return len;
}
+static void purge_cand_cache(struct btf *btf);
+
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
@@ -5984,7 +7353,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
int err = 0;
if (mod->btf_data_size == 0 ||
- (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
goto out;
switch (op) {
@@ -5996,10 +7366,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
if (IS_ERR(btf)) {
- pr_warn("failed to validate module [%s] BTF: %ld\n",
- mod->name, PTR_ERR(btf));
kfree(btf_mod);
- err = PTR_ERR(btf);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
+ err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
goto out;
}
err = btf_alloc_id(btf);
@@ -6009,6 +7383,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
goto out;
}
+ purge_cand_cache(NULL);
mutex_lock(&btf_module_mutex);
btf_mod->module = module;
btf_mod->btf = btf;
@@ -6042,6 +7417,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
case MODULE_STATE_GOING:
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6051,6 +7437,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
btf_put(btf_mod->btf);
kfree(btf_mod->sysfs_attr);
kfree(btf_mod);
@@ -6087,7 +7474,12 @@ struct module *btf_try_get_module(const struct btf *btf)
if (btf_mod->btf != btf)
continue;
- if (try_module_get(btf_mod->module))
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
res = btf_mod->module;
break;
@@ -6098,9 +7490,43 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
- struct btf *btf;
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
long ret;
if (flags)
@@ -6109,44 +7535,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
- btf = bpf_get_btf_vmlinux();
- if (IS_ERR(btf))
- return PTR_ERR(btf);
-
- ret = btf_find_by_name_kind(btf, name, kind);
- /* ret is never zero, since btf_find_by_name_kind returns
- * positive btf_id or negative error.
- */
- if (ret < 0) {
- struct btf *mod_btf;
- int id;
-
- /* If name is not found in vmlinux's BTF then search in module's BTFs */
- spin_lock_bh(&btf_idr_lock);
- idr_for_each_entry(&btf_idr, mod_btf, id) {
- if (!btf_is_module(mod_btf))
- continue;
- /* linear search could be slow hence unlock/lock
- * the IDR to avoiding holding it for too long
- */
- btf_get(mod_btf);
- spin_unlock_bh(&btf_idr_lock);
- ret = btf_find_by_name_kind(mod_btf, name, kind);
- if (ret > 0) {
- int btf_obj_fd;
-
- btf_obj_fd = __btf_new_fd(mod_btf);
- if (btf_obj_fd < 0) {
- btf_put(mod_btf);
- return btf_obj_fd;
- }
- return ret | (((u64)btf_obj_fd) << 32);
- }
- spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
}
- spin_unlock_bh(&btf_idr_lock);
+ return ret | (((u64)btf_obj_fd) << 32);
}
+ if (ret > 0)
+ btf_put(btf);
return ret;
}
@@ -6154,8 +7553,994 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
.func = bpf_btf_find_by_name_kind,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
+
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
+
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
+{
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *name, *sfx, *iter_name;
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ arg = &btf_params(func)[0];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = name + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Kernel Function (kfunc) BTF ID set registration API */
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ struct btf_id_set8 *add_set)
+{
+ bool vmlinux_set = !btf_is_module(btf);
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set8 *set;
+ u32 set_cnt;
+ int ret;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* We don't need to allocate, concatenate, and sort module sets, because
+ * only one is allowed per hook. Hence, we can directly assign the
+ * pointer and return.
+ */
+ if (!vmlinux_set) {
+ tab->sets[hook] = add_set;
+ return 0;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set8_contains function work.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook],
+ offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook])
+ set->cnt = 0;
+ tab->sets[hook] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ set->cnt += add_set->cnt;
+
+ sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
+
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id)
+{
+ struct btf_id_set8 *set;
+ u32 *id;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return NULL;
+ if (!btf->kfunc_set_tab)
+ return NULL;
+ set = btf->kfunc_set_tab->sets[hook];
+ if (!set)
+ return NULL;
+ id = btf_id_set8_contains(set, kfunc_btf_id);
+ if (!id)
+ return NULL;
+ /* The flags for BTF ID are located next to it */
+ return id + 1;
+}
+
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ return BTF_KFUNC_HOOK_TRACING;
+ case BPF_PROG_TYPE_SYSCALL:
+ return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
+ case BPF_PROG_TYPE_NETFILTER:
+ return BTF_KFUNC_HOOK_NETFILTER;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
+}
+
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
+ enum bpf_prog_type prog_type,
+ u32 kfunc_btf_id)
+{
+ enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ if (kfunc_flags)
+ return kfunc_flags;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
+}
+
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ struct btf *btf;
+ int ret, i;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf) {
+ if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ return 0;
+ }
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
+ ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
+ btf_put(btf);
+ return ret;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
+s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+ struct btf_id_dtor_kfunc *dtor;
+
+ if (!tab)
+ return -ENOENT;
+ /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
+ * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
+ */
+ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
+ dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
+ if (!dtor)
+ return -ENOENT;
+ return dtor->kfunc_btf_id;
+}
+
+static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
+{
+ const struct btf_type *dtor_func, *dtor_func_proto, *t;
+ const struct btf_param *args;
+ s32 dtor_btf_id;
+ u32 nr_args, i;
+
+ for (i = 0; i < cnt; i++) {
+ dtor_btf_id = dtors[i].kfunc_btf_id;
+
+ dtor_func = btf_type_by_id(btf, dtor_btf_id);
+ if (!dtor_func || !btf_type_is_func(dtor_func))
+ return -EINVAL;
+
+ dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
+ if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
+ return -EINVAL;
+
+ /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
+ t = btf_type_by_id(btf, dtor_func_proto->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+
+ nr_args = btf_type_vlen(dtor_func_proto);
+ if (nr_args != 1)
+ return -EINVAL;
+ args = btf_params(dtor_func_proto);
+ t = btf_type_by_id(btf, args[0].type);
+ /* Allow any pointer type, as width on targets Linux supports
+ * will be same for all pointer types (i.e. sizeof(void *))
+ */
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
+ struct module *owner)
+{
+ struct btf_id_dtor_kfunc_tab *tab;
+ struct btf *btf;
+ u32 tab_cnt;
+ int ret;
+
+ btf = btf_get_module_btf(owner);
+ if (!btf) {
+ if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n");
+ return -ENOENT;
+ }
+ if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register dtor kfuncs\n");
+ return -ENOENT;
+ }
+ return 0;
+ }
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Ensure that the prototype of dtor kfuncs being registered is sane */
+ ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
+ if (ret < 0)
+ goto end;
+
+ tab = btf->dtor_kfunc_tab;
+ /* Only one call allowed for modules */
+ if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ tab_cnt = tab ? tab->cnt : 0;
+ if (tab_cnt > U32_MAX - add_cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+ if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ tab = krealloc(btf->dtor_kfunc_tab,
+ offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!tab) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ if (!btf->dtor_kfunc_tab)
+ tab->cnt = 0;
+ btf->dtor_kfunc_tab = tab;
+
+ memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+ tab->cnt += add_cnt;
+
+ sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
+
+end:
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
+ btf_put(btf);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs/ENUM64s, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
+}
+
+#define MAX_TYPES_MATCH_DEPTH 2
+
+int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
+ const struct btf *targ_btf, u32 targ_id)
+{
+ return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
+ MAX_TYPES_MATCH_DEPTH);
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+ kfree(cands->name);
+ kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
+ }
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
+ }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
+#endif
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
+
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ btf_put(mod_btf);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
+ struct bpf_core_spec *specs;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ if (!specs)
+ return -ENOMEM;
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_calc_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id, const char *suffix)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member;
+ u32 i;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
+ if (ret < 0)
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+
+ btf_type_skip_modifiers(btf, mtype->type, &id);
+ /* If we match on both type and name, the field is considered trusted. */
+ if (btf_id == id && !strcmp(field_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b567ca46555c..517b6a5928cc 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,14 +14,181 @@
#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/* __always_inline is necessary to prevent indirect call through run_prog
+ * function pointer.
+ */
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags)
+{
+ const struct bpf_prog_array_item *item;
+ const struct bpf_prog *prog;
+ const struct bpf_prog_array *array;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ u32 func_ret;
+
+ run_ctx.retval = retval;
+ migrate_disable();
+ rcu_read_lock();
+ array = rcu_dereference(cgrp->effective[atype]);
+ item = &array->items[0];
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ while ((prog = READ_ONCE(item->prog))) {
+ run_ctx.prog_item = item;
+ func_ret = run_prog(prog, ctx);
+ if (ret_flags) {
+ *(ret_flags) |= (func_ret >> 1);
+ func_ret &= 1;
+ }
+ if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+ run_ctx.retval = -EPERM;
+ item++;
+ }
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+ return run_ctx.retval;
+}
+
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct sock *sk;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sk = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct socket *sock;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sock = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+ cgrp = task_dfl_cgroup(current);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ int i;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+ return CGROUP_LSM_START + i;
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == 0)
+ return CGROUP_LSM_START + i;
+
+ return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+ cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ cgroup_lock();
+ if (--cgroup_lsm_atype[i].refcnt <= 0)
+ cgroup_lsm_atype[i].attach_btf_id = 0;
+ WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+ cgroup_unlock();
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
@@ -113,25 +280,32 @@ static void cgroup_bpf_release(struct work_struct *work)
struct list_head *storages = &cgrp->bpf.storages;
struct bpf_cgroup_storage *storage, *stmp;
- unsigned int type;
+ unsigned int atype;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
- struct list_head *progs = &cgrp->bpf.progs[type];
- struct bpf_prog_list *pl, *pltmp;
+ for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+ struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl;
+ struct hlist_node *pltmp;
- list_for_each_entry_safe(pl, pltmp, progs, node) {
- list_del(&pl->node);
- if (pl->prog)
+ hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+ hlist_del(&pl->node);
+ if (pl->prog) {
+ if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->prog);
bpf_prog_put(pl->prog);
- if (pl->link)
+ }
+ if (pl->link) {
+ if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
bpf_cgroup_link_auto_detach(pl->link);
+ }
kfree(pl);
- static_branch_dec(&cgroup_bpf_enabled_key[type]);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
old_array = rcu_dereference_protected(
- cgrp->bpf.effective[type],
+ cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
bpf_prog_array_free(old_array);
}
@@ -141,7 +315,7 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_cgroup_storage_free(storage);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -178,12 +352,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct list_head *head)
+static u32 prog_list_length(struct hlist_head *head)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
cnt++;
@@ -196,7 +370,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *p;
@@ -204,12 +378,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (!p)
return true;
do {
- u32 flags = p->bpf.flags[type];
+ u32 flags = p->bpf.flags[atype];
u32 cnt;
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[type]);
+ cnt = prog_list_length(&p->bpf.progs[atype]);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -225,7 +399,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
* to programs in this cgroup
*/
static int compute_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array **array)
{
struct bpf_prog_array_item *item;
@@ -236,8 +410,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* count number of effective programs by walking parents */
do {
- if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[type]);
+ if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[atype]);
p = cgroup_parent(p);
} while (p);
@@ -249,10 +423,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
cnt = 0;
p = cgrp;
do {
- if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[type], node) {
+ hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
@@ -269,10 +443,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
static void activate_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array *old_array)
{
- old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
@@ -303,7 +477,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cgroup_bpf_get(p);
for (i = 0; i < NR; i++)
- INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
INIT_LIST_HEAD(&cgrp->bpf.storages);
@@ -328,7 +502,7 @@ cleanup:
}
static int update_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup_subsys_state *css;
int err;
@@ -340,7 +514,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt))
continue;
- err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
if (err)
goto cleanup;
}
@@ -357,7 +531,7 @@ static int update_effective_progs(struct cgroup *cgrp,
continue;
}
- activate_effective_progs(desc, type, desc->bpf.inactive);
+ activate_effective_progs(desc, atype, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -379,7 +553,7 @@ cleanup:
#define BPF_CGROUP_MAX_PROGS 64
-static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
struct bpf_prog *replace_prog,
@@ -389,12 +563,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* single-attach case */
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
return NULL;
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
@@ -405,7 +579,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* direct prog multi-attach w/ replacement case */
if (replace_prog) {
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == replace_prog)
/* a match found */
return pl;
@@ -430,17 +604,19 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type, u32 flags)
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type, u32 flags)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
- struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_prog *new_prog = prog ? : link->link.prog;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_list *pl;
+ struct hlist_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -454,10 +630,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
/* replace_prog implies BPF_F_REPLACE, and vice versa */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type))
+ atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
+ if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+ if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -479,30 +661,53 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (pl) {
old_prog = pl->prog;
} else {
+ struct hlist_node *last = NULL;
+
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
- list_add_tail(&pl->node, progs);
+ if (hlist_empty(progs))
+ hlist_add_head(&pl->node, progs);
+ else
+ hlist_for_each(last, progs) {
+ if (last->next)
+ continue;
+ hlist_add_behind(&pl->node, last);
+ break;
+ }
}
pl->prog = prog;
pl->link = link;
bpf_cgroup_storages_assign(pl->storage, storage);
- cgrp->bpf.flags[type] = saved_flags;
+ cgrp->bpf.flags[atype] = saved_flags;
+
+ if (type == BPF_LSM_CGROUP) {
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+ if (err)
+ goto cleanup;
+ }
- err = update_effective_progs(cgrp, type);
+ err = update_effective_progs(cgrp, atype);
if (err)
- goto cleanup;
+ goto cleanup_trampoline;
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- else
- static_branch_inc(&cgroup_bpf_enabled_key[type]);
+ } else {
+ static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ }
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
+cleanup_trampoline:
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(new_prog);
+
cleanup:
if (old_prog) {
pl->prog = old_prog;
@@ -510,24 +715,38 @@ cleanup:
}
bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
- list_del(&pl->node);
+ hlist_del(&pl->node);
kfree(pl);
}
return err;
}
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ cgroup_unlock();
+ return ret;
+}
+
/* Swap updated BPF program for given link in effective program arrays across
* all descendant cgroups. This function is guaranteed to succeed.
*/
static void replace_effective_prog(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_cgroup_link *link)
{
struct bpf_prog_array_item *item;
struct cgroup_subsys_state *css;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
- struct list_head *head;
+ struct hlist_head *head;
struct cgroup *cg;
int pos;
@@ -539,11 +758,11 @@ static void replace_effective_prog(struct cgroup *cgrp,
/* find position of link in effective progs array */
for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
- if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- head = &cg->bpf.progs[type];
- list_for_each_entry(pl, head, node) {
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
if (pl->link == link)
@@ -554,7 +773,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
found:
BUG_ON(!cg);
progs = rcu_dereference_protected(
- desc->bpf.effective[type],
+ desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
item = &progs->items[pos];
WRITE_ONCE(item->prog, link->link.prog);
@@ -574,15 +793,22 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
struct bpf_cgroup_link *link,
struct bpf_prog *new_prog)
{
- struct list_head *progs = &cgrp->bpf.progs[link->type];
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
+ struct hlist_head *progs;
bool found = false;
+ atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
if (link->link.prog->type != new_prog->type)
return -EINVAL;
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->link == link) {
found = true;
break;
@@ -592,7 +818,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
return -ENOENT;
old_prog = xchg(&link->link.prog, new_prog);
- replace_effective_prog(cgrp, link->type, link);
+ replace_effective_prog(cgrp, atype, link);
bpf_prog_put(old_prog);
return 0;
}
@@ -605,7 +831,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
cg_link = container_of(link, struct bpf_cgroup_link, link);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* link might have been auto-released by dying cgroup, so fail */
if (!cg_link->cgroup) {
ret = -ENOLINK;
@@ -617,11 +843,11 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
}
ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
-static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
bool allow_multi)
@@ -629,14 +855,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog_list *pl;
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* report error when trying to detach and nothing is attached */
return ERR_PTR(-ENOENT);
/* to maintain backward compatibility NONE and OVERRIDE cgroups
* allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
if (!prog && !link)
@@ -646,7 +872,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == prog && pl->link == link)
return pl;
}
@@ -654,24 +880,93 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
}
/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @atype: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+
+ /* no link or prog match, skip the cgroup of this layer */
+ continue;
+found:
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
+/**
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to detach or NULL
- * @prog: A link to detach or NULL
+ * @link: A link to detach or NULL
* @type: Type of detach operation
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_cgroup_link *link, enum bpf_attach_type type)
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
- struct bpf_prog_list *pl;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
- int err;
+ struct bpf_prog_list *pl;
+ struct hlist_head *progs;
+ u32 attach_btf_id = 0;
+ u32 flags;
+
+ if (prog)
+ attach_btf_id = prog->aux->attach_btf_id;
+ if (link)
+ attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+ atype = bpf_cgroup_atype_find(type, attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
if (prog && link)
/* only one of prog or link can be specified */
@@ -686,79 +981,151 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, type);
- if (err)
- goto cleanup;
+ if (update_effective_progs(cgrp, atype)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, atype);
+ }
/* now can actually delete it from this cgroup list */
- list_del(&pl->node);
+ hlist_del(&pl->node);
+
kfree(pl);
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* last program was detached, reset flags to zero */
- cgrp->bpf.flags[type] = 0;
- if (old_prog)
+ cgrp->bpf.flags[atype] = 0;
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- static_branch_dec(&cgroup_bpf_enabled_key[type]);
+ }
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
+}
-cleanup:
- /* restore back prog or link */
- pl->prog = old_prog;
- pl->link = link;
- return err;
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+ cgroup_unlock();
+ return ret;
}
/* Must be called with cgroup_mutex held to avoid races. */
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
+ __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
+ enum cgroup_bpf_attach_type from_atype, to_atype;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
- struct bpf_prog *prog;
int cnt, ret = 0, i;
+ int total_cnt = 0;
+ u32 flags;
- effective = rcu_dereference_protected(cgrp->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
+ if (effective_query && prog_attach_flags)
+ return -EINVAL;
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(effective);
- else
- cnt = prog_list_length(progs);
+ if (type == BPF_LSM_CGROUP) {
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
+ return -EINVAL;
+
+ from_atype = CGROUP_LSM_START;
+ to_atype = CGROUP_LSM_END;
+ flags = 0;
+ } else {
+ from_atype = to_cgroup_bpf_attach_type(type);
+ if (from_atype < 0)
+ return -EINVAL;
+ to_atype = from_atype;
+ flags = cgrp->bpf.flags[from_atype];
+ }
+ for (atype = from_atype; atype <= to_atype; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective);
+ } else {
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ }
+ }
+
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
return -EFAULT;
- if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
return 0;
- if (attr->query.prog_cnt < cnt) {
- cnt = attr->query.prog_cnt;
+
+ if (attr->query.prog_cnt < total_cnt) {
+ total_cnt = attr->query.prog_cnt;
ret = -ENOSPC;
}
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
- } else {
- struct bpf_prog_list *pl;
- u32 id;
-
- i = 0;
- list_for_each_entry(pl, progs, node) {
- prog = prog_list_prog(pl);
- id = prog->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id)))
- return -EFAULT;
- if (++i == cnt)
- break;
+ for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ } else {
+ struct hlist_head *progs;
+ struct bpf_prog_list *pl;
+ struct bpf_prog *prog;
+ u32 id;
+
+ progs = &cgrp->bpf.progs[atype];
+ cnt = min_t(int, prog_list_length(progs), total_cnt);
+ i = 0;
+ hlist_for_each_entry(pl, progs, node) {
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
+
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
}
+
+ prog_ids += cnt;
+ total_cnt -= cnt;
}
return ret;
}
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_unlock();
+ return ret;
+}
+
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
@@ -822,21 +1189,23 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
if (!cg_link->cgroup)
return;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* re-check cgroup under lock again */
if (!cg_link->cgroup) {
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return;
}
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
cg_link->type));
+ if (cg_link->type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
cg = cg_link->cgroup;
cg_link->cgroup = NULL;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
cgroup_put(cg);
}
@@ -863,10 +1232,10 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
seq_printf(seq,
"cgroup_id:\t%llu\n"
@@ -882,10 +1251,10 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
info->cgroup.cgroup_id = cg_id;
info->cgroup.attach_type = cg_link->type;
@@ -925,14 +1294,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
goto out_put_cgroup;
}
- err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
- BPF_F_ALLOW_MULTI);
+ err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+ link->type, BPF_F_ALLOW_MULTI);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
@@ -965,7 +1334,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -978,7 +1347,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
- * -EPERM - drop packet
+ * -err - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
@@ -986,7 +1355,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
unsigned int offset = skb->data - skb_network_header(skb);
struct sock *save_sk;
@@ -1008,13 +1377,41 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- if (type == BPF_CGROUP_INET_EGRESS) {
- ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
- cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ if (atype == CGROUP_INET_EGRESS) {
+ u32 flags = 0;
+ bool cn;
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
+ __bpf_prog_run_save_cb, 0, &flags);
+
+ /* Return values of CGROUP EGRESS BPF programs are:
+ * 0: drop packet
+ * 1: keep packet
+ * 2: drop packet and cn
+ * 3: keep packet and cn
+ *
+ * The returned value is then converted to one of the NET_XMIT
+ * or an error code that is then interpreted as drop packet
+ * (and no cn):
+ * 0: NET_XMIT_SUCCESS skb should be transmitted
+ * 1: NET_XMIT_DROP skb should be dropped and cn
+ * 2: NET_XMIT_CN skb should be transmitted and cn
+ * 3: -err skb should be dropped
+ */
+
+ cn = flags & BPF_RET_SET_CN;
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
+ if (!ret)
+ ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+ else
+ ret = (cn ? NET_XMIT_DROP : ret);
} else {
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
- ret = (ret == 1 ? 0 : -EPERM);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+ skb, __bpf_prog_run_save_cb, 0,
+ NULL);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1027,7 +1424,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1038,13 +1435,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sk(struct sock *sk,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
+ NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1053,7 +1449,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
@@ -1065,7 +1461,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
void *t_ctx,
u32 *flags)
{
@@ -1076,7 +1472,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
};
struct sockaddr_storage unspec;
struct cgroup *cgrp;
- int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
@@ -1090,10 +1485,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
- BPF_PROG_RUN, flags);
-
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1103,7 +1496,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1115,19 +1508,17 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
*/
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
struct bpf_sock_ops_kern *sock_ops,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
- BPF_PROG_RUN);
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+ 0, NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
- short access, enum bpf_attach_type type)
+ short access, enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp;
struct bpf_cgroup_dev_ctx ctx = {
@@ -1135,27 +1526,92 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow = 1;
+ int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
- BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
- return !allow;
+ return ret;
+}
+
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
}
+const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
@@ -1163,12 +1619,6 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -1231,7 +1681,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
char **buf, size_t *pcount, loff_t *ppos,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct bpf_sysctl_kern ctx = {
.head = head,
@@ -1271,7 +1721,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1284,24 +1735,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.new_val);
}
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf)
{
@@ -1360,19 +1797,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
};
int ret, max_optlen;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
- return 0;
-
/* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1385,14 +1814,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
+ &ctx, bpf_prog_run, 0, NULL);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret)
goto out;
- }
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
@@ -1452,19 +1879,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
int ret;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
- return retval;
-
ctx.optlen = max_optlen;
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1495,38 +1914,29 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
- goto out;
- }
-
- if (ctx.optlen > max_optlen || ctx.optlen < 0) {
- ret = -EFAULT;
+ if (ret < 0)
goto out;
- }
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
+ if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
ret = -EFAULT;
goto out;
}
if (ctx.optlen != 0) {
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
+ if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (put_user(ctx.optlen, optlen)) {
ret = -EFAULT;
goto out;
}
}
- ret = ctx.retval;
-
out:
sockopt_free_buf(&ctx, &buf);
return ret;
@@ -1541,10 +1951,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
+ .current_task = current,
};
int ret;
@@ -1556,26 +1966,20 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
* be called if that data shouldn't be "exported".
*/
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
- &ctx, BPF_PROG_RUN);
- if (!ret)
- return -EPERM;
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
+ if (ret < 0)
+ return ret;
if (ctx.optlen > *optlen)
return -EFAULT;
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval)
- return -EFAULT;
-
/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;
- return ctx.retval;
+ return ret;
}
#endif
@@ -1723,18 +2127,24 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -1743,8 +2153,12 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_get_new_value_proto;
case BPF_FUNC_sysctl_set_new_value:
return &bpf_sysctl_set_new_value_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -1812,10 +2226,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
- *insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg,
+ *insn++ = BPF_RAW_INSN(
+ BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
+ treg, si->src_reg,
bpf_ctx_narrow_access_offset(
- 0, sizeof(u32), sizeof(loff_t)));
+ 0, sizeof(u32), sizeof(loff_t)),
+ si->imm);
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1846,22 +2262,60 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+#ifdef CONFIG_NET
+BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
+{
+ const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
+ .func = bpf_get_netns_cookie_sockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+#endif
+
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sockopt_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -1927,10 +2381,17 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return true;
}
-#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
- T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sockopt_kern, F))
+#define CG_SOCKOPT_READ_FIELD(F) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+#define CG_SOCKOPT_WRITE_FIELD(F) \
+ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
+ BPF_MEM | BPF_CLASS(si->code)), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F), \
+ si->imm)
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -1942,37 +2403,68 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sockopt, sk):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ *insn++ = CG_SOCKOPT_READ_FIELD(sk);
break;
case offsetof(struct bpf_sockopt, level):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ *insn++ = CG_SOCKOPT_READ_FIELD(level);
break;
case offsetof(struct bpf_sockopt, optname):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optname);
break;
case offsetof(struct bpf_sockopt, optlen):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
+ BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval),
+ si->imm);
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval);
break;
case offsetof(struct bpf_sockopt, optval_end):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
break;
}
@@ -1997,3 +2489,65 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
+
+/* Common helpers for cgroup hooks with valid process context. */
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..810378f04fbc
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_lock();
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_unlock();
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
+ p->start_css = &cgrp->self;
+ css_get(p->start_css);
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_v1v2_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9b1577498373..7421487422d4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,13 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
+#include <linux/nospec.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -56,6 +63,9 @@
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -64,11 +74,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -77,7 +89,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
@@ -86,12 +98,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
@@ -102,6 +114,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+ aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
mutex_init(&fp->aux->used_maps_mutex);
@@ -112,7 +128,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *prog;
int cpu;
@@ -145,7 +161,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
if (!prog->aux->jited_linfo)
return -ENOMEM;
@@ -171,7 +187,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
* here is relative to the prog itself instead of the main prog.
* This array has one entry for each xlated bpf insn.
*
- * jited_off is the byte off to the last byte of the jited insn.
+ * jited_off is the byte off to the end of the jited insn.
*
* Hence, with
* insn_start:
@@ -220,7 +236,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
u32 pages;
@@ -387,6 +403,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
i = end_new;
insn = prog->insnsi + end_old;
}
+ if (bpf_pseudo_func(insn)) {
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ if (ret)
+ return ret;
+ continue;
+ }
code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP &&
BPF_CLASS(code) != BPF_JMP32) ||
@@ -522,17 +545,15 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
@@ -637,12 +658,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
return fp->jited && !bpf_prog_was_classic(fp);
}
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
-{
- return list_empty(&fp->aux->ksym.lnode) ||
- fp->aux->ksym.lnode.prev == LIST_POISON2;
-}
-
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
@@ -797,6 +812,147 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+ memset(area, 0, size);
+}
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
+#else
+#define BPF_PROG_PACK_SIZE PAGE_SIZE
+#endif
+
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ return pack;
+}
+
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = module_alloc(size);
+ if (ptr) {
+ bpf_fill_ill_insns(ptr, size);
+ set_vm_flush_reset_perms(ptr);
+ set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ nbits, 0);
+ if (pos < BPF_PROG_CHUNK_COUNT)
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack(bpf_fill_ill_insns);
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+
+ mutex_lock(&pack_mutex);
+ if (hdr->size > BPF_PROG_PACK_SIZE) {
+ module_memfree(hdr);
+ goto out;
+ }
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+ pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+ "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
+ list_del(&pack->list);
+ module_memfree(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -815,18 +971,18 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
PAGE_SIZE), LONG_MAX);
return 0;
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
- atomic_long_sub(pages, &bpf_jit_current);
+ if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
+ if (!bpf_capable()) {
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -834,9 +990,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -855,7 +1011,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -865,23 +1021,22 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -891,10 +1046,122 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
+ bpf_prog_pack_free(ro_header);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = get_random_u32_below(hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header);
+ return PTR_ERR(ptr);
+ }
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ return (void *)addr;
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & PAGE_MASK;
+ return (void *)addr;
}
/* This symbol is only overridden by archs that have different
@@ -907,7 +1174,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_free(hdr);
-
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -921,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
s16 off = insn->off;
s32 imm = insn->imm;
u8 *addr;
+ int err;
*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
if (!*func_addr_fixed) {
@@ -935,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ bpf_jit_supports_far_kfunc_call()) {
+ err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+ if (err)
+ return err;
} else {
/* Address of a BPF helper call. Since part of the core
* kernel, it's always at a fixed location. __bpf_call_base
@@ -954,7 +1226,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = get_random_int();
+ u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -1134,7 +1406,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+ if (!prog->blinding_requested || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -1145,6 +1417,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn = clone->insnsi;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* ld_imm64 with an address of bpf subprog is not
+ * a user controlled constant. Don't randomize it,
+ * since it will conflict with jit_subprogs() logic.
+ */
+ insn++;
+ i++;
+ continue;
+ }
+
/* We temporarily need to hold the original ld64 insn
* so that we can still access the first part in the
* second blinding run.
@@ -1360,11 +1642,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
}
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
@@ -1377,6 +1661,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1559,7 +1844,8 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@@ -1621,7 +1907,19 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+ barrier_nospec();
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -1631,6 +1929,11 @@ out:
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEM_##SIZEOP: \
+ bpf_probe_read_kernel(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
LDST(B, u8)
@@ -1638,15 +1941,6 @@ out:
LDST(W, u32)
LDST(DW, u64)
#undef LDST
-#define LDX_PROBE(SIZEOP, SIZE) \
- LDX_PROBE_MEM_##SIZEOP: \
- bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
- CONT;
- LDX_PROBE(B, 1)
- LDX_PROBE(H, 2)
- LDX_PROBE(W, 4)
- LDX_PROBE(DW, 8)
-#undef LDX_PROBE
#define ATOMIC_ALU_OP(BOP, KOP) \
case BOP: \
@@ -1721,7 +2015,7 @@ out:
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_EXT_REG]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
@@ -1799,23 +2093,40 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
}
#endif
-bool bpf_prog_array_compatible(struct bpf_array *array,
- const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
+ bool ret;
+
if (fp->kprobe_override)
return false;
- if (!array->aux->type) {
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ spin_lock(&map->owner.lock);
+ if (!map->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->type = fp->type;
- array->aux->jited = fp->jited;
- return true;
+ map->owner.type = prog_type;
+ map->owner.jited = fp->jited;
+ map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
+ ret = true;
+ } else {
+ ret = map->owner.type == prog_type &&
+ map->owner.jited == fp->jited &&
+ map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
+ spin_unlock(&map->owner.lock);
- return array->aux->type == fp->type &&
- array->aux->jited == fp->jited;
+ return ret;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
@@ -1826,13 +2137,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
- struct bpf_array *array;
- if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ if (!map_type_contains_progs(map))
continue;
- array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp)) {
+ if (!bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -1856,11 +2165,14 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via BPF_PROG_RUN() macro.
+ * The BPF program will be executed via bpf_prog_run() function.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
@@ -1884,7 +2196,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
@@ -1929,18 +2241,10 @@ static struct bpf_prog_dummy {
},
};
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
@@ -1950,16 +2254,37 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
(prog_cnt + 1),
flags);
- return &empty_prog_array.hdr;
+ return &bpf_empty_prog_array.hdr;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+ struct bpf_prog_array *progs;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
+ progs = container_of(rcu, struct bpf_prog_array, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
@@ -2097,13 +2422,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
+ u64 bpf_cookie,
struct bpf_prog_array **new_array)
{
int new_prog_cnt, carry_prog_cnt = 0;
- struct bpf_prog_array_item *existing;
+ struct bpf_prog_array_item *existing, *new;
struct bpf_prog_array *array;
bool found_exclude = false;
- int new_prog_idx = 0;
/* Figure out how many existing progs we need to carry over to
* the new array.
@@ -2140,20 +2465,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
if (!array)
return -ENOMEM;
+ new = array->items;
/* Fill in the new prog array */
if (carry_prog_cnt) {
existing = old_array->items;
- for (; existing->prog; existing++)
- if (existing->prog != exclude_prog &&
- existing->prog != &dummy_bpf_prog.prog) {
- array->items[new_prog_idx++].prog =
- existing->prog;
- }
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog ||
+ existing->prog == &dummy_bpf_prog.prog)
+ continue;
+
+ new->prog = existing->prog;
+ new->bpf_cookie = existing->bpf_cookie;
+ new++;
+ }
}
- if (include_prog)
- array->items[new_prog_idx++].prog = include_prog;
- array->items[new_prog_idx].prog = NULL;
+ if (include_prog) {
+ new->prog = include_prog;
+ new->bpf_cookie = bpf_cookie;
+ new++;
+ }
+ new->prog = NULL;
*new_array = array;
return 0;
}
@@ -2226,10 +2558,17 @@ static void bpf_prog_free_deferred(struct work_struct *work)
int i;
aux = container_of(work, struct bpf_prog_aux, work);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+#endif
+#ifdef CONFIG_CGROUP_BPF
+ if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+ bpf_cgroup_atype_put(aux->cgroup_atype);
+#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
@@ -2252,7 +2591,6 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2302,6 +2640,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;
@@ -2312,6 +2651,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
@@ -2322,12 +2662,19 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
@@ -2380,11 +2727,22 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
}
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2400,6 +2758,28 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return -ENOTSUPP;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 480e936c54d0..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -4,18 +4,22 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
-/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
*
- * Unlike devmap which redirects XDP frames out another NIC device,
+ * Unlike devmap which redirects XDP frames out to another NIC device,
* this map type redirects raw XDP frames to another CPU. The remote
* CPU will do SKB-allocation and call the normal network stack.
- *
+ */
+/*
* This is a scalability and isolation mechanism, that allow
* separating the early driver network XDP layer, from the rest of the
* netstack, and assigning dedicated CPUs for this stage. This
* basically allows for 10G wirespeed pre-filtering via bpf.
*/
+#include <linux/bitops.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
@@ -26,6 +30,7 @@
#include <linux/kthread.h>
#include <linux/capability.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
#include <linux/netdevice.h> /* netif_receive_skb_list */
#include <linux/etherdevice.h> /* eth_type_trans */
@@ -83,7 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- int err = -ENOMEM;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@@ -95,29 +99,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&cmap->map, attr);
- /* Pre-limit array size based on NR_CPUS, not final CPU check */
- if (cmap->map.max_entries > NR_CPUS) {
- err = -E2BIG;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
- if (!cmap->cpu_map)
- goto free_cmap;
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
return &cmap->map;
-free_cmap:
- kfree(cmap);
- return ERR_PTR(err);
}
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ struct list_head *listp,
+ struct xdp_cpumap_stats *stats)
+{
+ struct sk_buff *skb, *tmp;
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ list_for_each_entry_safe(skb, tmp, listp, list) {
+ act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ case XDP_REDIRECT:
+ skb_list_del_init(skb);
+ err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ return;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ stats->drop++;
+ return;
+ }
+ }
+}
+
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
@@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff xdp;
int i, nframes = 0;
- if (!rcpu->prog)
- return n;
-
- rcu_read_lock_bh();
-
xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
@@ -218,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
@@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
+ xdp_clear_return_frame_no_direct();
+
+ return nframes;
+}
+
+#define CPUMAP_BATCH 8
+
+static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ int xdp_n, struct xdp_cpumap_stats *stats,
+ struct list_head *list)
+{
+ int nframes;
+
+ if (!rcpu->prog)
+ return xdp_n;
+
+ rcu_read_lock_bh();
+
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+
if (stats->redirect)
- xdp_do_flush_map();
+ xdp_do_flush();
- xdp_clear_return_frame_no_direct();
+ if (unlikely(!list_empty(list)))
+ cpu_map_bpf_prog_run_skb(rcpu, list, stats);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
}
-#define CPUMAP_BATCH 8
static int cpu_map_kthread_run(void *data)
{
@@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data)
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m, nframes, xdp_n;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- int i, n, m, nframes;
LIST_HEAD(list);
/* Release CPU reschedule checks */
@@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0; i < n; i++) {
+ for (i = 0, xdp_n = 0; i < n; i++) {
void *f = frames[i];
- struct page *page = virt_to_page(f);
+ struct page *page;
+
+ if (unlikely(__ptr_test_bit(0, &f))) {
+ struct sk_buff *skb = f;
+
+ __ptr_clear_bit(0, &skb);
+ list_add_tail(&skb->list, &list);
+ continue;
+ }
+
+ frames[xdp_n++] = f;
+ page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
* build_skb_around via page_is_pfmemalloc(), and when
@@ -292,9 +359,9 @@ static int cpu_map_kthread_run(void *data)
}
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+ m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
@@ -330,13 +397,8 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-bool cpu_map_prog_allowed(struct bpf_map *map)
-{
- return map->map_type == BPF_MAP_TYPE_CPUMAP &&
- map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
-}
-
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
{
struct bpf_prog *prog;
@@ -344,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return -EINVAL;
}
@@ -396,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
/* Setup kthread */
@@ -477,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
}
}
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -490,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpumap_val cpumap_value = {};
@@ -559,7 +622,7 @@ static void cpu_map_free(struct bpf_map *map)
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
bpf_map_area_free(cmap->cpu_map);
- kfree(cmap);
+ bpf_map_area_free(cmap);
}
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
@@ -604,13 +667,22 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
-static int cpu_map_btf_id;
+static u64 cpu_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_cpu_map);
+
+ /* Currently the dynamically allocated elements are not counted */
+ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = cpu_map_alloc,
@@ -620,8 +692,8 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_cpu_map",
- .map_btf_id = &cpu_map_btf_id,
+ .map_mem_usage = cpu_map_mem_usage,
+ .map_btf_id = &cpu_map_btf_ids[0],
.map_redirect = cpu_map_redirect,
};
@@ -685,15 +757,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- struct xdp_frame *xdpf;
-
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;
@@ -701,6 +767,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ __skb_pull(skb, skb->mac_len);
+ skb_set_redirected(skb, false);
+ __ptr_set_bit(0, &skb);
+
+ ret = ptr_ring_produce(rcpu->queue, skb);
+ if (ret < 0)
+ goto trace;
+
+ wake_up_process(rcpu->kthread);
+trace:
+ trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+ return ret;
+}
+
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..7efdf5d770ca
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @rcu: The RCU head used to free the cpumask with RCU safety.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global kfuncs as their definitions will be in BTF");
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
+ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
+
+ cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+static void cpumask_free_cb(struct rcu_head *head)
+{
+ struct bpf_cpumask *cpumask;
+
+ cpumask = container_of(head, struct bpf_cpumask, rcu);
+ migrate_disable();
+ bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (refcount_dec_and_test(&cpumask->usage))
+ call_rcu(&cpumask->rcu, cpumask_free_cb);
+}
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask)
+{
+ return cpumask_any(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and() - Return a random set CPU from the AND of two
+ * cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_any_and(src1, src2);
+}
+
+__diag_pop();
+
+BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
+BTF_SET8_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fdc20892837c..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -162,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
- dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
err = dev_map_init_map(dtab, attr);
if (err) {
- kfree(dtab);
+ bpf_map_area_free(dtab);
return ERR_PTR(err);
}
@@ -239,7 +240,7 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- kfree(dtab);
+ bpf_map_area_free(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -322,16 +323,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-bool dev_map_can_have_prog(struct bpf_map *map)
-{
- if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
- map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
- return true;
-
- return false;
-}
-
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
struct xdp_frame **frames, int n,
struct net_device *dev)
@@ -358,7 +349,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
frames[nframes++] = xdpf;
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dev, xdp_prog, act);
@@ -477,50 +468,85 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
bq->q[bq->count++] = xdpf;
}
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
- struct xdp_frame *xdpf;
int err;
- if (!dev->netdev_ops->ndo_xdp_xmit)
+ if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
+{
+ struct xdp_txq_info txq = { .dev = dst->dev };
+ struct xdp_buff xdp;
+ u32 act;
+
+ if (!dst->xdp_prog)
+ return XDP_PASS;
+
+ __skb_pull(skb, skb->mac_len);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
+ switch (act) {
+ case XDP_PASS:
+ __skb_push(skb, skb->mac_len);
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ kfree_skb(skb);
+ break;
+ }
+
+ return act;
+}
+
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
- return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}
-static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
- int exclude_ifindex)
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
- if (!obj || obj->dev->ifindex == exclude_ifindex ||
- !obj->dev->netdev_ops->ndo_xdp_xmit)
+ if (!obj)
+ return false;
+
+ if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return false;
+
+ if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
return false;
return true;
@@ -541,26 +567,55 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
return 0;
}
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
+{
+ while (num_excluded--) {
+ if (ifindex == excluded[num_excluded])
+ return true;
+ }
+ return false;
+}
+
+/* Get ifindex of each upper device. 'indexes' must be able to hold at
+ * least MAX_NEST_DEV elements.
+ * Returns the number of ifindexes added.
+ */
+static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+ int n = 0;
+
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ indexes[n++] = upper->ifindex;
+ }
+ return n;
+}
+
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
- struct xdp_frame *xdpf;
+ int num_excluded = 0;
unsigned int i;
int err;
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ excluded_devices[num_excluded++] = dev_rx->ifindex;
+ }
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -580,7 +635,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
lockdep_is_held(&dtab->index_lock)) {
- if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -615,6 +674,14 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
+
+ /* Redirect has already succeeded semantically at this point, so we just
+ * return 0 even if packet is dropped. Helper below takes care of
+ * freeing skb.
+ */
+ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+ return 0;
+
skb->dev = dst->dev;
generic_xdp_tx(skb, xdp_prog);
@@ -646,18 +713,27 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
struct hlist_node *next;
+ int num_excluded = 0;
unsigned int i;
int err;
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ excluded_devices[num_excluded++] = dev->ifindex;
+ }
+
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!dst || dst->dev->ifindex == exclude_ifindex)
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -671,12 +747,17 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
return err;
last_dst = dst;
+
}
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
- if (!dst || dst->dev->ifindex == exclude_ifindex)
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -728,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
kfree(dev);
}
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -738,12 +819,14 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
- if (old_dev)
+ if (old_dev) {
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
return 0;
}
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -774,7 +857,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
- GFP_ATOMIC | __GFP_NOWARN,
+ GFP_NOWAIT | __GFP_NOWARN,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -788,7 +871,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
goto err_put_dev;
- if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
goto err_put_prog;
}
@@ -813,8 +897,8 @@ err_out:
return ERR_PTR(-EINVAL);
}
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -849,19 +933,21 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ else
+ atomic_inc((atomic_t *)&dtab->items);
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -913,28 +999,42 @@ out_err:
return err;
}
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_hash_lookup_elem);
}
-static int dev_map_btf_id;
+static u64 dev_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u64 usage = sizeof(struct bpf_dtab);
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+ usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
+ else
+ usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
+ usage += atomic_read((atomic_t *)&dtab->items) *
+ (u64)sizeof(struct bpf_dtab_netdev);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
@@ -944,12 +1044,11 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_dtab",
- .map_btf_id = &dev_map_btf_id,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_map_redirect,
};
-static int dev_map_hash_map_btf_id;
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
@@ -959,8 +1058,8 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_dtab",
- .map_btf_id = &dev_map_hash_map_btf_id,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_hash_map_redirect,
};
@@ -1030,9 +1129,11 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!dev || netdev != dev->dev)
continue;
odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
- if (dev == odev)
+ if (dev == odev) {
call_rcu(&dev->rcu,
__dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
}
}
rcu_read_unlock();
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bbfc6bb79240..7b4afb7d96db 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e546b18d27da..a4b040793f44 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..fa3e9225aedc 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -4,6 +4,7 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/static_call.h>
/* The BPF dispatcher is a multiway branch code generator. The
* dispatcher is a mechanism to avoid the performance penalty of an
@@ -85,12 +86,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
return false;
}
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
return -ENOTSUPP;
}
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
int i;
@@ -99,34 +100,38 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
if (d->progs[i].prog)
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
}
- return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
}
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new;
- u32 noff;
- int err;
-
- if (!prev_num_progs) {
- old = NULL;
- noff = 0;
- } else {
- old = d->image + d->image_off;
+ void *new, *tmp;
+ u32 noff = 0;
+
+ if (prev_num_progs)
noff = d->image_off ^ (PAGE_SIZE / 2);
- }
new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
- if (bpf_dispatcher_prepare(d, new))
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
return;
}
- err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
- if (err || !new)
- return;
+ __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
- d->image_off = noff;
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
+
+ if (new)
+ d->image_off = noff;
}
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
@@ -140,9 +145,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
if (!d->image)
goto out;
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ u32 size = PAGE_SIZE;
+
+ bpf_arch_text_copy(d->image, &size, sizeof(size));
+ bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ d->image = NULL;
+ goto out;
+ }
bpf_image_ksym_add(d->image, &d->ksym);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 72c58cc516a3..00c253b84bf5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -10,9 +10,11 @@
#include <linux/random.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -60,30 +62,22 @@
*
* As regular device interrupt handlers and soft interrupts are forced into
* thread context, the existing code which does
- * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
* just works.
*
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
*/
struct bucket {
struct hlist_nulls_head head;
- union {
- raw_spinlock_t raw_lock;
- spinlock_t lock;
- };
+ raw_spinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -91,6 +85,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -98,7 +94,12 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
@@ -113,14 +114,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -132,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
- return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
static void htab_init_buckets(struct bpf_htab *htab)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- if (htab_use_raw_lock(htab)) {
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ lockdep_set_class(&htab->buckets[i].raw_lock,
&htab->lockdep_key);
- } else {
- spin_lock_init(&htab->buckets[i].lock);
- lockdep_set_class(&htab->buckets[i].lock,
- &htab->lockdep_key);
- }
cond_resched();
}
}
@@ -162,19 +152,16 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
{
unsigned long flags;
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
- migrate_disable();
+ preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
return -EBUSY;
}
- if (htab_use_raw_lock(htab))
- raw_spin_lock_irqsave(&b->raw_lock, flags);
- else
- spin_lock_irqsave(&b->lock, flags);
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
*pflags = flags;
return 0;
@@ -184,13 +171,10 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b, u32 hash,
unsigned long flags)
{
- hash = hash & HASHTAB_MAP_LOCK_MASK;
- if (htab_use_raw_lock(htab))
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
- else
- spin_unlock_irqrestore(&b->lock, flags);
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -228,6 +212,59 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
+}
+
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ } else {
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
+ cond_resched();
+ }
+}
+
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
@@ -278,7 +315,7 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
@@ -382,8 +419,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -445,7 +480,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
struct bpf_htab *htab;
int err, i;
- htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -500,10 +535,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
- htab->hashrnd = get_random_int();
+ htab->hashrnd = get_random_u32();
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
@@ -517,6 +575,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
@@ -524,17 +592,23 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_prealloc:
prealloc_destroy(htab);
free_map_locked:
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
return jhash(key, key_len, hashrnd);
}
@@ -638,7 +712,7 @@ static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -679,7 +753,7 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@@ -695,12 +769,28 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static void check_and_free_fields(struct bpf_htab *htab,
+ struct htab_elem *elem)
+{
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ } else {
+ void *map_value = elem->key + round_up(htab->map.key_size, 8);
+
+ bpf_obj_free_fields(htab->map.record, map_value);
+ }
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
- struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct bpf_htab *htab = arg;
struct htab_elem *l = NULL, *tgt_l;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
@@ -719,6 +809,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
+ check_and_free_fields(htab, l);
break;
}
@@ -788,17 +879,10 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
+ check_and_free_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
-
- htab_elem_free(htab, l);
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -812,16 +896,41 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
@@ -830,14 +939,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
{
if (!onallcpus) {
/* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
off += size;
}
}
@@ -846,23 +954,20 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
- /* When using prealloc and not setting the initial value on all cpus,
- * zero-fill element values for other cpus (just as what happens when
- * not using prealloc). Otherwise, bpf program has no way to ensure
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
* known initial values for cpus other than current one
* (onallcpus=false always when coming from bpf prog).
*/
- if (htab_is_prealloc(htab) && !onallcpus) {
- u32 size = round_up(htab->map.value_size, 8);
+ if (!onallcpus) {
int current_cpu = raw_smp_processor_id();
int cpu;
for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
- size);
- else
- memset(per_cpu_ptr(pptr, cpu), 0, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ else /* Since elem is preallocated, we cannot touch special fields */
+ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
pcpu_copy_value(htab, pptr, value, onallcpus);
@@ -903,41 +1008,36 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = container_of(l, struct htab_elem, fnode);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_lock(&htab->map,
- l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_ATOMIC | __GFP_NOWARN);
+ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
if (!pptr) {
- kfree(l_new);
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = pptr;
+ pptr = *(void **)pptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -956,7 +1056,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -975,8 +1075,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
}
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1001,7 +1101,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
head = &b->head;
if (unlikely(map_flags & BPF_F_LOCK)) {
- if (unlikely(!map_value_has_spin_lock(map)))
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
/* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -1062,6 +1162,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
+ else
+ check_and_free_fields(htab, l_old);
}
ret = 0;
err:
@@ -1069,8 +1171,14 @@ err:
return ret;
}
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ check_and_free_fields(htab, elem);
+ bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
@@ -1102,7 +1210,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+ copy_map_value(&htab->map,
+ l_new->key + round_up(map->key_size, 8), value);
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
@@ -1128,16 +1237,16 @@ err:
htab_unlock_bucket(htab, b, hash, flags);
if (ret)
- bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ htab_lru_push_free(htab, l_new);
else if (l_old)
- bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+ htab_lru_push_free(htab, l_old);
return ret;
}
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1190,9 +1299,9 @@ err:
return ret;
}
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1257,21 +1366,21 @@ err:
return ret;
}
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1307,7 +1416,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1339,7 +1448,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
htab_unlock_bucket(htab, b, hash, flags);
if (l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1347,6 +1456,10 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread, so disable migration here,
+ * since bpf_mem_cache_free() relies on that.
+ */
+ migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1357,6 +1470,39 @@ static void delete_all_elements(struct bpf_htab *htab)
htab_elem_free(htab, l);
}
}
+ migrate_enable();
+}
+
+static void htab_free_malloced_timers(struct bpf_htab *htab)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry(l, n, head, hash_node) {
+ /* We only free timer on uref dropping to zero */
+ bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+static void htab_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* We only free timer on uref dropping to zero */
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
+ return;
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers(htab);
+ else
+ htab_free_prealloced_timers(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -1370,21 +1516,27 @@ static void htab_map_free(struct bpf_map *map)
* There is no need to synchronize_rcu() here to protect map elements.
*/
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is reponsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
- if (!htab_is_prealloc(htab))
+ if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
- else
+ } else {
+ htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
+ }
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
}
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1441,9 +1593,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
pptr = htab_elem_get_ptr(l, key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
off += roundup_value_size;
}
} else {
@@ -1456,7 +1607,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
else
copy_map_value(map, value, l->key +
roundup_key_size);
- check_and_init_map_lock(map, value);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
}
hlist_nulls_del_rcu(&l->hash_node);
@@ -1467,7 +1619,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
htab_unlock_bucket(htab, b, hash, bflags);
if (is_lru_map && l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1515,7 +1667,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
- u32 batch, max_count, size, bucket_size;
+ u32 batch, max_count, size, bucket_size, map_id;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1528,7 +1680,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
map_flags = attr->batch.flags;
@@ -1557,7 +1709,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
- * 1000, it was observed that a bucket can have upto 5 entries.
+ * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
@@ -1565,8 +1717,8 @@ alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
- keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
- values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
@@ -1583,8 +1735,11 @@ again_nocopy:
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
ret = htab_lock_bucket(htab, b, batch, &flags);
- if (ret)
- goto next_batch;
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
}
bucket_cnt = 0;
@@ -1634,18 +1789,27 @@ again_nocopy:
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(dst_val + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
off += size;
}
} else {
value = l->key + roundup_key_size;
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ struct bpf_map **inner_map = value;
+
+ /* Actual value is the id of the inner map */
+ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+ value = &map_id;
+ }
+
if (elem_map_flags & BPF_F_LOCK)
copy_map_value_locked(map, dst_val, value,
true);
else
copy_map_value(map, dst_val, value);
- check_and_init_map_lock(map, dst_val);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
@@ -1672,7 +1836,7 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
}
next_batch:
@@ -1899,9 +2063,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += roundup_value_size;
}
ctx.value = info->percpu_value_buf;
@@ -1944,6 +2108,7 @@ static int bpf_iter_init_hash_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
seq_info->htab = container_of(map, struct bpf_htab, map);
return 0;
@@ -1953,6 +2118,7 @@ static void bpf_iter_fini_hash_map(void *priv_data)
{
struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
@@ -1970,8 +2136,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
-static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -2010,9 +2176,8 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
val = elem->key + roundup_key_size;
}
num_elems++;
- ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
- (u64)(long)key, (u64)(long)val,
- (u64)(long)callback_ctx, 0);
+ ret = callback_fn((u64)(long)map, (u64)(long)key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret) {
rcu_read_unlock();
@@ -2027,13 +2192,52 @@ out:
return num_elems;
}
-static int htab_map_btf_id;
+static u64 htab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 value_size = round_up(htab->map.value_size, 8);
+ bool prealloc = htab_is_prealloc(htab);
+ bool percpu = htab_is_percpu(htab);
+ bool lru = htab_is_lru(htab);
+ u64 num_entries;
+ u64 usage = sizeof(struct bpf_htab);
+
+ usage += sizeof(struct bucket) * htab->n_buckets;
+ usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
+ if (prealloc) {
+ num_entries = map->max_entries;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ usage += htab->elem_size * num_entries;
+
+ if (percpu)
+ usage += value_size * num_possible_cpus() * num_entries;
+ else if (!lru)
+ usage += sizeof(struct htab_elem *) * num_possible_cpus();
+ } else {
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+ num_entries = htab->use_percpu_counter ?
+ percpu_counter_sum(&htab->pcount) :
+ atomic_read(&htab->count);
+ usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
+ if (percpu) {
+ usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
+ usage += value_size * num_possible_cpus() * num_entries;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2042,19 +2246,19 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int htab_lru_map_btf_id;
const struct bpf_map_ops htab_lru_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
@@ -2064,9 +2268,9 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_lru_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -2081,6 +2285,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l)
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ else
+ return NULL;
+}
+
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -2093,6 +2311,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
struct htab_elem *l;
@@ -2115,8 +2349,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
@@ -2172,7 +2406,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int htab_percpu_map_btf_id;
const struct bpf_map_ops htab_percpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2183,16 +2416,16 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_percpu_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int htab_lru_percpu_map_btf_id;
const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2203,12 +2436,13 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_lru_percpu_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -2316,7 +2550,7 @@ static int htab_of_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -2332,7 +2566,6 @@ static void htab_of_map_free(struct bpf_map *map)
fd_htab_map_free(map);
}
-static int htab_of_maps_map_btf_id;
const struct bpf_map_ops htab_of_maps_map_ops = {
.map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
@@ -2345,6 +2578,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_of_maps_map_btf_id,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..8d368fa353f9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,6 +2,9 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -13,8 +16,12 @@
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
+#include <linux/poison.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
#include <linux/security.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
#include "../../lib/kstrtox.h"
@@ -100,7 +107,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -113,7 +120,23 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+};
+
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+ .func = bpf_map_lookup_percpu_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
@@ -179,6 +202,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@@ -222,13 +257,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
if (unlikely(!task))
goto err_clear;
- strncpy(buf, task->comm, size);
-
- /* Verifier guarantees that size > 0. For task->comm exceeding
- * size, guarantee that buf is %NUL-terminated. Unconditionally
- * done here to save the size test.
- */
- buf[size - 1] = 0;
+ /* Verifier guarantees that size > 0 */
+ strscpy_pad(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -289,13 +319,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_lock_irqsave(lock);
return 0;
}
@@ -304,15 +339,21 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_unlock_irqrestore(lock);
return 0;
}
@@ -321,6 +362,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
@@ -329,13 +371,13 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
struct bpf_spin_lock *lock;
if (lock_src)
- lock = src + map->spin_lock_off;
+ lock = src + map->record->spin_lock_off;
else
- lock = dst + map->spin_lock_off;
+ lock = dst + map->record->spin_lock_off;
preempt_disable();
- ____bpf_spin_lock(lock);
+ __bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
- ____bpf_spin_unlock(lock);
+ __bpf_spin_unlock_irqrestore(lock);
preempt_enable();
}
@@ -353,9 +395,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = {
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
+ u64 cgrp_id;
- return cgroup_id(cgrp);
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ cgrp_id = cgroup_id(cgrp);
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
@@ -366,13 +414,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
struct cgroup *ancestor;
+ u64 cgrp_id;
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
- if (!ancestor)
- return 0;
- return cgroup_id(ancestor);
+ cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
@@ -381,46 +433,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
- bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage = NULL;
- void *ptr;
- int i;
-
- for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
- if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
- continue;
-
- storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
- break;
- }
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -516,7 +529,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.func = bpf_strtol,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
@@ -544,12 +557,25 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.func = bpf_strtoul,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
-#endif
+
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+ return strncmp(s1, s2, s1_sz);
+}
+
+static const struct bpf_func_proto bpf_strncmp_proto = {
+ .func = bpf_strncmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+};
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
struct bpf_pidns_info *, nsdata, u32, size)
@@ -616,7 +642,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -636,10 +662,45 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_proto = {
.func = bpf_copy_from_user,
.gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
};
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
@@ -653,7 +714,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
.func = bpf_per_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
.arg2_type = ARG_ANYTHING,
};
@@ -666,7 +727,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.func = bpf_this_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
@@ -696,19 +757,20 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
* arguments representation.
*/
-#define MAX_BPRINTF_BUF_LEN 512
+#define MAX_BPRINTF_BIN_ARGS 512
/* Support executing three nested bprintf helper calls on a given CPU */
#define MAX_BPRINTF_NEST_LEVEL 3
struct bpf_bprintf_buffers {
- char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN];
+ char bin_args[MAX_BPRINTF_BIN_ARGS];
+ char buf[MAX_BPRINTF_BUF];
};
-static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
+
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
-static int try_get_fmt_tmp_buf(char **tmp_buf)
+static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
- struct bpf_bprintf_buffers *bufs;
int nest_level;
preempt_disable();
@@ -718,18 +780,19 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
preempt_enable();
return -EBUSY;
}
- bufs = this_cpu_ptr(&bpf_bprintf_bufs);
- *tmp_buf = bufs->tmp_bufs[nest_level - 1];
+ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
return 0;
}
-void bpf_bprintf_cleanup(void)
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
{
- if (this_cpu_read(bpf_bprintf_nest_level)) {
- this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
- }
+ if (!data->bin_args && !data->buf)
+ return;
+ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
+ return;
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
}
/*
@@ -738,18 +801,20 @@ void bpf_bprintf_cleanup(void)
* Returns a negative value if fmt is an invalid format string or 0 otherwise.
*
* This can be used in two ways:
- * - Format string verification only: when bin_args is NULL
+ * - Format string verification only: when data->get_bin_args is false
* - Arguments preparation: in addition to the above verification, it writes in
- * bin_args a binary representation of arguments usable by bstr_printf where
- * pointers from BPF have been sanitized.
+ * data->bin_args a binary representation of arguments usable by bstr_printf
+ * where pointers from BPF have been sanitized.
*
* In argument preparation mode, if 0 is returned, safe temporary buffers are
* allocated and bpf_bprintf_cleanup should be called to free them after use.
*/
int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
- u32 **bin_args, u32 num_args)
+ u32 num_args, struct bpf_bprintf_data *data)
{
+ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ struct bpf_bprintf_buffers *buffers = NULL;
size_t sizeof_cur_arg, sizeof_cur_ip;
int err, i, num_spec = 0;
u64 cur_arg;
@@ -760,14 +825,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
return -EINVAL;
fmt_size = fmt_end - fmt;
- if (bin_args) {
- if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
- return -EBUSY;
+ if (get_buffers && try_get_buffers(&buffers))
+ return -EBUSY;
- tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN;
- *bin_args = (u32 *)tmp_buf;
+ if (data->get_bin_args) {
+ if (num_args)
+ tmp_buf = buffers->bin_args;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
+ data->bin_args = (u32 *)tmp_buf;
}
+ if (data->get_buf)
+ data->buf = buffers->buf;
+
for (i = 0; i < fmt_size; i++) {
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
err = -EINVAL;
@@ -904,6 +974,20 @@ fmt_str:
num_spec++;
continue;
+ } else if (fmt[i] == 'c') {
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ *tmp_buf = raw_args[num_spec];
+ tmp_buf++;
+ num_spec++;
+
+ continue;
}
sizeof_cur_arg = sizeof(int);
@@ -947,33 +1031,33 @@ nocopy_fmt:
err = 0;
out:
if (err)
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(data);
return err;
}
-#define MAX_SNPRINTF_VARARGS 12
-
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
- const void *, data, u32, data_len)
+ const void *, args, u32, data_len)
{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
int err, num_args;
- u32 *bin_args;
- if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
* can safely give an unbounded size.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
if (err < 0)
return err;
- err = bstr_printf(str, str_size, fmt, bin_args);
+ err = bstr_printf(str, str_size, fmt, data.bin_args);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return err + 1;
}
@@ -985,15 +1069,591 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+ struct hrtimer timer;
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+ struct bpf_hrtimer *timer;
+ /* bpf_spin_lock is used here instead of spinlock_t to make
+ * sure that it always fits into space reserved by struct bpf_timer
+ * regardless of LOCKDEP and spinlock debug flags.
+ */
+ struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+ struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ struct bpf_map *map = t->map;
+ void *value = t->value;
+ bpf_callback_t callback_fn;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_timer);
+ callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ if (!callback_fn)
+ goto out;
+
+ /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ * Remember the timer this callback is servicing to prevent
+ * deadlock if callback_fn() calls bpf_timer_cancel() or
+ * bpf_map_delete_elem() on the same timer.
+ */
+ this_cpu_write(hrtimer_running, t);
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ this_cpu_write(hrtimer_running, NULL);
+out:
+ return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (t) {
+ ret = -EBUSY;
+ goto out;
+ }
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ /* allocate hrtimer via map_kmalloc to use memcg accounting */
+ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ t->value = (void *)timer - map->record->timer_off;
+ t->map = map;
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ timer->timer = t;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+ .func = bpf_timer_init,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ struct bpf_prog *prev, *prog = aux->prog;
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!atomic64_read(&t->map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs. Otherwise timer might still be
+ * running even when bpf prog is detached and user space
+ * is gone, since map_release_uref won't ever be called.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ prev = t->prog;
+ if (prev != prog) {
+ /* Bump prog refcnt once. Every bpf_timer_set_callback()
+ * can pick different callback_fn-s within the same prog.
+ */
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto out;
+ }
+ if (prev)
+ /* Drop prev prog refcnt when swapping with new prog */
+ bpf_prog_put(prev);
+ t->prog = prog;
+ }
+ rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+ .func = bpf_timer_set_callback,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+ enum hrtimer_mode mode;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags > BPF_F_TIMER_ABS)
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t || !t->prog) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (flags & BPF_F_TIMER_ABS)
+ mode = HRTIMER_MODE_ABS_SOFT;
+ else
+ mode = HRTIMER_MODE_REL_SOFT;
+
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+ .func = bpf_timer_start,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+ struct bpf_prog *prog = t->prog;
+
+ if (prog) {
+ bpf_prog_put(prog);
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ }
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (this_cpu_read(hrtimer_running) == t) {
+ /* If bpf callback_fn is trying to bpf_timer_cancel()
+ * its own timer the hrtimer_cancel() will deadlock
+ * since it waits for callback_fn to finish
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+ drop_prog_refcnt(t);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ /* Cancel the timer and wait for associated callback to finish
+ * if it was running.
+ */
+ ret = ret ?: hrtimer_cancel(&t->timer);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+ .func = bpf_timer_cancel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_timer_kern *timer = val;
+ struct bpf_hrtimer *t;
+
+ /* Performance optimization: read timer->timer without lock first. */
+ if (!READ_ONCE(timer->timer))
+ return;
+
+ __bpf_spin_lock_irqsave(&timer->lock);
+ /* re-read it under lock */
+ t = timer->timer;
+ if (!t)
+ goto out;
+ drop_prog_refcnt(t);
+ /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+ * this timer, since it won't be initialized.
+ */
+ timer->timer = NULL;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ if (!t)
+ return;
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+ * right after for both preallocated and non-preallocated maps.
+ * The timer->timer = NULL was already done and no code path can
+ * see address 't' anymore.
+ *
+ * Check that bpf_map_delete/update_elem() wasn't called from timer
+ * callback_fn. In such case don't call hrtimer_cancel() (since it will
+ * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+ * return -1). Though callback_fn is still running on this cpu it's
+ * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+ * from 't'. The bpf subprog callback_fn won't be able to access 't',
+ * since timer->timer = NULL was already done. The timer will be
+ * effectively cancelled because bpf_timer_cb() will return
+ * HRTIMER_NORESTART.
+ */
+ if (this_cpu_read(hrtimer_running) != t)
+ hrtimer_cancel(&t->timer);
+ kfree(t);
+}
+
+BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+{
+ unsigned long *kptr = map_value;
+
+ return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
+ */
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
+ .func = bpf_kptr_xchg,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = BPF_PTR_POISON,
+ .arg1_type = ARG_PTR_TO_KPTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+ .arg2_btf_id = BPF_PTR_POISON,
+};
+
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE ((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT 28
+#define DYNPTR_SIZE_MASK 0xFFFFFF
+#define DYNPTR_RDONLY_BIT BIT(31)
+
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_RDONLY_BIT;
+}
+
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+ ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_SIZE_MASK;
+}
+
+int bpf_dynptr_check_size(u32 size)
+{
+ return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+ ptr->data = data;
+ ptr->offset = offset;
+ ptr->size = size;
+ bpf_dynptr_set_type(ptr, type);
+}
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+ memset(ptr, 0, sizeof(*ptr));
+}
+
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+{
+ u32 size = bpf_dynptr_get_size(ptr);
+
+ if (len > size || offset > size - len)
+ return -E2BIG;
+
+ return 0;
+}
+
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ goto error;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+ return 0;
+
+error:
+ bpf_dynptr_set_null(ptr);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+ .func = bpf_dynptr_from_mem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+};
+
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
+ u32, offset, u64, flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!src->data || flags)
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(src, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(src);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
+ .func = bpf_dynptr_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+ u32, len, u64, flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!dst->data || bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(dst, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(dst);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
+ .func = bpf_dynptr_write,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!ptr->data)
+ return 0;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return 0;
+
+ if (bpf_dynptr_is_rdonly(ptr))
+ return 0;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
+ .func = bpf_dynptr_data,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
@@ -1011,6 +1671,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -1023,8 +1685,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_ktime_get_coarse_ns:
- return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1035,8 +1697,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_strncmp:
+ return &bpf_strncmp_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
default:
break;
}
@@ -1055,6 +1721,46 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_timer_init:
+ return &bpf_timer_init_proto;
+ case BPF_FUNC_timer_set_callback:
+ return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_start:
+ return &bpf_timer_start_proto;
+ case BPF_FUNC_timer_cancel:
+ return &bpf_timer_cancel_proto;
+ case BPF_FUNC_kptr_xchg:
+ return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
default:
break;
}
@@ -1067,21 +1773,632 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
+ case BPF_FUNC_get_current_task_btf:
+ return &bpf_get_current_task_btf_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
return &bpf_snprintf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
default:
return NULL;
}
}
+
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->graph_root.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ migrate_disable();
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ migrate_enable();
+ }
+}
+
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+ for (pos = rb_first_postorder(root); \
+ pos && ({ n = rb_next_postorder(pos); 1; }); \
+ pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct rb_root_cached orig_root, *root = rb_root;
+ struct rb_node *pos, *n;
+ void *obj;
+
+ BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+ BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+ __bpf_spin_lock_irqsave(spin_lock);
+ orig_root = *root;
+ *root = RB_ROOT_CACHED;
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ obj = pos;
+ obj -= field->graph_root.node_offset;
+
+
+ migrate_disable();
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ migrate_enable();
+ }
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->record, p);
+ return p;
+}
+
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+ if (rec && rec->refcount_off >= 0 &&
+ !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+ /* Object is refcounted and refcount_dec didn't result in 0
+ * refcount. Return without freeing the object
+ */
+ return;
+ }
+
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+}
+
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_refcount *ref;
+
+ /* Could just cast directly to refcount_t *, but need some code using
+ * bpf_refcount type so that it is emitted in vmlinux BTF
+ */
+ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+
+ refcount_inc((refcount_t *)ref);
+ return (void *)p__refcounted_kptr;
+}
+
+static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+ bool tail, struct btf_record *rec, u64 off)
+{
+ struct list_head *n = (void *)node, *h = (void *)head;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (!list_empty(n)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl(n - off, rec);
+ return -EINVAL;
+ }
+
+ tail ? list_add_tail(n, h) : list_add(n, h);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(node, head, false,
+ meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(node, head, true,
+ meta ? meta->record : NULL, off);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+ n = tail ? h->prev : h->next;
+ list_del_init(n);
+ return (struct bpf_list_node *)n;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+ struct rb_node *n = (struct rb_node *)node;
+
+ if (RB_EMPTY_NODE(n))
+ return NULL;
+
+ rb_erase_cached(n, r);
+ RB_CLEAR_NODE(n);
+ return (struct bpf_rb_node *)n;
+}
+
+/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
+ * program
+ */
+static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ void *less, struct btf_record *rec, u64 off)
+{
+ struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ struct rb_node *parent = NULL, *n = (struct rb_node *)node;
+ bpf_callback_t cb = (bpf_callback_t)less;
+ bool leftmost = true;
+
+ if (!RB_EMPTY_NODE(n)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl(n - off, rec);
+ return -EINVAL;
+ }
+
+ while (*link) {
+ parent = *link;
+ if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(n, parent, link);
+ rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ return 0;
+}
+
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
+{
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)rb_first_cached(r);
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
+{
+ put_task_struct_rcu_user(p);
+}
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ cgroup_put(cgrp);
+}
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
+ ancestor = cgrp->ancestors[level];
+ if (!cgroup_tryget(ancestor))
+ return NULL;
+ return ancestor;
+}
+
+/**
+ * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
+ * kfunc which is not subsequently stored in a map, must be released by calling
+ * bpf_cgroup_release().
+ * @cgid: cgroup id.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
+{
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_get_from_id(cgid);
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
+ * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ enum bpf_dynptr_type type;
+ u32 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return NULL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return NULL;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (xdp_ptr)
+ return xdp_ptr;
+
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+ return buffer;
+ }
+ default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return NULL;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possiblities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+{
+ return obj__ign;
+}
+
+__bpf_kfunc void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+__bpf_kfunc void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+__diag_pop();
+
+BTF_SET8_START(generic_btf_ids)
+#ifdef CONFIG_KEXEC_CORE
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
+BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_SET8_END(generic_btf_ids)
+
+static const struct btf_kfunc_id_set generic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release)
+#endif
+
+BTF_SET8_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
+BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_SET8_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 80da1db47c68..9948b542a470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
@@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(&init_user_ns, inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
@@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
int opt;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
- if (opt < 0)
+ if (opt < 0) {
/* We might like to report bad mount options here, but
* traditionally we've ignored all mount options, so we'd
* better continue to ignore non-existing options for bpf.
*/
- return opt == -ENOPARAM ? 0 : opt;
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
+
+ return 0;
+ }
+
+ if (opt < 0)
+ return opt;
+ }
switch (opt) {
case OPT_MODE:
@@ -700,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock);
static int populate_bpffs(struct dentry *parent)
{
struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
- struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
int err = 0, i;
/* grab the mutex to make sure the kernel interactions with bpf_preload
- * UMD are serialized
+ * are serialized
*/
mutex_lock(&bpf_preload_lock);
@@ -712,40 +721,22 @@ static int populate_bpffs(struct dentry *parent)
if (!bpf_preload_mod_get())
goto out;
- if (!bpf_preload_ops->info.tgid) {
- /* preload() will start UMD that will load BPF iterator programs */
- err = bpf_preload_ops->preload(objs);
- if (err)
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
goto out_put;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- links[i] = bpf_link_by_id(objs[i].link_id);
- if (IS_ERR(links[i])) {
- err = PTR_ERR(links[i]);
- goto out_put;
- }
- }
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- err = bpf_iter_link_pin_kernel(parent,
- objs[i].link_name, links[i]);
- if (err)
- goto out_put;
- /* do not unlink successfully pinned links even
- * if later link fails to pin
- */
- links[i] = NULL;
}
- /* finish() will tell UMD process to exit */
- err = bpf_preload_ops->finish();
- if (err)
- goto out_put;
}
out_put:
bpf_preload_mod_put();
out:
mutex_unlock(&bpf_preload_lock);
- for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
- if (!IS_ERR_OR_NULL(links[i]))
- bpf_link_put(links[i]);
return err;
}
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
new file mode 100644
index 000000000000..fec8005a121c
--- /dev/null
+++ b/kernel/bpf/link_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Red Hat, Inc. */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_link_info {
+ u32 link_id;
+};
+
+static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+ struct bpf_link *link;
+
+ link = bpf_link_get_curr_or_next(&info->link_id);
+ if (!link)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return link;
+}
+
+static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+
+ ++*pos;
+ ++info->link_id;
+ bpf_link_put((struct bpf_link *)v);
+ return bpf_link_get_curr_or_next(&info->link_id);
+}
+
+struct bpf_iter__bpf_link {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_link *, link);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link)
+
+static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_link ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.link = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_link_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_link_seq_show(seq, v, false);
+}
+
+static void bpf_link_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_link_seq_show(seq, v, true);
+ else
+ bpf_link_put((struct bpf_link *)v);
+}
+
+static const struct seq_operations bpf_link_seq_ops = {
+ .start = bpf_link_seq_start,
+ .next = bpf_link_seq_next,
+ .stop = bpf_link_seq_stop,
+ .show = bpf_link_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_link_id)
+BTF_ID(struct, bpf_link)
+
+static const struct bpf_iter_seq_info bpf_link_seq_info = {
+ .seq_ops = &bpf_link_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_link_info),
+};
+
+static struct bpf_iter_reg bpf_link_reg_info = {
+ .target = "bpf_link",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_link, link),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_link_seq_info,
+};
+
+static int __init bpf_link_iter_init(void)
+{
+ bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id;
+ return bpf_iter_reg_target(&bpf_link_reg_info);
+}
+
+late_initcall(bpf_link_iter_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..a04f505aefe9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,6 +1,7 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
#include <linux/btf.h>
#include <linux/bug.h>
#include <linux/filter.h>
@@ -8,12 +9,10 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
#ifdef CONFIG_CGROUP_BPF
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
- bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
@@ -142,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
@@ -152,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -165,15 +164,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return 0;
}
- new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
- map->value_size,
- __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
+ new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
+ __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
map->numa_node);
if (!new)
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
- check_and_init_map_lock(map, new->data);
+ check_and_init_map_value(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -286,9 +284,17 @@ enoent:
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
+ __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
+ /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu
+ * is the same as other local storages.
+ */
+ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ max_value_size = min_t(__u32, max_value_size,
+ PCPU_MIN_UNIT_SIZE);
+
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
attr->key_size != sizeof(__u64))
return ERR_PTR(-EINVAL);
@@ -296,7 +302,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
if (attr->value_size == 0)
return ERR_PTR(-EINVAL);
- if (attr->value_size > PAGE_SIZE)
+ if (attr->value_size > max_value_size)
return ERR_PTR(-E2BIG);
if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -307,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
- map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
- __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
if (!map)
return ERR_PTR(-ENOMEM);
@@ -328,22 +333,22 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
struct list_head *storages = &map->list;
struct bpf_cgroup_storage *storage, *stmp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_for_each_entry_safe(storage, stmp, storages, list_map) {
bpf_cgroup_storage_unlink(storage);
bpf_cgroup_storage_free(storage);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
- kfree(map);
+ bpf_map_area_free(map);
}
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -409,7 +414,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ enum bpf_cgroup_storage_type stype;
struct bpf_cgroup_storage *storage;
int cpu;
@@ -441,7 +446,14 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int cgroup_storage_map_btf_id;
+static u64 cgroup_storage_map_usage(const struct bpf_map *map)
+{
+ /* Currently the dynamically allocated elements are not counted. */
+ return sizeof(struct bpf_cgroup_storage_map);
+}
+
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
+ bpf_cgroup_storage_map)
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -451,8 +463,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
- .map_btf_name = "bpf_cgroup_storage_map",
- .map_btf_id = &cgroup_storage_map_btf_id,
+ .map_mem_usage = cgroup_storage_map_usage,
+ .map_btf_id = &cgroup_storage_map_btf_ids[0],
};
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
@@ -509,7 +521,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
map->numa_node);
if (!storage->buf)
goto enomem;
- check_and_init_map_lock(map, storage->buf->data);
+ check_and_init_map_value(map, storage->buf->data);
} else {
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..046ddff37a76
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end, new_n;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 423549d2c52e..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -284,7 +285,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN,
+ node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
trie->map.numa_node);
if (!node)
return NULL;
@@ -299,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
}
/* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
- void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -412,7 +413,7 @@ static int trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(im_node->child[1], node);
}
- /* Finally, assign the intermediate node to the determined spot */
+ /* Finally, assign the intermediate node to the determined slot */
rcu_assign_pointer(*slot, im_node);
out:
@@ -430,7 +431,7 @@ out:
}
/* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct bpf_lpm_trie_key *key = _key;
@@ -557,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
- trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
if (!trie)
return ERR_PTR(-ENOMEM);
@@ -608,7 +609,7 @@ static void trie_free(struct bpf_map *map)
}
out:
- kfree(trie);
+ bpf_map_area_free(trie);
}
static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
@@ -719,7 +720,17 @@ static int trie_check_btf(const struct bpf_map *map,
-EINVAL : 0;
}
-static int trie_map_btf_id;
+static u64 trie_mem_usage(const struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ u64 elem_size;
+
+ elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ return elem_size * READ_ONCE(trie->n_entries);
+}
+
+BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
const struct bpf_map_ops trie_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = trie_alloc,
@@ -732,6 +743,6 @@ const struct bpf_map_ops trie_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
- .map_btf_name = "lpm_trie",
- .map_btf_id = &trie_map_btf_id,
+ .map_mem_usage = trie_mem_usage,
+ .map_btf_id = &trie_map_btf_ids[0],
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 39ab0b68cade..2c5c64c2a53b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -3,6 +3,7 @@
*/
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include "map_in_map.h"
@@ -11,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
struct fd f;
+ int ret;
f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
@@ -19,18 +21,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
- fdput(f);
- return ERR_PTR(-EINVAL);
+ ret = -EINVAL;
+ goto put;
}
if (!inner_map->ops->map_meta_equal) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
-
- if (map_value_has_spin_lock(inner_map)) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
+ ret = -ENOTSUPP;
+ goto put;
}
inner_map_meta_size = sizeof(*inner_map_meta);
@@ -40,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
- fdput(f);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto put;
}
inner_map_meta->map_type = inner_map->map_type;
@@ -49,7 +46,25 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
- inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ ret = PTR_ERR(inner_map_meta->record);
+ goto free;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
+ if (inner_map->btf) {
+ btf_get(inner_map->btf);
+ inner_map_meta->btf = inner_map->btf;
+ }
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
@@ -61,10 +76,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
fdput(f);
return inner_map_meta;
+free:
+ kfree(inner_map_meta);
+put:
+ fdput(f);
+ return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ bpf_map_free_record(map_meta);
+ btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -75,7 +97,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
- meta0->map_flags == meta1->map_flags;
+ meta0->map_flags == meta1->map_flags &&
+ btf_record_equal(meta0->record, meta1->record);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6a9542af4212..b0fa190b0979 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
- PTR_TO_RDONLY_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
{ offsetof(struct bpf_iter__bpf_map_elem, value),
- PTR_TO_RDWR_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL },
},
};
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..410637c225fb
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,719 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > 4096)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 2;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+
+ struct rcu_head rcu;
+ struct llist_head free_by_rcu;
+ struct llist_head waiting_for_gp;
+ atomic_t call_rcu_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
+{
+ if (c->percpu_size) {
+ void **obj = kmalloc_node(c->percpu_size, flags, node);
+ void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ unsigned long flags;
+ void *obj;
+ int i;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (i = 0; i < cnt; i++) {
+ /*
+ * free_by_rcu is only manipulated by irq work refill_work().
+ * IRQ works on the same CPU are called sequentially, so it is
+ * safe to use __llist_del_first() here. If alloc_bulk() is
+ * invoked by the initial prefill, there will be no running
+ * refill_work(), so __llist_del_first() is fine as well.
+ *
+ * In most cases, objects on free_by_rcu are from the same CPU.
+ * If some objects come from other CPUs, it doesn't incur any
+ * harm because NUMA_NO_NODE means the preference for current
+ * numa node and it is not a guarantee.
+ */
+ obj = __llist_del_first(&c->free_by_rcu);
+ if (!obj) {
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (!obj)
+ break;
+ }
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(struct bpf_mem_cache *c, void *obj)
+{
+ if (c->percpu_size) {
+ free_percpu(((void **)obj)[1]);
+ kfree(obj);
+ return;
+ }
+
+ kfree(obj);
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
+ struct llist_node *pos, *t;
+
+ llist_for_each_safe(pos, t, llnode)
+ free_one(c, pos);
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu list.
+ */
+ __llist_add(llnode, &c->free_by_rcu);
+}
+
+static void do_call_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ return;
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ /* There is no concurrent __llist_add(waiting_for_gp) access.
+ * It doesn't race with llist_del_all either.
+ * But there could be two concurrent llist_del_all(waiting_for_gp):
+ * from __free_rcu() and from drain_mem_cache().
+ */
+ __llist_add(llnode, &c->waiting_for_gp);
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ do {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(flags);
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ if (llnode)
+ enque_to_free(c, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(c, llnode);
+ do_call_rcu(c);
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ */
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+
+ /* To avoid consuming memory assume that 1st run of bpf
+ * prog won't be doing more than 4 map_update_elem from
+ * irq disabled region
+ */
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_cache *c, __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (percpu)
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ else
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (memcg_bpf_enabled())
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ /* size == 0 && percpu is an invalid combination */
+ if (WARN_ON_ONCE(percpu))
+ return -EINVAL;
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ prefill_mem_cache(c, cpu);
+ }
+ }
+ ma->caches = pcc;
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ *
+ * Except for waiting_for_gp list, there are no concurrent operations
+ * on these lists, so it is safe to use __llist_del_all().
+ */
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))
+ free_one(c, llnode);
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp lists was drained, but __free_rcu might
+ * still execute. Wait for it now before we freeing percpu caches.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
+ */
+ rcu_barrier_tasks_trace();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ copy->cache = ma->cache;
+ ma->cache = NULL;
+ copy->caches = ma->caches;
+ ma->caches = NULL;
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_unbound_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ /*
+ * refill_work may be unfinished for PREEMPT_RT kernel
+ * in which irq work is invoked in a per-CPU RT thread.
+ * It is also possible for kernel with
+ * arch_irq_work_has_interrupt() being false and irq
+ * work is invoked in timer interrupt. So waiting for
+ * the completion of irq work to ease the handling of
+ * concurrency.
+ */
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ /* objcg is the same across cpus */
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+ struct irq_work irq_work;
+ struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = false;
+
+ if (irqs_disabled()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ work = this_cpu_ptr(&mmap_unlock_work);
+ if (irq_work_is_busy(&work->irq_work)) {
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+ } else {
+ /*
+ * PREEMPT_RT does not allow to trylock mmap sem in
+ * interrupt disabled context. Force the fallback code.
+ */
+ irq_work_busy = true;
+ }
+ }
+
+ *work_ptr = work;
+ return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+ if (!work) {
+ mmap_read_unlock(mm);
+ } else {
+ work->mm = mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 542f275bf252..868cc2c43899 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
+#include <linux/bpf-netns.h>
#include <linux/filter.h>
#include <net/net_namespace.h>
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index bd09290e3648..d9c9f45e3529 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -41,7 +41,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
if (!ondev) {
- err = -EINVAL;
- goto err_unlock;
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,27 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -343,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -372,7 +513,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
- offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
if (!offmap)
return ERR_PTR(-ENOMEM);
@@ -404,19 +545,10 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
err_unlock:
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -428,7 +560,13 @@ void bpf_map_offload_map_free(struct bpf_map *map)
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
+}
+
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+ /* The memory dynamically allocated in netdev dev_ops is not counted */
+ return sizeof(struct bpf_offloaded_map);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
@@ -576,12 +714,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -595,32 +749,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
- up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -628,43 +761,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -673,18 +771,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -710,3 +796,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+ if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
+ p = ops->xmo_rx_timestamp;
+ else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
+ p = ops->xmo_rx_hash;
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+late_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..034cf87b54e9 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
- head->first = node;
+ WRITE_ONCE(head->first, node);
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
{
int cpu, orig_cpu;
- orig_cpu = cpu = raw_smp_processor_id();
+ orig_cpu = raw_smp_processor_id();
while (1) {
- struct pcpu_freelist_head *head;
+ for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
+ struct pcpu_freelist_head *head;
- head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
- return;
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ pcpu_freelist_push_node(head, node);
+ raw_spin_unlock(&head->lock);
+ return;
+ }
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
/* cannot lock any per cpu lock, try extralist */
- if (cpu == orig_cpu &&
- pcpu_freelist_try_push_extra(s, node))
+ if (pcpu_freelist_try_push_extra(s, node))
return;
}
}
@@ -102,22 +100,21 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
- int i, cpu, pcpu_entries;
+ unsigned int cpu, cpu_idx, i, j, n, m;
- pcpu_entries = nr_elems / num_possible_cpus() + 1;
- i = 0;
+ n = nr_elems / num_possible_cpus();
+ m = nr_elems % num_possible_cpus();
+ cpu_idx = 0;
for_each_possible_cpu(cpu) {
-again:
head = per_cpu_ptr(s->freelist, cpu);
- /* No locking required as this is not visible yet. */
- pcpu_freelist_push_node(head, buf);
- i++;
- buf += elem_size;
- if (i == nr_elems)
- break;
- if (i % pcpu_entries)
- goto again;
+ j = n + (cpu_idx < m ? 1 : 0);
+ for (i = 0; i < j; i++) {
+ /* No locking required as this is not visible yet. */
+ pcpu_freelist_push_node(head, buf);
+ buf += elem_size;
+ }
+ cpu_idx++;
}
}
@@ -125,31 +122,29 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* per cpu lists are all empty, try extralist */
+ if (!READ_ONCE(s->extralist.first))
+ return NULL;
raw_spin_lock(&s->extralist.lock);
node = s->extralist.first;
if (node)
- s->extralist.first = node->next;
+ WRITE_ONCE(s->extralist.first, node->next);
raw_spin_unlock(&s->extralist.lock);
return node;
}
@@ -159,33 +154,29 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* cannot pop from per cpu lists, try extralist */
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
return NULL;
node = s->extralist.first;
if (node)
- s->extralist.first = node->next;
+ WRITE_ONCE(s->extralist.first, node->next);
raw_spin_unlock(&s->extralist.lock);
return node;
}
diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore
index 856a4c5ad0dd..9452322902a5 100644
--- a/kernel/bpf/preload/.gitignore
+++ b/kernel/bpf/preload/.gitignore
@@ -1,4 +1,2 @@
-/FEATURE-DUMP.libbpf
-/bpf_helper_defs.h
-/feature
+/libbpf
/bpf_preload_umd
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index 26bced262473..c9d45c9d6918 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD
if BPF_PRELOAD
config BPF_PRELOAD_UMD
- tristate "bpf_preload kernel module with user mode driver"
- depends on CC_CAN_LINK
- depends on m || CC_CAN_LINK_STATIC
+ tristate "bpf_preload kernel module"
default m
help
- This builds bpf_preload kernel module with embedded user mode driver.
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1951332dd15f..20f89cc0a0a6 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,28 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_A = $(obj)/libbpf.a
-LIBBPF_OUT = $(abspath $(obj))
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A):
- $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a
-
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
- -I $(srctree)/tools/lib/ -Wno-unused-result
-
-userprogs := bpf_preload_umd
-
-clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/
-
-bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
-
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
-
-$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
+LIBBPF_INCLUDE = $(srctree)/tools/lib
obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
index 2f9932276f2e..f065c91213a0 100644
--- a/kernel/bpf/preload/bpf_preload.h
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -2,13 +2,13 @@
#ifndef _BPF_PRELOAD_H
#define _BPF_PRELOAD_H
-#include <linux/usermode_driver.h>
-#include "iterators/bpf_preload_common.h"
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
struct bpf_preload_ops {
- struct umd_info info;
int (*preload)(struct bpf_preload_info *);
- int (*finish)(void);
struct module *owner;
};
extern struct bpf_preload_ops *bpf_preload_ops;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 53736e52c1df..b56f9f3314fd 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -2,101 +2,91 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/pid.h>
-#include <linux/fs.h>
-#include <linux/sched/signal.h>
#include "bpf_preload.h"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#include "iterators/iterators.lskel-little-endian.h"
+#else
+#include "iterators/iterators.lskel-big-endian.h"
+#endif
-extern char bpf_preload_umd_start;
-extern char bpf_preload_umd_end;
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
-static int preload(struct bpf_preload_info *obj);
-static int finish(void);
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
-static struct bpf_preload_ops umd_ops = {
- .info.driver_name = "bpf_preload",
+static struct bpf_preload_ops ops = {
.preload = preload,
- .finish = finish,
.owner = THIS_MODULE,
};
-static int preload(struct bpf_preload_info *obj)
+static int load_skel(void)
{
- int magic = BPF_PRELOAD_START;
- loff_t pos = 0;
- int i, err;
- ssize_t n;
+ int err;
- err = fork_usermode_driver(&umd_ops.info);
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
if (err)
- return err;
-
- /* send the start magic to let UMD proceed with loading BPF progs */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- /* receive bpf_link IDs and names from UMD */
- pos = 0;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- n = kernel_read(umd_ops.info.pipe_from_umh,
- &obj[i], sizeof(*obj), &pos);
- if (n != sizeof(*obj))
- return -EPIPE;
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
}
- return 0;
-}
-
-static int finish(void)
-{
- int magic = BPF_PRELOAD_END;
- struct pid *tgid;
- loff_t pos = 0;
- ssize_t n;
-
- /* send the last magic to UMD. It will do a normal exit. */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- tgid = umd_ops.info.tgid;
- if (tgid) {
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
}
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
return 0;
+out:
+ free_links_and_skel();
+ return err;
}
-static int __init load_umd(void)
+static int __init load(void)
{
int err;
- err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
- &bpf_preload_umd_end - &bpf_preload_umd_start);
+ err = load_skel();
if (err)
return err;
- bpf_preload_ops = &umd_ops;
+ bpf_preload_ops = &ops;
return err;
}
-static void __exit fini_umd(void)
+static void __exit fini(void)
{
- struct pid *tgid;
-
bpf_preload_ops = NULL;
-
- /* kill UMD in case it's still there due to earlier error */
- tgid = umd_ops.info.tgid;
- if (tgid) {
- kill_pid(tgid, SIGKILL, 1);
-
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
- }
- umd_unload_blob(&umd_ops.info);
+ free_links_and_skel();
}
-late_initcall(load_umd);
-module_exit(fini_umd);
+late_initcall(load);
+module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
deleted file mode 100644
index f1f40223b5c3..000000000000
--- a/kernel/bpf/preload/bpf_preload_umd_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
- .section .init.rodata, "a"
- .global bpf_preload_umd_start
-bpf_preload_umd_start:
- .incbin "kernel/bpf/preload/bpf_preload_umd"
- .global bpf_preload_umd_end
-bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index 28fa8c1440f4..8937dc6bc8d0 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -1,18 +1,26 @@
# SPDX-License-Identifier: GPL-2.0
OUTPUT := .output
+abs_out := $(abspath $(OUTPUT))
+
CLANG ?= clang
LLC ?= llc
LLVM_STRIP ?= llvm-strip
-DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
+
+TOOLS_PATH := $(abspath ../../../../tools)
+BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL_OUTPUT := $(abs_out)/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
-LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf)
-BPFOBJ := $(OUTPUT)/libbpf.a
-BPF_INCLUDE := $(OUTPUT)
-INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \
- -I$(abspath ../../../../tools/include/uapi)
+
+LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
+LIBBPF_OUTPUT := $(abs_out)/libbpf
+LIBBPF_DESTDIR := $(LIBBPF_OUTPUT)
+LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include
+BPFOBJ := $(LIBBPF_OUTPUT)/libbpf.a
+
+INCLUDES := -I$(OUTPUT) -I$(LIBBPF_INCLUDE) -I$(TOOLS_PATH)/include/uapi
CFLAGS := -g -Wall
-abs_out := $(abspath $(OUTPUT))
ifeq ($(V),1)
Q =
msg =
@@ -27,31 +35,33 @@ endif
.PHONY: all clean
-all: iterators.skel.h
+all: iterators.lskel-little-endian.h
+
+big: iterators.lskel-big-endian.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
- $(Q)$(BPFTOOL) gen skeleton $< > $@
-
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
-$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
$(call msg,BPF,$@)
- $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \
+ $(Q)mkdir -p $(@D)
+ $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
-$(OUTPUT):
+$(OUTPUT) $(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
- $(Q)mkdir -p $(OUTPUT)
+ $(Q)mkdir -p $@
-$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)
+$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
- OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
+ OUTPUT=$(abspath $(dir $@))/ prefix= \
+ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
-$(DEFAULT_BPFTOOL):
- $(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \
- prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install
+$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
index 7fd6d39a9ad2..98e7c90ea012 100644
--- a/kernel/bpf/preload/iterators/README
+++ b/kernel/bpf/preload/iterators/README
@@ -1,4 +1,7 @@
WARNING:
-If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h".
+If you change "iterators.bpf.c" do "make -j" in this directory to
+rebuild "iterators.lskel-little-endian.h". Then, on a big-endian
+machine, do "make -j big" in this directory to rebuild
+"iterators.lskel-big-endian.h". Commit both resulting headers.
Make sure to have clang 10 installed.
See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
deleted file mode 100644
index 8464d1a48c05..000000000000
--- a/kernel/bpf/preload/iterators/bpf_preload_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BPF_PRELOAD_COMMON_H
-#define _BPF_PRELOAD_COMMON_H
-
-#define BPF_PRELOAD_START 0x5555
-#define BPF_PRELOAD_END 0xAAAA
-
-struct bpf_preload_info {
- char link_name[16];
- int link_id;
-};
-
-#endif
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
deleted file mode 100644
index 5d872a705470..000000000000
--- a/kernel/bpf/preload/iterators/iterators.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <sys/mount.h>
-#include "iterators.skel.h"
-#include "bpf_preload_common.h"
-
-int to_kernel = -1;
-int from_kernel = 0;
-
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
-{
- struct bpf_preload_info obj = {};
- struct bpf_link_info info = {};
- __u32 info_len = sizeof(info);
- int err;
-
- err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
- if (err)
- return err;
- obj.link_id = info.id;
- if (strlen(link_name) >= sizeof(obj.link_name))
- return -E2BIG;
- strcpy(obj.link_name, link_name);
- if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
- return -EPIPE;
- return 0;
-}
-
-int main(int argc, char **argv)
-{
- struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
- struct iterators_bpf *skel;
- int err, magic;
- int debug_fd;
-
- debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
- if (debug_fd < 0)
- return 1;
- to_kernel = dup(1);
- close(1);
- dup(debug_fd);
- /* now stdin and stderr point to /dev/console */
-
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_START) {
- printf("bad start magic %d\n", magic);
- return 1;
- }
- setrlimit(RLIMIT_MEMLOCK, &rlim);
- /* libbpf opens BPF object and loads it into the kernel */
- skel = iterators_bpf__open_and_load();
- if (!skel) {
- /* iterators.skel.h is little endian.
- * libbpf doesn't support automatic little->big conversion
- * of BPF bytecode yet.
- * The program load will fail in such case.
- */
- printf("Failed load could be due to wrong endianness\n");
- return 1;
- }
- err = iterators_bpf__attach(skel);
- if (err)
- goto cleanup;
-
- /* send two bpf_link IDs with names to the kernel */
- err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
- if (err)
- goto cleanup;
- err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
- if (err)
- goto cleanup;
-
- /* The kernel will proceed with pinnging the links in bpffs.
- * UMD will wait on read from pipe.
- */
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_END) {
- printf("bad final magic %d\n", magic);
- err = -EINVAL;
- }
-cleanup:
- iterators_bpf__destroy(skel);
-
- return err != 0;
-}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
new file mode 100644
index 000000000000..ebdc6c0cdb70
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6008;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\
+\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
+\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
+\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\
+\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\
+\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\
+\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\
+\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\
+\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\
+\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\
+\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\
+\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\
+\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\
+\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\
+\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\
+\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\
+\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\
+\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\
+\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\
+\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\
+\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\
+\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\
+\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\
+\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\
+\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\
+\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\
+\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\
+\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\
+\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\
+\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\
+\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\
+\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\
+\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\
+\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\
+\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\
+\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\
+\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\
+\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\
+\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\
+\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\
+\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\
+\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\
+\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\
+\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\
+\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\
+\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\
+\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\
+\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\
+\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\
+\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\
+\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\
+\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\
+\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\
+\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\
+\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\
+\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\
+\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\
+\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\
+\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\
+\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\
+\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\
+\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\
+\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\
+\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\
+\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\
+\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\
+\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\
+\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\
+\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\
+\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\
+\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\
+\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\
+\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
+\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\
+\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\
+\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\
+\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\
+\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\
+\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\
+\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\
+\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\
+\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\
+\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\
+\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\
+\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\
+\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\
+\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\
+\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\
+\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\
+\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\
+\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\
+\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\
+\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\
+\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
+\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
+\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
+\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\
+\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\
+\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\
+\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\
+\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\
+\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\
+\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\
+\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\
+\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\
+\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\
+\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\
+\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\
+\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\
+\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\
+\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\
+\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\
+\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\
+\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
+\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
+\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
+\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\
+\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\
+\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
+\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\
+\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
+\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\
+\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\
+\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\
+\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\
+\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\
+\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\
+\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\
+\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\
+\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\
+\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\
+\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\
+\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\
+\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\
+\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\
+\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\
+\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\
+\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\
+\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\
+\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\
+\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\
+\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\
+\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\
+\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
new file mode 100644
index 000000000000..70f236a82fe1
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+ struct iterators_bpf__rodata {
+ } *rodata;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ skel->rodata = skel_prep_map_data((void *)"\
+\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
+ if (!skel->rodata)
+ goto cleanup;
+ skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6056;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
+\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
+\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
+\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
+\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
+\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
+\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
+\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
+\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
+\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
+\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
+\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
+\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
+\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
+\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
+\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
+\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
+\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
+\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
+\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
+\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
+\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
+\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
+\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
+\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
+\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
+\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
+\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
+\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
+\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
+\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
+\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
+\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
+\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
+\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
+\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
+\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
+\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
+\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
+\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
+\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
+\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
+\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
+\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
+\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
+\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
+\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
+\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
+\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
+\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
+\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
+\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
+\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
+\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
+\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
+\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
+\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
+\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
+\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
+\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
+\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
+\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
+\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
+\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
+\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
+\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
+\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
+\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
+\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
+\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
+\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
+\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
+\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
+\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
+\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
+\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
+\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
+\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
+\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
+\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
+\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
+\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
+\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
+\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
+\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
+\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
+\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
+\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
+\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
+\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
+\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
+\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
+\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
+\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
+\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
+\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
+\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
+\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
+\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
+\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
+\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
+\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
+\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
+\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
+\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
+\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
+\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
+\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
+\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
+\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
+\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
+\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
+\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
+\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
+ 4096, PROT_READ, skel->maps.rodata.map_fd);
+ if (!skel->rodata)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644
index cf9a6a94b3a4..000000000000
--- a/kernel/bpf/preload/iterators/iterators.skel.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
- struct bpf_object_skeleton *skeleton;
- struct bpf_object *obj;
- struct {
- struct bpf_map *rodata;
- } maps;
- struct {
- struct bpf_program *dump_bpf_map;
- struct bpf_program *dump_bpf_prog;
- } progs;
- struct {
- struct bpf_link *dump_bpf_map;
- struct bpf_link *dump_bpf_prog;
- } links;
- struct iterators_bpf__rodata {
- char dump_bpf_map____fmt[35];
- char dump_bpf_map____fmt_1[14];
- char dump_bpf_prog____fmt[32];
- char dump_bpf_prog____fmt_2[17];
- } *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
- if (!obj)
- return;
- if (obj->skeleton)
- bpf_object__destroy_skeleton(obj->skeleton);
- free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
- struct iterators_bpf *obj;
-
- obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
- if (!obj)
- return NULL;
- if (iterators_bpf__create_skeleton(obj))
- goto err;
- if (bpf_object__open_skeleton(obj->skeleton, opts))
- goto err;
-
- return obj;
-err:
- iterators_bpf__destroy(obj);
- return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
- return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
- return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
- struct iterators_bpf *obj;
-
- obj = iterators_bpf__open();
- if (!obj)
- return NULL;
- if (iterators_bpf__load(obj)) {
- iterators_bpf__destroy(obj);
- return NULL;
- }
- return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
- return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
- return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
- struct bpf_object_skeleton *s;
-
- s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
- if (!s)
- return -1;
- obj->skeleton = s;
-
- s->sz = sizeof(*s);
- s->name = "iterators_bpf";
- s->obj = &obj->obj;
-
- /* maps */
- s->map_cnt = 1;
- s->map_skel_sz = sizeof(*s->maps);
- s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
- if (!s->maps)
- goto err;
-
- s->maps[0].name = "iterator.rodata";
- s->maps[0].map = &obj->maps.rodata;
- s->maps[0].mmaped = (void **)&obj->rodata;
-
- /* programs */
- s->prog_cnt = 2;
- s->prog_skel_sz = sizeof(*s->progs);
- s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
- if (!s->progs)
- goto err;
-
- s->progs[0].name = "dump_bpf_map";
- s->progs[0].prog = &obj->progs.dump_bpf_map;
- s->progs[0].link = &obj->links.dump_bpf_map;
-
- s->progs[1].name = "dump_bpf_prog";
- s->progs[1].prog = &obj->progs.dump_bpf_prog;
- s->progs[1].link = &obj->links.dump_bpf_prog;
-
- s->data_sz = 7176;
- s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
- return 0;
-err:
- bpf_object__destroy_skeleton(s);
- return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f9c734aaa990..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -8,6 +8,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/capability.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
@@ -77,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
if (!qs)
return ERR_PTR(-ENOMEM);
- memset(qs, 0, sizeof(*qs));
-
bpf_map_init_from_attr(&qs->map, attr);
qs->size = size;
@@ -96,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
bpf_map_area_free(qs);
}
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -125,7 +124,7 @@ out:
}
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -157,32 +156,32 @@ out:
}
/* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
- u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
@@ -228,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -247,7 +246,15 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
-static int queue_map_btf_id;
+static u64 queue_stack_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_queue_stack);
+
+ usage += ((u64)map->max_entries + 1) * map->value_size;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
const struct bpf_map_ops queue_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
@@ -260,11 +267,10 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
- .map_btf_name = "bpf_queue_stack",
- .map_btf_id = &queue_map_btf_id,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
};
-static int stack_map_btf_id;
const struct bpf_map_ops stack_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
@@ -277,6 +283,6 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
- .map_btf_name = "bpf_queue_stack",
- .map_btf_id = &stack_map_btf_id,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
};
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 93a55391791a..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -6,6 +6,7 @@
#include <linux/err.h>
#include <linux/sock_diag.h>
#include <net/sock_reuseport.h>
+#include <linux/btf_ids.h>
struct reuseport_array {
struct bpf_map map;
@@ -20,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
- uintptr_t sk_user_data;
+ struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- sk_user_data = (uintptr_t)sk->sk_user_data;
- if (sk_user_data & SK_USER_DATA_BPF) {
- struct sock __rcu **socks;
-
- socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
@@ -61,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
@@ -143,7 +141,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* Once reaching here, all sk->sk_user_data is not
- * referenceing this "array". "array" can be freed now.
+ * referencing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
@@ -152,16 +150,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- u64 array_size;
if (!bpf_capable())
return ERR_PTR(-EPERM);
- array_size = sizeof(*array);
- array_size += (u64)attr->max_entries * sizeof(struct sock *);
-
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size, numa_node);
+ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
@@ -341,7 +335,14 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int reuseport_array_map_btf_id;
+static u64 reuseport_array_mem_usage(const struct bpf_map *map)
+{
+ struct reuseport_array *array;
+
+ return struct_size(array, ptrs, map->max_entries);
+}
+
+BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
const struct bpf_map_ops reuseport_array_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = reuseport_array_alloc_check,
@@ -350,6 +351,6 @@ const struct bpf_map_ops reuseport_array_ops = {
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
- .map_btf_name = "reuseport_array",
- .map_btf_id = &reuseport_array_map_btf_id,
+ .map_mem_usage = reuseport_array_mem_usage,
+ .map_btf_id = &reuseport_array_map_btf_ids[0],
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e0c10c6892a..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -10,6 +10,7 @@
#include <linux/poll.h>
#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
@@ -18,6 +19,7 @@
(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
/* consumer page and producer page */
#define RINGBUF_POS_PAGES 2
+#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
@@ -37,10 +39,43 @@ struct bpf_ringbuf {
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -62,7 +97,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
{
const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
__GFP_NOWARN | __GFP_ZERO;
- int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ int nr_meta_pages = RINGBUF_NR_META_PAGES;
int nr_data_pages = data_sz >> PAGE_SHIFT;
int nr_pages = nr_meta_pages + nr_data_pages;
struct page **pages, *page;
@@ -104,7 +139,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
}
rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
- VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ VM_MAP | VM_USERMAP, PAGE_KERNEL);
if (rb) {
kmemleak_not_leak(pages);
rb->pages = pages;
@@ -115,7 +150,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
err_free_pages:
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
return NULL;
}
@@ -135,6 +170,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
return NULL;
spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -163,7 +199,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
#endif
- rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
@@ -171,7 +207,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
if (!rb_map->rb) {
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
}
@@ -189,7 +225,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
vunmap(rb);
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
}
static void ringbuf_map_free(struct bpf_map *map)
@@ -198,7 +234,7 @@ static void ringbuf_map_free(struct bpf_map *map)
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
}
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
@@ -206,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-ENOTSUPP);
}
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
return -ENOTSUPP;
}
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
{
return -ENOTSUPP;
}
@@ -223,7 +259,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
@@ -234,13 +270,33 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
} else {
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
vma->vm_pgoff + RINGBUF_PGOFF);
}
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ } else {
+ vm_flags_clear(vma, VM_MAYWRITE);
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
unsigned long cons_pos, prod_pos;
@@ -250,8 +306,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
return prod_pos - cons_pos;
}
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
- struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
{
struct bpf_ringbuf_map *rb_map;
@@ -263,19 +324,62 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
-static int ringbuf_map_btf_id;
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
+static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_ringbuf *rb;
+ int nr_data_pages;
+ int nr_meta_pages;
+ u64 usage = sizeof(struct bpf_ringbuf_map);
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+ usage += (u64)rb->nr_pages << PAGE_SHIFT;
+ nr_meta_pages = RINGBUF_NR_META_PAGES;
+ nr_data_pages = map->max_entries >> PAGE_SHIFT;
+ usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
- .map_poll = ringbuf_map_poll,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
- .map_btf_name = "bpf_ringbuf_map",
- .map_btf_id = &ringbuf_map_btf_id,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &ringbuf_map_btf_ids[0],
+};
+
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
};
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
@@ -312,7 +416,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
- if (len > rb->mask + 1)
+ if (len > ringbuf_total_data_sz(rb))
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos);
@@ -361,7 +465,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
.func = bpf_ringbuf_reserve,
- .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
@@ -404,7 +508,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -417,7 +521,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -444,7 +548,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
.func = bpf_ringbuf_output,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -459,7 +563,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
case BPF_RB_AVAIL_DATA:
return ringbuf_avail_data_sz(rb);
case BPF_RB_RING_SIZE:
- return rb->mask + 1;
+ return ringbuf_total_data_sz(rb);
case BPF_RB_CONS_POS:
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
@@ -475,3 +579,216 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+ struct bpf_dynptr_kern *, ptr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *sample;
+ int err;
+
+ if (unlikely(flags)) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ err = bpf_dynptr_check_size(size);
+ if (err) {
+ bpf_dynptr_set_null(ptr);
+ return err;
+ }
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!sample) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+ .func = bpf_ringbuf_reserve_dynptr,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+ .func = bpf_ringbuf_submit_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+ .func = bpf_ringbuf_discard_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ smp_mb__before_atomic();
+ atomic_set(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6fbc2abe9c91..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,10 @@
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
-#include <linux/irq_work.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
#include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
#define STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
@@ -31,25 +31,6 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
- struct irq_work irq_work;
- struct mm_struct *mm;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
- struct stack_map_irq_work *work;
-
- if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
- return;
-
- work = container_of(entry, struct stack_map_irq_work, irq_work);
- mmap_read_unlock_non_owner(work->mm);
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -63,7 +44,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
- u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
int err;
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
@@ -118,13 +100,11 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
- smap->map.value_size = value_size;
smap->n_buckets = n_buckets;
err = get_callchain_buffers(sysctl_perf_event_max_stack);
@@ -148,38 +128,17 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
- struct vm_area_struct *vma;
- bool irq_work_busy = false;
- struct stack_map_irq_work *work = NULL;
-
- if (irqs_disabled()) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- work = this_cpu_ptr(&up_read_work);
- if (irq_work_is_busy(&work->irq_work)) {
- /* cannot queue more up_read, fallback */
- irq_work_busy = true;
- }
- } else {
- /*
- * PREEMPT_RT does not allow to trylock mmap sem in
- * interrupt disabled context. Force the fallback code.
- */
- irq_work_busy = true;
- }
- }
-
- /*
- * We cannot do up_read() when the irq is disabled, because of
- * risk to deadlock with rq_lock. To do build_id lookup when the
- * irqs are disabled, we need to run up_read() in irq_work. We use
- * a percpu variable to do the irq_work. If the irq_work is
- * already used by another lookup, we fall back to report ips.
- *
- * Same fallback is used for kernel stack (!user) on a stackmap
- * with build_id.
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
+
+ /* If the irq_work is in use, fall back to report ips. Same
+ * fallback is used for kernel stack (!user) on a stackmap with
+ * build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock_non_owner(current->mm)) {
+ !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -190,6 +149,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
for (i = 0; i < trace_nr; i++) {
+ if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id,
+ BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
vma = find_vma(current->mm, ips[i]);
if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
/* per entry fall back to ips */
@@ -198,21 +163,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
+build_id_valid:
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
}
-
- if (!work) {
- mmap_read_unlock_non_owner(current->mm);
- } else {
- work->mm = current->mm;
- irq_work_queue(&work->irq_work);
- }
+ bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
{
#ifdef CONFIG_STACKTRACE
struct perf_callchain_entry *entry;
@@ -223,9 +185,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
if (!entry)
return NULL;
- entry->nr = init_nr +
- stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
- sysctl_perf_event_max_stack - init_nr, 0);
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
/* stack_trace_save_tsk() works on unsigned long array, while
* perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -237,7 +198,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
int i;
/* copy data from the end to avoid using extra buffer */
- for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ for (i = entry->nr - 1; i >= 0; i--)
to[i] = (u64)(from[i]);
}
@@ -254,27 +215,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
- */
- trace_nr = trace->nr - init_nr;
-
- if (trace_nr <= skip)
+ if (trace->nr <= skip)
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_len = trace_nr * sizeof(u64);
- ips = trace->ip + skip + init_nr;
+ ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
@@ -331,8 +284,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
@@ -341,8 +293,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ max_depth += skip;
+ if (max_depth > sysctl_perf_event_max_stack)
+ max_depth = sysctl_perf_event_max_stack;
+
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -382,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
int ret;
/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return bpf_get_stackid((unsigned long)(ctx->regs),
(unsigned long) map, flags, 0, 0);
@@ -433,7 +389,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags)
{
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
@@ -458,30 +414,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto err_fault;
num_elem = size / elem_size;
- if (sysctl_perf_event_max_stack < num_elem)
- init_nr = 0;
- else
- init_nr = sysctl_perf_event_max_stack - num_elem;
+ max_depth = num_elem + skip;
+ if (sysctl_perf_event_max_stack < max_depth)
+ max_depth = sysctl_perf_event_max_stack;
if (trace_in)
trace = trace_in;
else if (kernel && task)
- trace = get_callchain_entry_for_task(task, init_nr);
+ trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack,
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
goto err_fault;
- trace_nr = trace->nr - init_nr;
- if (trace_nr < skip)
+ if (trace->nr < skip)
goto err_fault;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
- ips = trace->ip + skip + init_nr;
+
+ ips = trace->ip + skip;
if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else
@@ -518,26 +472,25 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
struct pt_regs *regs;
- long res;
+ long res = -EINVAL;
if (!try_get_task_stack(task))
return -EFAULT;
regs = task_pt_regs(task);
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
put_task_stack(task);
return res;
}
-BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
-
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &bpf_get_task_stack_btf_ids[0],
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
@@ -553,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
int err = -EINVAL;
__u64 nr_kernel;
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
@@ -665,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *old_bucket;
@@ -701,7 +654,20 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
-static int stack_trace_map_btf_id;
+static u64 stack_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ u64 value_size = map->value_size;
+ u64 n_buckets = smap->n_buckets;
+ u64 enties = map->max_entries;
+ u64 usage = sizeof(*smap);
+
+ usage += n_buckets * sizeof(struct stack_map_bucket *);
+ usage += enties * (sizeof(struct stack_map_bucket) + value_size);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
const struct bpf_map_ops stack_trace_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = stack_map_alloc,
@@ -711,19 +677,6 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_stack_map",
- .map_btf_id = &stack_trace_map_btf_id,
+ .map_mem_usage = stack_map_mem_usage,
+ .map_btf_id = &stack_trace_map_btf_ids[0],
};
-
-static int __init stack_map_init(void)
-{
- int cpu;
- struct stack_map_irq_work *work;
-
- for_each_possible_cpu(cpu) {
- work = per_cpu_ptr(&up_read_work, cpu);
- init_irq_work(&work->irq_work, do_up_read);
- }
- return 0;
-}
-subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e343f158e556..14f39c1e573e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2,9 +2,11 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
+#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
@@ -28,9 +30,12 @@
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
+#include <linux/sort.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
+#include <net/netfilter/nf_bpf_link.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -101,6 +106,7 @@ const struct bpf_map_ops bpf_map_offload_ops = {
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = bpf_map_offload_map_mem_usage,
};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -124,6 +130,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
}
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return ERR_PTR(-EINVAL);
map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
@@ -132,6 +140,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
+static void bpf_map_write_active_inc(struct bpf_map *map)
+{
+ atomic64_inc(&map->writecnt);
+}
+
+static void bpf_map_write_active_dec(struct bpf_map *map)
+{
+ atomic64_dec(&map->writecnt);
+}
+
+bool bpf_map_write_active(const struct bpf_map *map)
+{
+ return atomic64_read(&map->writecnt) != 0;
+}
+
static u32 bpf_map_value_size(const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -156,13 +179,13 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
- void *value, __u64 flags)
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
@@ -171,7 +194,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
- return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
@@ -186,12 +209,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
@@ -199,7 +222,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = bpf_fd_reuseport_array_update_elem(map, key, value,
flags);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
rcu_read_lock();
@@ -218,7 +242,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -238,7 +262,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_peek_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
@@ -260,8 +285,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
+ /* mask lock and timer, since value wasn't zero inited */
+ check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
@@ -288,7 +313,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
+ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
@@ -348,6 +373,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
map->max_entries = attr->max_entries;
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
+ map->map_extra = attr->map_extra;
}
static int bpf_map_alloc_id(struct bpf_map *map)
@@ -368,7 +394,7 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map)
{
unsigned long flags;
@@ -380,52 +406,80 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
if (!map->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&map_idr_lock, flags);
- else
- __acquire(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
idr_remove(&map_idr, map->id);
map->id = 0;
- if (do_idr_lock)
- spin_unlock_irqrestore(&map_idr_lock, flags);
- else
- __release(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
}
#ifdef CONFIG_MEMCG_KMEM
static void bpf_map_save_memcg(struct bpf_map *map)
{
- map->memcg = get_mem_cgroup_from_mm(current->mm);
+ /* Currently if a map is created by a process belonging to the root
+ * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+ * So we have to check map->objcg for being NULL each time it's
+ * being used.
+ */
+ if (memcg_bpf_enabled())
+ map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
- mem_cgroup_put(map->memcg);
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
+}
+
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+ if (map->objcg)
+ return get_mem_cgroup_from_objcg(map->objcg);
+
+ return root_mem_cgroup;
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
+ gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -433,12 +487,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -453,15 +509,234 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
+static int btf_field_cmp(const void *a, const void *b)
+{
+ const struct btf_field *f1 = a, *f2 = b;
+
+ if (f1->offset < f2->offset)
+ return -1;
+ else if (f1->offset > f2->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ u32 field_mask)
+{
+ struct btf_field *field;
+
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & field_mask))
+ return NULL;
+ return field;
+}
+
+void btf_record_free(struct btf_record *rec)
+{
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ /* Nothing to release */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+ kfree(rec);
+}
+
+void bpf_map_free_record(struct bpf_map *map)
+{
+ btf_record_free(map->record);
+ map->record = NULL;
+}
+
+struct btf_record *btf_record_dup(const struct btf_record *rec)
+{
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = offsetof(struct btf_record, fields[rec->cnt]);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
+ return ERR_PTR(-ENOMEM);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ /* Nothing to acquire */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
+ }
+ new_rec->cnt++;
+ }
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
+}
+
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
+{
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
+ int size;
+
+ if (!a_has_fields && !b_has_fields)
+ return true;
+ if (a_has_fields != b_has_fields)
+ return false;
+ if (rec_a->cnt != rec_b->cnt)
+ return false;
+ size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
+}
+
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
+
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
+ void *xchgd_field;
+
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ WARN_ON_ONCE(!pointee_struct_meta);
+ migrate_disable();
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record :
+ NULL);
+ migrate_enable();
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_RB_ROOT:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ struct btf_record *rec = map->record;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
+ /* Delay freeing of btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ btf_record_free(rec);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -473,23 +748,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
/* decrement map refcnt and schedule it for freeing via workqueue
- * (unrelying map implementation ops->map_free() might sleep)
+ * (underlying map implementation ops->map_free() might sleep)
*/
-static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_put(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
- bpf_map_free_id(map, do_idr_lock);
+ bpf_map_free_id(map);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
}
}
-
-void bpf_map_put(struct bpf_map *map)
-{
- __bpf_map_put(map, true);
-}
EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(struct bpf_map *map)
@@ -522,29 +795,22 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
}
#ifdef CONFIG_PROC_FS
-/* Provides an approximation of the map's memory footprint.
- * Used only to provide a backward compatibility and display
- * a reasonable "memlock" info.
- */
-static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
{
- unsigned long size;
-
- size = round_up(map->key_size + bpf_map_value_size(map), 8);
-
- return round_up(map->max_entries * size, PAGE_SIZE);
+ return map->ops->map_mem_usage(map);
}
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
- const struct bpf_map *map = filp->private_data;
- const struct bpf_array *array;
+ struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
- array = container_of(map, struct bpf_array, map);
- type = array->aux->type;
- jited = array->aux->jited;
+ if (map_type_contains_progs(map)) {
+ spin_lock(&map->owner.lock);
+ type = map->owner.type;
+ jited = map->owner.jited;
+ spin_unlock(&map->owner.lock);
}
seq_printf(m,
@@ -553,7 +819,8 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
- "memlock:\t%lu\n"
+ "map_extra:\t%#llx\n"
+ "memlock:\t%llu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
@@ -561,7 +828,8 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->value_size,
map->max_entries,
map->map_flags,
- bpf_map_memory_footprint(map),
+ (unsigned long long)map->map_extra,
+ bpf_map_memory_usage(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
@@ -594,11 +862,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt++;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_inc(map);
}
/* called for all unmapped memory region (including initial) */
@@ -606,11 +871,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt--;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_dec(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -623,7 +885,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -650,17 +912,17 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
- vma->vm_flags &= ~VM_MAYEXEC;
+ vm_flags_clear(vma, VM_MAYEXEC);
if (!(vma->vm_flags & VM_WRITE))
/* disallow re-mapping with PROT_WRITE */
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
if (err)
goto out;
if (vma->vm_flags & VM_MAYWRITE)
- map->writecnt++;
+ bpf_map_write_active_inc(map);
out:
mutex_unlock(&map->freeze_mutex);
return err;
@@ -772,34 +1034,96 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
- map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
- if (map_value_has_spin_lock(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
- map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
- map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
- return -ENOTSUPP;
- if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
- map->value_size) {
- WARN_ONCE(1,
- "verifier bug spin_lock_off %d value_size %d\n",
- map->spin_lock_off, map->value_size);
- return -EFAULT;
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT | BPF_REFCOUNT,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
+
+ if (!bpf_capable()) {
+ ret = -EPERM;
+ goto free_map_tab;
+ }
+ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+ ret = -EACCES;
+ goto free_map_tab;
+ }
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_REFCOUNT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
}
}
- if (map->ops->map_check_btf)
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
+
+ if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
+ if (ret < 0)
+ goto free_map_tab;
+ }
return ret;
+free_map_tab:
+ bpf_map_free_record(map);
+ return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD map_extra
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -820,6 +1144,10 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
}
+ if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_extra != 0)
+ return -EINVAL;
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -842,8 +1170,8 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner.lock);
- map->spin_lock_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -953,6 +1281,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
+EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -969,8 +1298,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -1001,7 +1332,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
static void *__bpf_copy_key(void __user *ukey, u64 key_size)
{
if (key_size)
- return memdup_user(ukey, key_size);
+ return vmemdup_user(ukey, key_size);
if (ukey)
return ERR_PTR(-EINVAL);
@@ -1012,7 +1343,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
{
if (key_size)
- return memdup_bpfptr(ukey, key_size);
+ return kvmemdup_bpfptr(ukey, key_size);
if (!bpfptr_is_null(ukey))
return ERR_PTR(-EINVAL);
@@ -1050,7 +1381,7 @@ static int map_lookup_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1064,10 +1395,18 @@ static int map_lookup_elem(union bpf_attr *attr)
value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+ if (copy_from_user(value, uvalue, value_size))
+ err = -EFAULT;
+ else
+ err = bpf_map_copy_value(map, key, value, attr->flags);
+ goto free_value;
+ }
+
err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -1079,9 +1418,9 @@ static int map_lookup_elem(union bpf_attr *attr)
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1108,13 +1447,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1125,39 +1465,29 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else
- value_size = map->value_size;
-
- err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value_size = bpf_map_value_size(map);
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
+ }
- err = -EFAULT;
- if (copy_from_bpfptr(value, uvalue, value_size) != 0)
- goto free_value;
-
- err = bpf_map_update_value(map, f, key, value, attr->flags);
+ err = bpf_map_update_value(map, f.file, key, value, attr->flags);
-free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1171,18 +1501,19 @@ static int map_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1199,8 +1530,9 @@ static int map_delete_elem(union bpf_attr *attr)
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
out:
- kfree(key);
+ kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1241,11 +1573,11 @@ static int map_get_next_key(union bpf_attr *attr)
}
err = -ENOMEM;
- next_key = kmalloc(map->key_size, GFP_USER);
+ next_key = kvmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1264,9 +1596,9 @@ out:
err = 0;
free_next_key:
- kfree(next_key);
+ kvfree(next_key);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1285,7 +1617,7 @@ int generic_map_delete_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1293,7 +1625,7 @@ int generic_map_delete_batch(struct bpf_map *map,
if (!max_count)
return 0;
- key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1303,7 +1635,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1313,35 +1645,34 @@ int generic_map_delete_batch(struct bpf_map *map,
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
- kfree(key);
+ kvfree(key);
+
+ maybe_wait_bpf_programs(map);
return err;
}
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->map_fd;
void *key, *value;
- struct fd f;
int err = 0;
- f = fdget(ufd);
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1351,13 +1682,13 @@ int generic_map_update_batch(struct bpf_map *map,
if (!max_count)
return 0;
- key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value) {
- kfree(key);
+ kvfree(key);
return -ENOMEM;
}
@@ -1368,18 +1699,19 @@ int generic_map_update_batch(struct bpf_map *map,
copy_from_user(value, values + cp * value_size, value_size))
break;
- err = bpf_map_update_value(map, f, key, value,
+ err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
- kfree(value);
- kfree(key);
+ kvfree(value);
+ kvfree(key);
return err;
}
@@ -1401,7 +1733,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
value_size = bpf_map_value_size(map);
@@ -1413,13 +1745,13 @@ int generic_map_lookup_batch(struct bpf_map *map,
if (put_user(0, &uattr->batch.count))
return -EFAULT;
- buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!buf_prevkey)
return -ENOMEM;
- buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
- kfree(buf_prevkey);
+ kvfree(buf_prevkey);
return -ENOMEM;
}
@@ -1469,6 +1801,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
swap(prev_key, key);
retry = MAP_LOOKUP_RETRIES;
cp++;
+ cond_resched();
}
if (err == -EFAULT)
@@ -1479,8 +1812,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = -EFAULT;
free_buf:
- kfree(buf_prevkey);
- kfree(buf);
+ kvfree(buf_prevkey);
+ kvfree(buf);
return err;
}
@@ -1507,6 +1840,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
@@ -1521,7 +1855,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1535,7 +1869,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
@@ -1547,7 +1881,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- if (!bpf_map_is_dev_bound(map)) {
+ if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
@@ -1567,10 +1901,11 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1591,14 +1926,13 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
fdput(f);
return -ENOTSUPP;
}
mutex_lock(&map->freeze_mutex);
-
- if (map->writecnt) {
+ if (bpf_map_write_active(map)) {
err = -EBUSY;
goto err_put;
}
@@ -1640,7 +1974,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -1668,7 +2002,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (op == BPF_AUDIT_LOAD)
+ if (!in_irq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -1697,8 +2031,10 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog)
{
+ unsigned long flags;
+
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
@@ -1707,18 +2043,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
if (!prog->aux->id)
return;
- if (do_idr_lock)
- spin_lock_bh(&prog_idr_lock);
- else
- __acquire(&prog_idr_lock);
-
+ spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
-
- if (do_idr_lock)
- spin_unlock_bh(&prog_idr_lock);
- else
- __release(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -1736,6 +2064,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
+ module_put(prog->aux->mod);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
@@ -1752,20 +2081,36 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
}
}
-static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
+static void bpf_prog_put_deferred(struct work_struct *work)
{
- if (atomic64_dec_and_test(&prog->aux->refcnt)) {
- perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
- bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
- /* bpf_prog_free_id() must be called first */
- bpf_prog_free_id(prog, do_idr_lock);
- __bpf_prog_put_noref(prog, true);
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *prog;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
+ __bpf_prog_put_noref(prog, true);
+}
+
+static void __bpf_prog_put(struct bpf_prog *prog)
+{
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
+ if (in_irq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
+ } else {
+ bpf_prog_put_deferred(&aux->work);
+ }
}
}
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -1777,8 +2122,25 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+struct bpf_prog_kstats {
+ u64 nsecs;
+ u64 cnt;
+ u64 misses;
+};
+
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
- struct bpf_prog_stats *stats)
+ struct bpf_prog_kstats *stats)
{
u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
@@ -1790,11 +2152,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
- tnsecs = st->nsecs;
- tcnt = st->cnt;
- tmisses = st->misses;
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ start = u64_stats_fetch_begin(&st->syncp);
+ tnsecs = u64_stats_read(&st->nsecs);
+ tcnt = u64_stats_read(&st->cnt);
+ tmisses = u64_stats_read(&st->misses);
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
@@ -1809,7 +2171,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
@@ -1821,7 +2183,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
"run_cnt:\t%llu\n"
- "recursion_misses:\t%llu\n",
+ "recursion_misses:\t%llu\n"
+ "verified_insns:\t%u\n",
prog->type,
prog->jited,
prog_tag,
@@ -1829,7 +2192,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
prog->aux->id,
stats.nsecs,
stats.cnt,
- stats.misses);
+ stats.misses,
+ prog->aux->verified_insns);
}
#endif
@@ -1912,7 +2276,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -2091,7 +2455,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2100,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_EXT: /* extends any prog */
+ case BPF_PROG_TYPE_NETFILTER:
return true;
case BPF_PROG_TYPE_CGROUP_SKB:
/* always unpriv */
@@ -2129,9 +2493,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD fd_array
+#define BPF_PROG_LOAD_LAST_FIELD log_true_size
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2147,7 +2511,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
- BPF_F_TEST_RND_HI32))
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2231,8 +2597,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
- prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
err = security_bpf_prog_alloc(prog->aux);
if (err)
@@ -2254,7 +2621,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog_sec;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
goto free_prog_sec;
}
@@ -2271,7 +2645,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
goto free_prog_sec;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
@@ -2418,6 +2792,7 @@ void bpf_link_put(struct bpf_link *link)
bpf_link_free(link);
}
}
+EXPORT_SYMBOL(bpf_link_put);
static int bpf_link_release(struct inode *inode, struct file *filp)
{
@@ -2445,16 +2820,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -2489,7 +2867,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
@@ -2560,20 +2938,14 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
return link;
}
-
-struct bpf_tracing_link {
- struct bpf_link link;
- enum bpf_attach_type attach_type;
- struct bpf_trampoline *trampoline;
- struct bpf_prog *tgt_prog;
-};
+EXPORT_SYMBOL(bpf_link_get_from_fd);
static void bpf_tracing_link_release(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
tr_link->trampoline));
bpf_trampoline_put(tr_link->trampoline);
@@ -2586,7 +2958,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
kfree(tr_link);
}
@@ -2595,7 +2967,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
seq_printf(seq,
"attach_type:\t%d\n",
@@ -2606,7 +2978,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
info->tracing.attach_type = tr_link->attach_type;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
@@ -2625,7 +2997,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
static int bpf_tracing_prog_attach(struct bpf_prog *prog,
int tgt_prog_fd,
- u32 btf_id)
+ u32 btf_id,
+ u64 bpf_cookie)
{
struct bpf_link_primer link_primer;
struct bpf_prog *tgt_prog = NULL;
@@ -2687,9 +3060,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
&bpf_tracing_link_lops, prog);
link->attach_type = prog->expected_attach_type;
+ link->link.cookie = bpf_cookie;
mutex_lock(&prog->aux->dst_mutex);
@@ -2740,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr) {
err = -ENOMEM;
@@ -2757,11 +3136,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
tgt_prog = prog->aux->dst_prog;
}
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link.link, &link_primer);
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(prog, tr);
+ err = bpf_trampoline_link_prog(&link->link, tr);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -2873,66 +3252,123 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+ struct bpf_link link;
+ struct file *perf_file;
+};
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+ struct perf_event *event = perf_link->perf_file->private_data;
+
+ perf_event_free_bpf_prog(event);
+ fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+ kfree(perf_link);
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+ .release = bpf_perf_link_release,
+ .dealloc = bpf_perf_link_dealloc,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_perf_link *link;
+ struct perf_event *event;
+ struct file *perf_file;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ perf_file = perf_event_get(attr->link_create.target_fd);
+ if (IS_ERR(perf_file))
+ return PTR_ERR(perf_file);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_file;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+ link->perf_file = perf_file;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_file;
+ }
+
+ event = perf_file->private_data;
+ err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_file;
+ }
+ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out_put_file:
+ fput(perf_file);
+ return err;
+}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
+ const char __user *user_tp_name)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct bpf_prog *prog;
const char *tp_name;
char buf[128];
int err;
- if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
- return -EINVAL;
-
- prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_EXT:
case BPF_PROG_TYPE_LSM:
- if (attr->raw_tracepoint.name) {
+ if (user_tp_name)
/* The attach point for this category of programs
* should be specified via btf_id during program load.
*/
- err = -EINVAL;
- goto out_put_prog;
- }
+ return -EINVAL;
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP) {
tp_name = prog->aux->attach_func_name;
break;
}
- err = bpf_tracing_prog_attach(prog, 0, 0);
- if (err >= 0)
- return err;
- goto out_put_prog;
+ return bpf_tracing_prog_attach(prog, 0, 0, 0);
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
- if (strncpy_from_user(buf,
- u64_to_user_ptr(attr->raw_tracepoint.name),
- sizeof(buf) - 1) < 0) {
- err = -EFAULT;
- goto out_put_prog;
- }
+ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+ return -EFAULT;
buf[sizeof(buf) - 1] = 0;
tp_name = buf;
break;
default:
- err = -EINVAL;
- goto out_put_prog;
+ return -EINVAL;
}
btp = bpf_get_raw_tracepoint(tp_name);
- if (!btp) {
- err = -ENOENT;
- goto out_put_prog;
- }
+ if (!btp)
+ return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
@@ -2959,11 +3395,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
out_put_btp:
bpf_put_raw_tracepoint(btp);
-out_put_prog:
- bpf_prog_put(prog);
return err;
}
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ int fd;
+
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ if (fd < 0)
+ bpf_prog_put(prog);
+ return fd;
+}
+
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
enum bpf_attach_type attach_type)
{
@@ -3032,11 +3486,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
+ case BPF_LSM_MAC:
+ return BPF_PROG_TYPE_LSM;
case BPF_SK_LOOKUP:
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
return BPF_PROG_TYPE_XDP;
+ case BPF_LSM_CGROUP:
+ return BPF_PROG_TYPE_LSM;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -3090,7 +3552,12 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ case BPF_PROG_TYPE_LSM:
+ if (ptype == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type != BPF_LSM_CGROUP)
+ ret = -EINVAL;
+ else
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
ret = -EINVAL;
@@ -3127,13 +3594,14 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_LSM:
return cgroup_bpf_prog_detach(attr, ptype);
default:
return -EINVAL;
}
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3169,18 +3637,24 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
+ case BPF_LSM_CGROUP:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3473,9 +3947,10 @@ static int bpf_prog_get_info_by_fd(struct file *file,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct btf *attach_btf = bpf_prog_get_target_btf(prog);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
char __user *uinsns;
u32 ulen;
int err;
@@ -3525,6 +4000,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.run_cnt = stats.cnt;
info.recursion_misses = stats.misses;
+ info.verified_insns = prog->aux->verified_insns;
+
if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
@@ -3557,7 +4034,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -3672,6 +4149,9 @@ static int bpf_prog_get_info_by_fd(struct file *file,
if (prog->aux->btf)
info.btf_id = btf_obj_id(prog->aux->btf);
+ info.attach_btf_id = prog->aux->attach_btf_id;
+ if (attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(attach_btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@@ -3704,14 +4184,15 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.nr_jited_line_info = 0;
if (info.nr_jited_line_info && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long line_addr;
__u64 __user *user_linfo;
u32 i;
user_linfo = u64_to_user_ptr(info.jited_line_info);
ulen = min_t(u32, info.nr_jited_line_info, ulen);
for (i = 0; i < ulen; i++) {
- if (put_user((__u64)(long)prog->aux->jited_linfo[i],
- &user_linfo[i]))
+ line_addr = (unsigned long)prog->aux->jited_linfo[i];
+ if (put_user((__u64)line_addr, &user_linfo[i]))
return -EFAULT;
}
} else {
@@ -3771,6 +4252,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ info.map_extra = map->map_extra;
memcpy(info.name, map->name, sizeof(map->name));
if (map->btf) {
@@ -3780,7 +4262,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
@@ -3830,7 +4312,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -3880,9 +4363,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -3890,7 +4373,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr, uattr);
+ return btf_new_fd(attr, uattr, uattr_size);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -3974,7 +4457,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
@@ -4027,19 +4512,22 @@ put_file:
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
-#define BPF_DO_BATCH(fn) \
+#define BPF_DO_BATCH(fn, ...) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
- err = fn(map, attr, uattr); \
+ err = fn(__VA_ARGS__); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
union bpf_attr __user *uattr,
int cmd)
{
+ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
+ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
int err, ufd;
struct fd f;
@@ -4052,50 +4540,33 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if ((cmd == BPF_MAP_LOOKUP_BATCH ||
- cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ if (has_write)
+ bpf_map_write_active_inc(map);
+ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
-
- if (cmd != BPF_MAP_LOOKUP_BATCH &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
else
- BPF_DO_BATCH(map->ops->map_delete_batch);
-
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
+ if (has_write)
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
-static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
- struct bpf_prog *prog)
-{
- if (attr->link_create.attach_type != prog->expected_attach_type)
- return -EINVAL;
-
- if (prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, uattr, prog);
- else if (prog->type == BPF_PROG_TYPE_EXT)
- return bpf_tracing_prog_attach(prog,
- attr->link_create.target_fd,
- attr->link_create.target_btf_id);
- return -EINVAL;
-}
-
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
@@ -4105,6 +4576,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -4114,18 +4588,34 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (ret)
goto out;
- if (prog->type == BPF_PROG_TYPE_EXT) {
- ret = tracing_bpf_link_attach(attr, uattr, prog);
- goto out;
- }
-
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
- ret = -EINVAL;
- goto out;
+ switch (prog->type) {
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_NETFILTER:
+ break;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+ attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ default:
+ ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
}
- switch (ptype) {
+ switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4135,8 +4625,29 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_EXT:
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie);
+ break;
+ case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_TRACING:
- ret = tracing_bpf_link_attach(attr, uattr, prog);
+ if (attr->link_create.attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ ret = bpf_raw_tp_link_attach(prog, NULL);
+ else if (prog->expected_attach_type == BPF_TRACE_ITER)
+ ret = bpf_iter_link_attach(attr, uattr, prog);
+ else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ ret = cgroup_bpf_link_attach(attr, prog);
+ else
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
@@ -4146,7 +4657,20 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_NETFILTER:
+ ret = bpf_nf_link_attach(attr, prog);
+ break;
#endif
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ ret = bpf_perf_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -4157,6 +4681,35 @@ out:
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4177,6 +4730,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
@@ -4260,6 +4818,25 @@ struct bpf_link *bpf_link_by_id(u32 id)
return link;
}
+struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
+{
+ struct bpf_link *link;
+
+ spin_lock_bh(&link_idr_lock);
+again:
+ link = idr_get_next(&link_idr, id);
+ if (link) {
+ link = bpf_link_inc_not_zero(link);
+ if (IS_ERR(link)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ return link;
+}
+
#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
@@ -4427,9 +5004,21 @@ out_prog_put:
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
+ bool capable;
int err;
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
+
+ /* Intent here is for unprivileged_bpf_disabled to block key object
+ * creation commands for unprivileged users; other actions depend
+ * of fd availability and access to bpffs, so are dependent on
+ * object creation success. Capabilities are later verified for
+ * operations such as load and map create, so even with unprivileged
+ * BPF disabled, capability checks are still carried out for these
+ * and other operations.
+ */
+ if (!capable &&
+ (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
@@ -4457,7 +5046,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -4466,7 +5055,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr);
+ err = bpf_prog_load(&attr, uattr, size);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -4511,7 +5100,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr);
+ err = bpf_btf_load(&attr, uattr, size);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
@@ -4585,30 +5174,81 @@ static bool syscall_prog_is_valid_access(int off, int size,
return true;
}
-BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
break;
- /* case BPF_PROG_TEST_RUN:
- * is not part of this list to prevent recursive test_run
- */
default:
return -EINVAL;
}
return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
+
+/* To shut up -Wmissing-prototypes.
+ * This function is used by the kernel light skeleton
+ * to load bpf programs when modules are loaded or during kernel boot.
+ * See tools/lib/bpf/skel_internal.h
+ */
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+
+ switch (cmd) {
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ run_ctx.bpf_cookie = 0;
+ run_ctx.saved_run_ctx = NULL;
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
+ /* recursion detected */
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
+ bpf_prog_put(prog);
+ return 0;
+#endif
+ default:
+ return ____bpf_sys_bpf(cmd, attr, size);
+ }
+}
+EXPORT_SYMBOL(kern_sys_bpf);
+
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -4635,16 +5275,43 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
+{
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ if (!bpf_dump_raw_ok(current_cred()))
+ return -EPERM;
+
+ *res = kallsyms_lookup_name(name);
+ return *res ? 0 : -ENOENT;
+}
+
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+ .func = bpf_kallsyms_lookup_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
+
static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return &bpf_sys_bpf_proto;
+ return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
return &bpf_sys_close_proto;
+ case BPF_FUNC_kallsyms_lookup_name:
+ return &bpf_kallsyms_lookup_name_proto;
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -4658,3 +5325,89 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
const struct bpf_prog_ops bpf_syscall_prog_ops = {
.test_run = bpf_prog_test_run_syscall,
};
+
+#ifdef CONFIG_SYSCTL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+
+ unpriv_ebpf_notify(unpriv_enable);
+
+ return ret;
+}
+
+static struct ctl_table bpf_syscall_table[] = {
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &bpf_stats_enabled_key.key,
+ .mode = 0644,
+ .proc_handler = bpf_stats_handler,
+ },
+ { }
+};
+
+static int __init bpf_syscall_sysctl_init(void)
+{
+ register_sysctl_init("kernel", bpf_syscall_table);
+ return 0;
+}
+late_initcall(bpf_syscall_sysctl_init);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b68cb5d6d6eb..c4ab9d6cdbe9 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,9 +8,19 @@
#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/btf_ids.h>
+#include "mmap_unlock_work.h"
+
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -21,18 +31,115 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task, *next_task;
+ struct pid *pid;
+ u32 saved_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return NULL;
+
+retry:
+ if (!pid_alive(task)) {
+ put_task_struct(task);
+ return NULL;
+ }
+
+ next_task = next_thread(task);
+ put_task_struct(task);
+ if (!next_task)
+ return NULL;
+
+ saved_tid = *tid;
+ *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+ if (!*tid || *tid == common->pid) {
+ /* Run out of tasks of a process. The tasks of a
+ * thread_group are linked as circular linked list.
+ */
+ *tid = saved_tid;
+ return NULL;
+ }
+
+ get_task_struct(next_task);
+ common->pid_visiting = *tid;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files) {
+ task = next_task;
+ goto retry;
+ }
+
+ return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
u32 *tid,
bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = find_ge_pid(*tid, ns);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
- *tid = pid_nr_ns(pid, ns);
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -55,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -72,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -98,7 +205,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
if (!prog)
return 0;
- meta.seq = seq;
ctx.meta = &meta;
ctx.task = task;
return bpf_iter_run_prog(prog, &ctx);
@@ -117,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
@@ -137,8 +278,7 @@ struct bpf_iter_seq_task_file_info {
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
- struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
@@ -151,21 +291,18 @@ again:
curr_task = info->task;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
info->task = NULL;
- info->tid = curr_tid;
return NULL;
}
- /* set info->task and info->tid */
+ /* set info->task */
info->task = curr_task;
- if (curr_tid == info->tid) {
+ if (saved_tid == info->tid)
curr_fd = info->fd;
- } else {
- info->tid = curr_tid;
+ else
curr_fd = 0;
- }
}
rcu_read_lock();
@@ -186,9 +323,15 @@ again:
/* the current task is done, go to the next task */
rcu_read_unlock();
put_task_struct(curr_task);
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
info->task = NULL;
info->fd = 0;
- curr_tid = ++(info->tid);
+ saved_tid = ++(info->tid);
goto again;
}
@@ -269,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
return 0;
}
@@ -292,6 +438,7 @@ struct bpf_iter_seq_task_vma_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
+ struct mm_struct *mm;
struct vm_area_struct *vma;
u32 tid;
unsigned long prev_vm_start;
@@ -299,28 +446,30 @@ struct bpf_iter_seq_task_vma_info {
};
enum bpf_task_vma_iter_find_op {
- task_vma_iter_first_vma, /* use mm->mmap */
- task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
- struct pid_namespace *ns = info->common.ns;
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
- u32 curr_tid = info->tid;
+ struct mm_struct *curr_mm;
+ u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
- * the task_struct, and holds read lock on vma->mm->mmap_lock.
+ * the task_struct, holds a refcount on mm->mm_users, and holds
+ * read lock on vma->mm->mmap_lock.
* If this function returns NULL, it does not hold any reference or
* lock.
*/
if (info->task) {
curr_task = info->task;
curr_vma = info->vma;
+ curr_mm = info->mm;
/* In case of lock contention, drop mmap_lock to unblock
* the writer.
*
@@ -359,26 +508,27 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
* 4.2) VMA2 and VMA2' covers different ranges, process
* VMA2'.
*/
- if (mmap_lock_is_contended(curr_task->mm)) {
+ if (mmap_lock_is_contended(curr_mm)) {
info->prev_vm_start = curr_vma->vm_start;
info->prev_vm_end = curr_vma->vm_end;
op = task_vma_iter_find_vma;
- mmap_read_unlock(curr_task->mm);
- if (mmap_read_lock_killable(curr_task->mm))
+ mmap_read_unlock(curr_mm);
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
} else {
op = task_vma_iter_next_vma;
}
} else {
again:
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
- info->tid = curr_tid + 1;
+ info->tid++;
goto finish;
}
- if (curr_tid != info->tid) {
- info->tid = curr_tid;
+ if (saved_tid != info->tid) {
/* new task, process the first vma */
op = task_vma_iter_first_vma;
} else {
@@ -391,48 +541,57 @@ again:
op = task_vma_iter_find_vma;
}
- if (!curr_task->mm)
+ curr_mm = get_task_mm(curr_task);
+ if (!curr_mm)
goto next_task;
- if (mmap_read_lock_killable(curr_task->mm))
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
}
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = curr_task->mm->mmap;
+ curr_vma = find_vma(curr_mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
* to find the next vma. This is similar to the mechanism
* in show_smaps_rollup().
*/
- curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
+ curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
/* case 1) and 4.2) above just use curr_vma */
/* check for case 2) or case 4.1) above */
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
/* case 3) above, or case 2) 4.1) with vma->next == NULL */
- mmap_read_unlock(curr_task->mm);
+ mmap_read_unlock(curr_mm);
+ mmput(curr_mm);
goto next_task;
}
info->task = curr_task;
info->vma = curr_vma;
+ info->mm = curr_mm;
return curr_vma;
next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
put_task_struct(curr_task);
info->task = NULL;
- curr_tid++;
+ info->mm = NULL;
+ info->tid++;
goto again;
finish:
@@ -440,6 +599,7 @@ finish:
put_task_struct(curr_task);
info->task = NULL;
info->vma = NULL;
+ info->mm = NULL;
return NULL;
}
@@ -511,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v)
*/
info->prev_vm_start = ~0UL;
info->prev_vm_end = info->vma->vm_end;
- mmap_read_unlock(info->task->mm);
+ mmap_read_unlock(info->mm);
+ mmput(info->mm);
+ info->mm = NULL;
put_task_struct(info->task);
info->task = NULL;
}
@@ -524,11 +686,6 @@ static const struct seq_operations task_vma_seq_ops = {
.show = task_vma_seq_show,
};
-BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, task_struct)
-BTF_ID(struct, file)
-BTF_ID(struct, vm_area_struct)
-
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
@@ -536,8 +693,33 @@ static const struct bpf_iter_seq_info task_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
};
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
static struct bpf_iter_reg task_reg_info = {
.target = "task",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
@@ -545,6 +727,8 @@ static struct bpf_iter_reg task_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -556,6 +740,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
static struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -565,6 +750,8 @@ static struct bpf_iter_reg task_file_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -576,6 +763,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
static struct bpf_iter_reg task_vma_reg_info = {
.target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -585,25 +773,92 @@ static struct bpf_iter_reg task_vma_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+ bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct mm_struct *mm;
+ int ret = -ENOENT;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!task)
+ return -ENOENT;
+
+ mm = task->mm;
+ if (!mm)
+ return -ENOENT;
+
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+ if (irq_work_busy || !mmap_read_trylock(mm))
+ return -EBUSY;
+
+ vma = find_vma(mm, start);
+
+ if (vma && vma->vm_start <= start && vma->vm_end > start) {
+ callback_fn((u64)(long)task, (u64)(long)vma,
+ (u64)(long)callback_ctx, 0, 0);
+ ret = 0;
+ }
+ bpf_mmap_unlock_mm(work, mm);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+ .func = bpf_find_vma,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FUNC,
+ .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg5_type = ARG_ANYTHING,
+};
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+ struct mmap_unlock_irq_work *work;
+
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return;
+
+ work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ mmap_read_unlock_non_owner(work->mm);
+}
+
static int __init task_iter_init(void)
{
- int ret;
+ struct mmap_unlock_irq_work *work;
+ int ret, cpu;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&mmap_unlock_work, cpu);
+ init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ }
- task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+ task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
- task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
- task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
- task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
- task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 28a3630c48ee..ac021bc43a66 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,10 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
-#include <linux/module.h>
+#include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
+#include <linux/delay.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -26,20 +29,90 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
-void *bpf_jit_alloc_exec_page(void)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
{
- void *image;
+ struct bpf_trampoline *tr = ops->private;
+ int ret = 0;
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return NULL;
+ if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+ /* This is called inside register_ftrace_direct_multi(), so
+ * tr->mutex is already locked.
+ */
+ lockdep_assert_held_once(&tr->mutex);
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
+ /* Instead of updating the trampoline here, we propagate
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
+ * trampoline.
+ */
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+ if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+ return -EBUSY;
+
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+ return -EAGAIN;
+ }
+
+ return 0;
+ }
+
+ /* The normal locking order is
+ * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ *
+ * The following two commands are called from
+ *
+ * prepare_direct_functions_for_ipmodify
+ * cleanup_direct_functions_after_ipmodify
+ *
+ * In both cases, direct_mutex is already locked. Use
+ * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+ * (something else is making changes to this same trampoline).
*/
- set_memory_x((long)image, 1);
- return image;
+ if (!mutex_trylock(&tr->mutex)) {
+ /* sleep 1 ms to make sure whatever holding tr->mutex makes
+ * some progress.
+ */
+ msleep(1);
+ return -EAGAIN;
+ }
+
+ switch (cmd) {
+ case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+ tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tr->mutex);
+ return ret;
+}
+#endif
+
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type ptype = prog->type;
+
+ return (ptype == BPF_PROG_TYPE_TRACING &&
+ (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN)) ||
+ (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
@@ -75,6 +148,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops) {
+ kfree(tr);
+ tr = NULL;
+ goto out;
+ }
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
@@ -88,62 +171,33 @@ out:
return tr;
}
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
- struct module *mod;
- int err = 0;
-
- preempt_disable();
- mod = __module_text_address((unsigned long) tr->func.addr);
- if (mod && !try_module_get(mod))
- err = -ENOENT;
- preempt_enable();
- tr->mod = mod;
- return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
- module_put(tr->mod);
- tr->mod = NULL;
-}
-
-static int is_ftrace_location(void *ip)
-{
- long addr;
-
- addr = ftrace_location((long)ip);
- if (!addr)
- return 0;
- if (WARN_ON_ONCE(addr != (long)ip))
- return -EFAULT;
- return 1;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
- if (!ret)
- bpf_trampoline_module_put(tr);
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+ bool lock_direct_mutex)
{
void *ip = tr->func.addr;
int ret;
- if (tr->func.ftrace_managed)
- ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ if (lock_direct_mutex)
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
+ else
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ }
return ret;
}
@@ -151,48 +205,50 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
{
void *ip = tr->func.addr;
+ unsigned long faddr;
int ret;
- ret = is_ftrace_location(ip);
- if (ret < 0)
- return ret;
- tr->func.ftrace_managed = ret;
-
- if (bpf_trampoline_module_get(tr))
- return -ENOENT;
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr) {
+ if (!tr->fops)
+ return -ENOTSUPP;
+ tr->func.ftrace_managed = true;
+ }
- if (tr->func.ftrace_managed)
- ret = register_ftrace_direct((long)ip, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ }
- if (ret)
- bpf_trampoline_module_put(tr);
return ret;
}
-static struct bpf_tramp_progs *
-bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
+static struct bpf_tramp_links *
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
- const struct bpf_prog_aux *aux;
- struct bpf_tramp_progs *tprogs;
- struct bpf_prog **progs;
+ struct bpf_tramp_link *link;
+ struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_link **links;
int kind;
*total = 0;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return ERR_PTR(-ENOMEM);
for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
- tprogs[kind].nr_progs = tr->progs_cnt[kind];
+ tlinks[kind].nr_links = tr->progs_cnt[kind];
*total += tr->progs_cnt[kind];
- progs = tprogs[kind].progs;
+ links = tlinks[kind].links;
- hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
- *progs++ = aux->prog;
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= link->link.prog->call_get_func_ip;
+ *links++ = link;
+ }
}
- return tprogs;
+ return tlinks;
}
static void __bpf_tramp_image_put_deferred(struct work_struct *work)
@@ -202,7 +258,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -299,14 +355,15 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec_page();
+ im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!image)
goto out_uncharge;
+ set_vm_flush_reset_perms(image);
err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
if (err)
@@ -321,23 +378,24 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
return ERR_PTR(err);
}
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
{
struct bpf_tramp_image *im;
- struct bpf_tramp_progs *tprogs;
- u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ struct bpf_tramp_links *tlinks;
+ u32 orig_flags = tr->flags;
+ bool ip_arg = false;
int err, total;
- tprogs = bpf_trampoline_get_progs(tr, &total);
- if (IS_ERR(tprogs))
- return PTR_ERR(tprogs);
+ tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+ if (IS_ERR(tlinks))
+ return PTR_ERR(tlinks);
if (total == 0) {
err = unregister_fentry(tr, tr->cur_image->image);
@@ -353,32 +411,74 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
goto out;
}
- if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
- tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
- flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ /* clear all bits except SHARE_IPMODIFY */
+ tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
+ * should not be set together.
+ */
+ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ } else {
+ tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+ }
+
+ if (ip_arg)
+ tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+ if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
+ (tr->flags & BPF_TRAMP_F_CALL_ORIG))
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
- &tr->func.model, flags, tprogs,
+ &tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
goto out;
+ set_memory_rox((long)im->image, 1);
+
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, tr->cur_image->image, im->image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ if (err == -EAGAIN) {
+ /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+ * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+ * trampoline again, and retry register.
+ */
+ /* reset fops->func and fops->trampoline for re-register */
+ tr->fops->func = NULL;
+ tr->fops->trampoline = 0;
+
+ /* reset im->image memory attr for arch_prepare_bpf_trampoline */
+ set_memory_nx((long)im->image, 1);
+ set_memory_rw((long)im->image, 1);
+ goto again;
+ }
+#endif
if (err)
goto out;
+
if (tr->cur_image)
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = im;
tr->selector++;
out:
- kfree(tprogs);
+ /* If any error happens, restore previous flags */
+ if (err)
+ tr->flags = orig_flags;
+ kfree(tlinks);
return err;
}
@@ -404,77 +504,264 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
+ struct bpf_tramp_link *link_exiting;
int err = 0;
- int cnt;
+ int cnt = 0, i;
- kind = bpf_attach_type_to_tramp(prog);
- mutex_lock(&tr->mutex);
- if (tr->extension_prog) {
+ kind = bpf_attach_type_to_tramp(link->link.prog);
+ if (tr->extension_prog)
/* cannot attach fentry/fexit if extension prog is attached.
* cannot overwrite extension prog either.
*/
- err = -EBUSY;
- goto out;
- }
- cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ return -EBUSY;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ cnt += tr->progs_cnt[i];
+
if (kind == BPF_TRAMP_REPLACE) {
/* Cannot attach extension if fentry/fexit are in use. */
- if (cnt) {
- err = -EBUSY;
- goto out;
- }
- tr->extension_prog = prog;
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
- prog->bpf_func);
- goto out;
- }
- if (cnt >= BPF_MAX_TRAMP_PROGS) {
- err = -E2BIG;
- goto out;
+ if (cnt)
+ return -EBUSY;
+ tr->extension_prog = link->link.prog;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ link->link.prog->bpf_func);
}
- if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+ if (cnt >= BPF_MAX_TRAMP_LINKS)
+ return -E2BIG;
+ if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
- err = -EBUSY;
- goto out;
+ return -EBUSY;
+ hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ if (link_exiting->link.prog != link->link.prog)
+ continue;
+ /* prog already linked */
+ return -EBUSY;
}
- hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+
+ hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
- err = bpf_trampoline_update(tr);
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
- hlist_del_init(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
}
-out:
+ return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_link_prog(link, tr);
mutex_unlock(&tr->mutex);
return err;
}
-/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
int err;
- kind = bpf_attach_type_to_tramp(prog);
- mutex_lock(&tr->mutex);
+ kind = bpf_attach_type_to_tramp(link->link.prog);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
- goto out;
+ return err;
}
- hlist_del_init(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
- err = bpf_trampoline_update(tr);
-out:
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_unlink_prog(link, tr);
mutex_unlock(&tr->mutex);
return err;
}
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+ if (!shim_link->trampoline)
+ return;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+ .release = bpf_shim_tramp_link_release,
+ .dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+ bpf_func_t bpf_func,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_prog *p;
+
+ shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+ if (!shim_link)
+ return NULL;
+
+ p = bpf_prog_alloc(1, 0);
+ if (!p) {
+ kfree(shim_link);
+ return NULL;
+ }
+
+ p->jited = false;
+ p->bpf_func = bpf_func;
+
+ p->aux->cgroup_atype = cgroup_atype;
+ p->aux->attach_func_proto = prog->aux->attach_func_proto;
+ p->aux->attach_btf_id = prog->aux->attach_btf_id;
+ p->aux->attach_btf = prog->aux->attach_btf;
+ btf_get(p->aux->attach_btf);
+ p->type = BPF_PROG_TYPE_LSM;
+ p->expected_attach_type = BPF_LSM_MAC;
+ bpf_prog_inc(p);
+ bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p);
+ bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
+
+ return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+ bpf_func_t bpf_func)
+{
+ struct bpf_tramp_link *link;
+ int kind;
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = link->link.prog;
+
+ if (p->bpf_func == bpf_func)
+ return container_of(link, struct bpf_shim_tramp_link, link);
+ }
+ }
+
+ return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_attach_target_info tgt_info = {};
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+ int err;
+
+ err = bpf_check_attach_target(NULL, prog, NULL,
+ prog->aux->attach_btf_id,
+ &tgt_info);
+ if (err)
+ return err;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ mutex_lock(&tr->mutex);
+
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ if (shim_link) {
+ /* Reusing existing shim attached by the other program. */
+ bpf_link_inc(&shim_link->link.link);
+
+ mutex_unlock(&tr->mutex);
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+ return 0;
+ }
+
+ /* Allocate and install new shim. */
+
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+ if (!shim_link) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ if (err)
+ goto err;
+
+ shim_link->trampoline = tr;
+ /* note, we're still holding tr refcnt from above */
+
+ mutex_unlock(&tr->mutex);
+
+ return 0;
+err:
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ /* have to release tr while _not_ holding its mutex */
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
+ return err;
+}
+
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_lookup(key);
+ if (WARN_ON_ONCE(!tr))
+ return;
+
+ mutex_lock(&tr->mutex);
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info)
{
@@ -497,16 +784,19 @@ out:
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ int i;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
if (!refcount_dec_and_test(&tr->refcnt))
goto out;
WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
- goto out;
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
- goto out;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
+ goto out;
+
/* This code will be executed even when the last bpf_tramp_image
* is alive. All progs are detached from the trampoline and the
* trampoline image is patched with jmp into epilogue to skip
@@ -514,13 +804,17 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* multiple rcu callbacks.
*/
hlist_del(&tr->hlist);
+ if (tr->fops) {
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+ }
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
}
#define NO_START_TIME 1
-static u64 notrace bpf_prog_start_time(void)
+static __always_inline u64 notrace bpf_prog_start_time(void)
{
u64 start = NO_START_TIME;
@@ -532,17 +826,7 @@ static u64 notrace bpf_prog_start_time(void)
return start;
}
-static void notrace inc_misses_counter(struct bpf_prog *prog)
-{
- struct bpf_prog_stats *stats;
-
- stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
- stats->misses++;
- u64_stats_update_end(&stats->syncp);
-}
-
-/* The logic is similar to BPF_PROG_RUN, but with an explicit
+/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
* call __bpf_prog_enter
@@ -555,13 +839,16 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
migrate_disable();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
return bpf_prog_start_time();
@@ -579,43 +866,126 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
* Hence check that 'start' is valid.
*/
start > NO_START_TIME) {
+ unsigned long flags;
+
stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
- stats->cnt++;
- stats->nsecs += sched_clock() - start;
- u64_stats_update_end(&stats->syncp);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, sched_clock() - start);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ /* Runtime stats are exported via actual BPF_LSM_CGROUP
+ * programs, not the shims.
+ */
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return NO_START_TIME;
+}
+
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ migrate_enable();
+ rcu_read_unlock();
+}
+
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
migrate_disable();
might_fault();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock_trace();
}
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock();
+}
+
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{
percpu_ref_get(&tr->pcref);
@@ -626,10 +996,40 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
percpu_ref_put(&tr->pcref);
}
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_progs *tprogs,
+ struct bpf_tramp_links *tlinks,
void *orig_call)
{
return -ENOTSUPP;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 42a4063de7cd..fbcf5a4e2fcd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4,6 +4,7 @@
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
*/
#include <uapi/linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -22,6 +23,8 @@
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
+#include <linux/poison.h>
+#include <linux/module.h>
#include "disasm.h"
@@ -186,6 +189,15 @@ struct bpf_verifier_stack_elem {
POISON_POINTER_DELTA))
#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr);
+
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
@@ -240,27 +252,69 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}
-static bool bpf_pseudo_func(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
- insn->src_reg == BPF_PSEUDO_FUNC;
-}
-
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
bool pkt_access;
+ u8 release_regno;
int regno;
int access_size;
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int dynptr_id;
+ int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
struct btf *ret_btf;
u32 ret_btf_id;
u32 subprogno;
+ struct btf_field *kptr_field;
+};
+
+struct btf_and_id {
+ struct btf *btf;
+ u32 btf_id;
+};
+
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+ union {
+ struct btf_and_id arg_obj_drop;
+ struct btf_and_id arg_refcount_acquire;
+ };
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ } initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
+ u64 mem_size;
};
struct btf *btf_vmlinux;
@@ -288,59 +342,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
return &linfo[i - 1];
}
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
- va_list args)
-{
- unsigned int n;
-
- n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
-
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
-
- if (log->level == BPF_LOG_KERNEL) {
- pr_err("BPF:%s\n", log->kbuf);
- return;
- }
- if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
- log->len_used += n;
- else
- log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
- char zero = 0;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- log->len_used = new_pos;
- if (put_user(zero, log->ubuf + new_pos))
- log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(&env->log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
struct bpf_verifier_env *env = private_data;
@@ -354,19 +355,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(log, fmt, args);
- va_end(args);
-}
-
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -423,6 +411,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
+ type = base_type(type);
return type == PTR_TO_PACKET ||
type == PTR_TO_PACKET_META;
}
@@ -435,76 +424,58 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool type_may_be_null(u32 type)
+{
+ return type & PTR_MAYBE_NULL;
+}
+
static bool reg_type_not_null(enum bpf_reg_type type)
{
+ if (type_may_be_null(type))
+ return false;
+
+ type = base_type(type);
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_MAP_KEY ||
- type == PTR_TO_SOCK_COMMON;
-}
-
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_BTF_ID_OR_NULL ||
- type == PTR_TO_MEM_OR_NULL ||
- type == PTR_TO_RDONLY_BUF_OR_NULL ||
- type == PTR_TO_RDWR_BUF_OR_NULL;
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_MEM;
}
-static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+static bool type_is_ptr_alloc_obj(u32 type)
{
- return reg->type == PTR_TO_MAP_VALUE &&
- map_value_has_spin_lock(reg->map_ptr);
+ return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}
-static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+static bool type_is_non_owning_ref(u32 type)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_MEM ||
- type == PTR_TO_MEM_OR_NULL;
+ return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
}
-static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
- return type == ARG_PTR_TO_SOCK_COMMON;
-}
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
-static bool arg_type_may_be_null(enum bpf_arg_type type)
-{
- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_CTX_OR_NULL ||
- type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
- type == ARG_PTR_TO_STACK_OR_NULL;
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
+ }
+ return rec;
}
-/* Determine whether the function releases some resources allocated by another
- * function call. The first reference type argument will be assumed to be
- * released by release_reference().
- */
-static bool is_release_function(enum bpf_func_id func_id)
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- return func_id == BPF_FUNC_sk_release ||
- func_id == BPF_FUNC_ringbuf_submit ||
- func_id == BPF_FUNC_ringbuf_discard;
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
}
-static bool may_be_acquire_function(enum bpf_func_id func_id)
+static bool type_is_rdonly_mem(u32 type)
{
- return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_map_lookup_elem ||
- func_id == BPF_FUNC_ringbuf_reserve;
+ return type & MEM_RDONLY;
}
static bool is_acquire_function(enum bpf_func_id func_id,
@@ -515,7 +486,8 @@ static bool is_acquire_function(enum bpf_func_id func_id,
if (func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp ||
func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_ringbuf_reserve)
+ func_id == BPF_FUNC_ringbuf_reserve ||
+ func_id == BPF_FUNC_kptr_xchg)
return true;
if (func_id == BPF_FUNC_map_lookup_elem &&
@@ -533,10 +505,48 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_sock ||
func_id == BPF_FUNC_skc_to_tcp6_sock ||
func_id == BPF_FUNC_skc_to_udp6_sock ||
+ func_id == BPF_FUNC_skc_to_mptcp_sock ||
func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_dynptr_data;
+}
+
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_timer_set_callback ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
+}
+
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
+}
+
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
+}
+
static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_STX &&
@@ -544,45 +554,68 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_MEM_OR_NULL] = "mem_or_null",
- [PTR_TO_RDONLY_BUF] = "rdonly_buf",
- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
- [PTR_TO_RDWR_BUF] = "rdwr_buf",
- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
- [PTR_TO_FUNC] = "func",
- [PTR_TO_MAP_KEY] = "map_key",
-};
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+static const char *reg_type_str(struct bpf_verifier_env *env,
+ enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[64] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "scalar",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID)
+ strncpy(postfix, "or_null_", 16);
+ else
+ strncpy(postfix, "_or_null", 16);
+ }
+
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
+
+ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->type_str_buf;
+}
static char slot_type_char[] = {
[STACK_INVALID] = '?',
[STACK_SPILL] = 'r',
[STACK_MISC] = 'm',
[STACK_ZERO] = '0',
+ [STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -598,6 +631,11 @@ static void print_liveness(struct bpf_verifier_env *env,
verbose(env, "D");
}
+static int __get_spi(s32 off)
+{
+ return (-off - 1) / BPF_REG_SIZE;
+}
+
static struct bpf_func_state *func(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg)
{
@@ -606,13 +644,625 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env,
return cur->frame[reg->frameno];
}
-static const char *kernel_type_name(const struct btf* btf, u32 id)
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
+
+static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ const char *obj_kind, int nr_slots)
+{
+ int off, spi;
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "%s has to be at a constant offset\n", obj_kind);
+ return -EINVAL;
+ }
+
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ spi = __get_spi(off);
+ if (spi + 1 < nr_slots) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
+ return -ERANGE;
+ return spi;
+}
+
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
+}
+
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
{
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
+static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+ env->scratched_regs |= 1U << regno;
+}
+
+static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+ env->scratched_stack_slots |= 1ULL << spi;
+}
+
+static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+ return (env->scratched_regs >> regno) & 1;
+}
+
+static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+ return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+ return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = 0U;
+ env->scratched_stack_slots = 0ULL;
+}
+
+/* Used for printing the entire verifier state. */
+static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = ~0U;
+ env->scratched_stack_slots = ~0ULL;
+}
+
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ return BPF_DYNPTR_TYPE_LOCAL;
+ case DYNPTR_TYPE_RINGBUF:
+ return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
+ default:
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+}
+
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ default:
+ return 0;
+ }
+}
+
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+ return type == BPF_DYNPTR_TYPE_RINGBUF;
+}
+
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
+{
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
+}
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type, int insn_idx)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type type;
+ int spi, i, id, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_DYNPTR;
+ state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+ }
+
+ type = arg_to_dynptr_type(arg_type);
+ if (type == BPF_DYNPTR_TYPE_INVALID)
+ return -EINVAL;
+
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
+
+ if (dynptr_type_refcounted(type)) {
+ /* The id is used to track proper releasing */
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
+ }
+
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ /* Invalidate any slices associated with this dynptr */
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+ WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
+
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
+ *
+ * While we don't allow reading STACK_INVALID, it is still possible to
+ * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
+ * helpers or insns can do partial read of that part without failing,
+ * but check_stack_range_initialized, check_stack_read_var_off, and
+ * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
+ * the slot conservatively. Hence we need to prevent those liveness
+ * marking walks.
+ *
+ * This was not a problem before because STACK_INVALID is only set by
+ * default (where the default reg state has its reg->parent as NULL), or
+ * in clean_live_states after REG_LIVE_DONE (at which point
+ * mark_reg_read won't walk reg->parent chain), but not randomly during
+ * verifier state exploration (like we did above). Hence, for our case
+ * parentage chain will still be live (i.e. reg->parent may be
+ * non-NULL), while earlier reg->parent was NULL, so we need
+ * REG_LIVE_WRITTEN to screen off read marker propagation when it is
+ * done later on reads or by mark_dynptr_read as well to unnecessary
+ * mark registers in verifier state.
+ */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+}
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
+{
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
+
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
+
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+
+ /* Writing partially to one dynptr stack slot destroys both. */
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id)
+ mark_reg_invalid(env, dreg);
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Same reason as unmark_stack_slots_dynptr above */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
+ */
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
+ return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int i, spi;
+
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return true;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ return false;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+ state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi;
+
+ /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+ if (arg_type == ARG_PTR_TO_DYNPTR)
+ return true;
+
+ dynptr_type = arg_to_dynptr_type(arg_type);
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
+}
+
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return false;
+ if (i != 0 && st->ref_obj_id)
+ return false;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return false;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
+}
+
+/* The reg state of a pointer or a bounded scalar was saved when
+ * it was spilled to the stack.
+ */
+static bool is_spilled_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
+}
+
+static void scrub_spilled_slot(u8 *stype)
+{
+ if (*stype != STACK_INVALID)
+ *stype = STACK_MISC;
+}
+
static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+ const struct bpf_func_state *state,
+ bool print_all)
{
const struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -625,78 +1275,85 @@ static void print_verifier_state(struct bpf_verifier_env *env,
t = reg->type;
if (t == NOT_INIT)
continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=");
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
+ verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID ||
- t == PTR_TO_BTF_ID_OR_NULL ||
- t == PTR_TO_PERCPU_BTF_ID)
- verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
- verbose(env, "(id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t))
- verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
+ const char *sep = "";
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (base_type(t) == PTR_TO_BTF_ID)
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+ verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
if (t != SCALAR_VALUE)
- verbose(env, ",off=%d", reg->off);
+ verbose_a("off=%d", reg->off);
if (type_is_pkt_pointer(t))
- verbose(env, ",r=%d", reg->range);
- else if (t == CONST_PTR_TO_MAP ||
- t == PTR_TO_MAP_KEY ||
- t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(env, ",ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
+ verbose_a("r=%d", reg->range);
+ else if (base_type(t) == CONST_PTR_TO_MAP ||
+ base_type(t) == PTR_TO_MAP_KEY ||
+ base_type(t) == PTR_TO_MAP_VALUE)
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
/* Typically an immediate SCALAR_VALUE, but
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(env, ",imm=%llx", reg->var_off.value);
+ verbose_a("imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(env, ",smin_value=%lld",
- (long long)reg->smin_value);
+ verbose_a("smin=%lld", (long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(env, ",smax_value=%lld",
- (long long)reg->smax_value);
+ verbose_a("smax=%lld", (long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(env, ",umin_value=%llu",
- (unsigned long long)reg->umin_value);
+ verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(env, ",umax_value=%llu",
- (unsigned long long)reg->umax_value);
+ verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, ",var_off=%s", tn_buf);
+ verbose_a("var_off=%s", tn_buf);
}
if (reg->s32_min_value != reg->smin_value &&
reg->s32_min_value != S32_MIN)
- verbose(env, ",s32_min_value=%d",
- (int)(reg->s32_min_value));
+ verbose_a("s32_min=%d", (int)(reg->s32_min_value));
if (reg->s32_max_value != reg->smax_value &&
reg->s32_max_value != S32_MAX)
- verbose(env, ",s32_max_value=%d",
- (int)(reg->s32_max_value));
+ verbose_a("s32_max=%d", (int)(reg->s32_max_value));
if (reg->u32_min_value != reg->umin_value &&
reg->u32_min_value != U32_MIN)
- verbose(env, ",u32_min_value=%d",
- (int)(reg->u32_min_value));
+ verbose_a("u32_min=%d", (int)(reg->u32_min_value));
if (reg->u32_max_value != reg->umax_value &&
reg->u32_max_value != U32_MAX)
- verbose(env, ",u32_max_value=%d",
- (int)(reg->u32_max_value));
+ verbose_a("u32_max=%d", (int)(reg->u32_max_value));
}
+#undef verbose_a
+
verbose(env, ")");
}
}
@@ -708,24 +1365,62 @@ static void print_verifier_state(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++) {
if (state->stack[i].slot_type[j] != STACK_INVALID)
valid = true;
- types_buf[j] = slot_type_char[
- state->stack[i].slot_type[j]];
+ types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
}
types_buf[BPF_REG_SIZE] = 0;
if (!valid)
continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, state->stack[i].spilled_ptr.live);
- if (state->stack[i].slot_type[0] == STACK_SPILL) {
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
reg = &state->stack[i].spilled_ptr;
t = reg->type;
- verbose(env, "=%s", reg_type_str[t]);
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
+ break;
+ case STACK_DYNPTR:
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+ if (reg->ref_obj_id)
+ verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ reg = &state->stack[i].spilled_ptr;
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ reg = &state->stack[i].spilled_ptr;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+ types_buf[BPF_REG_SIZE] = 0;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
verbose(env, "=%s", types_buf);
+ break;
}
}
if (state->acquired_refs && state->refs[0].id) {
@@ -734,7 +1429,31 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->refs[i].id)
verbose(env, ",%d", state->refs[i].id);
}
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
verbose(env, "\n");
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+static void print_insn_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state)
+{
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, state, false);
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -746,6 +1465,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
*/
static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
+ size_t alloc_bytes;
+ void *orig = dst;
size_t bytes;
if (ZERO_OR_NULL_PTR(src))
@@ -754,11 +1475,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (ksize(dst) < bytes) {
- kfree(dst);
- dst = kmalloc_track_caller(bytes, flags);
- if (!dst)
- return NULL;
+ alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
+ dst = krealloc(orig, alloc_bytes, flags);
+ if (!dst) {
+ kfree(orig);
+ return NULL;
}
memcpy(dst, src, bytes);
@@ -773,12 +1494,19 @@ out:
*/
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
+ size_t alloc_size;
+ void *new_arr;
+
if (!new_n || old_n == new_n)
goto out;
- arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
- if (!arr)
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
+ if (!new_arr) {
+ kfree(arr);
return NULL;
+ }
+ arr = new_arr;
if (new_n > old_n)
memset(arr + old_n * size, 0, (new_n - old_n) * size);
@@ -854,6 +1582,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
id = ++env->id_gen;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
@@ -866,6 +1595,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
+ /* Cannot release caller references in callbacks */
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -941,8 +1673,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->frame[i] = NULL;
}
dst_state->speculative = src->speculative;
+ dst_state->active_rcu_lock = src->active_rcu_lock;
dst_state->curframe = src->curframe;
- dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->active_lock.ptr = src->active_lock.ptr;
+ dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
@@ -1023,7 +1757,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -1061,9 +1795,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
-
/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
@@ -1084,9 +1815,11 @@ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
*/
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- /* Clear id, off, and union(map_ptr, range) */
+ /* Clear off and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ reg->id = 0;
+ reg->ref_obj_id = 0;
___mark_reg_known(reg, imm);
}
@@ -1126,15 +1859,34 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
- switch (reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL: {
+ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
const struct bpf_map *map = reg->map_ptr;
if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = map->inner_map_meta;
+ /* transfer reg's id which is unique for every map_lookup_elem
+ * as UID of the inner map.
+ */
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1143,32 +1895,20 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
} else {
reg->type = PTR_TO_MAP_VALUE;
}
- break;
- }
- case PTR_TO_SOCKET_OR_NULL:
- reg->type = PTR_TO_SOCKET;
- break;
- case PTR_TO_SOCK_COMMON_OR_NULL:
- reg->type = PTR_TO_SOCK_COMMON;
- break;
- case PTR_TO_TCP_SOCK_OR_NULL:
- reg->type = PTR_TO_TCP_SOCK;
- break;
- case PTR_TO_BTF_ID_OR_NULL:
- reg->type = PTR_TO_BTF_ID;
- break;
- case PTR_TO_MEM_OR_NULL:
- reg->type = PTR_TO_MEM;
- break;
- case PTR_TO_RDONLY_BUF_OR_NULL:
- reg->type = PTR_TO_RDONLY_BUF;
- break;
- case PTR_TO_RDWR_BUF_OR_NULL:
- reg->type = PTR_TO_RDWR_BUF;
- break;
- default:
- WARN_ONCE(1, "unknown nullable register type");
+ return;
}
+
+ reg->type &= ~PTR_MAYBE_NULL;
+}
+
+static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
+ struct btf_field_graph_root *ds_head)
+{
+ __mark_reg_known_zero(&regs[regno]);
+ regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[regno].btf = ds_head->btf;
+ regs[regno].btf_id = ds_head->value_btf_id;
+ regs[regno].off = ds_head->node_offset;
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -1182,6 +1922,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
reg->type == PTR_TO_PACKET_END;
}
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
@@ -1341,29 +2087,50 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
- struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static void reg_bounds_sync(struct bpf_reg_state *reg)
+{
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(reg);
+}
+
+static bool __reg32_bound_s64(s32 a)
+{
+ return a >= 0 && a <= S32_MAX;
+}
+
static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
reg->umin_value = reg->u32_min_value;
reg->umax_value = reg->u32_max_value;
- /* Attempt to pull 32-bit signed bounds into 64-bit bounds
- * but must be positive otherwise set to worse case bounds
- * and refine later from tnum.
+
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+ * be positive otherwise set to worse case bounds and refine later
+ * from tnum.
*/
- if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0)
- reg->smax_value = reg->s32_max_value;
- else
- reg->smax_value = U32_MAX;
- if (reg->s32_min_value >= 0)
+ if (__reg32_bound_s64(reg->s32_min_value) &&
+ __reg32_bound_s64(reg->s32_max_value)) {
reg->smin_value = reg->s32_min_value;
- else
+ reg->smax_value = reg->s32_max_value;
+ } else {
reg->smin_value = 0;
+ reg->smax_value = U32_MAX;
+ }
}
static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
@@ -1383,32 +2150,23 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
* so they do not impact tnum bounds calculation.
*/
__mark_reg64_unbounded(reg);
- __update_reg_bounds(reg);
}
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ reg_bounds_sync(reg);
}
static bool __reg64_bound_s32(s64 a)
{
- return a > S32_MIN && a < S32_MAX;
+ return a >= S32_MIN && a <= S32_MAX;
}
static bool __reg64_bound_u32(u64 a)
{
- return a > U32_MIN && a < U32_MAX;
+ return a >= U32_MIN && a <= U32_MAX;
}
static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
{
__mark_reg32_unbounded(reg);
-
if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
reg->s32_min_value = (s32)reg->smin_value;
reg->s32_max_value = (s32)reg->smax_value;
@@ -1417,14 +2175,7 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
reg->u32_min_value = (u32)reg->umin_value;
reg->u32_max_value = (u32)reg->umax_value;
}
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ reg_bounds_sync(reg);
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -1432,14 +2183,16 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
/*
- * Clear type, id, off, and union(map_ptr, range) and
+ * Clear type, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
+ reg->precise = !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -1479,14 +2232,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
static void mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type,
- struct btf *btf, u32 btf_id)
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
{
if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, regno);
return;
}
mark_reg_known_zero(env, regs, regno);
- regs[regno].type = PTR_TO_BTF_ID;
+ regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
}
@@ -1519,9 +2273,59 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
+ state->callback_ret_range = tnum_range(0, 0);
init_reg_state(env, state);
+ mark_verifier_state_scratched(env);
+}
+
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ int subprog)
+{
+ struct bpf_verifier_stack_elem *elem;
+ struct bpf_func_state *frame;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ elem->log_pos = env->log.end_pos;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env,
+ "The sequence of %d jumps is too complex for async cb.\n",
+ env->stack_size);
+ goto err;
+ }
+ /* Unlike push_stack() do not copy_verifier_state().
+ * The caller state doesn't matter.
+ * This is async callback. It starts in a fresh stack.
+ * Initialize it similar to do_check_common().
+ */
+ elem->st.branches = 1;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ if (!frame)
+ goto err;
+ init_func_state(env, frame,
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ elem->st.frame[0] = frame;
+ return &elem->st;
+err:
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ /* pop all elements and return */
+ while (!pop_stack(env, NULL, NULL, false));
+ return NULL;
}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -1569,52 +2373,184 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
return env->subprog_cnt - 1;
}
+#define MAX_KFUNC_DESCS 256
+#define MAX_KFUNC_BTFS 256
+
struct bpf_kfunc_desc {
struct btf_func_model func_model;
u32 func_id;
s32 imm;
+ u16 offset;
+ unsigned long addr;
+};
+
+struct bpf_kfunc_btf {
+ struct btf *btf;
+ struct module *module;
+ u16 offset;
};
-#define MAX_KFUNC_DESCS 256
struct bpf_kfunc_desc_tab {
+ /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+ * verification. JITs do lookups by bpf_insn, where func_id may not be
+ * available, therefore at the end of verification do_misc_fixups()
+ * sorts this by imm and offset.
+ */
struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
u32 nr_descs;
};
-static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+struct bpf_kfunc_btf_tab {
+ struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
+ u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
const struct bpf_kfunc_desc *d1 = b;
/* func_id is not greater than BTF_MAX_TYPE */
- return d0->func_id - d1->func_id;
+ return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
+}
+
+static int kfunc_btf_cmp_by_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_btf *d0 = a;
+ const struct bpf_kfunc_btf *d1 = b;
+
+ return d0->offset - d1->offset;
}
static const struct bpf_kfunc_desc *
-find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
struct bpf_kfunc_desc desc = {
.func_id = func_id,
+ .offset = offset,
};
struct bpf_kfunc_desc_tab *tab;
tab = prog->aux->kfunc_tab;
return bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
+}
+
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+ u16 btf_fd_idx, u8 **func_addr)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+ if (!desc)
+ return -EFAULT;
+
+ *func_addr = (u8 *)desc->addr;
+ return 0;
}
-static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
+ s16 offset)
+{
+ struct bpf_kfunc_btf kf_btf = { .offset = offset };
+ struct bpf_kfunc_btf_tab *tab;
+ struct bpf_kfunc_btf *b;
+ struct module *mod;
+ struct btf *btf;
+ int btf_fd;
+
+ tab = env->prog->aux->kfunc_btf_tab;
+ b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
+ if (!b) {
+ if (tab->nr_descs == MAX_KFUNC_BTFS) {
+ verbose(env, "too many different module BTFs\n");
+ return ERR_PTR(-E2BIG);
+ }
+
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
+ return ERR_PTR(-EPROTO);
+ }
+
+ if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
+ offset * sizeof(btf_fd),
+ sizeof(btf_fd)))
+ return ERR_PTR(-EFAULT);
+
+ btf = btf_get_by_fd(btf_fd);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF fd specified\n");
+ return btf;
+ }
+
+ if (!btf_is_module(btf)) {
+ verbose(env, "BTF fd for kfunc is not a module BTF\n");
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mod = btf_try_get_module(btf);
+ if (!mod) {
+ btf_put(btf);
+ return ERR_PTR(-ENXIO);
+ }
+
+ b = &tab->descs[tab->nr_descs++];
+ b->btf = btf;
+ b->module = mod;
+ b->offset = offset;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_btf_cmp_by_off, NULL);
+ }
+ return b->btf;
+}
+
+void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
+{
+ if (!tab)
+ return;
+
+ while (tab->nr_descs--) {
+ module_put(tab->descs[tab->nr_descs].module);
+ btf_put(tab->descs[tab->nr_descs].btf);
+ }
+ kfree(tab);
+}
+
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
+{
+ if (offset) {
+ if (offset < 0) {
+ /* In the future, this can be allowed to increase limit
+ * of fd index into fd_array, interpreted as u16.
+ */
+ verbose(env, "negative offset disallowed for kernel module function call\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return __find_kfunc_desc_btf(env, offset);
+ }
+ return btf_vmlinux ?: ERR_PTR(-ENOENT);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
{
const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_btf_tab *btf_tab;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
struct bpf_kfunc_desc *desc;
const char *func_name;
+ struct btf *desc_btf;
+ unsigned long call_imm;
unsigned long addr;
int err;
prog_aux = env->prog->aux;
tab = prog_aux->kfunc_tab;
+ btf_tab = prog_aux->kfunc_btf_tab;
if (!tab) {
if (!btf_vmlinux) {
verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
@@ -1642,7 +2578,29 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
prog_aux->kfunc_tab = tab;
}
- if (find_kfunc_desc(env->prog, func_id))
+ /* func_id == 0 is always invalid, but instead of returning an error, be
+ * conservative and wait until the code elimination pass before returning
+ * error, so that invalid calls that get pruned out can be in BPF programs
+ * loaded from userspace. It is also required that offset be untouched
+ * for such calls.
+ */
+ if (!func_id && !offset)
+ return 0;
+
+ if (!btf_tab && offset) {
+ btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
+ if (!btf_tab)
+ return -ENOMEM;
+ prog_aux->kfunc_btf_tab = btf_tab;
+ }
+
+ desc_btf = find_kfunc_desc_btf(env, offset);
+ if (IS_ERR(desc_btf)) {
+ verbose(env, "failed to find BTF for kernel function\n");
+ return PTR_ERR(desc_btf);
+ }
+
+ if (find_kfunc_desc(env->prog, func_id, offset))
return 0;
if (tab->nr_descs == MAX_KFUNC_DESCS) {
@@ -1650,52 +2608,73 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
return -E2BIG;
}
- func = btf_type_by_id(btf_vmlinux, func_id);
+ func = btf_type_by_id(desc_btf, func_id);
if (!func || !btf_type_is_func(func)) {
verbose(env, "kernel btf_id %u is not a function\n",
func_id);
return -EINVAL;
}
- func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ func_proto = btf_type_by_id(desc_btf, func->type);
if (!func_proto || !btf_type_is_func_proto(func_proto)) {
verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
func_id);
return -EINVAL;
}
- func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
addr = kallsyms_lookup_name(func_name);
if (!addr) {
verbose(env, "cannot find address for kernel function %s\n",
func_name);
return -EINVAL;
}
+ specialize_kfunc(env, func_id, offset, &addr);
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
+ }
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
- err = btf_distill_func_proto(&env->log, btf_vmlinux,
+ desc->imm = call_imm;
+ desc->offset = offset;
+ desc->addr = addr;
+ err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
&desc->func_model);
if (!err)
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_id, NULL);
+ kfunc_desc_cmp_by_id_off, NULL);
return err;
}
-static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
const struct bpf_kfunc_desc *d1 = b;
- if (d0->imm > d1->imm)
- return 1;
- else if (d0->imm < d1->imm)
- return -1;
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
return 0;
}
-static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
{
struct bpf_kfunc_desc_tab *tab;
@@ -1704,7 +2683,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
return;
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_imm, NULL);
+ kfunc_desc_cmp_by_imm_off, NULL);
}
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -1718,13 +2697,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
{
const struct bpf_kfunc_desc desc = {
.imm = insn->imm,
+ .offset = insn->off,
};
const struct bpf_kfunc_desc *res;
struct bpf_kfunc_desc_tab *tab;
tab = prog->aux->kfunc_tab;
res = bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
return res ? &res->func_model : NULL;
}
@@ -1750,16 +2730,10 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return -EPERM;
}
- if (bpf_pseudo_func(insn)) {
- ret = add_subprog(env, i + insn->imm + 1);
- if (ret >= 0)
- /* remember subprog */
- insn[1].imm = ret;
- } else if (bpf_pseudo_call(insn)) {
+ if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
ret = add_subprog(env, i + insn->imm + 1);
- } else {
- ret = add_kfunc_call(env, insn->imm);
- }
+ else
+ ret = add_kfunc_call(env, insn->imm, insn->off);
if (ret < 0)
return ret;
@@ -1791,8 +2765,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
u8 code = insn[i].code;
if (code == (BPF_JMP | BPF_CALL) &&
- insn[i].imm == BPF_FUNC_tail_call &&
- insn[i].src_reg != BPF_PSEUDO_CALL)
+ insn[i].src_reg == 0 &&
+ insn[i].imm == BPF_FUNC_tail_call)
subprog[cur_subprog].has_tail_call = true;
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
@@ -1842,7 +2816,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
break;
if (parent->live & REG_LIVE_DONE) {
verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str[parent->type],
+ reg_type_str(env, parent->type),
parent->var_off.value, parent->off);
return -EFAULT;
}
@@ -1877,6 +2851,51 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ret;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
+ if (ret)
+ return ret;
+ return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
+ state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+}
+
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -2026,6 +3045,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return -EINVAL;
}
+ mark_reg_scratched(env, regno);
+
reg = &regs[regno];
rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
@@ -2057,15 +3078,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
static int push_jmp_history(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_idx_pair *p;
+ size_t alloc_size;
+
+ if (!is_jmp_point(env, env->insn_idx))
+ return 0;
cnt++;
- p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
p[cnt - 1].idx = env->insn_idx;
@@ -2095,12 +3131,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
{
const struct btf_type *func;
+ struct btf *desc_btf;
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- func = btf_type_by_id(btf_vmlinux, insn->imm);
- return btf_name_by_offset(btf_vmlinux, func->name_off);
+ desc_btf = find_kfunc_desc_btf(data, insn->off);
+ if (IS_ERR(desc_btf))
+ return "<error>";
+
+ func = btf_type_by_id(desc_btf, insn->imm);
+ return btf_name_by_offset(desc_btf, func->name_off);
}
/* For given verifier state backtrack_insn() is called from the last insn to
@@ -2125,7 +3166,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (insn->code == 0)
return 0;
- if (env->log.level & BPF_LOG_LEVEL) {
+ if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
@@ -2175,8 +3216,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
if (insn->src_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
@@ -2199,8 +3238,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
/* scalars can only be spilled into stack */
if (insn->dst_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
spi = (-insn->off - 1) / BPF_REG_SIZE;
if (spi >= 64) {
verbose(env, "BUG spi %d\n", spi);
@@ -2216,6 +3253,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (opcode == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
return -ENOTSUPP;
+ /* BPF helpers that invoke callback subprogs are
+ * equivalent to BPF_PSEUDO_CALL above
+ */
+ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+ return -ENOTSUPP;
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
/* regular helper call sets R0 */
*reg_mask &= ~1;
if (*reg_mask & 0x3f) {
@@ -2228,6 +3276,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
}
} else if (opcode == BPF_EXIT) {
return -ENOTSUPP;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!(*reg_mask & (dreg | sreg)))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ *reg_mask |= (sreg | dreg);
+ /* else dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
}
} else if (class == BPF_LD) {
if (!(*reg_mask & dreg))
@@ -2305,8 +3368,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
*/
- for (; st; st = st->parent)
+ for (st = st->parent; st; st = st->parent) {
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
@@ -2316,7 +3382,7 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
reg->precise = true;
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (func->stack[j].slot_type[0] != STACK_SPILL)
+ if (!is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
if (reg->type != SCALAR_VALUE)
@@ -2324,9 +3390,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
reg->precise = true;
}
}
+ }
}
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
int spi)
{
struct bpf_verifier_state *st = env->cur_state;
@@ -2343,22 +3522,22 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
if (!env->bpf_capable)
return 0;
- func = st->frame[st->curframe];
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- if (!reg->precise)
- new_marks = true;
- else
- reg_mask = 0;
- reg->precise = true;
+ new_marks = true;
}
while (spi >= 0) {
- if (func->stack[spi].slot_type[0] != STACK_SPILL) {
+ if (!is_spilled_reg(&func->stack[spi])) {
stack_mask = 0;
break;
}
@@ -2367,11 +3546,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
stack_mask = 0;
break;
}
- if (!reg->precise)
- new_marks = true;
- else
- stack_mask = 0;
- reg->precise = true;
+ new_marks = true;
break;
}
@@ -2379,12 +3554,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
if (!reg_mask && !stack_mask)
return 0;
+
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
- if (env->log.level & BPF_LOG_LEVEL)
+ if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
+ continue;
+ }
+ reg->precise = true;
+ }
+ return 0;
+ }
+
+ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, reg_mask, stack_mask);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
for (i = last_idx;;) {
if (skip_first) {
err = 0;
@@ -2424,7 +3629,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
break;
new_marks = false;
- func = st->frame[st->curframe];
+ func = st->frame[frame];
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
reg = &func->regs[i];
@@ -2457,7 +3662,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
}
- if (func->stack[i].slot_type[0] != STACK_SPILL) {
+ if (!is_spilled_reg(&func->stack[i])) {
stack_mask &= ~(1ull << i);
continue;
}
@@ -2470,11 +3675,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
new_marks = true;
reg->precise = true;
}
- if (env->log.level & BPF_LOG_LEVEL) {
- print_verifier_state(env, func);
- verbose(env, "parent %s regs=%x stack=%llx marks\n",
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "parent %s regs=%x stack=%llx marks:",
new_marks ? "didn't have" : "already had",
reg_mask, stack_mask);
+ print_verifier_state(env, func, true);
}
if (!reg_mask && !stack_mask)
@@ -2488,21 +3693,25 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+ return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
+}
+
+static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
{
- return __mark_chain_precision(env, regno, -1);
+ return __mark_chain_precision(env, frame, regno, -1);
}
-static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
{
- return __mark_chain_precision(env, -1, spi);
+ return __mark_chain_precision(env, frame, -1, spi);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_MAP_VALUE:
- case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -2511,21 +3720,12 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
- case PTR_TO_RDONLY_BUF:
- case PTR_TO_RDONLY_BUF_OR_NULL:
- case PTR_TO_RDWR_BUF:
- case PTR_TO_RDWR_BUF_OR_NULL:
- case PTR_TO_PERCPU_BTF_ID:
+ case PTR_TO_BUF:
case PTR_TO_MEM:
- case PTR_TO_MEM_OR_NULL:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
return true;
@@ -2568,16 +3768,38 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ struct bpf_reg_state *parent = dst->parent;
+ enum bpf_reg_liveness live = dst->live;
+
+ *dst = *src;
+ dst->parent = parent;
+ dst->live = live;
+}
+
static void save_register_state(struct bpf_func_state *state,
- int spi, struct bpf_reg_state *reg)
+ int spi, struct bpf_reg_state *reg,
+ int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
+ if (size == BPF_REG_SIZE)
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
+ state->stack[spi].slot_type[i - 1] = STACK_SPILL;
- for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack[spi].slot_type[i] = STACK_SPILL;
+ /* size < 8 bytes spill */
+ for (; i; i--)
+ scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
+}
+
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
}
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
@@ -2591,8 +3813,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
+ u32 dst_reg = insn->dst_reg;
err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
if (err)
@@ -2610,8 +3833,28 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
- if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
+ for (i = 0; i < size; i++) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
+ sanitize = true;
+ break;
+ }
+ }
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ }
+
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi);
+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
@@ -2624,7 +3867,14 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
}
- save_register_state(state, spi, reg);
+ save_register_state(state, spi, reg, size);
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+ insn->imm != 0 && env->bpf_capable) {
+ struct bpf_reg_state fake_reg = {};
+
+ __mark_reg_known(&fake_reg, (u32)insn->imm);
+ fake_reg.type = SCALAR_VALUE;
+ save_register_state(state, spi, &fake_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -2632,57 +3882,20 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
-
- if (!env->bypass_spec_v4) {
- bool sanitize = false;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- register_is_const(&state->stack[spi].spilled_ptr))
- sanitize = true;
- for (i = 0; i < BPF_REG_SIZE; i++)
- if (state->stack[spi].slot_type[i] == STACK_MISC) {
- sanitize = true;
- break;
- }
- if (sanitize) {
- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
- int soff = (-spi - 1) * BPF_REG_SIZE;
-
- /* detected reuse of integer stack slot with a pointer
- * which means either llvm is reusing stack slot or
- * an attacker is trying to exploit CVE-2018-3639
- * (speculative store bypass)
- * Have to sanitize that slot with preemptive
- * store of zero.
- */
- if (*poff && *poff != soff) {
- /* disallow programs where single insn stores
- * into two different stack slots, since verifier
- * cannot sanitize them
- */
- verbose(env,
- "insn %d cannot access two stack slots fp%d and fp%d",
- insn_idx, *poff, soff);
- return -EINVAL;
- }
- *poff = soff;
- }
- }
- save_register_state(state, spi, reg);
+ save_register_state(state, spi, reg, size);
} else {
u8 type = STACK_MISC;
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (state->stack[spi].slot_type[0] == STACK_SPILL)
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack[spi].slot_type[i] = STACK_MISC;
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -2696,7 +3909,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */
- if (reg && register_is_null(reg)) {
+ if ((reg && register_is_null(reg)) ||
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
/* backtracking doesn't work for STACK_ZERO yet. */
err = mark_chain_precision(env, value_regno);
if (err)
@@ -2741,6 +3955,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
int min_off, max_off;
int i, err;
struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
bool writing_zero = false;
/* set if the fact that we're writing a zero is used to let any
* stack slots remain STACK_ZERO
@@ -2753,13 +3968,22 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
max_off = ptr_reg->smax_value + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
- if (value_reg && register_is_null(value_reg))
+ if ((value_reg && register_is_null(value_reg)) ||
+ (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
if (err)
return err;
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
@@ -2769,15 +3993,19 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
-
- if (!env->allow_ptr_leaks
- && *stype != NOT_INIT
- && *stype != SCALAR_VALUE) {
- /* Reject the write if there's are spilled pointers in
- * range. If we didn't reject here, the ptr status
- * would be erased below (even though not all slots are
- * actually overwritten), possibly opening the door to
- * leaks.
+ mark_stack_slot_scratched(env, spi);
+
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
*/
verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
insn_idx, i);
@@ -2885,35 +4113,58 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
struct bpf_reg_state *reg;
- u8 *stype;
+ u8 *stype, type;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
- if (stype[0] == STACK_SPILL) {
- if (size != BPF_REG_SIZE) {
+ if (is_spilled_reg(&reg_state->stack[spi])) {
+ u8 spill_size = 1;
+
+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
+ spill_size++;
+
+ if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
if (reg->type != SCALAR_VALUE) {
verbose_linfo(env, env->insn_idx, "; ");
verbose(env, "invalid size of register fill\n");
return -EACCES;
}
- if (dst_regno >= 0) {
+
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ if (dst_regno < 0)
+ return 0;
+
+ if (!(off % BPF_REG_SIZE) && size == spill_size) {
+ /* The earlier check_reg_arg() has decided the
+ * subreg_def for this insn. Save it first.
+ */
+ s32 subreg_def = state->regs[dst_regno].subreg_def;
+
+ copy_register_state(&state->regs[dst_regno], reg);
+ state->regs[dst_regno].subreg_def = subreg_def;
+ } else {
+ for (i = 0; i < size; i++) {
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_SPILL)
+ continue;
+ if (type == STACK_MISC)
+ continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
mark_reg_unknown(env, state->regs, dst_regno);
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
return 0;
}
- for (i = 1; i < BPF_REG_SIZE; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
- verbose(env, "corrupted spill memory\n");
- return -EACCES;
- }
- }
if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
@@ -2932,14 +4183,14 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
- u8 type;
-
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
if (type == STACK_MISC)
continue;
if (type == STACK_ZERO)
continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -2951,7 +4202,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
return 0;
}
-enum stack_access_src {
+enum bpf_access_src {
ACCESS_DIRECT = 1, /* the access is performed by an instruction */
ACCESS_HELPER = 2, /* the access is performed by a helper */
};
@@ -2959,7 +4210,7 @@ enum stack_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
int regno, int off, int access_size,
bool zero_size_allowed,
- enum stack_access_src type,
+ enum bpf_access_src type,
struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -3035,17 +4286,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
/* Variable offset is prohibited for unprivileged mode for simplicity
* since it requires corresponding support in Spectre masking for stack
- * ALU. See also retrieve_ptr_limit().
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
*/
- if (!env->bypass_spec_v1 && var_off) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
- ptr_regno, tn_buf);
- return -EACCES;
- }
-
if (!var_off) {
off += reg->var_off.value;
err = check_stack_read_fixed_off(env, state, off, size,
@@ -3166,11 +4413,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
/* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
- */
- if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, state);
-
- /* The minimum value is only important with signed
+ *
+ * The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
@@ -3212,45 +4456,267 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ bool fixed_off_ok)
+{
+ /* Access to this pointer-typed register or passing it to a helper
+ * is only allowed in its original, unmodified form.
+ */
+
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!fixed_off_ok && reg->off) {
+ verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ return __check_ptr_off_reg(env, reg, regno, false);
+}
+
+static int map_kptr_match_type(struct bpf_verifier_env *env,
+ struct btf_field *kptr_field,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+ const char *reg_name = "";
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+
+ if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+ goto bad_type;
+
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "R%d must point to kernel BTF\n", regno);
+ return -EINVAL;
+ }
+ /* We need to verify reg->type and reg->btf, before accessing reg->btf */
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
+
+ /* For ref_ptr case, release function check should ensure we get one
+ * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+ * normal store of unreferenced kptr, we must ensure var_off is zero.
+ * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+ * reg->off and reg->ref_obj_id are not needed here.
+ */
+ if (__check_ptr_off_reg(env, reg, regno, true))
+ return -EACCES;
+
+ /* A full type match is needed, as BTF can be vmlinux or module BTF, and
+ * we also need to take into account the reg->off.
+ *
+ * We want to support cases like:
+ *
+ * struct foo {
+ * struct bar br;
+ * struct baz bz;
+ * };
+ *
+ * struct foo *v;
+ * v = func(); // PTR_TO_BTF_ID
+ * val->foo = v; // reg->off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * // first member type of struct after comparison fails
+ * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * // to match type
+ *
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * is zero. We must also ensure that btf_struct_ids_match does not walk
+ * the struct to match type against first member of struct, i.e. reject
+ * second case from above. Hence, when type is BPF_KPTR_REF, we set
+ * strict mode to true for type match.
+ */
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type == BPF_KPTR_REF))
+ goto bad_type;
+ return 0;
+bad_type:
+ verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
+ reg_type_str(env, reg->type), reg_name);
+ verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
+ targ_name);
+ else
+ verbose(env, "\n");
+ return -EINVAL;
+}
+
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+ return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(struct, cgroup)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(struct, task_struct)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+ if (!btf_is_kernel(btf))
+ return false;
+ return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+ const struct btf_field_kptr *kptr = &field->kptr;
+
+ return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+}
+
+static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+ int value_regno, int insn_idx,
+ struct btf_field *kptr_field)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int class = BPF_CLASS(insn->code);
+ struct bpf_reg_state *val_reg;
+
+ /* Things we already checked for in check_map_access and caller:
+ * - Reject cases where variable offset may touch kptr
+ * - size of access (must be BPF_DW)
+ * - tnum_is_const(reg->var_off)
+ * - kptr_field->offset == off + reg->var_off.value
+ */
+ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
+ return -EACCES;
+ }
+
+ /* We only allow loading referenced kptr, since it will be marked as
+ * untrusted, similar to unreferenced kptr.
+ */
+ if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
+ verbose(env, "store to referenced kptr disallowed\n");
+ return -EACCES;
+ }
+
+ if (class == BPF_LDX) {
+ val_reg = reg_state(env, value_regno);
+ /* We can simply mark the value_regno receiving the pointer
+ * value from map as PTR_TO_BTF_ID, with the correct type.
+ */
+ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id,
+ rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
+ PTR_MAYBE_NULL | MEM_RCU :
+ PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ /* For mark_ptr_or_null_reg */
+ val_reg->id = ++env->id_gen;
+ } else if (class == BPF_STX) {
+ val_reg = reg_state(env, value_regno);
+ if (!register_is_null(val_reg) &&
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
+ return -EACCES;
+ } else if (class == BPF_ST) {
+ if (insn->imm) {
+ verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
+ kptr_field->offset);
+ return -EACCES;
+ }
+ } else {
+ verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
+ int off, int size, bool zero_size_allowed,
+ enum bpf_access_src src)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
- int err;
+ struct btf_record *rec;
+ int err, i;
err = check_mem_region_access(env, regno, off, size, map->value_size,
zero_size_allowed);
if (err)
return err;
- if (map_value_has_spin_lock(map)) {
- u32 lock = map->spin_lock_off;
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
- /* if any part of struct bpf_spin_lock can be touched by
- * load/store reject this program.
- * To check that [x1, x2) overlaps with [y1, y2)
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
- lock < reg->umax_value + off + size) {
- verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
- return -EACCES;
+ if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (src != ACCESS_DIRECT) {
+ verbose(env, "kptr cannot be accessed indirectly by helper\n");
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "kptr access cannot have variable offset\n");
+ return -EACCES;
+ }
+ if (p != off + reg->var_off.value) {
+ verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+ p, off + reg->var_off.value);
+ return -EACCES;
+ }
+ if (size != bpf_size_to_bytes(BPF_DW)) {
+ verbose(env, "kptr access size must be BPF_DW\n");
+ return -EACCES;
+ }
+ break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
}
}
- return err;
+ return 0;
}
#define MAX_PACKET_OFF 0xffff
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
- return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
@@ -3356,7 +4822,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(*reg_type) == PTR_TO_BTF_ID) {
*btf = info.btf;
*btf_id = info.btf_id;
} else {
@@ -3424,7 +4890,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
}
verbose(env, "R%d invalid %s access off=%d size=%d\n",
- regno, reg_type_str[reg->type], off, size);
+ regno, reg_type_str(env, reg->type), off, size);
return -EACCES;
}
@@ -3463,6 +4929,35 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -3643,6 +5138,8 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
+ int next_insn;
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -3650,13 +5147,22 @@ continue_func:
ret_prog[frame] = idx;
/* find the callee */
- i = i + insn[i].imm + 1;
- idx = find_subprog(env, i);
+ next_insn = i + insn[i].imm + 1;
+ idx = find_subprog(env, next_insn);
if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i);
+ next_insn);
return -EFAULT;
}
+ if (subprog[idx].is_async_cb) {
+ if (subprog[idx].has_tail_call) {
+ verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ return -EFAULT;
+ }
+ /* async callbacks don't increase bpf prog stack size */
+ continue;
+ }
+ i = next_insn;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -3677,6 +5183,8 @@ continue_func:
if (tail_call_reachable)
for (j = 0; j < frame; j++)
subprog[ret_prog[j]].tail_call_reachable = true;
+ if (subprog[0].tail_call_reachable)
+ env->prog->aux->tail_call_reachable = true;
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
@@ -3706,30 +5214,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
-{
- /* Access to ctx or passing it to a helper is only allowed in
- * its original, unmodified form.
- */
-
- if (reg->off) {
- verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n",
- regno, reg->off);
- return -EACCES;
- }
-
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf);
- return -EACCES;
- }
-
- return 0;
-}
-
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
const struct bpf_reg_state *reg,
@@ -3774,9 +5258,9 @@ static int check_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size,
bool zero_size_allowed,
- const char *buf_info,
u32 *max_access)
{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
int err;
err = __check_buffer_access(env, buf_info, reg, regno, off, size);
@@ -3829,7 +5313,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
- return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
+ /* A map is considered read-only if the following condition are true:
+ *
+ * 1) BPF program side cannot change any of the map content. The
+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
+ * and was set at map creation time.
+ * 2) The map value(s) have been initialized from user space by a
+ * loader and then "frozen", such that no new map update/delete
+ * operations from syscall side are possible for the rest of
+ * the map's lifetime from that point onwards.
+ * 3) Any parallel/pending map update/delete operations from syscall
+ * side have been completed. Only after that point, it's safe to
+ * assume that map value(s) are immutable.
+ */
+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
+ READ_ONCE(map->frozen) &&
+ !bpf_map_write_active(map);
}
static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
@@ -3862,6 +5361,112 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
+#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
+#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
+
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+ struct css_set __rcu *cgroups;
+ struct task_struct __rcu *real_parent;
+ struct task_struct *group_leader;
+};
+
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
+BTF_TYPE_SAFE_RCU(struct css_set) {
+ struct cgroup *dfl_cgrp;
+};
+
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+ struct seq_file *seq;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+ struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+ struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct dentry) {
+ /* no negative dentry-s in places where bpf can see it */
+ struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct socket) {
+ struct sock *sk;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
+
+static bool type_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -3871,9 +5476,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
- u32 btf_id;
+ const char *field_name = NULL;
+ enum bpf_type_flag flag = 0;
+ u32 btf_id = 0;
int ret;
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -3890,24 +5509,103 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access) {
- ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id);
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
- if (atype != BPF_READ) {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
- ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id);
+ if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+ !reg->ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
}
if (ret < 0)
return ret;
+ if (ret != PTR_TO_BTF_ID) {
+ /* just mark; */
+
+ } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
+ */
+ flag = PTR_UNTRUSTED;
+
+ } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust (either full or RCU).
+ * For example:
+ * 'cgroups' pointer is untrusted if task->cgroups dereference
+ * happened in a sleepable program outside of bpf_rcu_read_lock()
+ * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+ * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+ *
+ * A regular RCU-protected pointer with __rcu tag can also be deemed
+ * trusted if we are in an RCU CS. Such pointer can be NULL.
+ */
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED;
+ } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
+ /* ignore __rcu tag and mark it MEM_RCU */
+ flag |= MEM_RCU;
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
+ /* __rcu tagged pointers can be NULL */
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
+ } else if (flag & (MEM_PERCPU | MEM_USER)) {
+ /* keep as-is */
+ } else {
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
+ }
+ } else {
+ /*
+ * If not in RCU CS or MEM_RCU pointer can be NULL then
+ * aggressively mark as untrusted otherwise such
+ * pointers will be plain PTR_TO_BTF_ID without flags
+ * and will be allowed to be passed into helpers for
+ * compat reasons.
+ */
+ flag = PTR_UNTRUSTED;
+ }
+ } else {
+ /* Old compat. Deprecated */
+ clear_trusted_flags(&flag);
+ }
+
if (atype == BPF_READ && value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
return 0;
}
@@ -3920,6 +5618,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
+ enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
u32 btf_id;
@@ -3939,9 +5639,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!env->allow_ptr_to_map_access) {
+ if (!env->allow_ptr_leaks) {
verbose(env,
- "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
tname);
return -EPERM;
}
@@ -3957,12 +5657,15 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
if (ret < 0)
return ret;
if (value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
return 0;
}
@@ -3997,7 +5700,7 @@ static int check_stack_slot_within_bounds(int off,
static int check_stack_access_within_bounds(
struct bpf_verifier_env *env,
int regno, int off, int access_size,
- enum stack_access_src src, enum bpf_access_type type)
+ enum bpf_access_src src, enum bpf_access_type type)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -4093,6 +5796,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_MAP_VALUE) {
+ struct btf_field *kptr_field = NULL;
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4101,8 +5806,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_map_access_type(env, regno, off, size, t);
if (err)
return err;
- err = check_map_access(env, regno, off, size, false);
- if (!err && t == BPF_READ && value_regno >= 0) {
+ err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+ if (err)
+ return err;
+ if (tnum_is_const(reg->var_off))
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
+ } else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
/* if map is read-only, track its contents as scalars */
@@ -4123,15 +5835,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
- } else if (reg->type == PTR_TO_MEM) {
+ } else if (base_type(reg->type) == PTR_TO_MEM) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+
+ if (type_may_be_null(reg->type)) {
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && rdonly_mem) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into mem\n", value_regno);
return -EACCES;
}
+
err = check_mem_region_access(env, regno, off, size,
reg->mem_size, false);
- if (!err && t == BPF_READ && value_regno >= 0)
+ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -4144,11 +5871,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_ctx_reg(env, reg, regno);
+ err = check_ptr_off_reg(env, reg, regno);
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+ &btf_id);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -4161,7 +5889,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (reg_type_may_be_null(reg_type))
+ if (type_may_be_null(reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -4169,8 +5897,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID ||
- reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(reg_type) == PTR_TO_BTF_ID) {
regs[value_regno].btf = btf;
regs[value_regno].btf_id = btf_id;
}
@@ -4223,7 +5950,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
+ regno, reg_type_str(env, reg->type));
return -EACCES;
}
err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -4233,32 +5960,36 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_BTF_ID) {
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
value_regno);
- } else if (reg->type == PTR_TO_RDONLY_BUF) {
- if (t == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
- return -EACCES;
+ } else if (base_type(reg->type) == PTR_TO_BUF) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ u32 *max_access;
+
+ if (rdonly_mem) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
}
+
err = check_buffer_access(env, reg, regno, off, size, false,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- if (!err && value_regno >= 0)
- mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_RDWR_BUF) {
- err = check_buffer_access(env, reg, regno, off, size, false,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
- if (!err && t == BPF_READ && value_regno >= 0)
+ max_access);
+
+ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EACCES;
}
@@ -4309,9 +6040,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (insn->imm == BPF_CMPXCHG) {
/* Check comparison of R0 with memory location */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ const u32 aux_reg = BPF_REG_0;
+
+ err = check_reg_arg(env, aux_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, aux_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", aux_reg);
+ return -EACCES;
+ }
}
if (is_pointer_value(env, insn->src_reg)) {
@@ -4325,7 +6063,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -4346,13 +6084,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
load_reg = -1;
}
- /* check whether we can read the memory */
+ /* Check whether we can read the memory, with second call for fetch
+ * case to simulate the register fill.
+ */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true);
+ if (!err && load_reg >= 0)
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, load_reg,
+ true);
if (err)
return err;
- /* check whether we can write into the same memory */
+ /* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true);
if (err)
@@ -4373,7 +6117,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
struct bpf_verifier_env *env, int regno, int off,
int access_size, bool zero_size_allowed,
- enum stack_access_src type, struct bpf_call_arg_meta *meta)
+ enum bpf_access_src type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
@@ -4436,6 +6180,31 @@ static int check_stack_range_initialized(
}
if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
meta->access_size = access_size;
meta->regno = regno;
return 0;
@@ -4451,7 +6220,8 @@ static int check_stack_range_initialized(
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
- if (*stype == STACK_ZERO) {
+ if ((*stype == STACK_ZERO) ||
+ (*stype == STACK_INVALID && env->allow_uninit_stack)) {
if (clobber) {
/* helper can write anything into the stack */
*stype = STACK_MISC;
@@ -4459,17 +6229,13 @@ static int check_stack_range_initialized(
goto mark;
}
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
- goto mark;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
- state->stack[spi].slot_type[j] = STACK_MISC;
+ scrub_spilled_slot(&state->stack[spi].slot_type[j]);
}
goto mark;
}
@@ -4493,6 +6259,11 @@ mark:
mark_reg_read(env, &state->stack[spi].spilled_ptr,
state->stack[spi].spilled_ptr.parent,
REG_LIVE_READ64);
+ /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
return update_stack_depth(env, state, min_off);
}
@@ -4502,13 +6273,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ u32 *max_access;
- switch (reg->type) {
+ switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
@@ -4517,82 +6294,209 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
BPF_READ))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
- zero_size_allowed);
+ zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ }
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
- case PTR_TO_RDONLY_BUF:
- if (meta && meta->raw_mode)
- return -EACCES;
- return check_buffer_access(env, reg, regno, reg->off,
- access_size, zero_size_allowed,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- case PTR_TO_RDWR_BUF:
+ case PTR_TO_BUF:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
+ max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ atype, -1, false);
+ }
+
+ fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
register_is_null(reg))
return 0;
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type],
- reg_type_str[PTR_TO_STACK]);
+ verbose(env, "R%d type=%s ", regno,
+ reg_type_str(env, reg->type));
+ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
return -EACCES;
}
}
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
+ */
+ if (!tnum_is_const(reg->var_off))
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
+ if (err)
+ return err;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
u32 regno, u32 mem_size)
{
+ bool may_be_null = type_may_be_null(reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
if (register_is_null(reg))
return 0;
- if (reg_type_may_be_null(reg->type)) {
- /* Assuming that the register contains a value check if the memory
- * access is safe. Temporarily save and restore the register's state as
- * the conversion shouldn't be visible to a caller.
- */
- const struct bpf_reg_state saved_reg = *reg;
- int rv;
-
+ memset(&meta, 0, sizeof(meta));
+ /* Assuming that the register contains a value check if the memory
+ * access is safe. Temporarily save and restore the register's state as
+ * the conversion shouldn't be visible to a caller.
+ */
+ if (may_be_null) {
+ saved_reg = *reg;
mark_ptr_not_null_reg(reg);
- rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
+ }
+
+ err = check_helper_mem_access(env, regno, mem_size, true, &meta);
+ /* Check access for BPF_WRITE */
+ meta.raw_mode = true;
+ err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+
+ if (may_be_null)
*reg = saved_reg;
- return rv;
+
+ return err;
+}
+
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ memset(&meta, 0, sizeof(meta));
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
}
- return check_helper_mem_access(env, regno, mem_size, true, NULL);
+ err = check_mem_size_reg(env, reg, regno, true, &meta);
+ /* Check access for BPF_WRITE */
+ meta.raw_mode = true;
+ err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+ return err;
}
/* Implementation details:
- * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
* Two bpf_map_lookups (even with the same key) will have different reg->id.
- * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
- * value_or_null->value transition, since the verifier only cares about
- * the range of access to valid map value pointer and doesn't care about actual
- * address of the map element.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
- * point to different bpf_spin_locks.
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element got locked
- * and clears it after bpf_spin_unlock.
+ * cur_state->active_lock remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
@@ -4600,8 +6504,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
if (!is_const) {
verbose(env,
@@ -4609,58 +6515,435 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
regno);
return -EINVAL;
}
- if (!map->btf) {
- verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
- return -EINVAL;
- }
- if (!map_value_has_spin_lock(map)) {
- if (map->spin_lock_off == -E2BIG)
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
verbose(env,
- "map '%s' has more than one 'struct bpf_spin_lock'\n",
- map->name);
- else if (map->spin_lock_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_spin_lock'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
map->name);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr");
return -EINVAL;
}
- if (map->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
- val + reg->off);
+ if (rec->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
+ val + reg->off, rec->spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_spin_lock) {
+ if (cur->active_lock.ptr) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- cur->active_spin_lock = reg->id;
+ if (map)
+ cur->active_lock.ptr = map;
+ else
+ cur->active_lock.ptr = btf;
+ cur->active_lock.id = reg->id;
} else {
- if (!cur->active_spin_lock) {
+ void *ptr;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_lock.ptr) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_spin_lock != reg->id) {
+ if (cur->active_lock.ptr != ptr ||
+ cur->active_lock.id != reg->id) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
- cur->active_spin_lock = 0;
+
+ invalidate_non_owning_refs(env);
+
+ cur->active_lock.ptr = NULL;
+ cur->active_lock.id = 0;
}
return 0;
}
-static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
{
- return type == ARG_PTR_TO_MEM ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_UNINIT_MEM;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map->record, BPF_TIMER)) {
+ verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
+ return -EINVAL;
+ }
+ if (map->record->timer_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+ val + reg->off, map->record->timer_off);
+ return -EINVAL;
+ }
+ if (meta->map_ptr) {
+ verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ return -EFAULT;
+ }
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
+}
+
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map_ptr = reg->map_ptr;
+ struct btf_field *kptr_field;
+ u32 kptr_off;
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
+ verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+ return -EINVAL;
+ }
+
+ meta->map_ptr = map_ptr;
+ kptr_off = reg->off + reg->var_off.value;
+ kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
+ verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+ return -EACCES;
+ }
+ if (kptr_field->type != BPF_KPTR_REF) {
+ verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
+ return -EACCES;
+ }
+ meta->kptr_field = kptr_field;
+ return 0;
+}
+
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err;
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+ return -EFAULT;
+ }
+
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
+ } else /* MEM_RDONLY and None case from above */ {
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
+ return -EINVAL;
+ }
+
+ err = mark_dynptr_read(env, reg);
+ }
+ return err;
+}
+
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ const struct btf_param *arg;
+ int spi, err, i, nr_slots;
+ u32 btf_id;
+
+ /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+ arg = &btf_params(meta->func_proto)[0];
+ t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
+ t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy() expect initialized iter state*/
+ if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
+ if (err)
+ return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+ }
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+ return 0;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -4669,15 +6952,14 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
-static bool arg_type_is_alloc_size(enum bpf_arg_type type)
+static bool arg_type_is_release(enum bpf_arg_type type)
{
- return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
+ return type & OBJ_RELEASE;
}
-static bool arg_type_is_int_ptr(enum bpf_arg_type type)
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_INT ||
- type == ARG_PTR_TO_LONG;
+ return base_type(type) == ARG_PTR_TO_DYNPTR;
}
static int int_ptr_type_to_size(enum bpf_arg_type type)
@@ -4710,7 +6992,10 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
return -EINVAL;
}
break;
-
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (meta->func_id == BPF_FUNC_map_peek_elem)
+ *arg_type = ARG_PTR_TO_MAP_VALUE;
+ break;
default:
break;
}
@@ -4722,16 +7007,6 @@ struct bpf_reg_types {
u32 *btf_id;
};
-static const struct bpf_reg_types map_key_value_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types sock_types = {
.types = {
PTR_TO_SOCK_COMMON,
@@ -4749,6 +7024,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
PTR_TO_TCP_SOCK,
PTR_TO_XDP_SOCK,
PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
@@ -4762,8 +7038,9 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_RDONLY_BUF,
- PTR_TO_RDWR_BUF,
+ PTR_TO_MEM | MEM_RINGBUF,
+ PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
};
@@ -4777,65 +7054,105 @@ static const struct bpf_reg_types int_ptr_types = {
},
};
+static const struct bpf_reg_types spin_lock_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
+};
+
static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
-static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
-static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ CONST_PTR_TO_DYNPTR,
+ }
+};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
- [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_MAP_PTR] = &const_map_ptr_types,
[ARG_PTR_TO_CTX] = &context_types,
- [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
[ARG_PTR_TO_SOCK_COMMON] = &sock_types,
#ifdef CONFIG_NET
[ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
#endif
[ARG_PTR_TO_SOCKET] = &fullsock_types,
- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
- [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
- [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
+ [ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
+ [ARG_PTR_TO_TIMER] = &timer_types,
+ [ARG_PTR_TO_KPTR] = &kptr_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
- const u32 *arg_btf_id)
+ const u32 *arg_btf_id,
+ struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected, type = reg->type;
const struct bpf_reg_types *compatible;
int i, j;
- compatible = compatible_reg_types[arg_type];
+ compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
return -EFAULT;
}
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+ *
+ * Same for MAYBE_NULL:
+ *
+ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+ *
+ * Therefore we fold these flags depending on the arg_type before comparison.
+ */
+ if (arg_type & MEM_RDONLY)
+ type &= ~MEM_RDONLY;
+ if (arg_type & PTR_MAYBE_NULL)
+ type &= ~PTR_MAYBE_NULL;
+
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+ type &= ~MEM_ALLOC;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -4845,14 +7162,47 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
goto found;
}
- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
for (j = 0; j + 1 < i; j++)
- verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
- verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
+ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
return -EACCES;
found:
- if (type == PTR_TO_BTF_ID) {
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ return 0;
+
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ switch ((int)reg->type) {
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+ {
+ /* For bpf_sk_release, it needs to match against first member
+ * 'struct sock_common', hence make an exception for it. This
+ * allows bpf_sk_release to work for multiple socket types.
+ */
+ bool strict_type_match = arg_type_is_release(arg_type) &&
+ meta->func_id != BPF_FUNC_sk_release;
+
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -4861,32 +7211,220 @@ found:
arg_btf_id = compatible->btf_id;
}
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id)) {
- verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
- return -EACCES;
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
+ verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
+ return -EFAULT;
}
+ /* Handled by helper specific checks */
+ break;
+ case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
+ /* Handled by helper specific checks */
+ break;
+ default:
+ verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
+ rec = reg_btf_record(reg);
+ if (!rec)
+ return NULL;
+
+ field = btf_record_find(rec, off, fields);
+ if (!field)
+ return NULL;
+
+ return field;
+}
+
+int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type)
+{
+ u32 type = reg->type;
+
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+
+ if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
+ if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
+ return __check_ptr_off_reg(env, reg, regno, true);
+
+ verbose(env, "R%d must have zero offset when passed to release func\n",
regno);
- return -EACCES;
+ verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
+ btf_type_name(reg->btf, reg->btf_id), reg->off);
+ return -EINVAL;
+ }
+
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
+ return -EINVAL;
}
+ return __check_ptr_off_reg(env, reg, regno, false);
}
- return 0;
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_RINGBUF:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case SCALAR_VALUE:
+ return 0;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
+ */
+ return __check_ptr_off_reg(env, reg, regno, true);
+ default:
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+}
+
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.id;
+}
+
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
}
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
- const struct bpf_func_proto *fn)
+ const struct bpf_func_proto *fn,
+ int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
+ u32 *arg_btf_id = NULL;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -4911,31 +7449,64 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type);
if (err)
return err;
}
- if (register_is_null(reg) && arg_type_may_be_null(arg_type))
+ if (register_is_null(reg) && type_may_be_null(arg_type))
/* A NULL register has a SCALAR_VALUE type, so skip
* type checking.
*/
goto skip_type_check;
- err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
+ /* arg_btf_id and arg_size are in a union. */
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
+ arg_btf_id = fn->arg_btf_id[arg];
+
+ err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
if (err)
return err;
- if (type == PTR_TO_CTX) {
- err = check_ctx_reg(env, reg, regno);
- if (err < 0)
- return err;
- }
+ err = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (err)
+ return err;
skip_type_check:
+ if (arg_type_is_release(arg_type)) {
+ if (arg_type_is_dynptr(arg_type)) {
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
+ return -EINVAL;
+ }
+ } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ verbose(env, "R%d must be referenced when passed to release function\n",
+ regno);
+ return -EINVAL;
+ }
+ if (meta->release_regno) {
+ verbose(env, "verifier internal error: more than one release argument\n");
+ return -EFAULT;
+ }
+ meta->release_regno = regno;
+ }
+
if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -4946,10 +7517,34 @@ skip_type_check:
meta->ref_obj_id = reg->ref_obj_id;
}
- if (arg_type == ARG_CONST_MAP_PTR) {
+ switch (base_type(arg_type)) {
+ case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ if (meta->map_ptr) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * timer = bpf_map_lookup_elem(inner_map1);
+ * if (timer)
+ * // mismatch would have been allowed
+ * bpf_timer_init(timer, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map_ptr != reg->map_ptr ||
+ meta->map_uid != reg->map_uid) {
+ verbose(env,
+ "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map_uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
meta->map_ptr = reg->map_ptr;
- } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ meta->map_uid = reg->map_uid;
+ break;
+ case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -4966,10 +7561,11 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
- !register_is_null(reg)) ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ break;
+ case ARG_PTR_TO_MAP_VALUE:
+ if (type_may_be_null(arg_type) && register_is_null(reg))
+ return 0;
+
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
@@ -4978,98 +7574,91 @@ skip_type_check:
verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
+ meta->raw_mode = arg_type & MEM_UNINIT;
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
- } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+ break;
+ case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
meta->ret_btf = reg->btf;
meta->ret_btf_id = reg->btf_id;
- } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ break;
+ case ARG_PTR_TO_SPIN_LOCK:
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
+ return -EACCES;
+ }
if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
+ err = process_spin_lock(env, regno, true);
+ if (err)
+ return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
+ err = process_spin_lock(env, regno, false);
+ if (err)
+ return err;
} else {
verbose(env, "verifier internal error\n");
return -EFAULT;
}
- } else if (arg_type == ARG_PTR_TO_FUNC) {
+ break;
+ case ARG_PTR_TO_TIMER:
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
- } else if (arg_type_is_mem_ptr(arg_type)) {
+ break;
+ case ARG_PTR_TO_MEM:
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
*/
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
- } else if (arg_type_is_mem_size(arg_type)) {
- bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
-
- /* This is used to refine r0 return value bounds for helpers
- * that enforce this value as an upper bound on return values.
- * See do_refine_retval_range() for helpers that can refine
- * the return value. C type of helper is u32 so we pull register
- * bound from umax_value however, if negative verifier errors
- * out. Only upper bounds can be learned because retval is an
- * int type and negative retvals are allowed.
- */
- meta->msize_max_value = reg->umax_value;
-
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
- */
- if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
- meta = NULL;
-
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
+ meta->raw_mode = arg_type & MEM_UNINIT;
+ if (arg_type & MEM_FIXED_SIZE) {
+ err = check_helper_mem_access(env, regno,
+ fn->arg_size[arg], false,
meta);
- if (err)
- return err;
- }
-
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
}
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
- } else if (arg_type_is_alloc_size(arg_type)) {
+ break;
+ case ARG_CONST_SIZE:
+ err = check_mem_size_reg(env, reg, regno, false, meta);
+ break;
+ case ARG_CONST_SIZE_OR_ZERO:
+ err = check_mem_size_reg(env, reg, regno, true, meta);
+ break;
+ case ARG_PTR_TO_DYNPTR:
+ err = process_dynptr_func(env, regno, insn_idx, arg_type);
+ if (err)
+ return err;
+ break;
+ case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
regno);
return -EACCES;
}
meta->mem_size = reg->var_off.value;
- } else if (arg_type_is_int_ptr(arg_type)) {
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_INT:
+ case ARG_PTR_TO_LONG:
+ {
int size = int_ptr_type_to_size(arg_type);
err = check_helper_mem_access(env, regno, size, false, meta);
if (err)
return err;
err = check_ptr_alignment(env, reg, 0, size, true);
- } else if (arg_type == ARG_PTR_TO_CONST_STR) {
+ break;
+ }
+ case ARG_PTR_TO_CONST_STR:
+ {
struct bpf_map *map = reg->map_ptr;
int map_off;
u64 map_addr;
@@ -5091,7 +7680,8 @@ skip_type_check:
}
err = check_map_access(env, regno, reg->off,
- map->value_size - reg->off, false);
+ map->value_size - reg->off, false,
+ ACCESS_HELPER);
if (err)
return err;
@@ -5107,6 +7697,13 @@ skip_type_check:
verbose(env, "string is not zero-terminated\n");
return -EINVAL;
}
+ break;
+ }
+ case ARG_PTR_TO_KPTR:
+ err = process_kptr_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
}
return err;
@@ -5146,7 +7743,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
- return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
+ return env->prog->jit_requested &&
+ bpf_jit_supports_subprog_tailcalls();
}
static int check_map_func_compatibility(struct bpf_verifier_env *env,
@@ -5172,9 +7770,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_submit &&
- func_id != BPF_FUNC_ringbuf_discard &&
- func_id != BPF_FUNC_ringbuf_query)
+ func_id != BPF_FUNC_ringbuf_query &&
+ func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+ func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+ func_id != BPF_FUNC_ringbuf_discard_dynptr)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -5247,17 +7850,31 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
- func_id != BPF_FUNC_sk_storage_delete)
+ func_id != BPF_FUNC_sk_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_INODE_STORAGE:
if (func_id != BPF_FUNC_inode_storage_get &&
- func_id != BPF_FUNC_inode_storage_delete)
+ func_id != BPF_FUNC_inode_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_TASK_STORAGE:
if (func_id != BPF_FUNC_task_storage_get &&
- func_id != BPF_FUNC_task_storage_delete)
+ func_id != BPF_FUNC_task_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_map_push_elem)
goto error;
break;
default:
@@ -5282,6 +7899,19 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
+ case BPF_FUNC_ringbuf_output:
+ case BPF_FUNC_ringbuf_reserve:
+ case BPF_FUNC_ringbuf_query:
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
+ goto error;
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -5321,13 +7951,24 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
- case BPF_FUNC_map_peek_elem:
case BPF_FUNC_map_pop_elem:
- case BPF_FUNC_map_push_elem:
if (map->map_type != BPF_MAP_TYPE_QUEUE &&
map->map_type != BPF_MAP_TYPE_STACK)
goto error;
break;
+ case BPF_FUNC_map_peek_elem:
+ case BPF_FUNC_map_push_elem:
+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+ map->map_type != BPF_MAP_TYPE_STACK &&
+ map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
+ goto error;
+ break;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+ goto error;
+ break;
case BPF_FUNC_sk_storage_get:
case BPF_FUNC_sk_storage_delete:
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
@@ -5343,6 +7984,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
goto error;
break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -5376,13 +8022,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
return count <= 1;
}
-static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
- enum bpf_arg_type arg_next)
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
- return (arg_type_is_mem_ptr(arg_curr) &&
- !arg_type_is_mem_size(arg_next)) ||
- (!arg_type_is_mem_ptr(arg_curr) &&
- arg_type_is_mem_size(arg_next));
+ bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+ bool has_size = fn->arg_size[arg] != 0;
+ bool is_next_size = false;
+
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+ is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+ if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+ return is_next_size;
+
+ return has_size == is_next_size || is_next_size == is_fixed;
}
static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -5393,52 +8045,29 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
* helper function specification.
*/
if (arg_type_is_mem_size(fn->arg1_type) ||
- arg_type_is_mem_ptr(fn->arg5_type) ||
- check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
- check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
- check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
- check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+ check_args_pair_invalid(fn, 0) ||
+ check_args_pair_invalid(fn, 1) ||
+ check_args_pair_invalid(fn, 2) ||
+ check_args_pair_invalid(fn, 3) ||
+ check_args_pair_invalid(fn, 4))
return false;
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
-{
- int count = 0;
-
- if (arg_type_may_be_refcounted(fn->arg1_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg2_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg3_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg4_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg5_type))
- count++;
-
- /* A reference acquiring function cannot acquire
- * another refcounted ptr.
- */
- if (may_be_acquire_function(func_id) && count)
- return false;
-
- /* We only support one arg being unreferenced at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- return count <= 1;
-}
-
static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
- if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
- return false;
-
- if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+ /* arg_btf_id and arg_size are in a union. */
+ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+ !(fn->arg_type[i] & MEM_FIXED_SIZE)))
return false;
}
@@ -5449,38 +8078,24 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_btf_id_ok(fn) &&
- check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
*/
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
- struct bpf_func_state *state)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(env, reg);
- }
-}
-
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- int i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
- for (i = 0; i <= vstate->curframe; i++)
- __clear_all_pkt_pointers(env, vstate->frame[i]);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
+ mark_reg_invalid(env, reg);
+ }));
}
enum {
@@ -5509,45 +8124,39 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
-static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state,
- int ref_obj_id)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].ref_obj_id == ref_obj_id)
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
- }
-}
-
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
*/
static int release_reference(struct bpf_verifier_env *env,
int ref_obj_id)
{
- struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
int err;
- int i;
err = release_reference_state(cur_func(env), ref_obj_id);
if (err)
return err;
- for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], ref_obj_id);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ mark_reg_invalid(env, reg);
+ }));
return 0;
}
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (type_is_non_owning_ref(reg->type))
+ mark_reg_invalid(env, reg);
+ }));
+}
+
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -5565,6 +8174,12 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
+
+static bool is_callback_calling_kfunc(u32 btf_id);
+
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
@@ -5591,7 +8206,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_subprog_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -5615,6 +8230,50 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
+ */
+ if (set_callee_state_cb != set_callee_state) {
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ }
+
+ if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_timer_set_callback) {
+ struct bpf_verifier_state *async_cb;
+
+ /* there is no real recursion here. timer callbacks are async */
+ env->subprog_info[subprog].is_async_cb = true;
+ async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+ *insn_idx, subprog);
+ if (!async_cb)
+ return -EFAULT;
+ callee = async_cb->frame[0];
+ callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+ /* Convert bpf_timer_set_callback() args into timer callback args */
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
+
+ clear_caller_saved_regs(env, caller->regs);
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* continue with next insn after call */
+ return 0;
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -5633,11 +8292,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* Transfer references to the callee */
err = copy_reference_state(callee, caller);
if (err)
- return err;
+ goto err_out;
err = set_callee_state_cb(env, caller, callee, *insn_idx);
if (err)
- return err;
+ goto err_out;
clear_caller_saved_regs(env, caller->regs);
@@ -5649,11 +8308,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
}
return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
}
int map_set_for_each_callback_args(struct bpf_verifier_env *env,
@@ -5739,9 +8403,170 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+ * u64 flags);
+ * callback_fn(u32 index, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_find_vma(struct task_struct *task, u64 addr,
+ * void *callback_fn, void *callback_ctx, u64 flags)
+ * (callback_fn)(struct task_struct *task,
+ * struct vm_area_struct *vma, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].btf = btf_vmlinux;
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
+ *
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
+ * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
+ * by this point, so look at 'root'
+ */
+ struct btf_field *field;
+
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ BPF_RB_ROOT);
+ if (!field || !field->graph_root.value_btf_id)
+ return -EFAULT;
+
+ mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
+ mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
+
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id);
+
+/* Are we currently verifying the callback for a rbtree helper that must
+ * be called with lock held? If so, no need to complain about unreleased
+ * lock
+ */
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_func_state *callee;
+ int kfunc_btf_id;
+
+ if (!state->curframe)
+ return false;
+
+ callee = state->frame[state->curframe];
+
+ if (!callee->in_callback_fn)
+ return false;
+
+ kfunc_btf_id = insn[callee->callsite].imm;
+ return is_rbtree_lock_required_kfunc(kfunc_btf_id);
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -5762,11 +8587,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return -EINVAL;
}
- state->curframe--;
- caller = state->frame[state->curframe];
+ caller = state->frame[state->curframe - 1];
if (callee->in_callback_fn) {
/* enforce R0 return value range [0, 1]. */
- struct tnum range = tnum_range(0, 1);
+ struct tnum range = callee->callback_ret_range;
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
@@ -5781,21 +8605,28 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
+ /* callback_fn frame should have released its own additions to parent's
+ * reference state at this point, or check_reference_leak would
+ * complain, hence it must be the same as the caller. There is no need
+ * to copy it back.
+ */
+ if (!callee->in_callback_fn) {
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
+ }
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
}
/* clear everything in the callee */
free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
+ state->frame[state->curframe--] = NULL;
return 0;
}
@@ -5817,9 +8648,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
ret_reg->s32_max_value = meta->msize_max_value;
ret_reg->smin_value = -MAX_ERRNO;
ret_reg->s32_min_value = -MAX_ERRNO;
- __reg_deduce_bounds(ret_reg);
- __reg_bound_offset(ret_reg);
- __update_reg_bounds(ret_reg);
+ reg_bounds_sync(ret_reg);
}
static int
@@ -5837,7 +8666,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_pop_elem &&
func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_for_each_map_elem &&
- func_id != BPF_FUNC_redirect_map)
+ func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_percpu_elem)
return 0;
if (map == NULL) {
@@ -5874,8 +8704,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
struct bpf_reg_state *regs = cur_regs(env), *reg;
struct bpf_map *map = meta->map_ptr;
- struct tnum range;
- u64 val;
+ u64 val, max;
int err;
if (func_id != BPF_FUNC_tail_call)
@@ -5885,10 +8714,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EINVAL;
}
- range = tnum_range(0, map->max_entries - 1);
reg = &regs[BPF_REG_3];
+ val = reg->var_off.value;
+ max = map->max_entries;
- if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+ if (!(register_is_const(reg) && val < max)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
@@ -5896,8 +8726,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
err = mark_chain_precision(env, BPF_REG_3);
if (err)
return err;
-
- val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
else if (!bpf_map_key_poisoned(aux) &&
@@ -5909,13 +8737,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
+ bool refs_lingering = false;
int i;
+ if (state->frameno && !state->in_callback_fn)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
}
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
@@ -5924,6 +8759,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ struct bpf_bprintf_data data = {};
int err, fmt_map_off, num_args;
u64 fmt_addr;
char *fmt;
@@ -5948,17 +8784,76 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
* can focus on validating the format specifiers.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
if (err < 0)
verbose(env, "Invalid format string\n");
return err;
}
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ int func_id = BPF_FUNC_get_func_ip;
+
+ if (type == BPF_PROG_TYPE_TRACING) {
+ if (!bpf_prog_has_trampoline(env->prog)) {
+ verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ func_id_name(func_id), func_id);
+ return -ENOTSUPP;
+ }
+ return 0;
+ } else if (type == BPF_PROG_TYPE_KPROBE) {
+ return 0;
+ }
+
+ verbose(env, "func %s#%d not supported for program type %d\n",
+ func_id_name(func_id), func_id, type);
+ return -ENOTSUPP;
+}
+
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ bool reg_is_null = register_is_null(reg);
+
+ if (reg_is_null)
+ mark_chain_precision(env, BPF_REG_4);
+
+ return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+ struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+ if (!state->initialized) {
+ state->initialized = 1;
+ state->fit_for_inline = loop_flag_is_zero(env);
+ state->callback_subprogno = subprogno;
+ return;
+ }
+
+ if (!state->fit_for_inline)
+ return;
+
+ state->fit_for_inline = (loop_flag_is_zero(env) &&
+ state->callback_subprogno == subprogno);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
const struct bpf_func_proto *fn = NULL;
+ enum bpf_return_type ret_type;
+ enum bpf_type_flag ret_flag;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
int insn_idx = *insn_idx_p;
@@ -5992,6 +8887,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (!env->prog->aux->sleepable && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -6010,10 +8910,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
+ if (env->cur_state->active_rcu_lock) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- err = check_func_arg(env, i, &meta, fn);
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
if (err)
return err;
}
@@ -6036,14 +8947,28 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (func_id == BPF_FUNC_tail_call) {
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
- return err;
+ regs = cur_regs(env);
+
+ if (meta.release_regno) {
+ err = -EINVAL;
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+ return -EFAULT;
+ }
+ err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ } else if (meta.ref_obj_id) {
+ err = release_reference(env, meta.ref_obj_id);
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
+ err = 0;
}
- } else if (is_release_function(func_id)) {
- err = release_reference(env, meta.ref_obj_id);
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -6051,29 +8976,127 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
- regs = cur_regs(env);
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ err = check_reference_leak(env);
+ if (err) {
+ verbose(env, "tail_call would lead to reference leak\n");
+ return err;
+ }
+ break;
+ case BPF_FUNC_get_local_storage:
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (!register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_for_each_map_elem:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_map_elem_callback_state);
+ break;
+ case BPF_FUNC_timer_set_callback:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_timer_callback_state);
+ break;
+ case BPF_FUNC_find_vma:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_find_vma_callback_state);
+ break;
+ case BPF_FUNC_snprintf:
+ err = check_bpf_snprintf_call(env, regs);
+ break;
+ case BPF_FUNC_loop:
+ update_loop_inline_state(env, meta.subprogno);
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_loop_callback_state);
+ break;
+ case BPF_FUNC_dynptr_from_mem:
+ if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+ verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+ reg_type_str(env, regs[BPF_REG_1].type));
+ return -EACCES;
+ }
+ break;
+ case BPF_FUNC_set_retval:
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+ }
+ break;
+ case BPF_FUNC_dynptr_data:
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
- /* check that flags argument in get_local_storage(map, flags) is 0,
- * this is required because get_local_storage() can't return an error.
- */
- if (func_id == BPF_FUNC_get_local_storage &&
- !register_is_null(&regs[BPF_REG_2])) {
- verbose(env, "get_local_storage() doesn't support non-zero flags\n");
- return -EINVAL;
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
+
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
}
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
- if (func_id == BPF_FUNC_for_each_map_elem) {
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
+ break;
+ }
+ case BPF_FUNC_user_ringbuf_drain:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_map_elem_callback_state);
- if (err < 0)
- return -EINVAL;
+ set_user_ringbuf_callback_state);
+ break;
}
- if (func_id == BPF_FUNC_snprintf) {
- err = check_bpf_snprintf_call(env, regs);
- if (err < 0)
- return err;
- }
+ if (err)
+ return err;
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
@@ -6085,13 +9108,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* update return register (already marked as written above) */
- if (fn->ret_type == RET_INTEGER) {
+ ret_type = fn->ret_type;
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (fn->ret_type == RET_VOID) {
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ break;
+ case RET_PTR_TO_MAP_VALUE:
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -6104,28 +9132,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
- if (map_value_has_spin_lock(meta.map_ptr))
- regs[BPF_REG_0].id = ++env->id_gen;
- } else {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].map_uid = meta.map_uid;
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+ if (!type_may_be_null(ret_type) &&
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
+ regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+ break;
+ case RET_PTR_TO_SOCKET:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
+ break;
+ case RET_PTR_TO_TCP_SOCK:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
+ break;
+ case RET_PTR_TO_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -6143,46 +9175,73 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
tname, PTR_ERR(ret));
return -EINVAL;
}
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_BTF_ID) {
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
+ struct btf *ret_btf;
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
- PTR_TO_BTF_ID :
- PTR_TO_BTF_ID_OR_NULL;
- ret_btf_id = *fn->ret_btf_id;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ if (func_id == BPF_FUNC_kptr_xchg) {
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf))
+ regs[BPF_REG_0].type |= MEM_ALLOC;
+ } else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
+ func_id_name(func_id));
+ return -EINVAL;
+ }
+ ret_btf = btf_vmlinux;
+ ret_btf_id = *fn->ret_btf_id;
+ }
if (ret_btf_id == 0) {
- verbose(env, "invalid return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "invalid return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- /* current BPF helper definitions are only coming from
- * built-in code with type IDs from vmlinux BTF
- */
- regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- } else {
- verbose(env, "unknown return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ break;
+ }
+ default:
+ verbose(env, "unknown return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
}
- if (reg_type_may_be_null(regs[BPF_REG_0].type))
+ if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (is_ptr_cast_function(func_id)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
@@ -6225,6 +9284,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
env->prog->call_get_stack = true;
+ if (func_id == BPF_FUNC_get_func_ip) {
+ if (check_get_func_ip(env))
+ return -ENOTSUPP;
+ env->prog->call_get_func_ip = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -6254,64 +9319,1580 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool __kfunc_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
{
- const struct btf_type *t, *func, *func_proto, *ptr_type;
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__uninit");
+}
+
+static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+ KF_ARG_RB_ROOT_ID,
+ KF_ARG_RB_NODE_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+BTF_ID(struct, bpf_rb_root)
+BTF_ID(struct, bpf_rb_node)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
+}
+
+static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
+}
+
+static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
+ const struct btf_param *arg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
+ if (!t)
+ return false;
+
+ return true;
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_CALLBACK,
+ KF_ARG_PTR_TO_RB_ROOT,
+ KF_ARG_PTR_TO_RB_NODE,
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_refcount_acquire_impl,
+ KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_back_impl,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+ KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_add_impl,
+ KF_bpf_rbtree_first,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
+};
+
+BTF_SET_START(special_kfunc_set)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_SET_END(special_kfunc_set)
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
struct bpf_reg_state *regs = cur_regs(env);
- const char *func_name, *ptr_type_name;
- u32 i, nargs, func_id, ptr_type_id;
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_iter(meta, argno))
+ return KF_ARG_PTR_TO_ITER;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_ROOT;
+
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_NODE;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CALLBACK;
+
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if (is_kfunc_acquire(meta) ||
+ (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
+ strict_type_match = true;
+
+ WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+
+ if (!state->active_lock.ptr) {
+ verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
+ return -EFAULT;
+ }
+
+ if (type_flag(reg->type) & NON_OWN_REF) {
+ verbose(env, "verifier internal error: NON_OWN_REF already set\n");
+ return -EFAULT;
+ }
+
+ reg->type |= NON_OWN_REF;
+ return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_func_state *state, *unused;
+ struct bpf_reg_state *reg;
+ int i;
+
+ state = cur_func(env);
+
+ if (!ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id is zero for "
+ "owning -> non-owning conversion\n");
+ return -EFAULT;
+ }
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id != ref_obj_id)
+ continue;
+
+ /* Clear ref_obj_id here so release_reference doesn't clobber
+ * the whole reg
+ */
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ reg->ref_obj_id = 0;
+ ref_set_non_owning(env, reg);
+ }
+ }));
+ return 0;
+ }
+
+ verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ ptr = reg->btf;
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reg type for lock check\n");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_lock.ptr)
+ return -EINVAL;
+ if (env->cur_state->active_lock.ptr != ptr ||
+ env->cur_state->active_lock.id != id) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+}
+
+static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+}
+
+static bool is_bpf_graph_api_kfunc(u32 btf_id)
+{
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
+ btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id)
+{
+ return is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
+ enum btf_field_type head_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (head_field_type) {
+ case BPF_LIST_HEAD:
+ ret = is_bpf_list_api_kfunc(kfunc_btf_id);
+ break;
+ case BPF_RB_ROOT:
+ ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
+ btf_field_type_name(head_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
+ btf_field_type_name(head_field_type));
+ return ret;
+}
+
+static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
+ enum btf_field_type node_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (node_field_type) {
+ case BPF_LIST_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
+ break;
+ case BPF_RB_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
+ btf_field_type_name(node_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
+ btf_field_type_name(node_field_type));
+ return ret;
+}
+
+static int
+__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ struct btf_field **head_field)
+{
+ const char *head_type_name;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 head_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
+ return -EFAULT;
+
+ head_type_name = btf_field_type_name(head_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, head_type_name);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, head_off, head_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
+ rec->spin_lock_off, head_type_name);
+ return -EINVAL;
+ }
+
+ if (*head_field) {
+ verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
+ return -EFAULT;
+ }
+ *head_field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ &meta->arg_rbtree_root.field);
+}
+
+static int
+__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ enum btf_field_type node_field_type,
+ struct btf_field **node_field)
+{
+ const char *node_type_name;
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ u32 node_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
+ return -EFAULT;
+
+ node_type_name = btf_field_type_name(node_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, node_type_name);
+ return -EINVAL;
+ }
+
+ node_off = reg->off + reg->var_off.value;
+ field = reg_find_field_offset(reg, node_off, node_field_type);
+ if (!field || field->offset != node_off) {
+ verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
+ return -EINVAL;
+ }
+
+ field = *node_field;
+
+ et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
+ field->graph_root.value_btf_id, true)) {
+ verbose(env, "operation on %s expects arg#1 %s at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ btf_field_type_name(head_field_type),
+ btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off),
+ node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+
+ if (node_off != field->graph_root.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
+ node_off, btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
const struct btf_param *args;
- int err;
+ struct btf_record *rec;
+ u32 i, nargs;
+ int ret;
- func_id = insn->imm;
- func = btf_type_by_id(btf_vmlinux, func_id);
- func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
- func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
+ (register_is_null(reg) || type_may_be_null(reg->type))) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
- if (!env->ops->check_kfunc_call ||
- !env->ops->check_kfunc_call(func_id)) {
- verbose(env, "calling kernel function %s is not allowed\n",
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ /* Trusted arguments have the same offset checks as release arguments */
+ arg_type |= OBJ_RELEASE;
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_RB_ROOT:
+ case KF_ARG_PTR_TO_RB_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ case KF_ARG_PTR_TO_CALLBACK:
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ /* Trusted by default */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux &&
+ meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ meta->arg_obj_drop.btf = reg->btf;
+ meta->arg_obj_drop.btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+
+ if (reg->type != PTR_TO_STACK &&
+ reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
+ return -EINVAL;
+ }
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
+ if (ret < 0)
+ return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ }
+
+ break;
+ }
+ case KF_ARG_PTR_TO_ITER:
+ ret = process_iter_arg(env, regno, insn_idx, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_ROOT:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_NODE:
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
+ if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
+ verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ return -EINVAL;
+ }
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ return -EINVAL;
+ }
+ } else {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ }
+
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ {
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
+ i++;
+ break;
+ }
+ case KF_ARG_PTR_TO_CALLBACK:
+ meta->subprogno = reg->subprogno;
+ break;
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) {
+ verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!rec) {
+ verbose(env, "verifier internal error: Couldn't find btf_record\n");
+ return -EFAULT;
+ }
+
+ if (rec->refcount_off < 0) {
+ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ return -EINVAL;
+ }
+ if (rec->refcount_off >= 0) {
+ verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
+ return -EINVAL;
+ }
+ meta->arg_refcount_acquire.btf = reg->btf;
+ meta->arg_refcount_acquire.btf_id = reg->btf_id;
+ break;
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
+{
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
+ struct btf *desc_btf;
+
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
+ if (!insn->imm)
+ return -EINVAL;
+
+ desc_btf = find_kfunc_desc_btf(env, insn->off);
+ if (IS_ERR(desc_btf))
+ return PTR_ERR(desc_btf);
+
+ func_id = insn->imm;
+ func = btf_type_by_id(desc_btf, func_id);
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
+ func_proto = btf_type_by_id(desc_btf, func->type);
+
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
+ if (!kfunc_flags) {
return -EACCES;
}
- /* Check the arguments */
- err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ const struct btf_type *t, *ptr_type;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ const struct btf_type *ret_t;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
if (err)
return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !env->prog->aux->sleepable) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
+ return -EACCES;
+ }
+
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+
+ if (env->cur_state->active_rcu_lock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ if (rcu_lock) {
+ verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ } else if (rcu_unlock) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ env->cur_state->active_rcu_lock = false;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+ } else if (rcu_lock) {
+ env->cur_state->active_rcu_lock = true;
+ } else if (rcu_unlock) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
+ */
+ if (meta.release_regno) {
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ insn_aux->insert_off = regs[BPF_REG_2].off;
+ err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+ func_name, meta.func_id);
+ return err;
+ }
+
+ err = release_reference(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
- t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
+
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux ||
+ (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+ }
+
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
- ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
- &ptr_type_id);
- if (!btf_type_is_struct(ptr_type)) {
- ptr_type_name = btf_name_by_offset(btf_vmlinux,
- ptr_type->name_off);
- verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
- func_name, btf_type_str(ptr_type),
- ptr_type_name);
- return -EINVAL;
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (unlikely(!bpf_global_ma_set))
+ return -ENOMEM;
+
+ if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta.arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(ret_btf, ret_btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf;
+ regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_refcount_acquire.btf,
+ meta.arg_refcount_acquire.btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
+ struct btf_field *field = meta.arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ struct btf_field *field = meta.arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta.arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta.initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ verbose(env, "kernel function %s unhandled dynamic return type\n",
+ meta.func_name);
+ return -EFAULT;
+ }
+ } else if (!__btf_type_is_struct(ptr_type)) {
+ if (!meta.r0_size) {
+ __u32 sz;
+
+ if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
+ meta.r0_size = sz;
+ meta.r0_rdonly = true;
+ }
+ }
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+ }
+
+ if (is_kfunc_ret_null(&meta)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
}
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = btf_vmlinux;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
- } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+ if (is_kfunc_acquire(&meta)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ ref_set_non_owning(env, &regs[BPF_REG_0]);
+ }
+
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ }
+ }
+ }
- nargs = btf_type_vlen(func_proto);
- args = (const struct btf_param *)(func_proto + 1);
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
- t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
if (btf_type_is_ptr(t))
mark_btf_func_reg_size(env, regno, sizeof(void *));
else
@@ -6319,6 +10900,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_btf_func_reg_size(env, regno, t->size);
}
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -6372,36 +10959,31 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
- reg_type_str[type], val);
+ reg_type_str(env, type), val);
return false;
}
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str[type], reg->off);
+ reg_type_str(env, type), reg->off);
return false;
}
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
- reg_type_str[type]);
+ reg_type_str(env, type));
return false;
}
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
verbose(env, "value %lld makes %s pointer be out of bounds\n",
- smin, reg_type_str[type]);
+ smin, reg_type_str(env, type));
return false;
}
return true;
}
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
- return &env->insn_aux_data[env->insn_idx];
-}
-
enum {
REASON_BOUNDS = -1,
REASON_TYPE = -2,
@@ -6559,6 +11141,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
}
err = update_alu_sanitation_state(aux, alu_state, alu_limit);
@@ -6587,7 +11175,7 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx);
@@ -6701,7 +11289,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
return -EACCES;
break;
case PTR_TO_MAP_VALUE:
- if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+ if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
"prohibited for !root\n", dst);
return -EACCES;
@@ -6761,11 +11349,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
- switch (ptr_reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL:
+ if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
+ }
+
+ switch (base_type(ptr_reg->type)) {
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -6773,14 +11363,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
default:
break;
@@ -6933,11 +11520,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
-
+ reg_bounds_sync(dst_reg);
if (sanitize_check_bounds(env, insn, dst_reg) < 0)
return -EACCES;
if (sanitize_needed(opcode)) {
@@ -7675,10 +12258,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* ALU32 ops are zero extended into 64bit register */
if (alu32)
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
return 0;
}
@@ -7738,6 +12318,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -7753,12 +12338,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -7774,7 +12359,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (opcode == BPF_END || opcode == BPF_NEG) {
if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != 0 ||
+ if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
verbose(env, "BPF_NEG uses reserved fields\n");
@@ -7843,7 +12428,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -7854,12 +12439,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
- /* Make sure ID is cleared otherwise
+ bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+
+ if (is_src_reg_u32 && !src_reg->id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ /* Make sure ID is cleared if src_reg is not in u32 range otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
*/
- dst_reg->id = 0;
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
@@ -7867,6 +12457,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->dst_reg);
}
zext_32_to_64(dst_reg);
+ reg_bounds_sync(dst_reg);
}
} else {
/* case: R = imm
@@ -7938,34 +12529,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
- struct bpf_reg_state *dst_reg,
- enum bpf_reg_type type, int new_range)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- if (reg->type == type && reg->id == dst_reg->id)
- /* keep the maximum range already checked */
- reg->range = max(reg->range, new_range);
- }
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
-}
-
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- int new_range, i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -7981,7 +12552,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
new_range = dst_reg->off;
if (range_right_open)
- new_range--;
+ new_range++;
/* Examples for register markings:
*
@@ -8030,9 +12601,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i <= vstate->curframe; i++)
- __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
- new_range);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
}
static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
@@ -8044,10 +12617,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(subreg))
return !!tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(subreg))
return !tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 1;
break;
case BPF_JSET:
if ((~subreg.mask & subreg.value) & val)
@@ -8117,10 +12694,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
return !!tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(reg->var_off))
return !tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 1;
break;
case BPF_JSET:
if ((~reg->var_off.mask & reg->var_off.value) & val)
@@ -8304,26 +12885,33 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
return;
switch (opcode) {
+ /* JEQ/JNE comparison doesn't change the register equivalence.
+ *
+ * r1 = r2;
+ * if (r1 == 42) goto label;
+ * ...
+ * label: // here both r1 and r2 are known to be 42.
+ *
+ * Hence when marking register as known preserve it's ID.
+ */
case BPF_JEQ:
+ if (is_jmp32) {
+ __mark_reg32_known(true_reg, val32);
+ true_32off = tnum_subreg(true_reg->var_off);
+ } else {
+ ___mark_reg_known(true_reg, val);
+ true_64off = true_reg->var_off;
+ }
+ break;
case BPF_JNE:
- {
- struct bpf_reg_state *reg =
- opcode == BPF_JEQ ? true_reg : false_reg;
-
- /* JEQ/JNE comparison doesn't change the register equivalence.
- * r1 = r2;
- * if (r1 == 42) goto label;
- * ...
- * label: // here both r1 and r2 are known to be 42.
- *
- * Hence when marking register as known preserve it's ID.
- */
- if (is_jmp32)
- __mark_reg32_known(reg, val32);
- else
- ___mark_reg_known(reg, val);
+ if (is_jmp32) {
+ __mark_reg32_known(false_reg, val32);
+ false_32off = tnum_subreg(false_reg->var_off);
+ } else {
+ ___mark_reg_known(false_reg, val);
+ false_64off = false_reg->var_off;
+ }
break;
- }
case BPF_JSET:
if (is_jmp32) {
false_32off = tnum_and(false_32off, tnum_const(~val32));
@@ -8462,21 +13050,8 @@ static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
dst_reg->smax_value);
src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
dst_reg->var_off);
- /* We might have learned new bounds from the var_off. */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
- /* We might have learned something about the sign bit. */
- __reg_deduce_bounds(src_reg);
- __reg_deduce_bounds(dst_reg);
- /* We might have learned some bits from the bounds. */
- __reg_bound_offset(src_reg);
- __reg_bound_offset(dst_reg);
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
+ reg_bounds_sync(src_reg);
+ reg_bounds_sync(dst_reg);
}
static void reg_combine_min_max(struct bpf_reg_state *true_src,
@@ -8499,18 +13074,23 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
struct bpf_reg_state *reg, u32 id,
bool is_null)
{
- if (reg_type_may_be_null(reg->type) && reg->id == id &&
- !WARN_ON_ONCE(!reg->id)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL.
+ if (type_may_be_null(reg->type) && reg->id == id &&
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
*/
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
- !tnum_equals_const(reg->var_off, 0) ||
- reg->off)) {
- __mark_reg_known_zero(reg);
- reg->off = 0;
- }
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
+ return;
+ if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+ WARN_ON_ONCE(reg->off))
+ return;
+
if (is_null) {
reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
@@ -8527,7 +13107,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reg_references().
+ * in release_reference().
*
* reg->id is still used by spin_lock ptr. Other
* than spin_lock ptr type, reg->id can be reset.
@@ -8537,22 +13117,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
- bool is_null)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
-}
-
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -8560,10 +13124,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -8572,8 +13135,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i <= vstate->curframe; i++)
- __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -8686,23 +13250,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
{
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- int i, j;
- for (i = 0; i <= vstate->curframe; i++) {
- state = vstate->frame[i];
- for (j = 0; j < MAX_BPF_REG; j++) {
- reg = &state->regs[j];
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
-
- bpf_for_each_spilled_reg(j, state, reg) {
- if (!reg)
- continue;
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
- }
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ copy_register_state(reg, known_reg);
+ }));
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -8712,6 +13264,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -8769,6 +13322,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
src_reg->var_off.value,
opcode,
is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
+ pred = is_branch_taken(src_reg,
+ tnum_subreg(dst_reg->var_off).value,
+ flip_opcode(opcode),
+ is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
+ pred = is_branch_taken(src_reg,
+ dst_reg->var_off.value,
+ flip_opcode(opcode),
+ is_jmp32);
} else if (reg_is_pkt_pointer_any(dst_reg) &&
reg_is_pkt_pointer_any(src_reg) &&
!is_jmp32) {
@@ -8821,8 +13386,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
* this is only legit if both are scalars (or pointers to the same
- * object, I suppose, but we don't support that right now), because
- * otherwise the different base pointers mean the offsets aren't
+ * object, I suppose, see the PTR_MAYBE_NULL related if block below),
+ * because otherwise the different base pointers mean the offsets aren't
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
@@ -8871,13 +13436,50 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
}
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ *
+ * Since PTR_TO_BTF_ID points to a kernel struct that does
+ * not need to be null checked by the BPF program, i.e.,
+ * could be null even without PTR_MAYBE_NULL marking, so
+ * only propagate nullness when neither reg is that type.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
+ base_type(src_reg->type) != PTR_TO_BTF_ID &&
+ base_type(dst_reg->type) != PTR_TO_BTF_ID) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
+ }
+
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- reg_type_may_be_null(dst_reg->type)) {
+ type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -8893,7 +13495,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -8928,16 +13530,19 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
- if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
- mark_reg_known_zero(env, regs, insn->dst_reg);
+ /* All special src_reg cases are listed below. From this point onwards
+ * we either succeed and assign a corresponding dst_reg->type after
+ * zeroing the offset, or fail and reject the program.
+ */
+ mark_reg_known_zero(env, regs, insn->dst_reg);
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
dst_reg->type = aux->btf_var.reg_type;
- switch (dst_reg->type) {
+ switch (base_type(dst_reg->type)) {
case PTR_TO_MEM:
dst_reg->mem_size = aux->btf_var.mem_size;
break;
case PTR_TO_BTF_ID:
- case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf = aux->btf_var.btf;
dst_reg->btf_id = aux->btf_var.btf_id;
break;
@@ -8950,7 +13555,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_FUNC) {
struct bpf_prog_aux *aux = env->prog->aux;
- u32 subprogno = insn[1].imm;
+ u32 subprogno = find_subprog(env,
+ env->insn_idx + insn->imm + 1);
if (!aux->func_info) {
verbose(env, "missing btf func_info\n");
@@ -8967,15 +13573,14 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
map = env->used_maps[aux->map_index];
- mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- if (map_value_has_spin_lock(map))
- dst_reg->id = ++env->id_gen;
+ WARN_ON_ONCE(map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
@@ -9053,11 +13658,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -9071,7 +13681,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
if (err < 0)
return err;
@@ -9099,14 +13709,25 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
- const bool is_subprog = env->cur_state->frame[0]->subprogno;
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog &&
- (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
- prog_type == BPF_PROG_TYPE_LSM) &&
- !prog->aux->attach_func_proto->type)
- return 0;
+ if (!is_subprog) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ /* See below, can be 0 or 0-1 depending on hook. */
+ break;
+ fallthrough;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ }
/* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
@@ -9124,10 +13745,26 @@ static int check_return_code(struct bpf_verifier_env *env)
}
reg = cur_regs(env) + BPF_REG_0;
+
+ if (frame->in_async_callback_fn) {
+ /* enforce return zero from async callbacks like timer */
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "In async callback the register R0 is not a known value (%s)\n",
+ reg_type_str(env, reg->type));
+ return -EINVAL;
+ }
+
+ if (!tnum_in(tnum_const(0), reg->var_off)) {
+ verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -9181,6 +13818,25 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_SK_LOOKUP:
range = tnum_range(SK_DROP, SK_PASS);
break;
+
+ case BPF_PROG_TYPE_LSM:
+ if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+ /* Regular BPF_PROG_TYPE_LSM programs can return
+ * any value.
+ */
+ return 0;
+ }
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ range = tnum_range(1, 1);
+ }
+ break;
+
+ case BPF_PROG_TYPE_NETFILTER:
+ range = tnum_range(NF_DROP, NF_ACCEPT);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -9191,12 +13847,16 @@ static int check_return_code(struct bpf_verifier_env *env)
if (reg->type != SCALAR_VALUE) {
verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
if (!tnum_in(range, reg->var_off)) {
verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+ if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+ prog_type == BPF_PROG_TYPE_LSM &&
+ !prog->aux->attach_func_proto->type)
+ verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
return -EINVAL;
}
@@ -9212,7 +13872,7 @@ static int check_return_code(struct bpf_verifier_env *env)
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -9261,11 +13921,27 @@ static struct bpf_verifier_state_list **explored_state(
return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
}
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].prune_point = true;
}
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].prune_point;
+}
+
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -9294,9 +13970,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- init_explored_state(env, w);
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
@@ -9323,8 +14001,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return DONE_EXPLORING;
}
-static int visit_func_call_insn(int t, int insn_cnt,
- struct bpf_insn *insns,
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
struct bpf_verifier_env *env,
bool visit_callee)
{
@@ -9334,12 +14011,18 @@ static int visit_func_call_insn(int t, int insn_cnt,
if (ret)
return ret;
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + 1);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + 1);
+
if (visit_callee) {
- init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
+ mark_prune_point(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
+ /* It's ok to allow recursion from CFG point of
+ * view. __check_func_call() will do the actual
+ * check.
+ */
+ bpf_pseudo_func(insns + t));
}
return ret;
}
@@ -9349,58 +14032,77 @@ static int visit_func_call_insn(int t, int insn_cnt,
* DONE_EXPLORING - the instruction was fully explored
* KEEP_EXPLORING - there is still work to be done before it is fully explored
*/
-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+static int visit_insn(int t, struct bpf_verifier_env *env)
{
- struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
int ret;
- if (bpf_pseudo_func(insns + t))
- return visit_func_call_insn(t, insn_cnt, insns, env, true);
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
- if (BPF_CLASS(insns[t].code) != BPF_JMP &&
- BPF_CLASS(insns[t].code) != BPF_JMP32)
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32)
return push_insn(t, t + 1, FALLTHROUGH, env, false);
- switch (BPF_OP(insns[t].code)) {
+ switch (BPF_OP(insn->code)) {
case BPF_EXIT:
return DONE_EXPLORING;
case BPF_CALL:
- return visit_func_call_insn(t, insn_cnt, insns, env,
- insns[t].src_reg == BPF_PSEUDO_CALL);
+ if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
- if (BPF_SRC(insns[t].code) != BPF_K)
+ if (BPF_SRC(insn->code) != BPF_K)
return -EINVAL;
/* unconditional jump with single edge */
- ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
+ ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
true);
if (ret)
return ret;
- /* unconditional jmp is not a good pruning point,
- * but it's marked, since backtracking needs
- * to record jmp history in is_state_visited().
- */
- init_explored_state(env, t + insns[t].off + 1);
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + insn->off + 1);
+ mark_jmp_point(env, t + insn->off + 1);
return ret;
default:
/* conditional jump with two edges */
- init_explored_state(env, t);
+ mark_prune_point(env, t);
+
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret)
return ret;
- return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
+ return push_insn(t, t + insn->off + 1, BRANCH, env, true);
}
}
@@ -9431,7 +14133,7 @@ static int check_cfg(struct bpf_verifier_env *env)
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
- ret = visit_insn(t, insn_cnt, env);
+ ret = visit_insn(t, env);
switch (ret) {
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
@@ -9597,7 +14299,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
scalar_return =
- btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
goto err_free;
@@ -9634,8 +14336,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
- sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
static int check_btf_line(struct bpf_verifier_env *env,
@@ -9653,6 +14354,8 @@ static int check_btf_line(struct bpf_verifier_env *env,
nr_linfo = attr->line_info_cnt;
if (!nr_linfo)
return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
rec_size = attr->line_info_rec_size;
if (rec_size < MIN_BPF_LINEINFO_SIZE ||
@@ -9761,6 +14464,78 @@ err_free:
return err;
}
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
bpfptr_t uattr)
@@ -9791,6 +14566,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
if (err)
return err;
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
return 0;
}
@@ -9822,6 +14601,13 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
{
unsigned int i;
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
if (!idmap[i].old) {
/* Reached an empty slot; haven't seen this id before */
@@ -9933,81 +14719,78 @@ next:
}
}
-/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct bpf_id_pair *idmap)
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_id_pair *idmap)
{
- bool equal;
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
+{
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
-
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
-
- if (rold->type == PTR_TO_STACK)
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return equal && rold->frameno == rcur->frameno;
-
- if (equal)
- return true;
-
if (rold->type == NOT_INIT)
/* explored state can't have used this */
return true;
if (rcur->type == NOT_INIT)
return false;
- switch (rold->type) {
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
+ switch (base_type(rold->type)) {
case SCALAR_VALUE:
- if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise && !rcur->precise)
- return true;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- } else {
- /* We're trying to use a pointer in place of a scalar.
- * Even if the scalar was unbounded, this could lead to
- * pointer leaks because scalars are allowed to leak
- * while pointers are not. We could make this safe in
- * special cases if root is calling us, but it's
- * probably not worth the hassle.
- */
+ if (regs_exact(rold, rcur, idmap))
+ return true;
+ if (env->explore_alu_limits)
return false;
- }
+ if (!rold->precise)
+ return true;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * 'id' is not compared, since it's only used for maps with
- * bpf_spin_lock inside map element and in such cases if
- * the rest of the prog is valid for one map element then
- * it's valid for all map elements regardless of the key
- * used in bpf_map_lookup()
*/
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_VALUE_OR_NULL:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != rold->type)
- return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
* still safe. This is true even if old range < old off,
@@ -10022,38 +14805,23 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
if (rold->off != rcur->off)
return false;
/* id relations must be preserved */
- if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ if (!check_ids(rold->id, rcur->id, idmap))
return false;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_CTX:
- case CONST_PTR_TO_MAP:
- case PTR_TO_PACKET_END:
- case PTR_TO_FLOW_KEYS:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
- case PTR_TO_XDP_SOCK:
- /* Only valid matches are exact, which memcmp() above
- * would have accepted
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
*/
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
default:
- /* Don't know what's going on, just say it's not safe */
- return false;
+ return regs_exact(rold, rcur, idmap);
}
-
- /* Shouldn't get here; if we do, say it's not safe */
- WARN_ON_ONCE(1);
- return false;
}
-static bool stacksafe(struct bpf_func_state *old,
- struct bpf_func_state *cur,
- struct bpf_id_pair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_id_pair *idmap)
{
int i, spi;
@@ -10062,6 +14830,8 @@ static bool stacksafe(struct bpf_func_state *old,
* didn't use them
*/
for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
spi = i / BPF_REG_SIZE;
if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -10073,6 +14843,10 @@ static bool stacksafe(struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
/* explored stack has more populated slots than current stack
* and these slots were used
*/
@@ -10094,13 +14868,11 @@ static bool stacksafe(struct bpf_func_state *old,
* return false to continue verification of this path
*/
return false;
- if (i % BPF_REG_SIZE)
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (old->stack[spi].slot_type[0] != STACK_SPILL)
- continue;
- if (!regsafe(&old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr,
- idmap))
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -10111,17 +14883,60 @@ static bool stacksafe(struct bpf_func_state *old,
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
return false;
+ }
}
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+ struct bpf_id_pair *idmap)
{
+ int i;
+
if (old->acquired_refs != cur->acquired_refs)
return false;
- return !memcmp(old->refs, cur->refs,
- sizeof(*old->refs) * old->acquired_refs);
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ return false;
+ }
+
+ return true;
}
/* compare two verifier states
@@ -10155,15 +14970,15 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+ if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ env->idmap_scratch))
return false;
- if (!stacksafe(old, cur, env->idmap_scratch))
+ if (!stacksafe(env, old, cur, env->idmap_scratch))
return false;
- if (!refsafe(old, cur))
+ if (!refsafe(old, cur, env->idmap_scratch))
return false;
return true;
@@ -10178,13 +14993,28 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
+ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
*/
if (old->speculative && !cur->speculative)
return false;
- if (old->active_spin_lock != cur->active_spin_lock)
+ if (old->active_lock.ptr != cur->active_lock.ptr)
+ return false;
+
+ /* Old and cur active_lock's have to be either both present
+ * or both absent.
+ */
+ if (!!old->active_lock.id != !!cur->active_lock.id)
+ return false;
+
+ if (old->active_lock.id &&
+ !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
/* for states to be equal callsites have to be the same
@@ -10287,34 +15117,38 @@ static int propagate_precision(struct bpf_verifier_env *env,
{
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
- int i, err = 0;
+ int i, err = 0, fr;
- state = old->frame[old->curframe];
- state_reg = state->regs;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating r%d\n", i);
- err = mark_chain_precision(env, i);
- if (err < 0)
- return err;
- }
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating r%d\n", fr, i);
+ err = mark_chain_precision_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_SPILL)
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack(env, i);
- if (err < 0)
- return err;
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating fp%d\n",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ err = mark_chain_precision_stack_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
}
return 0;
}
@@ -10337,6 +15171,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
return true;
}
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
@@ -10344,14 +15264,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = env->test_state_freq ? true : false;
-
- cur->last_insn_idx = env->prev_insn_idx;
- if (!env->insn_aux_data[insn_idx].prune_point)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
+ bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+ bool add_new_state = force_new_state;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -10374,9 +15288,63 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
+
if (sl->state.branches) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ states_equal(env, &sl->state, cur) &&
+ !iter_active_depths_differ(&sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -10393,12 +15361,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
- if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
if (states_equal(env, &sl->state, cur)) {
+hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
@@ -10471,10 +15442,10 @@ next:
env->max_states_per_insn = states_cnt;
if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return push_jmp_history(env, cur);
+ return 0;
if (!add_new_state)
- return push_jmp_history(env, cur);
+ return 0;
/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
@@ -10493,6 +15464,10 @@ next:
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -10547,17 +15522,13 @@ next:
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_CTX:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -10582,6 +15553,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
!reg_type_mismatch_ok(prev));
}
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_missmatch)
+{
+ enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+
+ if (*prev_type == NOT_INIT) {
+ /* Saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_type = type;
+ } else if (reg_type_mismatch(type, *prev_type)) {
+ /* Abuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ if (allow_trust_missmatch &&
+ base_type(type) == PTR_TO_BTF_ID &&
+ base_type(*prev_type) == PTR_TO_BTF_ID) {
+ /*
+ * Have to support a use case when one path through
+ * the program yields TRUSTED pointer while another
+ * is UNTRUSTED. Fallback to UNTRUSTED to generate
+ * BPF_PROBE_MEM.
+ */
+ *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ } else {
+ verbose(env, "same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -10614,21 +15623,31 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, env->insn_idx);
- if (err < 0)
- return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (env->log.level & BPF_LOG_LEVEL) {
- if (do_print_state)
- verbose(env, "\nfrom %d to %d%s: safe\n",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- else
- verbose(env, "%d: safe\n", env->insn_idx);
+ state->last_insn_idx = env->prev_insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
}
- goto process_bpf_exit;
+ }
+
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state);
+ if (err)
+ return err;
}
if (signal_pending(current))
@@ -10637,16 +15656,12 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level & BPF_LOG_LEVEL2 ||
- (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%d:", env->insn_idx);
- else
- verbose(env, "\nfrom %d to %d%s:",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe]);
+ if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env, state->frame[state->curframe], true);
do_print_state = false;
}
@@ -10657,12 +15672,18 @@ static int do_check(struct bpf_verifier_env *env)
.private_data = env,
};
+ if (verifier_state_scratched(env))
+ print_insn_state(env, state->frame[state->curframe]);
+
verbose_linfo(env, env->insn_idx, "; ");
+ env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
@@ -10679,7 +15700,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type *prev_src_type, src_reg_type;
+ enum bpf_reg_type src_reg_type;
/* check for reserved fields is already done */
@@ -10703,29 +15724,11 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_src_type == NOT_INIT) {
- /* saw a valid insn
- * dst_reg = *(u32 *)(src_reg + off)
- * save type to validate intersecting paths
- */
- *prev_src_type = src_reg_type;
-
- } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
- /* ABuser program is trying to use the same insn
- * dst_reg = *(u32*) (src_reg + off)
- * with different pointer types:
- * src_reg == ctx in one branch and
- * src_reg == stack|map in some other branch.
- * Reject it.
- */
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, src_reg_type, true);
+ if (err)
+ return err;
} else if (class == BPF_STX) {
- enum bpf_reg_type *prev_dst_type, dst_reg_type;
+ enum bpf_reg_type dst_reg_type;
if (BPF_MODE(insn->code) == BPF_ATOMIC) {
err = check_atomic(env, env->insn_idx, insn);
@@ -10758,16 +15761,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_dst_type == NOT_INIT) {
- *prev_dst_type = dst_reg_type;
- } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_ST uses reserved fields\n");
@@ -10778,12 +15777,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- if (is_ctx_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
- insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
- return -EACCES;
- }
+ dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
@@ -10792,13 +15786,17 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
env->jmps_processed++;
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
- insn->off != 0 ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
+ && insn->off != 0) ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL &&
insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
@@ -10808,20 +15806,25 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock &&
- (insn->src_reg == BPF_PSEUDO_CALL ||
- insn->imm != BPF_FUNC_spin_unlock)) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
+ if (env->cur_state->active_lock.ptr) {
+ if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_CALL) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
- err = check_kfunc_call(env, insn);
+ err = check_kfunc_call(env, insn, &env->insn_idx);
else
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
return err;
+
+ mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
@@ -10845,11 +15848,27 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "bpf_rcu_read_unlock is missing\n");
+ return -EINVAL;
+ }
+
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback
+ * function, for which reference_state must
+ * match caller reference state when it exits.
+ */
+ err = check_reference_leak(env);
+ if (err)
+ return err;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -10859,14 +15878,11 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_reference_leak(env);
- if (err)
- return err;
-
err = check_return_code(env);
if (err)
return err;
process_bpf_exit:
+ mark_verifier_state_scratched(env);
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
&env->insn_idx, pop_log);
@@ -10983,8 +15999,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
goto err_put;
}
- if (!btf_type_is_var(t)) {
- verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
err = -EINVAL;
goto err_put;
}
@@ -10997,6 +16013,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -ENOENT;
goto err_put;
}
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ goto check_btf;
+ }
datasec_id = find_btf_percpu_datasec(btf);
if (datasec_id > 0) {
@@ -11009,13 +16033,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
}
}
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
-
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
- aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
@@ -11032,14 +16053,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -EINVAL;
goto err_put;
}
- aux->btf_var.reg_type = PTR_TO_MEM;
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-
+check_btf:
/* check whether we recorded this BTF (and maybe module) already */
for (i = 0; i < env->used_btf_cnt; i++) {
if (env->used_btfs[i].btf == btf) {
@@ -11074,14 +16095,6 @@ err_put:
return err;
}
-static int check_map_prealloc(struct bpf_map *map)
-{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
-}
-
static bool is_tracing_prog_type(enum bpf_prog_type type)
{
switch (type) {
@@ -11089,58 +16102,29 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
return true;
default:
return false;
}
}
-static bool is_preallocated_map(struct bpf_map *map)
-{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
-}
-
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
- if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
+
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+ btf_record_has_field(map->record, BPF_RB_ROOT)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
return -EINVAL;
}
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
- if (map_value_has_spin_lock(map)) {
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -11157,7 +16141,14 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_timer yet\n");
+ return -EINVAL;
+ }
+ }
+
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -11178,17 +16169,16 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
- if (!is_preallocated_map(map)) {
- verbose(env,
- "Sleepable programs can only use preallocated maps\n");
- return -EINVAL;
- }
- break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
break;
default:
verbose(env,
- "Sleepable programs can only use array, hash, and ringbuf maps\n");
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
return -EINVAL;
}
@@ -11425,10 +16415,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_insn_aux_data *new_data,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
u32 old_seen = old_data[off].seen;
u32 prog_len;
@@ -11441,12 +16432,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
if (cnt == 1)
- return 0;
+ return;
prog_len = new_prog->len;
- new_data = vzalloc(array_size(prog_len,
- sizeof(struct bpf_insn_aux_data)));
- if (!new_data)
- return -ENOMEM;
+
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
@@ -11457,7 +16445,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
}
env->insn_aux_data = new_data;
vfree(old_data);
- return 0;
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -11492,6 +16479,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vzalloc(array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)));
+ if (!new_data)
+ return NULL;
+ }
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
if (IS_ERR(new_prog)) {
@@ -11499,10 +16494,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
+ vfree(new_data);
return NULL;
}
- if (adjust_insn_aux_data(env, new_prog, off, len))
- return NULL;
+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
adjust_subprog_starts(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
@@ -11634,7 +16629,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
err = bpf_remove_insns(env->prog, off, cnt);
@@ -11678,6 +16673,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
}
}
@@ -11714,7 +16710,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -11816,7 +16812,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
- imm_rnd = get_random_int();
+ imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
rnd_hi32_patch[3].dst_reg = load_reg;
@@ -11838,6 +16834,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
if (WARN_ON(load_reg == -1)) {
verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
return -EFAULT;
@@ -11897,7 +16897,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -11908,31 +16908,26 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- else
+ } else {
continue;
+ }
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_off) {
+ env->insn_aux_data[i + delta].sanitize_stack_spill) {
struct bpf_insn patch[] = {
- /* Sanitize suspicious stack slot with zero.
- * There are no memory dependencies for this store,
- * since it's only using frame pointer and immediate
- * constant of zero
- */
- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- env->insn_aux_data[i + delta].sanitize_stack_off,
- 0),
- /* the original STX instruction will immediately
- * overwrite the same stack slot with appropriate value
- */
*insn,
+ BPF_ST_NOSPEC(),
};
cnt = ARRAY_SIZE(patch);
@@ -11946,7 +16941,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
- switch (env->insn_aux_data[i + delta].ptr_type) {
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
continue;
@@ -11963,13 +16958,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
- verbose(env, "Writes through BTF pointers are not allowed\n");
- return -EINVAL;
}
continue;
default:
@@ -12017,6 +17017,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
+ if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier narrow ctx load misconfigured\n");
+ return -EINVAL;
+ }
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
@@ -12061,14 +17065,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- env->insn_aux_data[i].call_imm = insn->imm;
- /* subprog is encoded in insn[1].imm */
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
/* Upon error here we cannot fall back to interpreter but
* need a hard reject of the program. Thus -EFAULT is
* propagated in any case.
@@ -12089,6 +17088,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
+ if (bpf_pseudo_func(insn))
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Force a non zero here.
+ */
+ insn[1].imm = 1;
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -12105,7 +17110,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- /* BPF_PROG_RUN doesn't call subprogs directly,
+ /* bpf_prog_run() doesn't call subprogs directly,
* hence main prog stats include the runtime of subprogs.
* subprogs don't have IDs and not reachable via prog_get_next_id
* func[i]->stats will never be accessed and stays NULL
@@ -12124,6 +17129,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
@@ -12136,13 +17142,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
poke->aux = func[i]->aux;
}
- /* Use bpf_prog_F_tag to indicate functions in stack traces.
- * Long term would need debug info to populate names
- */
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
+ func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -12172,7 +17177,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (bpf_pseudo_func(insn)) {
- subprog = insn[1].imm;
+ subprog = insn->off;
insn[0].imm = (u32)(long)func[subprog]->bpf_func;
insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
continue;
@@ -12180,8 +17185,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
- insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
}
/* we use the aux data to keep a list of the start addresses
@@ -12224,7 +17228,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (bpf_pseudo_func(insn)) {
insn[0].imm = env->insn_aux_data[i].call_imm;
- insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ insn[1].imm = insn->off;
+ insn->off = 0;
continue;
}
if (!bpf_pseudo_call(insn))
@@ -12236,6 +17241,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
@@ -12263,6 +17269,7 @@ out_free:
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
+ prog->blinding_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (!bpf_pseudo_call(insn))
continue;
@@ -12284,7 +17291,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -12324,23 +17331,123 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-static int fixup_kfunc_call(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+/* replace a generic kfunc with a specialized version if necessary */
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc) {
+ *addr = (unsigned long)xdp_kfunc;
+ return;
+ }
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
+ if (offset)
+ return;
+
+ if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ }
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
+}
+
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
- /* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_base_call).
+ if (!insn->imm) {
+ verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
+ return -EINVAL;
+ }
+
+ *cnt = 0;
+
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
*/
- desc = find_kfunc_desc(env->prog, insn->imm);
+ desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
insn->imm);
return -EFAULT;
}
- insn->imm = desc->imm;
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
+
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
return 0;
}
@@ -12350,7 +17457,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
- bool expect_blinding = bpf_jit_blinding_enabled(prog);
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
@@ -12481,9 +17589,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn);
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
+ if (cnt == 0)
+ continue;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
@@ -12513,7 +17631,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !expect_blinding &&
+ if (env->bpf_capable && !prog->blinding_requested &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -12568,6 +17686,58 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ if (is_storage_get_function(insn->imm)) {
+ if (!env->prog->aux->sleepable ||
+ env->insn_aux_data[i + delta].storage_get_func_atomic)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -12579,7 +17749,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_push_elem ||
insn->imm == BPF_FUNC_map_pop_elem ||
insn->imm == BPF_FUNC_map_peek_elem ||
- insn->imm == BPF_FUNC_redirect_map)) {
+ insn->imm == BPF_FUNC_redirect_map ||
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
@@ -12610,49 +17782,55 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (int (*)(struct bpf_map *map, void *key))NULL));
+ (long (*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (int (*)(struct bpf_map *map, void *key, void *value,
+ (long (*)(struct bpf_map *map, void *key, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (int (*)(struct bpf_map *map, void *value,
+ (long (*)(struct bpf_map *map, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
+ (long (*)(struct bpf_map *map,
+ bpf_callback_t callback_fn,
+ void *callback_ctx,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
- insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
continue;
case BPF_FUNC_map_update_elem:
- insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_update_elem);
continue;
case BPF_FUNC_map_delete_elem:
- insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
continue;
case BPF_FUNC_map_push_elem:
- insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_push_elem);
continue;
case BPF_FUNC_map_pop_elem:
- insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
continue;
case BPF_FUNC_map_peek_elem:
- insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
continue;
case BPF_FUNC_redirect_map:
- insn->imm = BPF_CAST_CALL(ops->map_redirect) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_redirect);
+ continue;
+ case BPF_FUNC_for_each_map_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
+ continue;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
continue;
}
@@ -12684,6 +17862,89 @@ patch_map_ops_generic:
continue;
}
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[7] = BPF_JMP_A(1);
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ cnt = 9;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 6;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -12715,7 +17976,143 @@ patch_call_imm:
}
}
- sort_kfunc_descs_by_imm(env->prog);
+ sort_kfunc_descs_by_imm_off(env->prog);
+
+ return 0;
+}
+
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+ struct bpf_insn insn_buf[] = {
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+ BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+ /* spill R6, R7, R8 to use these as loop vars */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+ /* initialize loop vars */
+ BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+ BPF_MOV32_IMM(reg_loop_cnt, 0),
+ BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+ BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+ BPF_CALL_REL(0),
+ /* increment loop counter */
+ BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+ /* jump to loop header if callback returned 0 */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+ /* restore original values of R6, R7, R8 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+ };
+
+ *cnt = ARRAY_SIZE(insn_buf);
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -12750,37 +18147,6 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-/* The verifier is using insn_aux_data[] to store temporary data during
- * verification and to store information for passes that run after the
- * verification like dead code sanitization. do_check_common() for subprogram N
- * may analyze many other subprograms. sanitize_insn_aux_data() clears all
- * temporary data after do_check_common() finds that subprogram N cannot be
- * verified independently. pass_cnt counts the number of times
- * do_check_common() was run and insn->aux->seen tells the pass number
- * insn_aux_data was touched. These variables are compared to clear temporary
- * data from failed pass. For testing and experiments do_check_common() can be
- * run multiple times even when prior attempt to verify is unsuccessful.
- *
- * Note that special handling is needed on !env->bypass_spec_v1 if this is
- * ever called outside of error path with subsequent program rejection.
- */
-static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- struct bpf_insn_aux_data *aux;
- int i, class;
-
- for (i = 0; i < env->prog->len; i++) {
- class = BPF_CLASS(insn[i].code);
- if (class != BPF_LDX && class != BPF_STX)
- continue;
- aux = &env->insn_aux_data[i];
- if (aux->seen != env->pass_cnt)
- continue;
- memset(aux, 0, offsetof(typeof(*aux), orig_idx));
- }
-}
-
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -12807,6 +18173,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
@@ -12818,7 +18186,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
else if (regs[i].type == SCALAR_VALUE)
mark_reg_unknown(env, regs, i);
- else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+ else if (base_type(regs[i].type) == PTR_TO_MEM) {
const u32 mem_size = regs[i].mem_size;
mark_reg_known_zero(env, regs, i);
@@ -12857,9 +18225,6 @@ out:
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
- if (ret)
- /* clean aux data in case subprog was rejected */
- sanitize_insn_aux_data(env);
return ret;
}
@@ -12982,7 +18347,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member);
+ int err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -13015,7 +18380,7 @@ BTF_SET_START(btf_non_sleepable_error_inject)
/* Three functions below can be called from sleepable and non-sleepable context.
* Assume non-sleepable from bpf safety point of view.
*/
-BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, __filemap_add_folio)
BTF_ID(func, should_fail_alloc_page)
BTF_ID(func, should_failslab)
BTF_SET_END(btf_non_sleepable_error_inject)
@@ -13039,6 +18404,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
const char *tname;
struct btf *btf;
long addr = 0;
+ struct module *mod = NULL;
if (!btf_id) {
bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -13063,6 +18429,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
subprog = i;
@@ -13173,6 +18545,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
fallthrough;
case BPF_MODIFY_RETURN:
case BPF_LSM_MAC:
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -13205,8 +18578,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
else
addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
} else {
- addr = kallsyms_lookup_name(tname);
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
if (!addr) {
+ module_put(mod);
bpf_log(log,
"The address of function %s cannot be found\n",
tname);
@@ -13218,12 +18600,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
- /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
*/
if (!check_non_sleepable_error_inject(btf_id) &&
within_error_injection_list(addr))
ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
break;
case BPF_PROG_TYPE_LSM:
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -13236,16 +18628,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
break;
}
if (ret) {
+ module_put(mod);
bpf_log(log, "%s is not sleepable\n", tname);
return ret;
}
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
if (tgt_prog) {
+ module_put(mod);
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
- ret = check_attach_modify_return(addr, tname);
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
if (ret) {
+ module_put(mod);
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
}
@@ -13256,6 +18654,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
tgt_info->tgt_addr = addr;
tgt_info->tgt_name = tname;
tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
return 0;
}
@@ -13268,8 +18667,34 @@ BTF_ID(func, migrate_enable)
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
BTF_SET_END(btf_id_deny)
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -13288,9 +18713,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM) {
- verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -13318,6 +18742,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* store info about the attachment target that will be used later */
prog->aux->attach_func_proto = tgt_info.tgt_type;
prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
if (tgt_prog) {
prog->aux->saved_dst_prog_type = tgt_prog->type;
@@ -13362,12 +18787,12 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
- struct bpf_verifier_log *log;
- int i, len, ret = -EINVAL;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
bool is_priv;
/* no program is valid */
@@ -13380,7 +18805,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
- log = &env->log;
len = (*prog)->len;
env->insn_aux_data =
@@ -13401,20 +18825,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = attr->log_level;
- log->ubuf = (char __user *) (unsigned long) attr->log_buf;
- log->len_total = attr->log_size;
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
- ret = -EINVAL;
- /* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
- !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
- goto err_unlock;
- }
+ mark_verifier_state_clean(env);
if (IS_ERR(btf_vmlinux)) {
/* Either gcc or pahole or kernel are broken. */
@@ -13431,7 +18851,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
@@ -13466,7 +18885,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
@@ -13479,7 +18898,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
@@ -13489,6 +18908,9 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
+ if (ret == 0)
+ ret = optimize_bpf_loop(env);
+
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
@@ -13511,7 +18933,7 @@ skip_full_check:
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
@@ -13522,10 +18944,16 @@ skip_full_check:
env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);
+ env->prog->aux->verified_insns = env->insn_processed;
+
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
- if (log->level && bpf_verifier_log_full(log))
- ret = -ENOSPC;
- if (log->level && !log->ubuf) {
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
goto err_release_maps;
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 46a361dde042..3e058f41df32 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -20,13 +20,6 @@
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
-
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
@@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
+ struct __user_cap_data_struct kdata[2];
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
@@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
- if (!ret) {
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i;
-
- for (i = 0; i < tocopy; i++) {
- kdata[i].effective = pE.cap[i];
- kdata[i].permitted = pP.cap[i];
- kdata[i].inheritable = pI.cap[i];
- }
-
- /*
- * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
- * we silently drop the upper capabilities here. This
- * has the effect of making older libcap
- * implementations implicitly drop upper capability
- * bits when they perform a: capget/modify/capset
- * sequence.
- *
- * This behavior is considered fail-safe
- * behavior. Upgrading the application to a newer
- * version of libcap will enable access to the newer
- * capabilities.
- *
- * An alternative would be to return an error here
- * (-ERANGE), but that causes legacy applications to
- * unexpectedly fail; the capget/modify/capset aborts
- * before modification is attempted and the application
- * fails.
- */
- if (copy_to_user(dataptr, kdata, tocopy
- * sizeof(struct __user_cap_data_struct))) {
- return -EFAULT;
- }
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Annoying legacy format with 64-bit capabilities exposed
+ * as two sets of 32-bit fields, so we need to split the
+ * capability values up.
+ */
+ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32;
+ kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32;
+ kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+ return -EFAULT;
+
+ return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+ return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}
/**
@@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy, copybytes;
+ struct __user_cap_data_struct kdata[2] = { { 0, }, };
+ unsigned tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
- for (i = 0; i < tocopy; i++) {
- effective.cap[i] = kdata[i].effective;
- permitted.cap[i] = kdata[i].permitted;
- inheritable.cap[i] = kdata[i].inheritable;
- }
- while (i < _KERNEL_CAPABILITY_U32S) {
- effective.cap[i] = 0;
- permitted.cap[i] = 0;
- inheritable.cap[i] = 0;
- i++;
- }
-
- effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective);
+ permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted);
+ inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
new = prepare_creds();
if (!new)
@@ -360,6 +346,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
static bool ns_capable_common(struct user_namespace *ns,
int cap,
@@ -485,11 +472,11 @@ EXPORT_SYMBOL(file_ns_capable);
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
const struct inode *inode)
{
- return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
- kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
@@ -501,13 +488,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
- privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cfi.c b/kernel/cfi.c
index e17a56639766..08caad776717 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -1,329 +1,101 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ * Clang Control Flow Integrity (CFI) error handling.
*
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
*/
-#include <linux/hardirq.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/set_memory.h>
+#include <linux/cfi.h>
-/* Compiler-defined handler names */
-#ifdef CONFIG_CFI_PERMISSIVE
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail
-#else
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
-#endif
-
-static inline void handle_cfi_failure(void *ptr)
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
{
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
- WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
else
- panic("CFI failure (target: %pS)\n", ptr);
-}
-
-#ifdef CONFIG_MODULES
-#ifdef CONFIG_CFI_CLANG_SHADOW
-/*
- * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
- * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
- */
-typedef u16 shadow_t;
-#define SHADOW_INVALID ((shadow_t)~0UL)
-
-struct cfi_shadow {
- /* Page index for the beginning of the shadow */
- unsigned long base;
- /* An array of __cfi_check locations (as indices to the shadow) */
- shadow_t shadow[1];
-} __packed;
-
-/*
- * The shadow covers ~128M from the beginning of the module region. If
- * the region is larger, we fall back to __module_address for the rest.
- */
-#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
-
-/* The in-memory size of struct cfi_shadow, always at least one page */
-#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
-#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
-#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
-
-/* The actual size of the shadow array, minus metadata */
-#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
-#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
-
-static DEFINE_MUTEX(shadow_update_lock);
-static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
-
-/* Returns the index in the shadow for the given address */
-static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
-{
- unsigned long index;
- unsigned long page = ptr >> PAGE_SHIFT;
-
- if (unlikely(page < s->base))
- return -1; /* Outside of module area */
-
- index = page - s->base;
-
- if (index >= SHADOW_ARR_SLOTS)
- return -1; /* Cannot be addressed with shadow */
-
- return (int)index;
-}
-
-/* Returns the page address for an index in the shadow */
-static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- return (s->base + index) << PAGE_SHIFT;
-}
-
-/* Returns the __cfi_check function address for the given shadow location */
-static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- if (unlikely(s->shadow[index] == SHADOW_INVALID))
- return 0;
-
- /* __cfi_check is always page aligned */
- return (s->base + s->shadow[index]) << PAGE_SHIFT;
-}
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
-static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
- struct cfi_shadow *next)
-{
- int i, index, check;
-
- /* Mark everything invalid */
- memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
-
- if (!prev)
- return; /* No previous shadow */
-
- /* If the base address didn't change, an update is not needed */
- if (prev->base == next->base) {
- memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
- return;
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
}
- /* Convert the previous shadow to the new address range */
- for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
- if (prev->shadow[i] == SHADOW_INVALID)
- continue;
-
- index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
- if (index < 0)
- continue;
-
- check = ptr_to_shadow(next,
- shadow_to_check_fn(prev, prev->shadow[i]));
- if (check < 0)
- continue;
-
- next->shadow[index] = (shadow_t)check;
- }
+ return BUG_TRAP_TYPE_BUG;
}
-static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
{
- int check_index;
- unsigned long check = (unsigned long)mod->cfi_check;
- unsigned long ptr;
-
- if (unlikely(!PAGE_ALIGNED(check))) {
- pr_warn("cfi: not using shadow for module %s\n", mod->name);
- return;
- }
-
- check_index = ptr_to_shadow(s, check);
- if (check_index < 0)
- return; /* Module not addressable with shadow */
-
- /* For each page, store the check function index in the shadow */
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0) {
- /* Each page must only contain one module */
- WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
- s->shadow[index] = (shadow_t)check_index;
- }
- }
+ return (unsigned long)((long)p + (long)*p);
}
-static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
{
- unsigned long ptr;
+ s32 *p;
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0)
- s->shadow[index] = SHADOW_INVALID;
- }
-}
-
-typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
- unsigned long min_addr, unsigned long max_addr);
-
-static void update_shadow(struct module *mod, unsigned long base_addr,
- update_shadow_fn fn)
-{
- struct cfi_shadow *prev;
- struct cfi_shadow *next;
- unsigned long min_addr, max_addr;
-
- next = vmalloc(SHADOW_SIZE);
-
- mutex_lock(&shadow_update_lock);
- prev = rcu_dereference_protected(cfi_shadow,
- mutex_is_locked(&shadow_update_lock));
-
- if (next) {
- next->base = base_addr >> PAGE_SHIFT;
- prepare_next_shadow(prev, next);
-
- min_addr = (unsigned long)mod->core_layout.base;
- max_addr = min_addr + mod->core_layout.text_size;
- fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
-
- set_memory_ro((unsigned long)next, SHADOW_PAGES);
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
}
- rcu_assign_pointer(cfi_shadow, next);
- mutex_unlock(&shadow_update_lock);
- synchronize_rcu();
-
- if (prev) {
- set_memory_rw((unsigned long)prev, SHADOW_PAGES);
- vfree(prev);
- }
-}
-
-void cfi_module_add(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, add_module_to_shadow);
+ return false;
}
-void cfi_module_remove(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, remove_module_from_shadow);
-}
-
-static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
- unsigned long ptr)
-{
- int index;
-
- if (unlikely(!s))
- return NULL; /* No shadow available */
-
- index = ptr_to_shadow(s, ptr);
- if (index < 0)
- return NULL; /* Cannot be addressed with shadow */
-
- return (cfi_check_fn)shadow_to_check_fn(s, index);
-}
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
{
- cfi_check_fn fn;
+ char *secstrings;
+ unsigned int i;
- rcu_read_lock_sched();
- fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched();
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
- return fn;
-}
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-#else /* !CONFIG_CFI_CLANG_SHADOW */
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- return NULL;
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
+ }
}
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+static bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = NULL;
struct module *mod;
+ bool found = false;
- rcu_read_lock_sched();
- mod = __module_address(ptr);
- if (mod)
- fn = mod->cfi_check;
- rcu_read_unlock_sched();
-
- return fn;
-}
-
-static inline cfi_check_fn find_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn = NULL;
-
- if (is_kernel_text(ptr))
- return __cfi_check;
+ rcu_read_lock_sched_notrace();
- /*
- * Indirect call checks can happen when RCU is not watching. Both
- * the shadow and __module_address use RCU, so we need to wake it
- * up if necessary.
- */
- RCU_NONIDLE({
- if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
- fn = find_shadow_check_fn(ptr);
+ mod = __module_address(addr);
+ if (mod)
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
- if (!fn)
- fn = find_module_check_fn(ptr);
- });
+ rcu_read_unlock_sched_notrace();
- return fn;
+ return found;
}
-
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = find_check_fn((unsigned long)ptr);
-
- if (likely(fn))
- fn(id, ptr, diag);
- else /* Don't allow unchecked modules */
- handle_cfi_failure(ptr);
+ return false;
}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+#endif /* CONFIG_MODULES */
-#else /* !CONFIG_MODULES */
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+bool is_cfi_trap(unsigned long addr)
{
- handle_cfi_failure(ptr); /* No modules */
-}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
-#endif /* CONFIG_MODULES */
-
-void cfi_failure_handler(void *data, void *ptr, void *vtable)
-{
- handle_cfi_failure(ptr);
+ return is_module_cfi_trap(addr);
}
-EXPORT_SYMBOL(cfi_failure_handler);
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..367b0a42ada9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -12,7 +12,6 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-extern bool cgroup_debug;
extern void __init enable_debug_cgroup(void);
/*
@@ -65,6 +64,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
@@ -146,11 +164,9 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
/* iterate across the hierarchies */
#define for_each_root(root) \
@@ -215,6 +231,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
@@ -231,6 +248,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 8d6bf56ed77a..aeef06c465ef 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -50,20 +50,19 @@ bool cgroup1_ssid_disabled(int ssid)
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
struct cgroup_root *root;
int retval = 0;
- mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_lock();
+ cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
- if (root == &cgrp_dfl_root)
- continue;
-
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
@@ -72,15 +71,15 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_attach_unlock(true);
+ cgroup_unlock();
return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * cgroup_transfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*
@@ -89,6 +88,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
@@ -105,7 +106,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
percpu_down_write(&cgroup_threadgroup_rwsem);
@@ -144,7 +145,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
out_err:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -393,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -402,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -448,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -459,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -503,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
@@ -542,9 +546,19 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -658,11 +672,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
+ * Grab the subsystems state racily. No need to add avenue to
+ * cgroup_mutex contention.
*/
- mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
@@ -670,7 +682,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
- mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -682,6 +693,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
@@ -695,8 +708,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
- mutex_lock(&cgroup_mutex);
-
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
@@ -704,9 +715,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
+ if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
return -ENOENT;
}
rcu_read_unlock();
@@ -734,7 +744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
css_task_iter_end(&it);
- mutex_unlock(&cgroup_mutex);
+ cgroup_put(cgrp);
return 0;
}
@@ -837,13 +847,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
kernfs_break_active_protection(new_parent);
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
TRACE_CGROUP_PATH(rename, cgrp);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
kernfs_unbreak_active_protection(new_parent);
@@ -865,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -888,6 +900,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -899,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@@ -950,10 +966,22 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
@@ -1091,7 +1119,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
trace_cgroup_remount(root);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1195,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
- if (ret)
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
cgroup_free_root(root);
+
return ret;
}
@@ -1215,15 +1246,13 @@ int cgroup1_get_tree(struct fs_context *fc)
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
ret = 1; /* restart */
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (!ret)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3a0161c21b6b..625d7483951c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -68,6 +69,14 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -87,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -208,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
/* cgroup optional features */
enum cgroup_opt_features {
@@ -238,6 +248,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
+
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
@@ -248,7 +264,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
bool cgroup_ssid_enabled(int ssid)
{
- if (CGROUP_SUBSYS_COUNT == 0)
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
@@ -270,8 +286,6 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
@@ -472,7 +486,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
- if (ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
@@ -550,6 +564,9 @@ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
do {
css = cgroup_css(cgrp, ss);
@@ -577,6 +594,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
rcu_read_lock();
do {
@@ -647,7 +667,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
- if (cft->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
@@ -695,7 +715,7 @@ EXPORT_SYMBOL_GPL(of_css);
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
- if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
@@ -750,7 +770,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -1225,7 +1246,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1287,11 +1309,25 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ if (favor && !favoring) {
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1352,9 +1388,10 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_root_count--;
}
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
@@ -1362,73 +1399,97 @@ static void cgroup_destroy_root(struct cgroup_root *root)
}
/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
{
- struct cgroup *res = NULL;
- struct css_set *cset;
-
- lockdep_assert_held(&css_set_lock);
+ struct cgroup *res_cgroup = NULL;
- rcu_read_lock();
-
- cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
- res = &root->cgrp;
+ res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
+ res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
- res = c;
+ res_cgroup = c;
break;
}
}
}
- rcu_read_unlock();
- BUG_ON(!res);
- return res;
+ BUG_ON(!res_cgroup);
+ return res_cgroup;
}
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
+ struct css_set *cset;
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ rcu_read_lock();
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ rcu_read_unlock();
- BUG_ON(!res);
return res;
}
/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ if (current->nsproxy) {
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ } else {
+ /*
+ * NOTE: This function may be called from bpf_cgroup_from_id()
+ * on a task which has already passed exit_task_namespaces() and
+ * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
+ * cgroups visible for lookups.
+ */
+ return &cgrp_dfl_root.cgrp;
+ }
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
@@ -1574,7 +1635,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
else
cgrp = kn->parent->priv;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
@@ -1619,7 +1680,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -1663,12 +1724,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1691,14 +1756,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -1726,6 +1799,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
+ u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1742,8 +1816,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
} while_each_subsys_mask();
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
@@ -1752,10 +1846,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */
- src_root->subsys_mask &= ~(1 << ssid);
- WARN_ON(cgroup_apply_control(scgrp));
- cgroup_finalize_control(scgrp, 0);
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -1771,6 +1867,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
if (ss->css_rstat_flush) {
list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
list_add_rcu(&css->rstat_css_node,
&dcgrp->rstat_css_list);
}
@@ -1826,6 +1923,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
nr__cgroup2_params
@@ -1833,6 +1931,7 @@ enum cgroup2_param {
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
@@ -1852,6 +1951,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
@@ -1870,6 +1972,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
@@ -1886,6 +1991,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
@@ -1936,7 +2043,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -1987,9 +2095,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -2069,13 +2177,13 @@ int cgroup_do_get_tree(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
@@ -2112,7 +2220,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2158,6 +2266,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+#endif
return 0;
}
@@ -2169,13 +2281,14 @@ static void cgroup_kill_sb(struct super_block *sb)
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
- * cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2253,13 +2366,13 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -2285,7 +2398,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
int hierarchy_id = 1;
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2295,16 +2408,57 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+ ret = strscpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2373,7 +2527,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
- while (&cset->mg_node != tset->csets) {
+ while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
@@ -2531,10 +2685,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2558,21 +2708,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2612,11 +2768,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -2625,7 +2781,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2650,7 +2806,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2669,7 +2825,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2677,8 +2833,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2720,14 +2876,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
* take an rcu_read_lock.
*/
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
@@ -2774,8 +2928,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2792,12 +2945,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2828,17 +2977,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2846,8 +2992,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2902,29 +3048,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ cgroup_attach_lock(has_tasks);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2936,7 +3100,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -2957,7 +3121,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
int ssid;
restart:
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
@@ -2971,7 +3135,7 @@ restart:
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
schedule();
finish_wait(&dsct->offline_waitq, &wait);
@@ -3173,11 +3337,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3570,28 +3730,29 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
@@ -3603,15 +3764,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
- new = psi_trigger_create(psi, buf, nbytes, res);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res, of->file);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3621,36 +3787,108 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@@ -3773,24 +4011,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
@@ -3807,7 +4064,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -3984,8 +4241,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4050,21 +4305,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
@@ -4078,26 +4337,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
@@ -4119,9 +4378,15 @@ int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
- mutex_lock(&cgroup_mutex);
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
+ cgroup_lock();
ret = cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -4153,14 +4418,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true);
if (ret)
cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -4224,6 +4489,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4644,7 +4929,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->ss = css->ss;
it->flags = flags;
- if (it->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
@@ -4713,21 +4998,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4735,21 +5020,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4793,16 +5075,16 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
int ret;
@@ -4831,11 +5113,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb, bool threadgroup)
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
int ret = 0;
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
if (ret)
return ret;
@@ -4852,16 +5135,18 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
bool threadgroup)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4871,16 +5156,23 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- /* process and thread migrations follow same delegation rule */
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, threadgroup);
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -4976,10 +5268,14 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4987,7 +5283,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4995,12 +5291,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -5052,6 +5363,7 @@ static void css_free_rwork_fn(struct work_struct *work)
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) {
/*
@@ -5083,7 +5395,7 @@ static void css_release_work_fn(struct work_struct *work)
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
@@ -5124,7 +5436,7 @@ static void css_release_work_fn(struct work_struct *work)
NULL);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
@@ -5277,8 +5589,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5330,7 +5641,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5473,7 +5784,7 @@ static void css_killed_work_fn(struct work_struct *work)
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
do {
offline_css(css);
@@ -5482,7 +5793,7 @@ static void css_killed_work_fn(struct work_struct *work)
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* css kill confirmation processing requires process context, bounce */
@@ -5610,7 +5921,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
- if (parent && cgroup_is_threaded(cgrp))
+ if (cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
@@ -5666,14 +5977,14 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
pr_debug("Initializing cgroup subsys %s\n", ss->name);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -5710,7 +6021,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
BUG_ON(online_css(css));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/**
@@ -5763,19 +6074,14 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
get_user_ns(init_cgroup_ns.user_ns);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/*
* Add init_css_set to the hash table so that dfl_root can link to
@@ -5786,7 +6092,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for_each_subsys(ss, ssid) {
if (ss->early_init) {
@@ -5838,9 +6144,9 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css_populate_dir(init_css_set.subsys[ssid]);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -5889,24 +6195,41 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
- * On success return the cgrp, on failure return NULL
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
*/
struct cgroup *cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp = NULL;
+ struct cgroup *cgrp, *root_cgrp;
- mutex_lock(&cgroup_mutex);
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out_unlock;
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
- cgrp = kn->priv;
- if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
cgrp = NULL;
+
+ rcu_read_unlock();
kernfs_put(kn);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -5928,7 +6251,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -5936,7 +6259,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -5983,7 +6306,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kfree(buf);
out:
return retval;
@@ -6002,16 +6325,37 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
-static struct cgroup *cgroup_get_from_file(struct file *f)
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
- struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
- cgrp = css->cgroup;
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
@@ -6046,7 +6390,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_threadgroup_change_begin(current);
@@ -6088,8 +6432,23 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (ret)
goto err;
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
- !(kargs->flags & CLONE_THREAD));
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
if (ret)
goto err;
@@ -6106,7 +6465,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
err:
cgroup_threadgroup_change_end(current);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (f)
fput(f);
if (dst_cgrp)
@@ -6133,7 +6492,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (cset) {
put_css_set(cset);
@@ -6150,6 +6509,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6212,6 +6572,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
@@ -6459,54 +6820,77 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
- struct cgroup *cgrp;
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- mutex_lock(&cgroup_mutex);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ if (!kn)
+ goto out;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
- if (kn) {
- if (kernfs_type(kn) == KERNFS_DIR) {
- cgrp = kn->priv;
- cgroup_get_live(cgrp);
- } else {
- cgrp = ERR_PTR(-ENOTDIR);
- }
- kernfs_put(kn);
- } else {
- cgrp = ERR_PTR(-ENOENT);
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
}
- mutex_unlock(&cgroup_mutex);
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
- struct file *f;
-
- f = fget_raw(fd);
- if (!f)
+ struct fd f = fdget_raw(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- cgrp = cgroup_get_from_file(f);
- fput(f);
+ cgrp = cgroup_v1v2_get_from_file(f.file);
+ fdput(f);
+ return cgrp;
+}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@ -6559,118 +6943,57 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type,
- u32 flags)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
-
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)
@@ -6682,9 +7005,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
-
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
@@ -6704,8 +7024,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
@@ -6721,6 +7044,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
+ "favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n");
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index adb5190c4429..e4ca2dd2b764 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -33,6 +33,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
+#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
@@ -69,6 +70,13 @@
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgment
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -78,6 +86,30 @@ struct fmeter {
spinlock_t lock; /* guards read or write of above */
};
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+};
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+};
+
struct cpuset {
struct cgroup_subsys_state css;
@@ -160,25 +192,33 @@ struct cpuset {
*/
int use_parent_ecpus;
int child_ecpus_count;
+
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
};
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -258,15 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (is_partition_valid(cs))
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
+/*
+ * Send notification event of whenever partition_root_state changes.
+ */
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
+{
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
};
/**
@@ -298,17 +366,19 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_rwsem write lock. Other
+ * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
+ * prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
+ * just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
@@ -357,6 +427,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -371,19 +452,65 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * It is assumed that @cs is a valid partition root. @excluded_child should
+ * be non-NULL when this cpuset is going to become a partition itself.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+
+ if (cs->css.cgroup->nr_populated_csets)
+ return true;
+ if (!excluded_child && !cs->nr_subparts_cpus)
+ return cgroup_is_populated(cs->css.cgroup);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (child == excluded_child)
+ continue;
+ if (is_partition_valid(child))
+ continue;
+ if (cgroup_is_populated(child->css.cgroup)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+ cpumask_copy(pmask, cpu_online_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
@@ -393,11 +520,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
- cpumask_copy(pmask, cpu_online_mask);
- return;
+ goto out_unlock;
}
}
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+ rcu_read_unlock();
}
/*
@@ -409,7 +538,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -421,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * if on default hierarchy.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
@@ -442,7 +575,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
+ * are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -545,13 +678,42 @@ static inline void free_cpuset(struct cpuset *cs)
}
/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
*
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
+ * cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -568,44 +730,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
+ int ret = 0;
rcu_read_lock();
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
+ if (!is_in_v2_mode())
+ ret = validate_change_legacy(cur, trial);
+ if (ret)
+ goto out;
/* Remaining checks don't apply to root cpuset */
- ret = 0;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
- goto out;
-
- /*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
/*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
@@ -630,6 +769,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
trial->cpus_allowed))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ goto out;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -674,7 +829,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_mutex held. */
+/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -700,7 +855,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_mutex held.
+ * Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -765,7 +920,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -795,7 +950,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (root_load_balance &&
@@ -807,7 +962,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -884,7 +1039,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -979,7 +1134,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes get_online_cpus().
+ * Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1013,7 +1168,7 @@ static void rebuild_sched_domains_locked(void)
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_root(cs)) {
+ if (!is_partition_valid(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1040,29 +1195,46 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
}
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
- * cpuset membership stays stable.
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
+ * is used instead of effective_cpus to make sure all offline CPUs are also
+ * included as hotplug code won't update cpumasks for tasks in top_cpuset.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+ while ((task = css_task_iter_next(&it))) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(task);
+
+ if (top_cs) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task))
+ continue;
+ cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
+ } else {
+ cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
+ }
+ set_cpus_allowed_ptr(task, new_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1097,15 +1269,18 @@ enum subparts_cmd {
partcmd_enable, /* Enable partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's subparts_cpus */
+ partcmd_invalidate, /* Make partition invalid */
};
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
- * @cpuset: The cpuset that requests change in partition root state
+ * @cs: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
+ * Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1113,42 +1288,39 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * For partcmd_disable, the cpuset is being transformed from a partition
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
+ * into parent's effective_cpus. 0 will always be returned.
*
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
*
- * The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
+ * For partcmd_invalidate, the current partition will be made invalid.
*
- * The checking is more strict when enabling partition root than the
- * other two commands.
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
*
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_error will be updated
+ * directly.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
- struct cpuset *parent = parent_cs(cpuset);
+ struct cpuset *parent = parent_cs(cs);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
- bool part_error = false; /* Partition error? */
+ int old_prs, new_prs;
+ int part_error = PERR_NONE; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1157,124 +1329,164 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
-
- /*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
- */
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
-
- /*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
- */
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if ((newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cs->cpus_allowed)))
+ return PERR_CPUSEMPTY;
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
+ old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ /*
+ * Enabling partition root is not allowed if cpus_allowed
+ * doesn't overlap parent's cpus_allowed.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ return PERR_INVCPUS;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+ * task directly associated with the parent partition.
+ */
+ if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
+ partition_is_populated(parent, cs))
+ return PERR_NOCPUS;
+
+ cpumask_copy(tmp->addmask, cs->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ /*
+ * Need to remove cpus from parent's subparts_cpus for valid
+ * partition root.
+ */
+ deleting = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid. It is assumed that
+ * invalidation is caused by violating cpu exclusivity rule.
+ */
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
parent->subparts_cpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ part_error = PERR_NOTEXCL;
+ }
} else if (newmask) {
/*
* partcmd_update with newmask:
*
+ * Compute add/delete mask to/from subparts_cpus
+ *
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
+ * addmask = newmask & parent->cpus_allowed
* & ~parent->subparts_cpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
- * Return error if the new effective_cpus could become empty.
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
*/
if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
+ partition_is_populated(parent, cs)) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
} else {
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effectiveb_cpus
+ * delmask = cpus_allowed & parent->subparts_cpus
+ * addmask = cpus_allowed & parent->cpus_allowed
+ * & ~parent->subparts_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This gets invoked either due to a hotplug event or from
+ * update_cpumasks_hier(). This can cause the state of a
+ * partition root to transition from valid to invalid or vice
+ * versa. So we still need to compute the addmask and delmask.
+
+ * A partition error happens when:
+ * 1) Cpuset is valid partition, but parent does not distribute
+ * out any CPUs.
+ * 2) Parent has tasks and all its effective CPUs will have
+ * to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ cpumask_and(tmp->addmask, cs->cpus_allowed,
+ parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+
+ if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+ (adding &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ partition_is_populated(parent, cs))) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ }
+
+ if (part_error && is_partition_valid(cs) &&
+ parent->nr_subparts_cpus)
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
-
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
if (part_error)
- cpuset->partition_root_state = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
if (!part_error)
- cpuset->partition_root_state = PRS_ENABLED;
+ new_prs = -old_prs;
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
}
- if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
- return 0; /* Nothing need to be done */
+ if (!adding && !deleting && (new_prs == old_prs))
+ return 0;
- if (cpuset->partition_root_state == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
+ /*
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ */
+ if (old_prs != new_prs) {
+ if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
+ (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
+ return PERR_NOTEXCL;
+ if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
- if (!adding && !deleting)
- return 0;
-
/*
* Change the parent's subparts_cpus.
* Newly added CPUs will be removed from effective_cpus and
@@ -1299,40 +1511,68 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+
+ if (old_prs != new_prs)
+ cs->partition_root_state = new_prs;
+
spin_unlock_irq(&callback_lock);
- return cmd == partcmd_update;
+ if (adding || deleting)
+ update_tasks_cpumask(parent, tmp->addmask);
+
+ /*
+ * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
+ * rebuild_sched_domains_locked() may be called.
+ */
+ if (old_prs != new_prs) {
+ if (old_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ else if (new_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ }
+ notify_partition_change(cs, old_prs);
+ return 0;
}
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
*
* When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
+ int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool update_parent = false;
compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
*/
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ if (is_partition_valid(cp) &&
+ cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
+ goto update_parent_subparts;
+
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
@@ -1346,56 +1586,40 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
/*
* Skip the whole subtree if the cpumask remains the same
- * and has no partition root state.
+ * and has no partition root state and force flag not set.
*/
- if (!cp->partition_root_state &&
+ if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+update_parent_subparts:
/*
* update_parent_subparts_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
* update_tasks_cpumask() again for tasks in the parent
* cpuset if the parent's subparts_cpus changes.
*/
- if ((cp != cs) && cp->partition_root_state) {
+ old_prs = new_prs = cp->partition_root_state;
+ if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_DISABLED:
- /*
- * If parent is not a partition root or an
- * invalid partition root, clear the state
- * state and the CS_CPU_EXCLUSIVE flag.
- */
- WARN_ON_ONCE(cp->partition_root_state
- != PRS_ERROR);
- cp->partition_root_state = 0;
-
- /*
- * clear_bit() is an atomic operation and
- * readers aren't interested in the state
- * of CS_CPU_EXCLUSIVE anyway. So we can
- * just update the flag without holding
- * the callback_lock.
- */
- clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
- break;
-
- case PRS_ENABLED:
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
break;
- case PRS_ERROR:
+ default:
/*
- * When parent is invalid, it has to be too.
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
*/
- cp->partition_root_state = PRS_ERROR;
- if (cp->nr_subparts_cpus) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- }
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
@@ -1404,43 +1628,49 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
continue;
rcu_read_unlock();
+ if (update_parent) {
+ update_parent_subparts_cpumask(cp, partcmd_update, NULL,
+ tmp);
+ /*
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
+ */
+ new_prs = cp->partition_root_state;
+ }
+
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus &&
- (cp->partition_root_state != PRS_ENABLED)) {
+ if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
+ /*
+ * Put all active subparts_cpus back to effective_cpus.
+ */
+ cpumask_or(tmp->new_cpus, tmp->new_cpus,
+ cp->subparts_cpus);
+ cpumask_and(tmp->new_cpus, tmp->new_cpus,
+ cpu_active_mask);
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
- } else if (cp->nr_subparts_cpus) {
+ }
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus) {
/*
* Make sure that effective_cpus & subparts_cpus
* are mutually exclusive.
- *
- * In the unlikely event that effective_cpus
- * becomes empty. we clear cp->nr_subparts_cpus and
- * let its child partition roots to compete for
- * CPUs again.
*/
cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
cp->subparts_cpus);
- if (cpumask_empty(cp->effective_cpus)) {
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- cpumask_clear(cp->subparts_cpus);
- cp->nr_subparts_cpus = 0;
- } else if (!cpumask_subset(cp->subparts_cpus,
- tmp->new_cpus)) {
- cpumask_andnot(cp->subparts_cpus,
- cp->subparts_cpus, tmp->new_cpus);
- cp->nr_subparts_cpus
- = cpumask_weight(cp->subparts_cpus);
- }
}
+
+ cp->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
+ notify_partition_change(cp, old_prs);
+
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -1451,7 +1681,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
+ is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -1475,10 +1705,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1486,8 +1721,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
if (!sibling->use_parent_ecpus)
continue;
+ if (!css_tryget_online(&sibling->css))
+ continue;
- update_cpumasks_hier(sibling, tmp);
+ rcu_read_unlock();
+ update_cpumasks_hier(sibling, tmp, false);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1503,6 +1743,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ bool invalidate = false;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1530,43 +1771,88 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
-
#ifdef CONFIG_CPUMASK_OFFSTACK
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
* to allocated cpumasks.
+ *
+ * Note that update_parent_subparts_cpumask() uses only addmask &
+ * delmask, but not new_cpus.
*/
tmp.addmask = trialcs->subparts_cpus;
tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = trialcs->cpus_allowed;
+ tmp.new_cpus = NULL;
#endif
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ struct cpuset *cp, *parent;
+ struct cgroup_subsys_state *css;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ invalidate = true;
+ rcu_read_lock();
+ parent = parent_cs(cs);
+ cpuset_for_each_child(cp, css, parent)
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+ rcu_read_unlock();
+ update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+ if (retval < 0)
+ return retval;
+
if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
+ if (invalidate)
+ update_parent_subparts_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
+ else
+ update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp);
}
+ compute_effective_cpumask(trialcs->effective_cpus, trialcs,
+ parent_cs(cs));
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Make sure that subparts_cpus, if not empty, is a subset of
+ * cpus_allowed. Clear subparts_cpus if partition not valid or
+ * empty effective cpus with tasks.
*/
if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ if (!is_partition_valid(cs) ||
+ (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
+ partition_is_populated(cs, NULL))) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ } else {
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
}
spin_unlock_irq(&callback_lock);
- update_cpumasks_hier(cs, &tmp);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /* Now trialcs->cpus_allowed is available */
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
+ /* effective_cpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, false);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -1612,6 +1898,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct cpuset_migrate_mm_work *mwork;
+ if (nodes_equal(*from, *to)) {
+ mmput(mm);
+ return;
+ }
+
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
@@ -1664,12 +1955,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_mutex */
+ static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
@@ -1682,7 +1973,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
+ * the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1728,7 +2019,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1781,7 +2072,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_lock during call.
+ * Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1828,6 +2119,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ check_insane_mems_config(&trialcs->mems_allowed);
+
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@@ -1871,7 +2164,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
+ * function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -1881,7 +2174,7 @@ static void update_tasks_flags(struct cpuset *cs)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
@@ -1891,7 +2184,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1935,92 +2228,131 @@ out:
return err;
}
-/*
- * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * val: 0 - disabled, 1 - enabled
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
-static int update_prstate(struct cpuset *cs, int val)
+static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
+ bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
- struct tmpmasks tmp;
+ struct tmpmasks tmpmask;
- if ((val != 0) && (val != 1))
- return -EINVAL;
- if (val == cs->partition_root_state)
+ if (old_prs == new_prs)
return 0;
/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * For a previously invalid partition root, leave it at being
+ * invalid if new_prs is not "member".
*/
- if (val && cs->partition_root_state)
- return -EINVAL;
+ if (new_prs && is_prs_invalid(old_prs)) {
+ cs->partition_root_state = -new_prs;
+ return 0;
+ }
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- err = -EINVAL;
- if (!cs->partition_root_state) {
+ if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
+ * cannot be empty.
*/
- if (cpumask_empty(cs->cpus_allowed))
+ if (cpumask_empty(cs->cpus_allowed)) {
+ err = PERR_CPUSEMPTY;
goto out;
+ }
err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
+ if (err) {
+ err = PERR_NOTEXCL;
goto out;
+ }
err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmp);
+ NULL, &tmpmask);
if (err) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
- cs->partition_root_state = PRS_ENABLED;
+
+ if (new_prs == PRS_ISOLATED) {
+ /*
+ * Disable the load balance flag should not return an
+ * error unless the system is running out of memory.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ sched_domain_rebuilt = true;
+ }
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
+ sched_domain_rebuilt = true;
+ goto out; /* Sched domain is rebuilt in update_flag() */
} else {
/*
- * Turning off partition root will clear the
- * CS_CPU_EXCLUSIVE bit.
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
*/
- if (cs->partition_root_state == PRS_ERROR) {
- cs->partition_root_state = 0;
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- err = 0;
- goto out;
- }
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
+ &tmpmask);
- err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmp);
- if (err)
- goto out;
-
- cs->partition_root_state = 0;
+ /*
+ * If there are child partitions, they will all become invalid.
+ */
+ if (unlikely(cs->nr_subparts_cpus)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(cs->effective_cpus, cs, parent);
+ spin_unlock_irq(&callback_lock);
+ }
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+
+ if (!is_sched_load_balance(cs)) {
+ /* Make sure load balance is on */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ sched_domain_rebuilt = true;
+ }
}
- /*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
- */
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmpmask.new_cpus);
if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ update_sibling_cpumasks(parent, cs, &tmpmask);
- rebuild_sched_domains_locked();
+ if (!sched_domain_rebuilt)
+ rebuild_sched_domains_locked();
out:
- free_cpumasks(NULL, &tmp);
- return err;
+ /*
+ * Make partition invalid if an error happen
+ */
+ if (err)
+ new_prs = -new_prs;
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
+ spin_unlock_irq(&callback_lock);
+ /*
+ * Update child cpusets, if present.
+ * Force update if switching back to member.
+ */
+ if (!list_empty(&cs->css.children))
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+
+ notify_partition_change(cs, old_prs);
+ free_cpumasks(NULL, &tmpmask);
+ return 0;
}
/*
@@ -2126,7 +2458,21 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+/*
+ * Check to see if a cpuset can accept a new task
+ * For v1, cpus_allowed and mems_allowed can't be empty.
+ * For v2, effective_cpus can't be empty.
+ * Note that in v1, effective_cpus = cpus_allowed.
+ */
+static int cpuset_can_attach_check(struct cpuset *cs)
+{
+ if (cpumask_empty(cs->effective_cpus) ||
+ (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
+ return -ENOSPC;
+ return 0;
+}
+
+/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -2140,14 +2486,13 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem);
- /* allow moving tasks into an empty cpuset if on default hierarchy */
- ret = -ENOSPC;
- if (!is_in_v2_mode() &&
- (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
goto out_unlock;
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
+ ret = task_can_attach(task, cs->effective_cpus);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
@@ -2160,7 +2505,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
- ret = 0;
out_unlock:
percpu_up_write(&cpuset_rwsem);
return ret;
@@ -2169,60 +2513,90 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
+ struct cpuset *cs;
cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
percpu_down_write(&cpuset_rwsem);
- css_cs(css)->attach_in_progress--;
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ cs->subparts_cpus);
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flags(cs, task);
+}
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
- static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
- }
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
@@ -2245,6 +2619,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
}
}
+out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
cs->attach_in_progress--;
@@ -2282,7 +2657,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -2320,7 +2695,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -2331,7 +2706,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2346,7 +2721,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -2378,14 +2753,14 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2411,7 +2786,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2507,16 +2882,29 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
- case PRS_DISABLED:
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- seq_puts(seq, "root invalid\n");
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
@@ -2535,14 +2923,16 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
else
return -EINVAL;
css_get(&cs->css);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2550,7 +2940,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
retval = update_prstate(cs, val);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2702,6 +3092,7 @@ static struct cftype dfl_files[] = {
.write = sched_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cpuset, partition_file),
},
{
@@ -2715,11 +3106,15 @@ static struct cftype dfl_files[] = {
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -2737,12 +3132,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
}
- set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
nodes_clear(cs->mems_allowed);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
+ /* Set CS_MEMORY_MIGRATE for default hierarchy */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
+
return &cs->css;
}
@@ -2756,7 +3155,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
@@ -2782,7 +3181,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -2809,7 +3208,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_unlock_irq(&callback_lock);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -2828,10 +3227,10 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
- if (is_partition_root(cs))
+ if (is_partition_valid(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -2849,7 +3248,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2878,17 +3277,101 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
/*
+ * In case the child is cloned into a cpuset different from its parent,
+ * additional checks are done to see if the move is allowed.
+ */
+static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+ int ret;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+ percpu_down_write(&cpuset_rwsem);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ ret = task_can_attach(task, cs->effective_cpus);
+ if (ret)
+ goto out_unlock;
+
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ percpu_up_write(&cpuset_rwsem);
+ return ret;
+}
+
+static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return;
+
+ percpu_down_write(&cpuset_rwsem);
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+ percpu_up_write(&cpuset_rwsem);
+}
+
+/*
* Make sure the new task conform to the current state of its parent,
* which could have been changed by cpuset just after it inherits the
* state from the parent and before it sits on the cgroup's task list.
*/
static void cpuset_fork(struct task_struct *task)
{
- if (task_css_is_root(task, cpuset_cgrp_id))
+ struct cpuset *cs;
+ bool same_cs;
+
+ rcu_read_lock();
+ cs = task_cs(task);
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
return;
+ }
+
+ /* CLONE_INTO_CGROUP */
+ percpu_down_write(&cpuset_rwsem);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
- set_cpus_allowed_ptr(task, current->cpus_ptr);
- task->mems_allowed = current->mems_allowed;
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ percpu_up_write(&cpuset_rwsem);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -2901,6 +3384,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .can_fork = cpuset_can_fork,
+ .cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
.legacy_cftypes = legacy_files,
.dfl_cftypes = dfl_files,
@@ -2916,8 +3401,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
- BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -2979,10 +3462,10 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
@@ -3007,7 +3490,8 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
- if (cpumask_empty(new_cpus))
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -3018,7 +3502,7 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
update_tasks_nodemask(cs);
}
@@ -3060,7 +3544,7 @@ retry:
goto retry;
}
- parent = parent_cs(cs);
+ parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
@@ -3076,46 +3560,74 @@ retry:
/*
* In the unlikely event that a partition root has empty
- * effective_cpus or its parent becomes erroneous, we have to
- * transition it to the erroneous state.
+ * effective_cpus with tasks, we will have to invalidate child
+ * partitions, if present, by setting nr_subparts_cpus to 0 to
+ * reclaim their cpus.
*/
- if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
- (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
+ cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
+ */
+ if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
+ (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
+ int old_prs, parent_prs;
+
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
if (cs->nr_subparts_cpus) {
+ spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
compute_effective_cpumask(&new_cpus, cs, parent);
}
- /*
- * If the effective_cpus is empty because the child
- * partitions take away all the CPUs, we can keep
- * the current partition and let the child partitions
- * fight for available CPUs.
- */
- if ((parent->partition_root_state == PRS_ERROR) ||
- cpumask_empty(&new_cpus)) {
- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
- cs->partition_root_state = PRS_ERROR;
+ old_prs = cs->partition_root_state;
+ parent_prs = parent->partition_root_state;
+ if (is_partition_valid(cs)) {
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(cs);
+ spin_unlock_irq(&callback_lock);
+ if (is_prs_invalid(parent_prs))
+ WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
+ else if (!parent_prs)
+ WRITE_ONCE(cs->prs_err, PERR_NOTPART);
+ else
+ WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
+ notify_partition_change(cs, old_prs);
}
cpuset_force_rebuild();
}
/*
- * On the other hand, an erroneous partition root may be transitioned
- * back to a regular one or a partition root with no CPU allocated
- * from the parent may change to erroneous.
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one.
*/
- if (is_partition_root(parent) &&
- ((cs->partition_root_state == PRS_ERROR) ||
- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
+ if (is_partition_valid(cs))
+ cpuset_force_rebuild();
+ }
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (!cpus_updated && !mems_updated)
+ goto unlock; /* Hotplug doesn't affect this cpuset */
+
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
@@ -3124,6 +3636,7 @@ update_tasks:
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
+unlock:
percpu_up_write(&cpuset_rwsem);
}
@@ -3168,6 +3681,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+ /*
+ * In the rare case that hotplug removes all the cpus in subparts_cpus,
+ * we assumed that cpus are updated.
+ */
+ if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+ cpus_updated = true;
+
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
@@ -3261,11 +3781,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3273,14 +3788,17 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
*/
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
+ /*
+ * cpus_allowd/mems_allowed set to v2 values in the initial
+ * cpuset_bind() call will be reset to v1 values in another
+ * cpuset_bind() call when v1 cpuset is mounted.
+ */
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
@@ -3294,16 +3812,37 @@ void __init cpuset_init_smp(void)
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
+ struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+ if (!cpumask_intersects(pmask, cpu_online_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+
rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3318,13 +3857,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
**/
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
rcu_read_lock();
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ do_set_cpus_allowed(tsk, cs_mask);
+ changed = true;
+ }
rcu_read_unlock();
/*
@@ -3344,6 +3892,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
+ return changed;
}
void __init cpuset_init_current_mems_allowed(void)
@@ -3399,7 +3948,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
-/**
+/*
* cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
@@ -3439,10 +3988,10 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
@@ -3474,8 +4023,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
}
/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ * cpuset_spread_node() - On which node to begin search for a page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -3499,12 +4047,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* is passed an offline node, it will fall back to the local node.
* See kmem_cache_alloc_node().
*/
-
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ */
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
@@ -3514,6 +4064,9 @@ int cpuset_mem_spread_node(void)
return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}
+/**
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ */
int cpuset_slab_spread_node(void)
{
if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
@@ -3522,7 +4075,6 @@ int cpuset_slab_spread_node(void)
return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
-
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
@@ -3571,8 +4123,8 @@ void cpuset_print_current_mems_allowed(void)
int cpuset_memory_pressure_enabled __read_mostly;
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@@ -3587,7 +4139,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
- **/
+ */
void __cpuset_memory_pressure_bump(void)
{
@@ -3603,7 +4155,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..936473203a6b 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -113,7 +114,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
}
mutex_unlock(&freezer_mutex);
@@ -134,7 +135,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
freezer->state = 0;
@@ -179,6 +180,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +273,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +360,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec_cpuslocked(&freezer_active);
unfreeze_cgroup(freezer);
}
}
@@ -386,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
+ cpus_read_lock();
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
@@ -414,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static ssize_t freezer_write(struct kernfs_open_file *of,
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index ec02d963cad1..fe3e8a0eb7ed 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
new_usage = atomic_long_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
- if (!res->failed) {
- pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
- misc_res_name[type]);
- pr_cont_cgroup_path(i->css.cgroup);
- pr_cont("\n");
- res->failed = true;
- }
ret = -EBUSY;
goto err_charge;
}
@@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
return 0;
err_charge:
+ for (j = i; j; j = parent_misc(j)) {
+ atomic_long_inc(&j->res[type].events);
+ cgroup_file_notify(&j->events_file);
+ }
+
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
misc_cg_cancel_charge(type, i, amount);
@@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ unsigned long events, i;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ events = atomic_long_read(&cg->res[i].events);
+ if (READ_ONCE(misc_res_capacity[i]) || events)
+ seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ }
+ return 0;
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
},
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_file),
+ .seq_show = misc_events_show,
+ },
{}
};
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index f5e8828c109c..0d5c29879a50 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
struct cgroup_namespace *new_ns;
int ret;
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 511af87f685e..7695e60bcb40 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -47,6 +47,7 @@ struct pids_cgroup {
*/
atomic64_t counter;
atomic64_t limit;
+ int64_t watermark;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
kfree(css_pids(css));
}
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
/**
* pids_cancel - uncharge the local pid count
* @pids: the pid cgroup state
@@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num)
{
struct pids_cgroup *p;
- for (p = pids; parent_pids(p); p = parent_pids(p))
- atomic64_add(num, &p->counter);
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
}
/**
@@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
*/
if (new > limit)
goto revert;
+
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
}
return 0;
@@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
static int pids_events_show(struct seq_file *sf, void *v)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
@@ -332,6 +360,11 @@ static struct cftype pids_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
.name = "events",
.seq_show = pids_events_show,
.file_offset = offsetof(struct pids_cgroup, events_file),
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 7f0e58917432..9c4c55228567 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -22,7 +26,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
* rstat_cpu->updated_children list. See the comment on top of
* cgroup_rstat_cpu definition for details.
*/
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
unsigned long flags;
@@ -35,7 +39,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
@@ -88,6 +92,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
if (pos == root)
return NULL;
@@ -96,10 +101,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
- if (!pos)
+ if (!pos) {
pos = root;
- else
+ /* return NULL if this subtree is not on-list */
+ if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
+ return NULL;
+ } else {
pos = cgroup_parent(pos);
+ }
/* walk down to the first leaf */
while (true) {
@@ -115,35 +124,52 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
-
- if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
-
- prstatc = cgroup_rstat_cpu(parent, cpu);
- nextp = &prstatc->updated_children;
- while (true) {
- struct cgroup_rstat_cpu *nrstatc;
-
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
- WARN_ON_ONCE(*nextp == parent);
- nextp = &nrstatc->updated_next;
- }
- *nextp = rstatc->updated_next;
- }
+ parent = cgroup_parent(pos);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup **nextp;
+
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (*nextp != pos) {
+ struct cgroup_rstat_cpu *nrstatc;
- rstatc->updated_next = NULL;
- return pos;
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
}
- /* only happens for @root */
- return NULL;
+ rstatc->updated_next = NULL;
+ return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -156,12 +182,22 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -169,7 +205,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
@@ -195,7 +231,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
*
* This function may block.
*/
-void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
@@ -205,12 +241,12 @@ void cgroup_rstat_flush(struct cgroup *cgrp)
}
/**
- * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush()
* @cgrp: target cgroup
*
* This function can be called from any context.
*/
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+void cgroup_rstat_flush_atomic(struct cgroup *cgrp)
{
unsigned long flags;
@@ -304,6 +340,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -312,13 +351,16 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -328,11 +370,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
@@ -347,19 +388,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -367,18 +409,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -390,11 +434,16 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
@@ -403,13 +452,12 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
- cputime->stime = 0;
- cputime->utime = 0;
- cputime->sum_exec_runtime = 0;
+ memset(bstat, 0, sizeof(*bstat));
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
@@ -430,8 +478,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
}
}
@@ -439,27 +489,61 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
cgroup_rstat_flush_release();
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/compat.c b/kernel/compat.c
index 05adfd6fa8bf..fb50f29d9b36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -152,7 +152,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
@@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return 0;
}
EXPORT_SYMBOL_GPL(get_compat_sigset);
-
-/*
- * Allocate user-space memory for the duration of a single system call,
- * in order to marshall parameters inside a compat thunk.
- */
-void __user *compat_alloc_user_space(unsigned long len)
-{
- void __user *ptr;
-
- /* If len would occupy more than half of the entire compat space... */
- if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
- return NULL;
-
- ptr = arch_compat_alloc_user_space(len);
-
- if (unlikely(!access_ok(ptr, len)))
- return NULL;
-
- return ptr;
-}
-EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
deleted file mode 100644
index eb701b2ac72f..000000000000
--- a/kernel/configs/android-base.config
+++ /dev/null
@@ -1,160 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVMEM is not set
-# CONFIG_FHANDLE is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_NFSD is not set
-# CONFIG_NFS_FS is not set
-# CONFIG_OABI_COMPAT is not set
-# CONFIG_SYSVIPC is not set
-# CONFIG_USELIB is not set
-CONFIG_ANDROID=y
-CONFIG_ANDROID_BINDER_IPC=y
-CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
-CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
-CONFIG_ASHMEM=y
-CONFIG_AUDIT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_BPF=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_EMBEDDED=y
-CONFIG_FB=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
-CONFIG_INET_ESP=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_IP6_NF_FILTER=y
-CONFIG_IP6_NF_IPTABLES=y
-CONFIG_IP6_NF_MANGLE=y
-CONFIG_IP6_NF_RAW=y
-CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IPV6=y
-CONFIG_IPV6_MIP6=y
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_IPV6_ROUTE_INFO=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_NF_ARPFILTER=y
-CONFIG_IP_NF_ARPTABLES=y
-CONFIG_IP_NF_ARP_MANGLE=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_IP_NF_MATCH_AH=y
-CONFIG_IP_NF_MATCH_ECN=y
-CONFIG_IP_NF_MATCH_TTL=y
-CONFIG_IP_NF_NAT=y
-CONFIG_IP_NF_RAW=y
-CONFIG_IP_NF_SECURITY=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_IP_NF_TARGET_NETMAP=y
-CONFIG_IP_NF_TARGET_REDIRECT=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_NET=y
-CONFIG_NETDEVICES=y
-CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
-CONFIG_NETFILTER_XT_MATCH_COMMENT=y
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_HELPER=y
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
-CONFIG_NETFILTER_XT_MATCH_LENGTH=y
-CONFIG_NETFILTER_XT_MATCH_LIMIT=y
-CONFIG_NETFILTER_XT_MATCH_MAC=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA=y
-CONFIG_NETFILTER_XT_MATCH_SOCKET=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
-CONFIG_NETFILTER_XT_MATCH_STRING=y
-CONFIG_NETFILTER_XT_MATCH_TIME=y
-CONFIG_NETFILTER_XT_MATCH_U32=y
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
-CONFIG_NETFILTER_XT_TARGET_TPROXY=y
-CONFIG_NETFILTER_XT_TARGET_TRACE=y
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_CLS_U32=y
-CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_U32=y
-CONFIG_NET_KEY=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=y
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_AMANDA=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_FTP=y
-CONFIG_NF_CONNTRACK_H323=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_IPV6=y
-CONFIG_NF_CONNTRACK_IRC=y
-CONFIG_NF_CONNTRACK_NETBIOS_NS=y
-CONFIG_NF_CONNTRACK_PPTP=y
-CONFIG_NF_CONNTRACK_SANE=y
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_TFTP=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NF_CT_PROTO_DCCP=y
-CONFIG_NF_CT_PROTO_SCTP=y
-CONFIG_NF_CT_PROTO_UDPLITE=y
-CONFIG_NF_NAT=y
-CONFIG_NO_HZ=y
-CONFIG_PACKET=y
-CONFIG_PM_AUTOSLEEP=y
-CONFIG_PM_WAKELOCKS=y
-CONFIG_PPP=y
-CONFIG_PPP_BSDCOMP=y
-CONFIG_PPP_DEFLATE=y
-CONFIG_PPP_MPPE=y
-CONFIG_PREEMPT=y
-CONFIG_QUOTA=y
-CONFIG_RANDOMIZE_BASE=y
-CONFIG_RTC_CLASS=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_SECCOMP=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
-CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
-CONFIG_SYNC=y
-CONFIG_TUN=y
-CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_OTG_WAKELOCK=y
-CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
deleted file mode 100644
index eb0029c9a6a6..000000000000
--- a/kernel/configs/android-recommended.config
+++ /dev/null
@@ -1,127 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_NF_CONNTRACK_SIP is not set
-# CONFIG_PM_WAKELOCKS_GC is not set
-# CONFIG_VT is not set
-CONFIG_ARM64_SW_TTBR0_PAN=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_STACKPROTECTOR_STRONG=y
-CONFIG_COMPACTION=y
-CONFIG_CPU_SW_DOMAIN_PAN=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_UEVENT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
-CONFIG_DRAGONRISE_FF=y
-CONFIG_ENABLE_DEFAULT_TRACERS=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_FUSE_FS=y
-CONFIG_GREENASIA_FF=y
-CONFIG_HIDRAW=y
-CONFIG_HID_A4TECH=y
-CONFIG_HID_ACRUX=y
-CONFIG_HID_ACRUX_FF=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-CONFIG_HID_DRAGONRISE=y
-CONFIG_HID_ELECOM=y
-CONFIG_HID_EMS_FF=y
-CONFIG_HID_EZKEY=y
-CONFIG_HID_GREENASIA=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_HOLTEK=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_KEYTOUCH=y
-CONFIG_HID_KYE=y
-CONFIG_HID_LCPOWER=y
-CONFIG_HID_LOGITECH=y
-CONFIG_HID_LOGITECH_DJ=y
-CONFIG_HID_MAGICMOUSE=y
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
-CONFIG_HID_MULTITOUCH=y
-CONFIG_HID_NTRIG=y
-CONFIG_HID_ORTEK=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_PICOLCD=y
-CONFIG_HID_PRIMAX=y
-CONFIG_HID_PRODIKEYS=y
-CONFIG_HID_ROCCAT=y
-CONFIG_HID_SAITEK=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SMARTJOYPLUS=y
-CONFIG_HID_SONY=y
-CONFIG_HID_SPEEDLINK=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_HID_THRUSTMASTER=y
-CONFIG_HID_TIVO=y
-CONFIG_HID_TOPSEED=y
-CONFIG_HID_TWINHAN=y
-CONFIG_HID_UCLOGIC=y
-CONFIG_HID_WACOM=y
-CONFIG_HID_WALTOP=y
-CONFIG_HID_WIIMOTE=y
-CONFIG_HID_ZEROPLUS=y
-CONFIG_HID_ZYDACRON=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_GPIO=y
-CONFIG_INPUT_JOYSTICK=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_TABLET=y
-CONFIG_INPUT_UINPUT=y
-CONFIG_JOYSTICK_XPAD=y
-CONFIG_JOYSTICK_XPAD_FF=y
-CONFIG_JOYSTICK_XPAD_LEDS=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_KSM=y
-CONFIG_LOGIG940_FF=y
-CONFIG_LOGIRUMBLEPAD2_FF=y
-CONFIG_LOGITECH_FF=y
-CONFIG_MD=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_MSDOS_FS=y
-CONFIG_PANIC_TIMEOUT=5
-CONFIG_PANTHERLORD_FF=y
-CONFIG_PERF_EVENTS=y
-CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
-CONFIG_PM_WAKELOCKS_LIMIT=0
-CONFIG_POWER_SUPPLY=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SMARTJOYPLUS_FF=y
-CONFIG_SND=y
-CONFIG_SOUND=y
-CONFIG_STRICT_KERNEL_RWX=y
-CONFIG_SUSPEND_TIME=y
-CONFIG_TABLET_USB_ACECAD=y
-CONFIG_TABLET_USB_AIPTEK=y
-CONFIG_TABLET_USB_HANWANG=y
-CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_TASK_XACCT=y
-CONFIG_TIMER_STATS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_UHID=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB_USBNET=y
-CONFIG_VFAT_FS=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
new file mode 100644
index 000000000000..e8db8d938661
--- /dev/null
+++ b/kernel/configs/debug.config
@@ -0,0 +1,106 @@
+# The config is based on running daily CI for enterprise Linux distros to
+# seek regressions on linux-next builds on different bare-metal and virtual
+# platforms. It can be used for example,
+#
+# $ make ARCH=arm64 defconfig debug.config
+#
+# Keep alphabetically sorted inside each section.
+#
+# printk and dmesg options
+#
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_PRINTK_CALLER=y
+CONFIG_PRINTK_TIME=y
+CONFIG_SYMBOLIC_ERRNAME=y
+#
+# Compile-time checks and compiler options
+#
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
+CONFIG_DEBUG_SECTION_MISMATCH=y
+CONFIG_FRAME_WARN=2048
+CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+#
+# Generic Kernel Debugging Instruments
+#
+# CONFIG_UBSAN_ALIGNMENT is not set
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_TRAP is not set
+# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_FS_ALLOW_ALL=y
+CONFIG_DEBUG_IRQFLAGS=y
+CONFIG_UBSAN=y
+CONFIG_UBSAN_BOOL=y
+CONFIG_UBSAN_BOUNDS=y
+CONFIG_UBSAN_ENUM=y
+CONFIG_UBSAN_SHIFT=y
+CONFIG_UBSAN_UNREACHABLE=y
+#
+# Memory Debugging
+#
+# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set
+# CONFIG_DEBUG_RODATA_TEST is not set
+# CONFIG_DEBUG_WX is not set
+# CONFIG_KFENCE is not set
+# CONFIG_PAGE_POISONING is not set
+# CONFIG_SLUB_STATS is not set
+CONFIG_PAGE_EXTENSION=y
+CONFIG_PAGE_OWNER=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_DEBUG_OBJECTS_FREE=y
+CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_DEBUG_OBJECTS_TIMERS=y
+CONFIG_DEBUG_OBJECTS_WORK=y
+CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_VIRTUAL=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_PGFLAGS=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_GENERIC_PTDUMP=y
+CONFIG_KASAN=y
+CONFIG_KASAN_GENERIC=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KASAN_VMALLOC=y
+CONFIG_PTDUMP_DEBUGFS=y
+CONFIG_SCHED_STACK_END_CHECK=y
+CONFIG_SLUB_DEBUG_ON=y
+#
+# Debug Oops, Lockups and Hangs
+#
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_PANIC_TIMEOUT=0
+CONFIG_SOFTLOCKUP_DETECTOR=y
+#
+# Lock Debugging (spinlocks, mutexes, etc...)
+#
+# CONFIG_PROVE_RAW_LOCK_NESTING is not set
+CONFIG_PROVE_LOCKING=y
+#
+# Debug kernel data structures
+#
+CONFIG_BUG_ON_DATA_CORRUPTION=y
+#
+# RCU Debugging
+#
+CONFIG_PROVE_RCU=y
+CONFIG_PROVE_RCU_LIST=y
+#
+# Tracers
+#
+CONFIG_BRANCH_PROFILE_NONE=y
+CONFIG_DYNAMIC_FTRACE=y
+CONFIG_FTRACE=y
+CONFIG_FUNCTION_TRACER=y
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..38a7c5362c9c
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1 @@
+CONFIG_RUST=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 8a44b93da0f3..144b2bd86b14 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,5 +7,5 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLUB is not set
-CONFIG_SLOB=y
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
new file mode 100644
index 000000000000..6fac5b405334
--- /dev/null
+++ b/kernel/configs/x86_debug.config
@@ -0,0 +1,17 @@
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_GCOV_KERNEL=y
+CONFIG_LOCKDEP=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_SCHEDSTATS=y
+CONFIG_NOINSTR_VALIDATION=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index ff756221f112..436f806aa1ed 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -34,7 +34,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
CONFIG_XEN_SCSI_FRONTEND=m
# others
CONFIG_XEN_BALLOON=y
-CONFIG_XEN_SCRUB_PAGES=y
CONFIG_XEN_DEV_EVTCHN=m
CONFIG_XEN_BLKDEV_FRONTEND=m
CONFIG_XEN_NETDEV_FRONTEND=m
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 36a98c48aedc..a09f1c19336a 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Context tracking: Probe on high level context boundaries such as kernel
- * and userspace. This includes syscalls and exceptions entry/exit.
+ * Context tracking: Probe on high level context boundaries such as kernel,
+ * userspace, guest or idle.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
- * runs in userspace.
+ * runs in idle, userspace or guest mode.
*
- * Started by Frederic Weisbecker:
+ * User/guest tracking started by Frederic Weisbecker:
*
- * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
+ * RCU extended quiescent state bits imported from kernel/rcu/tree.c
+ * where the relevant authorship may be found.
*/
#include <linux/context_tracking.h>
@@ -21,6 +23,411 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <trace/events/rcu.h>
+
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+ .dynticks_nesting = 1,
+ .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
+#endif
+ .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
+};
+EXPORT_SYMBOL_GPL(context_tracking);
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x) tracepoint_string(x)
+
+/* Record the current task on dyntick-idle entry. */
+static __always_inline void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static __always_inline void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
+static __always_inline void rcu_dynticks_task_trace_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
+static __always_inline void rcu_dynticks_task_trace_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void ct_kernel_exit_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
+ seq = ct_state_inc(offset);
+ // RCU is no longer watching. Better be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void ct_kernel_enter_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = ct_state_inc(offset);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_dynticks_task_trace_exit(); // After ->dynticks update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr ct_kernel_exit(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ ct_dynticks_nesting() == 0);
+ if (ct_dynticks_nesting() != 1) {
+ // RCU will still be watching, so just do accounting and leave.
+ ct->dynticks_nesting--;
+ return;
+ }
+
+ instrumentation_begin();
+ lockdep_assert_irqs_disabled();
+ trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
+ ct_kernel_exit_state(offset);
+ // ... but is no longer watching here.
+ rcu_dynticks_task_enter();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr ct_kernel_enter(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ long oldval;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ oldval = ct_dynticks_nesting();
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+ if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
+ ct->dynticks_nesting++;
+ return;
+ }
+ rcu_dynticks_task_exit();
+ // RCU is not watching here ...
+ ct_kernel_enter_state(offset);
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(ct->dynticks_nesting, 1);
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+ instrumentation_end();
+}
+
+/**
+ * ct_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_exit(void)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ instrumentation_begin();
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (ct_dynticks_nmi_nesting() != 1) {
+ trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
+ ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
+ ct_dynticks_nmi_nesting() - 2);
+ instrumentation_end();
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+ instrumentation_end();
+
+ // RCU is watching here ...
+ ct_kernel_exit_state(RCU_DYNTICKS_IDX);
+ // ... but is no longer watching here.
+
+ if (!in_nmi())
+ rcu_dynticks_task_enter();
+}
+
+/**
+ * ct_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
+ * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_enter(void)
+{
+ long incby = 2;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+
+ if (!in_nmi())
+ rcu_dynticks_task_exit();
+
+ // RCU is not watching here ...
+ ct_kernel_enter_state(RCU_DYNTICKS_IDX);
+ // ... but is watching here.
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ incby = 1;
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ } else {
+ instrumentation_begin();
+ }
+
+ trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+ ct_dynticks_nmi_nesting(),
+ ct_dynticks_nmi_nesting() + incby, ct_dynticks());
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
+ ct_dynticks_nmi_nesting() + incby);
+ barrier();
+}
+
+/**
+ * ct_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_enter(void)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+/**
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_exit(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_enter();
+}
+
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_exit();
+}
+
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_enter();
+ local_irq_restore(flags);
+}
+
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_exit();
+ local_irq_restore(flags);
+}
+#else
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -28,9 +435,6 @@
DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
-EXPORT_SYMBOL_GPL(context_tracking);
-
static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -51,29 +455,32 @@ static __always_inline void context_tracking_recursion_exit(void)
}
/**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- * enter user or guest space mode.
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ * to enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void noinstr __context_tracking_enter(enum ctx_state state)
+void noinstr __ct_user_enter(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ lockdep_assert_irqs_disabled();
+
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
if (!context_tracking_recursion_enter())
return;
- if ( __this_cpu_read(context_tracking.state) != state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() != state) {
+ if (ct->active) {
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
if (state == CONTEXT_USER) {
@@ -82,35 +489,77 @@ void noinstr __context_tracking_enter(enum ctx_state state)
vtime_user_enter(current);
instrumentation_end();
}
- rcu_user_enter();
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
+
+ /*
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_eqs_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+ ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ arch_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ arch_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ arch_atomic_add(state, &ct->state);
+ }
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- __this_cpu_write(context_tracking.state, state);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__ct_user_enter);
-void context_tracking_enter(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_restore() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_enter() through user_enter_irqoff()
+ * or context_tracking_guest_enter(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_enter(enum ctx_state state)
{
unsigned long flags;
/*
* Some contexts may involve an exception occuring in an irq,
* leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
* This would mess up the dyntick_nesting count though. And rcu_irq_*()
* helpers are enough to protect RCU uses inside the exception. So
* just return immediately if we detect we are in an IRQ.
@@ -119,21 +568,32 @@ void context_tracking_enter(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_enter(state);
+ __ct_user_enter(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
-void context_tracking_user_enter(void)
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call user_enter_irqoff(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void user_enter_callable(void)
{
user_enter();
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
/**
- * context_tracking_exit - Inform the context tracking that the CPU is
- * exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
@@ -143,32 +603,64 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void noinstr __context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
if (!context_tracking_recursion_enter())
return;
- if (__this_cpu_read(context_tracking.state) == state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() == state) {
+ if (ct->active) {
/*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
*/
- rcu_user_exit();
+ ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
if (state == CONTEXT_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
instrumentation_end();
}
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ arch_atomic_sub(state, &ct->state);
+ }
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
-void context_tracking_exit(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_save() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_exit() through user_exit_irqoff()
+ * or context_tracking_guest_exit(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_exit(enum ctx_state state)
{
unsigned long flags;
@@ -176,19 +668,30 @@ void context_tracking_exit(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_exit(state);
+ __ct_user_exit(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
-void context_tracking_user_exit(void)
+/**
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
+ * involving illegal RCU uses through tracing and lockdep. This is unlikely
+ * to be fixed as this function is obsolete. The preferred way is to call
+ * user_exit_irqoff(). It should be the arch entry code responsibility to
+ * call into context tracking with IRQs disabled.
+ */
+void user_exit_callable(void)
{
user_exit();
}
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(user_exit_callable);
-void __init context_tracking_cpu_set(int cpu)
+void __init ct_cpu_track_user(int cpu)
{
static __initdata bool initialized = false;
@@ -212,12 +715,14 @@ void __init context_tracking_cpu_set(int cpu)
initialized = true;
}
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
}
#endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 804b847912dc..f4a2c5845bcb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -31,8 +31,11 @@
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
+#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
+#include <linux/random.h>
+#include <linux/cc_platform.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -41,14 +44,19 @@
#include "smpboot.h"
/**
- * cpuhp_cpu_state - Per cpu hotplug state storage
+ * struct cpuhp_cpu_state - Per cpu hotplug state storage
* @state: The current cpu state
* @target: The target state
+ * @fail: Current CPU hotplug callback state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
+ * @cpu: CPU number
+ * @node: Remote CPU node; for multi-instance, do a
+ * single entry callback for install/remove
+ * @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
* @done_up: Signal completion to the issuer of the task for cpu-up
@@ -64,7 +72,6 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
- int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -106,11 +113,12 @@ static inline void cpuhp_lock_release(bool bringup) { }
#endif
/**
- * cpuhp_step - Hotplug state machine step
+ * struct cpuhp_step - Hotplug state machine step
* @name: Name of the step
* @startup: Startup function of the step
* @teardown: Teardown function of the step
* @cant_stop: Bringup/teardown can't be stopped at this step
+ * @multi_instance: State has multiple instances which get added afterwards
*/
struct cpuhp_step {
const char *name;
@@ -124,7 +132,9 @@ struct cpuhp_step {
int (*multi)(unsigned int cpu,
struct hlist_node *node);
} teardown;
+ /* private: */
struct hlist_head list;
+ /* public: */
bool cant_stop;
bool multi_instance;
};
@@ -143,7 +153,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
}
/**
- * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * cpuhp_invoke_callback - Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
@@ -151,6 +161,8 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
* @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
+ *
+ * Return: %0 on success or a negative errno code
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
@@ -463,7 +475,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
-cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
bool bringup = st->state < target;
@@ -474,14 +486,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
st->target = target;
st->single = false;
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
return prev_state;
}
static inline void
-cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state prev_state)
{
bool bringup = !st->bringup;
@@ -508,8 +521,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
}
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -529,15 +542,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
wait_for_ap_thread(st, st->bringup);
}
-static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state;
int ret;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
__cpuhp_kick_ap(st);
if ((ret = st->result)) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
}
@@ -569,7 +583,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
- return cpuhp_kick_ap(st, st->target);
+ return cpuhp_kick_ap(cpu, st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -578,6 +592,12 @@ static int bringup_cpu(unsigned int cpu)
int ret;
/*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
+
+ /*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
* Prevent irq alloc/free across the bringup.
@@ -603,7 +623,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
@@ -643,21 +663,51 @@ static bool cpuhp_next_state(bool bringup,
return true;
}
-static int cpuhp_invoke_callback_range(bool bringup,
- unsigned int cpu,
- struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
enum cpuhp_state state;
- int err = 0;
+ int ret = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
- if (err)
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
break;
+ }
}
- return err;
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -682,7 +732,11 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
if (ret) {
- cpuhp_reset_state(st, prev_state);
+ pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
+ cpuhp_reset_state(cpu, st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
prev_state));
@@ -693,15 +747,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
/*
* The cpu hotplug threads manage the bringup and teardown of the cpus
*/
-static void cpuhp_create(unsigned int cpu)
-{
- struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-
- init_completion(&st->done_up);
- init_completion(&st->done_down);
- st->cpu = cpu;
-}
-
static int cpuhp_should_run(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
@@ -853,7 +898,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
cpuhp_lock_release(true);
trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
- ret = cpuhp_kick_ap(st, st->target);
+ ret = cpuhp_kick_ap(cpu, st, st->target);
trace_cpuhp_exit(cpu, st->state, prev_state, ret);
return ret;
@@ -861,15 +906,27 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
static struct smp_hotplug_thread cpuhp_threads = {
.store = &cpuhp_state.thread,
- .create = &cpuhp_create,
.thread_should_run = cpuhp_should_run,
.thread_fn = cpuhp_thread_fun,
.thread_comm = "cpuhp/%u",
.selfparking = true,
};
+static __init void cpuhp_init_state(void)
+{
+ struct cpuhp_cpu_state *st;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ st = per_cpu_ptr(&cpuhp_state, cpu);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
+ }
+}
+
void __init cpuhp_threads_init(void)
{
+ cpuhp_init_state();
BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
@@ -972,7 +1029,6 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
- int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -985,13 +1041,10 @@ static int take_cpu_down(void *_param)
*/
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
- /* Invoke the former CPU_DYING callbacks */
- ret = cpuhp_invoke_callback_range(false, cpu, st, target);
-
/*
- * DYING must not fail!
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -1081,8 +1134,11 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
if (ret) {
+ pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (st->state < prev_state)
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
@@ -1109,7 +1165,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
@@ -1140,7 +1196,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state < prev_state) {
if (st->state == CPUHP_TEARDOWN_CPU) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
} else {
WARN(1, "DEAD callback error for CPU%d", cpu);
@@ -1161,6 +1217,12 @@ out:
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
+ /*
+ * If the platform does not support hotplug, report it explicitly to
+ * differentiate it from a transient offlining failure.
+ */
+ if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+ return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
return _cpu_down(cpu, 0, target);
@@ -1183,6 +1245,8 @@ static int cpu_down(unsigned int cpu, enum cpuhp_state target)
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use remove_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_down(struct device *dev)
{
@@ -1258,16 +1322,14 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}
/*
@@ -1325,7 +1387,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- cpuhp_set_state(st, target);
+ cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1395,6 +1457,8 @@ out:
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use add_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_up(struct device *dev)
{
@@ -1420,6 +1484,8 @@ EXPORT_SYMBOL_GPL(add_cpu);
* On some architectures like arm64, we can hibernate on any CPU, but on
* wake up the CPU we hibernated on might be offline as a side effect of
* using maxcpus= for example.
+ *
+ * Return: %0 on success or a negative errno code
*/
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
@@ -1458,8 +1524,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
- primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
@@ -1629,6 +1695,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
@@ -1752,6 +1823,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
@@ -1976,6 +2052,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
* @state: The state to setup
+ * @name: Name of the step
* @invoke: If true, the startup function is invoked for cpus where
* cpu state >= @state
* @startup: startup callback function
@@ -1984,9 +2061,9 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* added afterwards.
*
* The caller needs to hold cpus read locked while calling this function.
- * Returns:
+ * Return:
* On success:
- * Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
@@ -2232,18 +2309,17 @@ int cpuhp_smt_enable(void)
#endif
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
-static ssize_t show_cpuhp_state(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->state);
}
-static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+static DEVICE_ATTR_RO(state);
-static ssize_t write_cpuhp_target(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t target_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2274,26 +2350,26 @@ static ssize_t write_cpuhp_target(struct device *dev,
if (st->state < target)
ret = cpu_up(dev->id, target);
- else
+ else if (st->state > target)
ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
out:
unlock_device_hotplug();
return ret ? ret : count;
}
-static ssize_t show_cpuhp_target(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t target_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->target);
}
-static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
-
+static DEVICE_ATTR_RW(target);
-static ssize_t write_cpuhp_fail(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2342,15 +2418,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
return count;
}
-static ssize_t show_cpuhp_fail(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->fail);
}
-static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+static DEVICE_ATTR_RW(fail);
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
@@ -2365,7 +2441,7 @@ static const struct attribute_group cpuhp_cpu_attr_group = {
NULL
};
-static ssize_t show_cpuhp_states(struct device *dev,
+static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
ssize_t cur, res = 0;
@@ -2384,7 +2460,7 @@ static ssize_t show_cpuhp_states(struct device *dev,
mutex_unlock(&cpuhp_state_mutex);
return res;
}
-static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+static DEVICE_ATTR_RO(states);
static struct attribute *cpuhp_cpu_root_attrs[] = {
&dev_attr_states.attr,
@@ -2457,28 +2533,27 @@ static const char *smt_states[] = {
[CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
};
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t control_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
const char *state = smt_states[cpu_smt_control];
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
-static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t control_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
return __store_smt_control(dev, attr, buf, count);
}
-static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+static DEVICE_ATTR_RW(control);
-static ssize_t
-show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
-static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+static DEVICE_ATTR_RO(active);
static struct attribute *cpuhp_smt_attrs[] = {
&dev_attr_control.attr,
@@ -2494,22 +2569,33 @@ static const struct attribute_group cpuhp_smt_attr_group = {
static int __init cpu_smt_sysfs_init(void)
{
- return sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_smt_attr_group);
+ struct device *dev_root;
+ int ret = -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
+ put_device(dev_root);
+ }
+ return ret;
}
static int __init cpuhp_sysfs_init(void)
{
+ struct device *dev_root;
int cpu, ret;
ret = cpu_smt_sysfs_init();
if (ret)
return ret;
- ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_cpu_root_attr_group);
- if (ret)
- return ret;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
+ put_device(dev_root);
+ if (ret)
+ return ret;
+ }
for_each_possible_cpu(cpu) {
struct device *dev = get_cpu_device(cpu);
@@ -2639,6 +2725,7 @@ void __init boot_cpu_hotplug_init(void)
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
/*
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index f7e1d0eccdbc..b0f0d15085db 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -13,31 +13,38 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+/*
+ * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT.
+ * Notifications for cpu_pm will be issued by the idle task itself, which can
+ * never block, IOW it requires using a raw_spinlock_t.
+ */
+static struct {
+ struct raw_notifier_head chain;
+ raw_spinlock_t lock;
+} cpu_pm_notifier = {
+ .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock),
+};
static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
- /*
- * atomic_notifier_call_chain has a RCU read critical section, which
- * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
- * RCU know this.
- */
- rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
- rcu_irq_exit_irqson();
+ rcu_read_lock();
+ ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
+ rcu_read_unlock();
return notifier_to_errno(ret);
}
static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
{
+ unsigned long flags;
int ret;
- rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
- rcu_irq_exit_irqson();
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
return notifier_to_errno(ret);
}
@@ -49,12 +56,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
+ * This function has the same return conditions as raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -64,12 +76,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*
* Remove a driver from the CPU PM notifier list.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
+ * This function has the same return conditions as raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index eb53f5ec62c9..90ce1dfd591c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -6,14 +6,18 @@
#include <linux/buildid.h>
#include <linux/crash_core.h>
+#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/sizes.h>
#include <asm/page.h>
#include <asm/sections.h>
#include <crypto/sha1.h>
+#include "kallsyms_internal.h"
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -42,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
/* for each entry of the comma-separated list */
do {
@@ -86,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
cur = tmp;
- if (size >= system_ram) {
+ if (size >= total_mem) {
pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
/* match ? */
- if (system_ram >= start && system_ram < end) {
+ if (total_mem >= start && total_mem < end) {
*crash_size = size;
break;
}
@@ -221,9 +234,6 @@ next:
p = strstr(p+1, name);
}
- if (!ck_cmdline)
- return NULL;
-
return ck_cmdline;
}
@@ -242,9 +252,8 @@ static int __init __parse_crashkernel(char *cmdline,
*crash_base = 0;
ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
if (!ck_cmdline)
- return -EINVAL;
+ return -ENOENT;
ck_cmdline += strlen(name);
@@ -295,6 +304,16 @@ int __init parse_crashkernel_low(char *cmdline,
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
@@ -364,6 +383,9 @@ void vmcoreinfo_append_str(const char *fmt, ...)
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
}
/*
@@ -433,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(folio, _folio_dtor);
+ VMCOREINFO_OFFSET(folio, _folio_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
@@ -452,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
@@ -473,6 +495,19 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
+#ifdef CONFIG_KALLSYMS
+ VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_num_syms);
+ VMCOREINFO_SYMBOL(kallsyms_token_table);
+ VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+ VMCOREINFO_SYMBOL(kallsyms_offsets);
+ VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+ VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/cred.c b/kernel/cred.c
index e6fd2b3fc31f..811ad654abd1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -225,8 +225,6 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
- new->ucounts = get_ucounts(&init_ucounts);
-
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -286,13 +284,13 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
- goto error;
-
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
+ goto error;
+
validate_creds(new);
return new;
@@ -501,7 +499,7 @@ int commit_creds(struct cred *new)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
+ if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
@@ -667,25 +665,20 @@ EXPORT_SYMBOL(cred_fscmp);
int set_cred_ucounts(struct cred *new)
{
- struct task_struct *task = current;
- const struct cred *old = task->real_cred;
- struct ucounts *old_ucounts = new->ucounts;
-
- if (new->user == old->user && new->user_ns == old->user_ns)
- return 0;
+ struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
/*
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
- if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
return 0;
- if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
return -EAGAIN;
- if (old_ucounts)
- put_ucounts(old_ucounts);
+ new->ucounts = new_ucounts;
+ put_ucounts(old_ucounts);
return 0;
}
@@ -708,9 +701,9 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
@@ -721,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
+ old = get_task_cred(daemon);
validate_creds(old);
*new = *old;
@@ -753,13 +745,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
- goto error;
-
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
+ goto error;
+
put_cred(old);
validate_creds(new);
return new;
@@ -877,7 +869,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
/*
* report use of invalid credentials
*/
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b4aa6bb6b2bd..d5e9ccde3ab8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#define pr_fmt(fmt) "KGDB: " fmt
@@ -53,9 +50,9 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
+#include <linux/security.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -285,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache.vmas[i])
- continue;
- flush_cache_range(current->vmacache.vmas[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
@@ -755,6 +741,29 @@ cpu_master_loop:
continue;
kgdb_connected = 0;
} else {
+ /*
+ * This is a brutal way to interfere with the debugger
+ * and prevent gdb being used to poke at kernel memory.
+ * This could cause trouble if lockdown is applied when
+ * there is already an active gdb session. For now the
+ * answer is simply "don't do that". Typically lockdown
+ * *will* be applied before the debug core gets started
+ * so only developers using kgdb for fairly advanced
+ * early kernel debug can be biten by this. Hopefully
+ * they are sophisticated enough to take care of
+ * themselves, especially with help from the lockdown
+ * message printed on the console!
+ */
+ if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+ if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+ /* Switch back to kdb if possible... */
+ dbg_kdb_mode = 1;
+ continue;
+ } else {
+ /* ... otherwise just bail */
+ break;
+ }
+ }
error = gdb_serial_stub(ks);
}
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index b6f28fad4307..9d34d2364b5a 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#include <linux/kernel.h>
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 2168f8dacb99..372025cf1ca3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -523,51 +523,51 @@ static int kdb_ss(int argc, const char **argv)
}
static kdbtab_t bptab[] = {
- { .cmd_name = "bp",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Set/Display breakpoints",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "bp",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Set/Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "bl",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Display breakpoints",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "bl",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "bc",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Clear Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "bc",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Clear Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "be",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Enable Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "be",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Enable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "bd",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Disable Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "bd",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Disable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "ss",
- .cmd_func = kdb_ss,
- .cmd_usage = "",
- .cmd_help = "Single Step",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "ss",
+ .func = kdb_ss,
+ .usage = "",
+ .help = "Single Step",
+ .minlen = 1,
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
};
static kdbtab_t bphcmd = {
- .cmd_name = "bph",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "[datar [length]|dataw [length]] Set hw brk",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ .name = "bph",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "[datar [length]|dataw [length]] Set hw brk",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
};
/* Initialize the breakpoint table and register breakpoint commands. */
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 1f9f0e47aeda..10b454554ab0 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -46,7 +46,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
* btp <pid> Kernel stack for <pid>
* btt <address-expression> Kernel stack for task structure at
* <address-expression>
- * bta [DRSTCZEUIMA] All useful processes, optionally
+ * bta [state_chars>|A] All useful processes, optionally
* filtered by state
* btc [<cpu>] The current process on one cpu,
* default is all cpus
@@ -74,7 +74,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
+kdb_bt1(struct task_struct *p, const char *mask, bool btaprompt)
{
char ch;
@@ -120,7 +120,7 @@ kdb_bt_cpu(unsigned long cpu)
return;
}
- kdb_bt1(kdb_tsk, ~0UL, false);
+ kdb_bt1(kdb_tsk, "A", false);
}
int
@@ -138,8 +138,8 @@ kdb_bt(int argc, const char **argv)
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
unsigned long cpu;
- unsigned long mask = kdb_task_state_string(argc ? argv[1] :
- NULL);
+ const char *mask = argc ? argv[1] : kdbgetenv("PS");
+
if (argc == 0)
kdb_ps_suppressed();
/* Run the active tasks first */
@@ -167,7 +167,7 @@ kdb_bt(int argc, const char **argv)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- return kdb_bt1(p, ~0UL, false);
+ return kdb_bt1(p, "A", false);
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -176,7 +176,7 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- return kdb_bt1((struct task_struct *)addr, ~0UL, false);
+ return kdb_bt1((struct task_struct *)addr, "A", false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
if (argc > 1)
@@ -212,7 +212,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, false);
+ return kdb_bt1(kdb_current_task, "A", false);
}
}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 0220afda3200..e91fc3e4edd5 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -140,7 +140,6 @@ int kdb_stub(struct kgdb_state *ks)
*/
kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
- kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6735ac36b718..5c7e9ba7cd6b 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -9,7 +9,6 @@
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
@@ -546,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
{
struct console *c;
const char *cp;
+ int cookie;
int len;
if (msg_len == 0)
@@ -559,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len)
cp++;
}
- for_each_console(c) {
- if (!(c->flags & CON_ENABLED))
+ /*
+ * The console_srcu_read_lock() only provides safe console list
+ * traversal. The use of the ->write() callback relies on all other
+ * CPUs being stopped at the moment and console drivers being able to
+ * handle reentrance when @oops_in_progress is set.
+ *
+ * There is no guarantee that every console driver can handle
+ * reentrance in this way; the developer deploying the debugger
+ * is responsible for ensuring that the console drivers they
+ * have selected handle reentrance appropriately.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (!(console_srcu_read_flags(c) & CON_ENABLED))
continue;
if (c == dbg_io_ops->cons)
continue;
@@ -578,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
--oops_in_progress;
touch_nmi_watchdog();
}
+ console_srcu_read_unlock(cookie);
}
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index f877a0a0d7cf..f87c750d3eb3 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -11,7 +11,6 @@
#include <linux/kdb.h>
#include <linux/keyboard.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/io.h>
/* Keyboard Controller Registers on normal PCs. */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index d8ee5647b732..438b868cbfa9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -26,14 +26,12 @@
#include <linux/utsname.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
-#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
-#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
@@ -46,6 +44,7 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "kdb_private.h"
#undef MODULE_PARAM_PREFIX
@@ -167,10 +166,62 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
*/
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static void kdb_check_for_lockdown(void)
+{
+ const int write_flags = KDB_ENABLE_MEM_WRITE |
+ KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_FLOW_CTRL;
+ const int read_flags = KDB_ENABLE_MEM_READ |
+ KDB_ENABLE_REG_READ;
+
+ bool need_to_lockdown_write = false;
+ bool need_to_lockdown_read = false;
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+ need_to_lockdown_write =
+ security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+ need_to_lockdown_read =
+ security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+ /* De-compose KDB_ENABLE_ALL if required */
+ if (need_to_lockdown_write || need_to_lockdown_read)
+ if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+ kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+ if (need_to_lockdown_write)
+ kdb_cmd_enabled &= ~write_flags;
+
+ if (need_to_lockdown_read)
+ kdb_cmd_enabled &= ~read_flags;
+}
+
+/*
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
+ */
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
bool no_args)
{
/* permissions comes from userspace so needs massaging slightly */
@@ -654,16 +705,17 @@ static void kdb_cmderror(int diag)
* Returns:
* zero for success, a kdb diagnostic if error
*/
-struct defcmd_set {
- int count;
- bool usable;
- char *name;
- char *usage;
- char *help;
- char **command;
+struct kdb_macro {
+ kdbtab_t cmd; /* Macro command */
+ struct list_head statements; /* Associated statement list */
+};
+
+struct kdb_macro_statement {
+ char *statement; /* Statement text */
+ struct list_head list_node; /* Statement list node */
};
-static struct defcmd_set *defcmd_set;
-static int defcmd_set_count;
+
+static struct kdb_macro *kdb_macro;
static bool defcmd_in_progress;
/* Forward references */
@@ -671,53 +723,55 @@ static int kdb_exec_defcmd(int argc, const char **argv);
static int kdb_defcmd2(const char *cmdstr, const char *argv0)
{
- struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
- char **save_command = s->command;
+ struct kdb_macro_statement *kms;
+
+ if (!kdb_macro)
+ return KDB_NOTIMP;
+
if (strcmp(argv0, "endefcmd") == 0) {
defcmd_in_progress = false;
- if (!s->count)
- s->usable = false;
- if (s->usable)
- /* macros are always safe because when executed each
- * internal command re-enters kdb_parse() and is
- * safety checked individually.
- */
- kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
- s->help, 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ if (!list_empty(&kdb_macro->statements))
+ kdb_register(&kdb_macro->cmd);
return 0;
}
- if (!s->usable)
- return KDB_NOTIMP;
- s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB);
- if (!s->command) {
- kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+
+ kms = kmalloc(sizeof(*kms), GFP_KDB);
+ if (!kms) {
+ kdb_printf("Could not allocate new kdb macro command: %s\n",
cmdstr);
- s->usable = false;
return KDB_NOTIMP;
}
- memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
- s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
- kfree(save_command);
+
+ kms->statement = kdb_strdup(cmdstr, GFP_KDB);
+ list_add_tail(&kms->list_node, &kdb_macro->statements);
+
return 0;
}
static int kdb_defcmd(int argc, const char **argv)
{
- struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ kdbtab_t *mp;
+
if (defcmd_in_progress) {
kdb_printf("kdb: nested defcmd detected, assuming missing "
"endefcmd\n");
kdb_defcmd2("endefcmd", "endefcmd");
}
if (argc == 0) {
- int i;
- for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
- kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
- s->usage, s->help);
- for (i = 0; i < s->count; ++i)
- kdb_printf("%s", s->command[i]);
- kdb_printf("endefcmd\n");
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (kp->func == kdb_exec_defcmd) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n",
+ kp->name, kp->usage, kp->help);
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements,
+ list_node)
+ kdb_printf("%s", kms->statement);
+ kdb_printf("endefcmd\n");
+ }
}
return 0;
}
@@ -727,45 +781,43 @@ static int kdb_defcmd(int argc, const char **argv)
kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
- defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set)
+ kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB);
+ if (!kdb_macro)
goto fail_defcmd;
- memcpy(defcmd_set, save_defcmd_set,
- defcmd_set_count * sizeof(*defcmd_set));
- s = defcmd_set + defcmd_set_count;
- memset(s, 0, sizeof(*s));
- s->usable = true;
- s->name = kdb_strdup(argv[1], GFP_KDB);
- if (!s->name)
+
+ mp = &kdb_macro->cmd;
+ mp->func = kdb_exec_defcmd;
+ mp->minlen = 0;
+ mp->flags = KDB_ENABLE_ALWAYS_SAFE;
+ mp->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!mp->name)
goto fail_name;
- s->usage = kdb_strdup(argv[2], GFP_KDB);
- if (!s->usage)
+ mp->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!mp->usage)
goto fail_usage;
- s->help = kdb_strdup(argv[3], GFP_KDB);
- if (!s->help)
+ mp->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!mp->help)
goto fail_help;
- if (s->usage[0] == '"') {
- strcpy(s->usage, argv[2]+1);
- s->usage[strlen(s->usage)-1] = '\0';
+ if (mp->usage[0] == '"') {
+ strcpy(mp->usage, argv[2]+1);
+ mp->usage[strlen(mp->usage)-1] = '\0';
}
- if (s->help[0] == '"') {
- strcpy(s->help, argv[3]+1);
- s->help[strlen(s->help)-1] = '\0';
+ if (mp->help[0] == '"') {
+ strcpy(mp->help, argv[3]+1);
+ mp->help[strlen(mp->help)-1] = '\0';
}
- ++defcmd_set_count;
+
+ INIT_LIST_HEAD(&kdb_macro->statements);
defcmd_in_progress = true;
- kfree(save_defcmd_set);
return 0;
fail_help:
- kfree(s->usage);
+ kfree(mp->usage);
fail_usage:
- kfree(s->name);
+ kfree(mp->name);
fail_name:
- kfree(defcmd_set);
+ kfree(kdb_macro);
fail_defcmd:
- kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
- defcmd_set = save_defcmd_set;
+ kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]);
return KDB_NOTIMP;
}
@@ -780,25 +832,31 @@ fail_defcmd:
*/
static int kdb_exec_defcmd(int argc, const char **argv)
{
- int i, ret;
- struct defcmd_set *s;
+ int ret;
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
if (argc != 0)
return KDB_ARGCOUNT;
- for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
- if (strcmp(s->name, argv[0]) == 0)
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, argv[0]) == 0)
break;
}
- if (i == defcmd_set_count) {
+ if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) {
kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
argv[0]);
return KDB_NOTIMP;
}
- for (i = 0; i < s->count; ++i) {
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements, list_node) {
+ /*
+ * Recursive use of kdb_parse, do not use argv after this point.
+ */
argv = NULL;
- kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
- ret = kdb_parse(s->command[i]);
+ kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement);
+ ret = kdb_parse(kms->statement);
if (ret)
return ret;
}
@@ -1009,11 +1067,11 @@ int kdb_parse(const char *cmdstr)
* If this command is allowed to be abbreviated,
* check to see if this is it.
*/
- if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) &&
- (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0))
+ if (tp->minlen && (strlen(argv[0]) <= tp->minlen) &&
+ (strncmp(argv[0], tp->name, tp->minlen) == 0))
break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
+ if (strcmp(argv[0], tp->name) == 0)
break;
}
@@ -1024,8 +1082,7 @@ int kdb_parse(const char *cmdstr)
*/
if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
list_for_each_entry(tp, &kdb_cmds_head, list_node) {
- if (strncmp(argv[0], tp->cmd_name,
- strlen(tp->cmd_name)) == 0)
+ if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0)
break;
}
}
@@ -1033,19 +1090,19 @@ int kdb_parse(const char *cmdstr)
if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
- if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1))
return KDB_NOPERM;
KDB_STATE_SET(CMD);
- result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ result = (*tp->func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ if (tp->flags & KDB_REPEAT_WITH_ARGS)
return result;
- argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
if (argv[argc])
*(argv[argc]) = '\0';
return result;
@@ -1175,6 +1232,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
+
+ kdb_check_for_lockdown();
+
kdb_go_count = 0;
if (reason == KDB_REASON_DEBUG) {
/* special case below */
@@ -1999,54 +2059,6 @@ static int kdb_ef(int argc, const char **argv)
return 0;
}
-#if defined(CONFIG_MODULES)
-/*
- * kdb_lsmod - This function implements the 'lsmod' command. Lists
- * currently loaded kernel modules.
- * Mostly taken from userland lsmod.
- */
-static int kdb_lsmod(int argc, const char **argv)
-{
- struct module *mod;
-
- if (argc != 0)
- return KDB_ARGCOUNT;
-
- kdb_printf("Module Size modstruct Used by\n");
- list_for_each_entry(mod, kdb_modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- kdb_printf("%-20s%8u 0x%px ", mod->name,
- mod->core_layout.size, (void *)mod);
-#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
-#endif
- if (mod->state == MODULE_STATE_GOING)
- kdb_printf(" (Unloading)");
- else if (mod->state == MODULE_STATE_COMING)
- kdb_printf(" (Loading)");
- else
- kdb_printf(" (Live)");
- kdb_printf(" 0x%px", mod->core_layout.base);
-
-#ifdef CONFIG_MODULE_UNLOAD
- {
- struct module_use *use;
- kdb_printf(" [ ");
- list_for_each_entry(use, &mod->source_list,
- source_list)
- kdb_printf("%s ", use->target->name);
- kdb_printf("]\n");
- }
-#endif
- }
-
- return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
/*
* kdb_env - This function implements the 'env' command. Display the
* current environment variables.
@@ -2198,8 +2210,8 @@ static void kdb_cpu_status(void)
state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
- if (kdb_task_state_char(KDB_TSK(i)) == 'I')
- state = 'I'; /* idle task */
+ if (kdb_task_state_char(KDB_TSK(i)) == '-')
+ state = '-'; /* idle task */
}
if (state != prev_state) {
if (prev_state != '?') {
@@ -2266,37 +2278,30 @@ static int kdb_cpu(int argc, const char **argv)
void kdb_ps_suppressed(void)
{
int idle = 0, daemon = 0;
- unsigned long mask_I = kdb_task_state_string("I"),
- mask_M = kdb_task_state_string("M");
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
- if (kdb_task_state(p, mask_I))
+ if (kdb_task_state(p, "-"))
++idle;
}
for_each_process_thread(g, p) {
- if (kdb_task_state(p, mask_M))
+ if (kdb_task_state(p, "ims"))
++daemon;
}
if (idle || daemon) {
if (idle)
- kdb_printf("%d idle process%s (state I)%s\n",
+ kdb_printf("%d idle process%s (state -)%s\n",
idle, idle == 1 ? "" : "es",
daemon ? " and " : "");
if (daemon)
- kdb_printf("%d sleeping system daemon (state M) "
+ kdb_printf("%d sleeping system daemon (state [ims]) "
"process%s", daemon,
daemon == 1 ? "" : "es");
kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
}
}
-/*
- * kdb_ps - This function implements the 'ps' command which shows a
- * list of the active processes.
- * ps [DRSTCZEUIMA] All processes, optionally filtered by state
- */
void kdb_ps1(const struct task_struct *p)
{
int cpu;
@@ -2325,17 +2330,25 @@ void kdb_ps1(const struct task_struct *p)
}
}
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ *
+ * ps [<state_chars>] Show processes, optionally selecting only those whose
+ * state character is found in <state_chars>.
+ */
static int kdb_ps(int argc, const char **argv)
{
struct task_struct *g, *p;
- unsigned long mask, cpu;
+ const char *mask;
+ unsigned long cpu;
if (argc == 0)
kdb_ps_suppressed();
kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
(int)(2*sizeof(void *))+2, "Task Addr",
(int)(2*sizeof(void *))+2, "Thread");
- mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ mask = argc ? argv[1] : kdbgetenv("PS");
/* Run the active tasks first */
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
@@ -2412,12 +2425,12 @@ static int kdb_help(int argc, const char **argv)
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true))
continue;
- if (strlen(kt->cmd_usage) > 20)
+ if (strlen(kt->usage) > 20)
space = "\n ";
- kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
- kt->cmd_usage, space, kt->cmd_help);
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->name,
+ kt->usage, space, kt->help);
}
return 0;
}
@@ -2613,56 +2626,32 @@ static int kdb_grep_help(int argc, const char **argv)
return 0;
}
-/*
- * kdb_register_flags - This function is used to register a kernel
- * debugger command.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * repeat Does the command auto repeat on enter?
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register() - This function is used to register a kernel debugger
+ * command.
+ * @cmd: pointer to kdb command
+ *
+ * Note that it's the job of the caller to keep the memory for the cmd
+ * allocated until unregister is called.
*/
-int kdb_register_flags(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_cmdflags_t flags)
+int kdb_register(kdbtab_t *cmd)
{
kdbtab_t *kp;
list_for_each_entry(kp, &kdb_cmds_head, list_node) {
- if (strcmp(kp->cmd_name, cmd) == 0) {
- kdb_printf("Duplicate kdb command registered: "
- "%s, func %px help %s\n", cmd, func, help);
+ if (strcmp(kp->name, cmd->name) == 0) {
+ kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n",
+ cmd->name, cmd->func, cmd->help);
return 1;
}
}
- kp = kmalloc(sizeof(*kp), GFP_KDB);
- if (!kp) {
- kdb_printf("Could not allocate new kdb_command table\n");
- return 1;
- }
-
- kp->cmd_name = cmd;
- kp->cmd_func = func;
- kp->cmd_usage = usage;
- kp->cmd_help = help;
- kp->cmd_minlen = minlen;
- kp->cmd_flags = flags;
- kp->is_dynamic = true;
-
- list_add_tail(&kp->list_node, &kdb_cmds_head);
-
+ list_add_tail(&cmd->list_node, &kdb_cmds_head);
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_flags);
+EXPORT_SYMBOL_GPL(kdb_register);
-/*
+/**
* kdb_register_table() - This function is used to register a kdb command
* table.
* @kp: pointer to kdb command table
@@ -2676,266 +2665,231 @@ void kdb_register_table(kdbtab_t *kp, size_t len)
}
}
-/*
- * kdb_register - Compatibility register function for commands that do
- * not need to specify a repeat state. Equivalent to
- * kdb_register_flags with flags set to 0.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * Returns:
- * zero for success, one if a duplicate command.
- */
-int kdb_register(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen)
-{
- return kdb_register_flags(cmd, func, usage, help, minlen, 0);
-}
-EXPORT_SYMBOL_GPL(kdb_register);
-
-/*
- * kdb_unregister - This function is used to unregister a kernel
- * debugger command. It is generally called when a module which
- * implements kdb commands is unloaded.
- * Inputs:
- * cmd Command name
- * Returns:
- * zero for success, one command not registered.
+/**
+ * kdb_unregister() - This function is used to unregister a kernel debugger
+ * command. It is generally called when a module which
+ * implements kdb command is unloaded.
+ * @cmd: pointer to kdb command
*/
-int kdb_unregister(char *cmd)
+void kdb_unregister(kdbtab_t *cmd)
{
- kdbtab_t *kp;
-
- /*
- * find the command.
- */
- list_for_each_entry(kp, &kdb_cmds_head, list_node) {
- if (strcmp(kp->cmd_name, cmd) == 0) {
- list_del(&kp->list_node);
- if (kp->is_dynamic)
- kfree(kp);
- return 0;
- }
- }
-
- /* Couldn't find it. */
- return 1;
+ list_del(&cmd->list_node);
}
EXPORT_SYMBOL_GPL(kdb_unregister);
static kdbtab_t maintab[] = {
- { .cmd_name = "md",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "md",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mdr",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr> <bytes>",
- .cmd_help = "Display Raw Memory",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mdr",
+ .func = kdb_md,
+ .usage = "<vaddr> <bytes>",
+ .help = "Display Raw Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mdp",
- .cmd_func = kdb_md,
- .cmd_usage = "<paddr> <bytes>",
- .cmd_help = "Display Physical Memory",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mdp",
+ .func = kdb_md,
+ .usage = "<paddr> <bytes>",
+ .help = "Display Physical Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mds",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display Memory Symbolically",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mds",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Symbolically",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mm",
- .cmd_func = kdb_mm,
- .cmd_usage = "<vaddr> <contents>",
- .cmd_help = "Modify Memory Contents",
- .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ { .name = "mm",
+ .func = kdb_mm,
+ .usage = "<vaddr> <contents>",
+ .help = "Modify Memory Contents",
+ .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "go",
- .cmd_func = kdb_go,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Continue Execution",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_REG_WRITE |
+ { .name = "go",
+ .func = kdb_go,
+ .usage = "[<vaddr>]",
+ .help = "Continue Execution",
+ .minlen = 1,
+ .flags = KDB_ENABLE_REG_WRITE |
KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
},
- { .cmd_name = "rd",
- .cmd_func = kdb_rd,
- .cmd_usage = "",
- .cmd_help = "Display Registers",
- .cmd_flags = KDB_ENABLE_REG_READ,
+ { .name = "rd",
+ .func = kdb_rd,
+ .usage = "",
+ .help = "Display Registers",
+ .flags = KDB_ENABLE_REG_READ,
},
- { .cmd_name = "rm",
- .cmd_func = kdb_rm,
- .cmd_usage = "<reg> <contents>",
- .cmd_help = "Modify Registers",
- .cmd_flags = KDB_ENABLE_REG_WRITE,
+ { .name = "rm",
+ .func = kdb_rm,
+ .usage = "<reg> <contents>",
+ .help = "Modify Registers",
+ .flags = KDB_ENABLE_REG_WRITE,
},
- { .cmd_name = "ef",
- .cmd_func = kdb_ef,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display exception frame",
- .cmd_flags = KDB_ENABLE_MEM_READ,
+ { .name = "ef",
+ .func = kdb_ef,
+ .usage = "<vaddr>",
+ .help = "Display exception frame",
+ .flags = KDB_ENABLE_MEM_READ,
},
- { .cmd_name = "bt",
- .cmd_func = kdb_bt,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Stack traceback",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ { .name = "bt",
+ .func = kdb_bt,
+ .usage = "[<vaddr>]",
+ .help = "Stack traceback",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
},
- { .cmd_name = "btp",
- .cmd_func = kdb_bt,
- .cmd_usage = "<pid>",
- .cmd_help = "Display stack for process <pid>",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "btp",
+ .func = kdb_bt,
+ .usage = "<pid>",
+ .help = "Display stack for process <pid>",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "bta",
- .cmd_func = kdb_bt,
- .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
- .cmd_help = "Backtrace all processes matching state flag",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "bta",
+ .func = kdb_bt,
+ .usage = "[<state_chars>|A]",
+ .help = "Backtrace all processes whose state matches",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "btc",
- .cmd_func = kdb_bt,
- .cmd_usage = "",
- .cmd_help = "Backtrace current process on each cpu",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "btc",
+ .func = kdb_bt,
+ .usage = "",
+ .help = "Backtrace current process on each cpu",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "btt",
- .cmd_func = kdb_bt,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Backtrace process given its struct task address",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ { .name = "btt",
+ .func = kdb_bt,
+ .usage = "<vaddr>",
+ .help = "Backtrace process given its struct task address",
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
},
- { .cmd_name = "env",
- .cmd_func = kdb_env,
- .cmd_usage = "",
- .cmd_help = "Show environment variables",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "env",
+ .func = kdb_env,
+ .usage = "",
+ .help = "Show environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "set",
- .cmd_func = kdb_set,
- .cmd_usage = "",
- .cmd_help = "Set environment variables",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "set",
+ .func = kdb_set,
+ .usage = "",
+ .help = "Set environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "help",
- .cmd_func = kdb_help,
- .cmd_usage = "",
- .cmd_help = "Display Help Message",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "help",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .minlen = 1,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "?",
- .cmd_func = kdb_help,
- .cmd_usage = "",
- .cmd_help = "Display Help Message",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "?",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "cpu",
- .cmd_func = kdb_cpu,
- .cmd_usage = "<cpunum>",
- .cmd_help = "Switch to new cpu",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ { .name = "cpu",
+ .func = kdb_cpu,
+ .usage = "<cpunum>",
+ .help = "Switch to new cpu",
+ .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
},
- { .cmd_name = "kgdb",
- .cmd_func = kdb_kgdb,
- .cmd_usage = "",
- .cmd_help = "Enter kgdb mode",
- .cmd_flags = 0,
+ { .name = "kgdb",
+ .func = kdb_kgdb,
+ .usage = "",
+ .help = "Enter kgdb mode",
+ .flags = 0,
},
- { .cmd_name = "ps",
- .cmd_func = kdb_ps,
- .cmd_usage = "[<flags>|A]",
- .cmd_help = "Display active task list",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "ps",
+ .func = kdb_ps,
+ .usage = "[<state_chars>|A]",
+ .help = "Display active task list",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "pid",
- .cmd_func = kdb_pid,
- .cmd_usage = "<pidnum>",
- .cmd_help = "Switch to another task",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "pid",
+ .func = kdb_pid,
+ .usage = "<pidnum>",
+ .help = "Switch to another task",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "reboot",
- .cmd_func = kdb_reboot,
- .cmd_usage = "",
- .cmd_help = "Reboot the machine immediately",
- .cmd_flags = KDB_ENABLE_REBOOT,
+ { .name = "reboot",
+ .func = kdb_reboot,
+ .usage = "",
+ .help = "Reboot the machine immediately",
+ .flags = KDB_ENABLE_REBOOT,
},
#if defined(CONFIG_MODULES)
- { .cmd_name = "lsmod",
- .cmd_func = kdb_lsmod,
- .cmd_usage = "",
- .cmd_help = "List loaded kernel modules",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "lsmod",
+ .func = kdb_lsmod,
+ .usage = "",
+ .help = "List loaded kernel modules",
+ .flags = KDB_ENABLE_INSPECT,
},
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- { .cmd_name = "sr",
- .cmd_func = kdb_sr,
- .cmd_usage = "<key>",
- .cmd_help = "Magic SysRq key",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "sr",
+ .func = kdb_sr,
+ .usage = "<key>",
+ .help = "Magic SysRq key",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
#endif
#if defined(CONFIG_PRINTK)
- { .cmd_name = "dmesg",
- .cmd_func = kdb_dmesg,
- .cmd_usage = "[lines]",
- .cmd_help = "Display syslog buffer",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "dmesg",
+ .func = kdb_dmesg,
+ .usage = "[lines]",
+ .help = "Display syslog buffer",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
#endif
- { .cmd_name = "defcmd",
- .cmd_func = kdb_defcmd,
- .cmd_usage = "name \"usage\" \"help\"",
- .cmd_help = "Define a set of commands, down to endefcmd",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "defcmd",
+ .func = kdb_defcmd,
+ .usage = "name \"usage\" \"help\"",
+ .help = "Define a set of commands, down to endefcmd",
+ /*
+ * Macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is safety
+ * checked individually.
+ */
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "kill",
- .cmd_func = kdb_kill,
- .cmd_usage = "<-signal> <pid>",
- .cmd_help = "Send a signal to a process",
- .cmd_flags = KDB_ENABLE_SIGNAL,
+ { .name = "kill",
+ .func = kdb_kill,
+ .usage = "<-signal> <pid>",
+ .help = "Send a signal to a process",
+ .flags = KDB_ENABLE_SIGNAL,
},
- { .cmd_name = "summary",
- .cmd_func = kdb_summary,
- .cmd_usage = "",
- .cmd_help = "Summarize the system",
- .cmd_minlen = 4,
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "summary",
+ .func = kdb_summary,
+ .usage = "",
+ .help = "Summarize the system",
+ .minlen = 4,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "per_cpu",
- .cmd_func = kdb_per_cpu,
- .cmd_usage = "<sym> [<bytes>] [<cpu>]",
- .cmd_help = "Display per_cpu variables",
- .cmd_minlen = 3,
- .cmd_flags = KDB_ENABLE_MEM_READ,
+ { .name = "per_cpu",
+ .func = kdb_per_cpu,
+ .usage = "<sym> [<bytes>] [<cpu>]",
+ .help = "Display per_cpu variables",
+ .minlen = 3,
+ .flags = KDB_ENABLE_MEM_READ,
},
- { .cmd_name = "grephelp",
- .cmd_func = kdb_grep_help,
- .cmd_usage = "",
- .cmd_help = "Display help on | grep",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "grephelp",
+ .func = kdb_grep_help,
+ .usage = "",
+ .help = "Display help on | grep",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
};
static kdbtab_t nmicmd = {
- .cmd_name = "disable_nmi",
- .cmd_func = kdb_disable_nmi,
- .cmd_usage = "",
- .cmd_help = "Disable NMI entry to KDB",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ .name = "disable_nmi",
+ .func = kdb_disable_nmi,
+ .usage = "",
+ .help = "Disable NMI entry to KDB",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
};
/* Initialize the kdb command table. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 170c69aedebb..1f8c519a5f81 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -109,7 +109,6 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
-extern void kdbnearsym_cleanup(void);
extern char *kdb_strdup(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
@@ -165,19 +164,6 @@ typedef struct _kdb_bp {
#ifdef CONFIG_KGDB_KDB
extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
-/* The KDB shell command table */
-typedef struct _kdbtab {
- char *cmd_name; /* Command name */
- kdb_func_t cmd_func; /* Function to execute command */
- char *cmd_usage; /* Usage String for this command */
- char *cmd_help; /* Help message for this command */
- short cmd_minlen; /* Minimum legal # command
- * chars required */
- kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
- struct list_head list_node; /* Command list */
- bool is_dynamic; /* Command table allocation type */
-} kdbtab_t;
-
extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
@@ -204,10 +190,8 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
-extern unsigned long kdb_task_state(const struct task_struct *p,
- unsigned long mask);
+extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
extern void kdb_send_sig(struct task_struct *p, int sig);
@@ -233,10 +217,6 @@ extern struct task_struct *kdb_curr_task(int);
#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
-extern void *debug_kmalloc(size_t size, gfp_t flags);
-extern void debug_kfree(void *);
-extern void debug_kusage(void);
-
extern struct task_struct *kdb_current_task;
extern struct pt_regs *kdb_current_regs;
@@ -246,10 +226,6 @@ extern void kdb_kbd_cleanup_state(void);
#define kdb_kbd_cleanup_state()
#endif /* ! CONFIG_KDB_KEYBOARD */
-#ifdef CONFIG_MODULES
-extern struct list_head *kdb_modules;
-#endif /* CONFIG_MODULES */
-
extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 9f50d22d68e6..0a39497140bf 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -10,7 +10,6 @@
* 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
*/
-#include <stdarg.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -18,13 +17,13 @@
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
-#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
+#include <linux/ctype.h>
#include "kdb_private.h"
/*
@@ -52,48 +51,48 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
}
EXPORT_SYMBOL(kdbgetsymval);
-static char *kdb_name_table[100]; /* arbitrary size */
-
-/*
- * kdbnearsym - Return the name of the symbol with the nearest address
- * less than 'addr'.
+/**
+ * kdbnearsym() - Return the name of the symbol with the nearest address
+ * less than @addr.
+ * @addr: Address to check for near symbol
+ * @symtab: Structure to receive results
*
- * Parameters:
- * addr Address to check for symbol near
- * symtab Structure to receive results
- * Returns:
- * 0 No sections contain this address, symtab zero filled
- * 1 Address mapped to module/symbol/section, data in symtab
- * Remarks:
- * 2.6 kallsyms has a "feature" where it unpacks the name into a
- * string. If that string is reused before the caller expects it
- * then the caller sees its string change without warning. To
- * avoid cluttering up the main kdb code with lots of kdb_strdup,
- * tests and kfree calls, kdbnearsym maintains an LRU list of the
- * last few unique strings. The list is sized large enough to
- * hold active strings, no kdb caller of kdbnearsym makes more
- * than ~20 later calls before using a saved value.
+ * WARNING: This function may return a pointer to a single statically
+ * allocated buffer (namebuf). kdb's unusual calling context (single
+ * threaded, all other CPUs halted) provides us sufficient locking for
+ * this to be safe. The only constraint imposed by the static buffer is
+ * that the caller must consume any previous reply prior to another call
+ * to lookup a new symbol.
+ *
+ * Note that, strictly speaking, some architectures may re-enter the kdb
+ * trap if the system turns out to be very badly damaged and this breaks
+ * the single-threaded assumption above. In these circumstances successful
+ * continuation and exit from the inner trap is unlikely to work and any
+ * user attempting this receives a prominent warning before being allowed
+ * to progress. In these circumstances we remain memory safe because
+ * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do
+ * tolerate the possibility of garbled symbol display from the outer kdb
+ * trap.
+ *
+ * Return:
+ * * 0 - No sections contain this address, symtab zero filled
+ * * 1 - Address mapped to module/symbol/section, data in symtab
*/
int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
{
int ret = 0;
unsigned long symbolsize = 0;
unsigned long offset = 0;
-#define knt1_size 128 /* must be >= kallsyms table size */
- char *knt1 = NULL;
+ static char namebuf[KSYM_NAME_LEN];
kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
- knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
- if (!knt1) {
- kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr);
- goto out;
- }
+
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
- (char **)(&symtab->mod_name), knt1);
+ (char **)(&symtab->mod_name), namebuf);
if (offset > 8*1024*1024) {
symtab->sym_name = NULL;
addr = offset = symbolsize = 0;
@@ -102,63 +101,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->sym_end = symtab->sym_start + symbolsize;
ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
- if (ret) {
- int i;
- /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
- * set but the buffer passed into kallsyms_lookup is not used,
- * so it contains garbage. The caller has to work out which
- * buffer needs to be saved.
- *
- * What was Rusty smoking when he wrote that code?
- */
- if (symtab->sym_name != knt1) {
- strncpy(knt1, symtab->sym_name, knt1_size);
- knt1[knt1_size-1] = '\0';
- }
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i] &&
- strcmp(kdb_name_table[i], knt1) == 0)
- break;
- }
- if (i >= ARRAY_SIZE(kdb_name_table)) {
- debug_kfree(kdb_name_table[0]);
- memmove(kdb_name_table, kdb_name_table+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-1));
- } else {
- debug_kfree(knt1);
- knt1 = kdb_name_table[i];
- memmove(kdb_name_table+i, kdb_name_table+i+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-i-1));
- }
- i = ARRAY_SIZE(kdb_name_table) - 1;
- kdb_name_table[i] = knt1;
- symtab->sym_name = kdb_name_table[i];
- knt1 = NULL;
- }
-
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
-
out:
- debug_kfree(knt1);
return ret;
}
-void kdbnearsym_cleanup(void)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i]) {
- debug_kfree(kdb_name_table[i]);
- kdb_name_table[i] = NULL;
- }
- }
-}
-
static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
/*
@@ -340,7 +290,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_func_printf("Bad address 0x%lx\n", addr);
@@ -523,82 +473,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
return diag;
}
-/*
- * kdb_task_state_string - Convert a string containing any of the
- * letters DRSTCZEUIMA to a mask for the process state field and
- * return the value. If no argument is supplied, return the mask
- * that corresponds to environment variable PS, DRSTCZEU by
- * default.
- * Inputs:
- * s String to convert
- * Returns:
- * Mask for process state.
- * Notes:
- * The mask folds data from several sources into a single long value, so
- * be careful not to overlap the bits. TASK_* bits are in the LSB,
- * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
- * is no overlap between TASK_* and EXIT_* but that may not always be
- * true, so EXIT_* bits are shifted left 16 bits before being stored in
- * the mask.
- */
-
-/* unrunnable is < 0 */
-#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
-#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
-#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
-#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
-unsigned long kdb_task_state_string(const char *s)
-{
- long res = 0;
- if (!s) {
- s = kdbgetenv("PS");
- if (!s)
- s = "DRSTCZEU"; /* default value for ps */
- }
- while (*s) {
- switch (*s) {
- case 'D':
- res |= TASK_UNINTERRUPTIBLE;
- break;
- case 'R':
- res |= RUNNING;
- break;
- case 'S':
- res |= TASK_INTERRUPTIBLE;
- break;
- case 'T':
- res |= TASK_STOPPED;
- break;
- case 'C':
- res |= TASK_TRACED;
- break;
- case 'Z':
- res |= EXIT_ZOMBIE << 16;
- break;
- case 'E':
- res |= EXIT_DEAD << 16;
- break;
- case 'U':
- res |= UNRUNNABLE;
- break;
- case 'I':
- res |= IDLE;
- break;
- case 'M':
- res |= DAEMON;
- break;
- case 'A':
- res = ~0UL;
- break;
- default:
- kdb_func_printf("unknown flag '%c' ignored\n", *s);
- break;
- }
- ++s;
- }
- return res;
-}
/*
* kdb_task_state_char - Return the character that represents the task state.
@@ -609,7 +484,6 @@ unsigned long kdb_task_state_string(const char *s)
*/
char kdb_task_state_char (const struct task_struct *p)
{
- unsigned int p_state;
unsigned long tmp;
char state;
int cpu;
@@ -618,25 +492,18 @@ char kdb_task_state_char (const struct task_struct *p)
copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
- cpu = kdb_process_cpu(p);
- p_state = READ_ONCE(p->__state);
- state = (p_state == 0) ? 'R' :
- (p_state < 0) ? 'U' :
- (p_state & TASK_UNINTERRUPTIBLE) ? 'D' :
- (p_state & TASK_STOPPED) ? 'T' :
- (p_state & TASK_TRACED) ? 'C' :
- (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
- (p->exit_state & EXIT_DEAD) ? 'E' :
- (p_state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ state = task_state_to_char((struct task_struct *) p);
+
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
+ cpu = kdb_process_cpu(p);
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
if (cpu != kdb_initial_cpu)
- state = 'I'; /* idle task */
+ state = '-'; /* idle task */
}
- } else if (!p->mm && state == 'S') {
- state = 'M'; /* sleeping system daemon */
+ } else if (!p->mm && strchr("IMS", state)) {
+ state = tolower(state); /* sleeping system daemon */
}
return state;
}
@@ -646,238 +513,28 @@ char kdb_task_state_char (const struct task_struct *p)
* given by the mask.
* Inputs:
* p struct task for the process
- * mask mask from kdb_task_state_string to select processes
+ * mask set of characters used to select processes; both NULL
+ * and the empty string mean adopt a default filter, which
+ * is to suppress sleeping system daemons and the idle tasks
* Returns:
* True if the process matches at least one criteria defined by the mask.
*/
-unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+bool kdb_task_state(const struct task_struct *p, const char *mask)
{
- char state[] = { kdb_task_state_char(p), '\0' };
- return (mask & kdb_task_state_string(state)) != 0;
-}
-
-/* Last ditch allocator for debugging, so we can still debug even when
- * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
- * for space usage, not for speed. One smallish memory pool, the free
- * chain is always in ascending address order to allow coalescing,
- * allocations are done in brute force best fit.
- */
-
-struct debug_alloc_header {
- u32 next; /* offset of next header from start of pool */
- u32 size;
- void *caller;
-};
-
-/* The memory returned by this allocator must be aligned, which means
- * so must the header size. Do not assume that sizeof(struct
- * debug_alloc_header) is a multiple of the alignment, explicitly
- * calculate the overhead of this header, including the alignment.
- * The rest of this code must not use sizeof() on any header or
- * pointer to a header.
- */
-#define dah_align 8
-#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
-
-static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
-static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
-static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
-
-/* Locking is awkward. The debug code is called from all contexts,
- * including non maskable interrupts. A normal spinlock is not safe
- * in NMI context. Try to get the debug allocator lock, if it cannot
- * be obtained after a second then give up. If the lock could not be
- * previously obtained on this cpu then only try once.
- *
- * sparse has no annotation for "this function _sometimes_ acquires a
- * lock", so fudge the acquire/release notation.
- */
-static DEFINE_SPINLOCK(dap_lock);
-static int get_dap_lock(void)
- __acquires(dap_lock)
-{
- static int dap_locked = -1;
- int count;
- if (dap_locked == smp_processor_id())
- count = 1;
- else
- count = 1000;
- while (1) {
- if (spin_trylock(&dap_lock)) {
- dap_locked = -1;
- return 1;
- }
- if (!count--)
- break;
- udelay(1000);
- }
- dap_locked = smp_processor_id();
- __acquire(dap_lock);
- return 0;
-}
+ char state = kdb_task_state_char(p);
-void *debug_kmalloc(size_t size, gfp_t flags)
-{
- unsigned int rem, h_offset;
- struct debug_alloc_header *best, *bestprev, *prev, *h;
- void *p = NULL;
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return NULL;
- }
- h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first_call) {
- h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
- dah_first_call = 0;
- }
- size = ALIGN(size, dah_align);
- prev = best = bestprev = NULL;
- while (1) {
- if (h->size >= size && (!best || h->size < best->size)) {
- best = h;
- bestprev = prev;
- if (h->size == size)
- break;
- }
- if (!h->next)
- break;
- prev = h;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
- }
- if (!best)
- goto out;
- rem = best->size - size;
- /* The pool must always contain at least one header */
- if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
- goto out;
- if (rem >= dah_overhead) {
- best->size = size;
- h_offset = ((char *)best - debug_alloc_pool) +
- dah_overhead + best->size;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
- h->size = rem - dah_overhead;
- h->next = best->next;
- } else
- h_offset = best->next;
- best->caller = __builtin_return_address(0);
- dah_used += best->size;
- dah_used_max = max(dah_used, dah_used_max);
- if (bestprev)
- bestprev->next = h_offset;
- else
- dah_first = h_offset;
- p = (char *)best + dah_overhead;
- memset(p, POISON_INUSE, best->size - 1);
- *((char *)p + best->size - 1) = POISON_END;
-out:
- spin_unlock(&dap_lock);
- return p;
-}
+ /* If there is no mask, then we will filter code that runs when the
+ * scheduler is idling and any system daemons that are currently
+ * sleeping.
+ */
+ if (!mask || mask[0] == '\0')
+ return !strchr("-ims", state);
-void debug_kfree(void *p)
-{
- struct debug_alloc_header *h;
- unsigned int h_offset;
- if (!p)
- return;
- if ((char *)p < debug_alloc_pool ||
- (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
- kfree(p);
- return;
- }
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return; /* memory leak, cannot be helped */
- }
- h = (struct debug_alloc_header *)((char *)p - dah_overhead);
- memset(p, POISON_FREE, h->size - 1);
- *((char *)p + h->size - 1) = POISON_END;
- h->caller = NULL;
- dah_used -= h->size;
- h_offset = (char *)h - debug_alloc_pool;
- if (h_offset < dah_first) {
- h->next = dah_first;
- dah_first = h_offset;
- } else {
- struct debug_alloc_header *prev;
- unsigned int prev_offset;
- prev = (struct debug_alloc_header *)(debug_alloc_pool +
- dah_first);
- while (1) {
- if (!prev->next || prev->next > h_offset)
- break;
- prev = (struct debug_alloc_header *)
- (debug_alloc_pool + prev->next);
- }
- prev_offset = (char *)prev - debug_alloc_pool;
- if (prev_offset + dah_overhead + prev->size == h_offset) {
- prev->size += dah_overhead + h->size;
- memset(h, POISON_FREE, dah_overhead - 1);
- *((char *)h + dah_overhead - 1) = POISON_END;
- h = prev;
- h_offset = prev_offset;
- } else {
- h->next = prev->next;
- prev->next = h_offset;
- }
- }
- if (h_offset + dah_overhead + h->size == h->next) {
- struct debug_alloc_header *next;
- next = (struct debug_alloc_header *)
- (debug_alloc_pool + h->next);
- h->size += dah_overhead + next->size;
- h->next = next->next;
- memset(next, POISON_FREE, dah_overhead - 1);
- *((char *)next + dah_overhead - 1) = POISON_END;
- }
- spin_unlock(&dap_lock);
-}
+ /* A is a special case that matches all states */
+ if (strchr(mask, 'A'))
+ return true;
-void debug_kusage(void)
-{
- struct debug_alloc_header *h_free, *h_used;
-#ifdef CONFIG_IA64
- /* FIXME: using dah for ia64 unwind always results in a memory leak.
- * Fix that memory leak first, then set debug_kusage_one_time = 1 for
- * all architectures.
- */
- static int debug_kusage_one_time;
-#else
- static int debug_kusage_one_time = 1;
-#endif
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return;
- }
- h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first == 0 &&
- (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
- dah_first_call))
- goto out;
- if (!debug_kusage_one_time)
- goto out;
- debug_kusage_one_time = 0;
- kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first);
- if (dah_first) {
- h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_func_printf("h_used %px size %d\n", h_used, h_used->size);
- }
- do {
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- kdb_func_printf("h_used %px size %d caller %px\n",
- h_used, h_used->size, h_used->caller);
- h_free = (struct debug_alloc_header *)
- (debug_alloc_pool + h_free->next);
- } while (h_free->next);
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- if ((char *)h_used - debug_alloc_pool !=
- sizeof(debug_alloc_pool_aligned))
- kdb_func_printf("h_used %px size %d caller %px\n",
- h_used, h_used->size, h_used->caller);
-out:
- spin_unlock(&dap_lock);
+ return strchr(mask, state);
}
/* Maintain a small stack of kdb_flags to allow recursion without disturbing
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 51530d5b15a8..6f0c358e73d8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,7 +44,7 @@ void delayacct_init(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int state = delayacct_on;
@@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
set_delayacct(state);
return err;
}
+
+static struct ctl_table kern_delayacct_table[] = {
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static __init int kernel_delayacct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_delayacct_table);
+ return 0;
+}
+late_initcall(kernel_delayacct_sysctls_init);
#endif
void __delayacct_tsk_init(struct task_struct *tsk)
@@ -100,19 +120,10 @@ void __delayacct_blkio_start(void)
*/
void __delayacct_blkio_end(struct task_struct *p)
{
- struct task_delay_info *delays = p->delays;
- u64 *total;
- u32 *count;
-
- if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
- total = &delays->swapin_delay;
- count = &delays->swapin_count;
- } else {
- total = &delays->blkio_delay;
- count = &delays->blkio_count;
- }
-
- delayacct_end(&delays->lock, &delays->blkio_start, total, count);
+ delayacct_end(&p->delays->lock,
+ &p->delays->blkio_start,
+ &p->delays->blkio_delay,
+ &p->delays->blkio_count);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -164,10 +175,19 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
+ tmp = d->compact_delay_total + tsk->delays->compact_delay;
+ d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+ tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
+ d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
+ tmp = d->irq_delay_total + tsk->delays->irq_delay;
+ d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
d->thrashing_count += tsk->delays->thrashing_count;
+ d->compact_count += tsk->delays->compact_count;
+ d->wpcopy_count += tsk->delays->wpcopy_count;
+ d->irq_count += tsk->delays->irq_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -179,8 +199,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
unsigned long flags;
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
- ret = nsec_to_clock_t(tsk->delays->blkio_delay +
- tsk->delays->swapin_delay);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
@@ -198,15 +217,74 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
-void __delayacct_thrashing_start(void)
+void __delayacct_thrashing_start(bool *in_thrashing)
{
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
current->delays->thrashing_start = local_clock();
}
-void __delayacct_thrashing_end(void)
+void __delayacct_thrashing_end(bool *in_thrashing)
{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
&current->delays->thrashing_count);
}
+
+void __delayacct_swapin_start(void)
+{
+ current->delays->swapin_start = local_clock();
+}
+
+void __delayacct_swapin_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->swapin_start,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+}
+
+void __delayacct_compact_start(void)
+{
+ current->delays->compact_start = local_clock();
+}
+
+void __delayacct_compact_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->compact_start,
+ &current->delays->compact_delay,
+ &current->delays->compact_count);
+}
+
+void __delayacct_wpcopy_start(void)
+{
+ current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->wpcopy_start,
+ &current->delays->wpcopy_delay,
+ &current->delays->wpcopy_count);
+}
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->delays->lock, flags);
+ task->delays->irq_delay += delta;
+ task->delays->irq_count++;
+ raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+}
+
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 77b405508743..6677d0e64d27 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -76,10 +76,30 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
+#
+# Select this option if the architecture assumes DMA devices are coherent
+# by default.
+#
+config ARCH_DMA_DEFAULT_COHERENT
+ bool
+
config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config DMA_RESTRICTED_POOL
+ bool "DMA Restricted Pool"
+ depends on OF && OF_RESERVED_MEM && SWIOTLB
+ help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+ and <kernel/dma/swiotlb.c>.
+ If unsure, say "n".
+
#
# Should be selected if we can mmap non-coherent mappings to userspace.
# The only thing that is really required is a way to set an uncached bit
@@ -93,15 +113,14 @@ config DMA_COHERENT_POOL
select GENERIC_ALLOCATOR
bool
-config DMA_REMAP
+config DMA_GLOBAL_POOL
+ select DMA_DECLARE_COHERENT
bool
- depends on MMU
- select DMA_NONCOHERENT_MMAP
config DMA_DIRECT_REMAP
bool
- select DMA_REMAP
select DMA_COHERENT_POOL
+ select DMA_NONCOHERENT_MMAP
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
-obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_MMU) += remap.o
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 794e76b03b34..c21abc77c53e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -20,8 +20,6 @@ struct dma_coherent_mem {
bool use_dev_dma_pfn_offset;
};
-static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
-
static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
{
if (dev && dev->dma_mem)
@@ -37,60 +35,52 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
return mem->device_base;
}
-static int dma_init_coherent_memory(phys_addr_t phys_addr,
- dma_addr_t device_addr, size_t size,
- struct dma_coherent_mem **mem)
+static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset)
{
- struct dma_coherent_mem *dma_mem = NULL;
- void *mem_base = NULL;
+ struct dma_coherent_mem *dma_mem;
int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
- int ret;
+ void *mem_base;
- if (!size) {
- ret = -EINVAL;
- goto out;
- }
+ if (!size)
+ return ERR_PTR(-EINVAL);
mem_base = memremap(phys_addr, size, MEMREMAP_WC);
- if (!mem_base) {
- ret = -EINVAL;
- goto out;
- }
+ if (!mem_base)
+ return ERR_PTR(-EINVAL);
+
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dma_mem) {
- ret = -ENOMEM;
- goto out;
- }
- dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dma_mem->bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!dma_mem)
+ goto out_unmap_membase;
+ dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
+ if (!dma_mem->bitmap)
+ goto out_free_dma_mem;
dma_mem->virt_base = mem_base;
dma_mem->device_base = device_addr;
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_mem->size = pages;
+ dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset;
spin_lock_init(&dma_mem->spinlock);
- *mem = dma_mem;
- return 0;
+ return dma_mem;
-out:
+out_free_dma_mem:
kfree(dma_mem);
- if (mem_base)
- memunmap(mem_base);
- return ret;
+out_unmap_membase:
+ memunmap(mem_base);
+ pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %zd MiB\n",
+ &phys_addr, size / SZ_1M);
+ return ERR_PTR(-ENOMEM);
}
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
{
if (!mem)
return;
memunmap(mem->virt_base);
- kfree(mem->bitmap);
+ bitmap_free(mem->bitmap);
kfree(mem);
}
@@ -130,16 +120,22 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
struct dma_coherent_mem *mem;
int ret;
- ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem);
- if (ret)
- return ret;
+ mem = dma_init_coherent_memory(phys_addr, device_addr, size, false);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
ret = dma_assign_coherent_memory(dev, mem);
if (ret)
- dma_release_coherent_memory(mem);
+ _dma_release_coherent_memory(mem);
return ret;
}
+void dma_release_coherent_memory(struct device *dev)
+{
+ if (dev)
+ _dma_release_coherent_memory(dev->dma_mem);
+}
+
static void *__dma_alloc_from_coherent(struct device *dev,
struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
@@ -198,16 +194,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
return 1;
}
-void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
- dma_addr_t *dma_handle)
-{
- if (!dma_coherent_default_memory)
- return NULL;
-
- return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
- dma_handle);
-}
-
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
int order, void *vaddr)
{
@@ -243,15 +229,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
return __dma_release_from_coherent(mem, order, vaddr);
}
-int dma_release_from_global_coherent(int order, void *vaddr)
-{
- if (!dma_coherent_default_memory)
- return 0;
-
- return __dma_release_from_coherent(dma_coherent_default_memory, order,
- vaddr);
-}
-
static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
{
@@ -297,6 +274,28 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
}
+#ifdef CONFIG_DMA_GLOBAL_POOL
+static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
+
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
+{
+ if (!dma_coherent_default_memory)
+ return NULL;
+
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
+}
+
+int dma_release_from_global_coherent(int order, void *vaddr)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_release_from_coherent(dma_coherent_default_memory, order,
+ vaddr);
+}
+
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
size_t size, int *ret)
{
@@ -307,6 +306,19 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
vaddr, size, ret);
}
+int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
+{
+ struct dma_coherent_mem *mem;
+
+ mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ dma_coherent_default_memory = mem;
+ pr_info("DMA: default coherent area is set\n");
+ return 0;
+}
+#endif /* CONFIG_DMA_GLOBAL_POOL */
+
/*
* Support for reserved memory regions defined in device tree
*/
@@ -315,25 +327,22 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
+#ifdef CONFIG_DMA_GLOBAL_POOL
static struct reserved_mem *dma_reserved_default_memory __initdata;
+#endif
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- struct dma_coherent_mem *mem = rmem->priv;
- int ret;
-
- if (!mem) {
- ret = dma_init_coherent_memory(rmem->base, rmem->base,
- rmem->size, &mem);
- if (ret) {
- pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
- &rmem->base, (unsigned long)rmem->size / SZ_1M);
- return ret;
- }
+ if (!rmem->priv) {
+ struct dma_coherent_mem *mem;
+
+ mem = dma_init_coherent_memory(rmem->base, rmem->base,
+ rmem->size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ rmem->priv = mem;
}
- mem->use_dev_dma_pfn_offset = true;
- rmem->priv = mem;
- dma_assign_coherent_memory(dev, mem);
+ dma_assign_coherent_memory(dev, rmem->priv);
return 0;
}
@@ -361,7 +370,9 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: regions without no-map are not yet supported\n");
return -EINVAL;
}
+#endif
+#ifdef CONFIG_DMA_GLOBAL_POOL
if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
WARN(dma_reserved_default_memory,
"Reserved memory: region for default DMA coherent area is redefined\n");
@@ -375,31 +386,16 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
return 0;
}
+#ifdef CONFIG_DMA_GLOBAL_POOL
static int __init dma_init_reserved_memory(void)
{
- const struct reserved_mem_ops *ops;
- int ret;
-
if (!dma_reserved_default_memory)
return -ENOMEM;
-
- ops = dma_reserved_default_memory->ops;
-
- /*
- * We rely on rmem_dma_device_init() does not propagate error of
- * dma_assign_coherent_memory() for "NULL" device.
- */
- ret = ops->device_init(dma_reserved_default_memory, NULL);
-
- if (!ret) {
- dma_coherent_default_memory = dma_reserved_default_memory->priv;
- pr_info("DMA: default coherent area is set\n");
- }
-
- return ret;
+ return dma_init_global_coherent(dma_reserved_default_memory->base,
+ dma_reserved_default_memory->size);
}
-
core_initcall(dma_init_reserved_memory);
+#endif /* CONFIG_DMA_GLOBAL_POOL */
RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..6ea80ae42622 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
static int __init rmem_cma_setup(struct reserved_mem *rmem)
{
- phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
- phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
@@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if ((rmem->base & mask) || (rmem->size & mask)) {
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index dadae6255d05..f190651bcadd 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -53,6 +53,7 @@ enum map_err_types {
* struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
* @list: node on pre-allocated free_entries list
* @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
+ * @dev_addr: dma address
* @size: length of the mapping
* @type: single, page, sg, coherent
* @direction: enum dma_data_direction
@@ -350,11 +351,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
unsigned long *flags)
{
- unsigned int max_range = dma_get_max_seg_size(ref->dev);
struct dma_debug_entry *entry, index = *ref;
- unsigned int range = 0;
+ int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1);
- while (range <= max_range) {
+ for (int i = 0; i < limit; i++) {
entry = __hash_bucket_find(*bucket, ref, containing_match);
if (entry)
@@ -364,7 +364,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
* Nothing found, go back a hash bucket
*/
put_hash_bucket(*bucket, *flags);
- range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
}
@@ -398,37 +397,6 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry)
}
/*
- * Dump mapping entries for debugging purposes
- */
-void debug_dma_dump_mappings(struct device *dev)
-{
- int idx;
-
- for (idx = 0; idx < HASH_SIZE; idx++) {
- struct hash_bucket *bucket = &dma_entry_hash[idx];
- struct dma_debug_entry *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&bucket->lock, flags);
-
- list_for_each_entry(entry, &bucket->list, list) {
- if (!dev || dev == entry->dev) {
- dev_info(entry->dev,
- "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n",
- type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- dir2name[entry->direction],
- maperr2str[entry->map_err_type]);
- }
- }
-
- spin_unlock_irqrestore(&bucket->lock, flags);
- cond_resched();
- }
-}
-
-/*
* For each mapping (initial cacheline in the case of
* dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
* scatterlist, or the cacheline specified in dma_map_single) insert
@@ -448,7 +416,7 @@ void debug_dma_dump_mappings(struct device *dev)
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
*/
-static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC);
static DEFINE_SPINLOCK(radix_lock);
#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
@@ -549,10 +517,74 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
}
/*
+ * Dump mappings entries on kernel space for debugging purposes
+ */
+void debug_dma_dump_mappings(struct device *dev)
+{
+ int idx;
+ phys_addr_t cln;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!dev || dev == entry->dev) {
+ cln = to_cacheline_number(entry);
+ dev_info(entry->dev,
+ "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ &cln, dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+
+ cond_resched();
+ }
+}
+
+/*
+ * Dump mappings entries on user space via debugfs
+ */
+static int dump_show(struct seq_file *seq, void *v)
+{
+ int idx;
+ phys_addr_t cln;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ cln = to_cacheline_number(entry);
+ seq_printf(seq,
+ "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ dev_driver_string(entry->dev),
+ dev_name(entry->dev),
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ &cln, dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dump);
+
+/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
-static void add_dma_entry(struct dma_debug_entry *entry)
+static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
struct hash_bucket *bucket;
unsigned long flags;
@@ -564,10 +596,11 @@ static void add_dma_entry(struct dma_debug_entry *entry)
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
- pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
+ pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
- } else if (rc == -EEXIST) {
- pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n");
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ err_printk(entry->dev, entry,
+ "cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
}
@@ -765,34 +798,7 @@ static const struct file_operations filter_fops = {
.llseek = default_llseek,
};
-static int dump_show(struct seq_file *seq, void *v)
-{
- int idx;
-
- for (idx = 0; idx < HASH_SIZE; idx++) {
- struct hash_bucket *bucket = &dma_entry_hash[idx];
- struct dma_debug_entry *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&bucket->lock, flags);
- list_for_each_entry(entry, &bucket->list, list) {
- seq_printf(seq,
- "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n",
- dev_name(entry->dev),
- dev_driver_string(entry->dev),
- type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- dir2name[entry->direction],
- maperr2str[entry->map_err_type]);
- }
- spin_unlock_irqrestore(&bucket->lock, flags);
- }
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(dump);
-
-static void dma_debug_fs_init(void)
+static int __init dma_debug_fs_init(void)
{
struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
@@ -805,7 +811,10 @@ static void dma_debug_fs_init(void)
debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
+
+ return 0;
}
+core_initcall_sync(dma_debug_fs_init);
static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
{
@@ -890,8 +899,6 @@ static int dma_debug_init(void)
spin_lock_init(&dma_entry_hash[i].lock);
}
- dma_debug_fs_init();
-
nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
for (i = 0; i < nr_pages; ++i)
dma_debug_create_entries(GFP_KERNEL);
@@ -925,7 +932,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -934,7 +941,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
@@ -1064,20 +1071,10 @@ static void check_for_stack(struct device *dev,
}
}
-static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
-{
- unsigned long a1 = (unsigned long)addr;
- unsigned long b1 = a1 + len;
- unsigned long a2 = (unsigned long)start;
- unsigned long b2 = (unsigned long)end;
-
- return !(b1 <= a2 || a1 >= b2);
-}
-
static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
{
- if (overlap(addr, len, _stext, _etext) ||
- overlap(addr, len, __start_rodata, __end_rodata))
+ if (memory_intersects(_stext, _etext, addr, len) ||
+ memory_intersects(__start_rodata, __end_rodata, addr, len))
err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
}
@@ -1199,7 +1196,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
EXPORT_SYMBOL(debug_dma_map_single);
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr)
+ size_t size, int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1230,7 +1228,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
check_for_illegal_area(dev, addr, size);
}
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -1271,13 +1269,13 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
}
EXPORT_SYMBOL(debug_dma_mapping_error);
-void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
{
struct dma_debug_entry ref = {
.type = dma_debug_single,
.dev = dev,
- .dev_addr = addr,
+ .dev_addr = dma_addr,
.size = size,
.direction = direction,
};
@@ -1288,7 +1286,8 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
struct scatterlist *s;
@@ -1297,6 +1296,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
if (unlikely(dma_debug_disabled()))
return;
+ for_each_sg(sg, s, nents, i) {
+ check_for_stack(dev, sg_page(s), s->offset);
+ if (!PageHighMem(sg_page(s)))
+ check_for_illegal_area(dev, sg_virt(s), s->length);
+ }
+
for_each_sg(sg, s, mapped_ents, i) {
entry = dma_entry_alloc();
if (!entry)
@@ -1312,15 +1317,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->sg_call_ents = nents;
entry->sg_mapped_ents = mapped_ents;
- check_for_stack(dev, sg_page(s), s->offset);
-
- if (!PageHighMem(sg_page(s))) {
- check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
- }
-
check_sg_segment(dev, s);
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
}
@@ -1376,7 +1375,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
}
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1406,17 +1406,17 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
else
entry->pfn = page_to_pfn(virt_to_page(virt));
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_free_coherent(struct device *dev, size_t size,
- void *virt, dma_addr_t addr)
+ void *virt, dma_addr_t dma_addr)
{
struct dma_debug_entry ref = {
.type = dma_debug_coherent,
.dev = dev,
.offset = offset_in_page(virt),
- .dev_addr = addr,
+ .dev_addr = dma_addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
};
@@ -1437,7 +1437,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
}
void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1457,7 +1458,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index 83643b3010b2..f525197d3cae 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -11,26 +11,30 @@
#ifdef CONFIG_DMA_API_DEBUG
extern void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr);
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, int direction);
extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction);
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs);
extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir);
extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt);
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs);
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr);
+ dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction);
@@ -53,7 +57,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
#else /* CONFIG_DMA_API_DEBUG */
static inline void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
@@ -63,7 +68,8 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
}
@@ -74,7 +80,8 @@ static inline void debug_dma_unmap_sg(struct device *dev,
}
static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
}
@@ -85,7 +92,8 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..5595d1d5cdcc 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -44,10 +44,11 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
-static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
- u64 *phys_limit)
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
{
- u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
+ u64 dma_limit = min_not_zero(
+ dev->coherent_dma_mask,
+ dev->bus_dma_limit);
/*
* Optimistically try the zone that the physical address mask falls
@@ -75,8 +76,47 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
+static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
+{
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
+}
+
+static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
+{
+ int ret;
+
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
+ return ret;
+}
+
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+ size_t size)
+{
+ if (swiotlb_free(dev, page, size))
+ return;
+ dma_free_contiguous(dev, page, size);
+}
+
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+{
+ struct page *page = swiotlb_alloc(dev, size);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ swiotlb_free(dev, page, size);
+ return NULL;
+ }
+
+ return page;
+}
+
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- gfp_t gfp)
+ gfp_t gfp, bool allow_highmem)
{
int node = dev_to_node(dev);
struct page *page = NULL;
@@ -84,12 +124,17 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
- gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_limit);
+ if (is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_swiotlb(dev, size);
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_free_contiguous(dev, page, size);
- page = NULL;
+ if (page) {
+ if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
+ (!allow_highmem && PageHighMem(page))) {
+ dma_free_contiguous(dev, page, size);
+ page = NULL;
+ }
}
again:
if (!page)
@@ -114,15 +159,26 @@ again:
return page;
}
+/*
+ * Check if a potentially blocking operations needs to dip into the atomic
+ * pools for the given device/gfp.
+ */
+static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
+{
+ return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
+}
+
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
struct page *page;
- u64 phys_mask;
+ u64 phys_limit;
void *ret;
- gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
+ return NULL;
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
if (!page)
return NULL;
@@ -130,114 +186,135 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
return ret;
}
+static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+
+ /* return the page pointer as the opaque cookie */
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+}
+
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
+ bool remap = false, set_uncached = false;
struct page *page;
void *ret;
- int err;
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
- if (!page)
- return NULL;
- /* remove any dirty cache lines on the kernel alias */
- if (!PageHighMem(page))
- arch_dma_prep_coherent(page, size);
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
- /* return the page pointer as the opaque cookie */
- return page;
- }
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev))
- return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+ if (!dev_is_dma_coherent(dev)) {
+ /*
+ * Fallback to the arch handler if it exists. This should
+ * eventually go away.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !is_swiotlb_for_alloc(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp,
+ attrs);
+
+ /*
+ * If there is a global pool, always allocate from it for
+ * non-coherent devices.
+ */
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
+ return dma_alloc_from_global_coherent(dev, size,
+ dma_handle);
+
+ /*
+ * Otherwise remap if the architecture is asking for it. But
+ * given that remapping memory is a blocking operation we'll
+ * instead have to dip into the atomic pools.
+ */
+ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
+ if (remap) {
+ if (dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size,
+ dma_handle, gfp);
+ } else {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
+ return NULL;
+ set_uncached = true;
+ }
+ }
/*
- * Remapping or decrypting memory may block. If either is required and
- * we can't block, allocate the memory from the atomic pools.
+ * Decrypting memory may block, so allocate the memory from the atomic
+ * pools if we can't block.
*/
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- !gfpflags_allow_blocking(gfp) &&
- (force_dma_unencrypted(dev) ||
- (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
- if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev)) ||
- (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
+ if (PageHighMem(page)) {
+ remap = true;
+ set_uncached = false;
+ }
+
+ if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
- goto out_free_pages;
- }
- memset(ret, 0, size);
- goto done;
- }
-
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
-
- ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
+ } else {
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !dev_is_dma_coherent(dev)) {
+ if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
-done:
+
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
- if (force_dma_unencrypted(dev)) {
- err = set_memory_encrypted((unsigned long)page_address(page),
- 1 << get_order(size));
- /* If memory cannot be re-encrypted, it must be leaked */
- if (err)
- return NULL;
- }
+ if (dma_set_encrypted(dev, page_address(page), size))
+ return NULL;
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
@@ -247,7 +324,7 @@ void dma_direct_free(struct device *dev, size_t size,
unsigned int page_order = get_order(size);
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
dma_free_contiguous(dev, cpu_addr, size);
return;
@@ -255,25 +332,35 @@ void dma_direct_free(struct device *dev, size_t size,
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev)) {
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev)) {
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
return;
}
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev)) {
+ if (!dma_release_from_global_coherent(page_order, cpu_addr))
+ WARN_ON_ONCE(1);
+ return;
+ }
+
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
-
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
- else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
- arch_dma_clear_uncached(cpu_addr, size);
+ } else {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+ arch_dma_clear_uncached(cpu_addr, size);
+ if (dma_set_encrypted(dev, cpu_addr, size))
+ return;
+ }
- dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
+ __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -282,35 +369,21 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
- page = __dma_direct_alloc_pages(dev, size, gfp);
+ page = __dma_direct_alloc_pages(dev, size, gfp, false);
if (!page)
return NULL;
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- if (set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size)))
- goto out_free_pages;
- }
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_free_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
@@ -318,7 +391,6 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
- unsigned int page_order = get_order(size);
void *vaddr = page_address(page);
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
@@ -326,10 +398,9 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
-
- dma_free_contiguous(dev, page, size);
+ if (dma_set_encrypted(dev, vaddr, size))
+ return;
+ __dma_direct_free_pages(dev, page, size);
}
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
@@ -343,7 +414,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, sg->length,
dir);
@@ -369,7 +440,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
dir);
@@ -381,29 +452,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
+/*
+ * Unmaps segments, except for ones marked as pci_p2pdma which do not
+ * require any further action as they contain a bus address.
+ */
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *sg;
int i;
- for_each_sg(sgl, sg, nents, i)
- dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_is_dma_bus_address(sg))
+ sg_dma_unmark_bus_address(sg);
+ else
+ dma_direct_unmap_page(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+ }
}
#endif
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
- int i;
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ enum pci_p2pdma_map_type map;
struct scatterlist *sg;
+ int i, ret;
for_each_sg(sgl, sg, nents, i) {
+ if (is_pci_p2pdma_page(sg_page(sg))) {
+ map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg);
+ switch (map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ continue;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI
+ * host bridge must be mapped with CPU physical
+ * address and not PCI bus addresses. This is
+ * done with dma_direct_map_page() below.
+ */
+ break;
+ default:
+ ret = -EREMOTEIO;
+ goto out_unmap;
+ }
+ }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR)
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
goto out_unmap;
+ }
sg_dma_len(sg) = sg->length;
}
@@ -411,7 +513,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return 0;
+ return ret;
}
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
@@ -459,9 +561,13 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
+ if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret))
+ return ret;
if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
return -ENXIO;
@@ -495,8 +601,8 @@ int dma_direct_supported(struct device *dev, u64 mask)
size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
- if (is_swiotlb_active() &&
- (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_active(dev) &&
+ (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
}
@@ -504,7 +610,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+ is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
}
/**
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 50afc05b6f1d..e38ffc5e6bdd 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -8,6 +8,7 @@
#define _KERNEL_DMA_DIRECT_H
#include <linux/dma-direct.h>
+#include <linux/memremap.h>
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -56,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
{
phys_addr_t paddr = dma_to_phys(dev, addr);
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
@@ -73,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
@@ -87,11 +88,16 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_force_bounce(dev)) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
+ }
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- if (swiotlb_force != SWIOTLB_NO_FORCE)
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
+ if (is_swiotlb_active(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
dev_WARN_ONCE(dev, 1,
@@ -113,7 +119,8 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
- if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ if (unlikely(is_swiotlb_buffer(dev, phys)))
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index eacd4c5b10bf..b492d59ac77e 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
int nelems, enum dma_data_direction dir,
unsigned long attrs)
{
- return 0;
+ return -EINVAL;
}
static int dma_dummy_supported(struct device *hwdev, u64 mask)
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..02205ab53b7e 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,6 +11,7 @@
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
+#include <linux/map_benchmark.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
@@ -18,30 +19,6 @@
#include <linux/slab.h>
#include <linux/timekeeping.h>
-#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS 1024
-#define DMA_MAP_MAX_SECONDS 300
-#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL 0
-#define DMA_MAP_TO_DEVICE 1
-#define DMA_MAP_FROM_DEVICE 2
-
-struct map_benchmark {
- __u64 avg_map_100ns; /* average map latency in 100ns */
- __u64 map_stddev; /* standard deviation of map latency */
- __u64 avg_unmap_100ns; /* as above */
- __u64 unmap_stddev;
- __u32 threads; /* how many threads will do map/unmap in parallel */
- __u32 seconds; /* how long the test will last */
- __s32 node; /* which numa node this benchmark will run on */
- __u32 dma_bits; /* DMA addressing capability */
- __u32 dma_dir; /* DMA data direction */
- __u32 dma_trans_ns; /* time for DMA transmission in ns */
- __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
- __u8 expansion[76]; /* For future use */
-};
-
struct map_benchmark_data {
struct map_benchmark bparam;
struct device *dev;
@@ -379,4 +356,3 @@ module_exit(map_benchmark_cleanup);
MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
MODULE_DESCRIPTION("dma_map benchmark driver");
-MODULE_LICENSE("GPL");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 2b06a809d0b9..9a4db5cce600 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -10,13 +10,18 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include "debug.h"
#include "direct.h"
-bool dma_default_coherent;
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT);
+#endif
/*
* Managed DMA API
@@ -156,7 +161,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
- debug_dma_map_page(dev, page, offset, size, dir, addr);
+ kmsan_handle_dma(page, offset, size, dir);
+ debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
}
@@ -177,12 +183,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
}
EXPORT_SYMBOL(dma_unmap_page_attrs);
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, unsigned long attrs)
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
int ents;
@@ -197,13 +199,87 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- BUG_ON(ents < 0);
- debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
+ debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
+ return -EIO;
+ }
return ents;
}
+
+/**
+ * dma_map_sg_attrs - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sg: The sg_table object describing the buffer
+ * @nents: Number of entries to map
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist passed in the sg argument with
+ * nents segments for the @dir DMA operation by the @dev device.
+ *
+ * Returns the number of mapped entries (which can be less than nents)
+ * on success. Zero is returned for any error.
+ *
+ * dma_unmap_sg_attrs() should be used to unmap the buffer with the
+ * original sg and original nents (not the value returned by this funciton).
+ */
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ int ret;
+
+ ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+ if (ret < 0)
+ return 0;
+ return ret;
+}
EXPORT_SYMBOL(dma_map_sg_attrs);
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sgt: The sg_table object describing the buffer
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success, the
+ * ownership for the buffer is transferred to the DMA domain. One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or a negative error code on error. The following
+ * error codes are supported with the given meaning:
+ *
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned
+ * DMA_MAPPING_ERROR.
+ * -EREMOTEIO The DMA device cannot access P2PDMA memory specified
+ * in the sg_table. This will not succeed if retried.
+ */
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int nents;
+
+ nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+ if (nents < 0)
+ return nents;
+ sgt->nents = nents;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dma_map_sgtable);
+
void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir,
unsigned long attrs)
@@ -231,16 +307,12 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
- /* Don't allow RAM to be mapped */
- if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
- return DMA_MAPPING_ERROR;
-
if (dma_map_direct(dev, ops))
addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
else if (ops->map_resource)
addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
- debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
return addr;
}
EXPORT_SYMBOL(dma_map_resource);
@@ -346,8 +418,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
@@ -432,6 +502,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
WARN_ON_ONCE(!dev->coherent_dma_mask);
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
@@ -445,7 +523,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
else
return NULL;
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
EXPORT_SYMBOL(dma_alloc_attrs);
@@ -486,6 +564,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
return NULL;
if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
@@ -501,7 +581,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
if (page)
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
@@ -571,6 +651,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
if (ops && ops->alloc_noncontiguous)
sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
@@ -579,7 +661,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (sgt) {
sgt->nents = 1;
- debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
}
return sgt;
}
@@ -645,7 +727,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
-int dma_supported(struct device *dev, u64 mask)
+static int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -659,7 +741,24 @@ int dma_supported(struct device *dev, u64 mask)
return 1;
return ops->dma_supported(dev, mask);
}
-EXPORT_SYMBOL(dma_supported);
+
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ /* if ops is not set, dma direct will be used which supports P2PDMA */
+ if (!ops)
+ return true;
+
+ /*
+ * Note: dma_ops_bypass is not checked here because P2PDMA should
+ * not be used with dma mapping ops that do not have support even
+ * if the specific device is bypassing them.
+ */
+
+ return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
void arch_dma_set_mask(struct device *dev, u64 mask);
@@ -684,7 +783,6 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -700,7 +798,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
size_t dma_max_mapping_size(struct device *dev)
{
@@ -716,6 +813,18 @@ size_t dma_max_mapping_size(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+size_t dma_opt_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (ops && ops->opt_mapping_size)
+ size = ops->opt_mapping_size();
+
+ return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 910ae69cae77..af4a6ef48ce0 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -5,6 +5,13 @@
*/
#include <linux/dma-map-ops.h>
+static struct page *dma_common_vaddr_to_page(void *cpu_addr)
+{
+ if (is_vmalloc_addr(cpu_addr))
+ return vmalloc_to_page(cpu_addr);
+ return virt_to_page(cpu_addr);
+}
+
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -12,7 +19,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- struct page *page = virt_to_page(cpu_addr);
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret;
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
@@ -32,6 +39,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -43,7 +51,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
return remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
+ page_to_pfn(page) + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5f84e6cdb78e..1acec2e22827 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER-1 */
- order = min(get_order(pool_size), MAX_ORDER-1);
+ /* Cannot allocate larger than MAX_ORDER */
+ order = min(get_order(pool_size), MAX_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
GFP_KERNEL);
if (!atomic_pool_kernel)
ret = -ENOMEM;
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
@@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
if (prev == NULL) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
return atomic_pool_dma32;
- if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ if (atomic_pool_dma && (gfp & GFP_DMA))
return atomic_pool_dma;
return atomic_pool_kernel;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e50df8d8f87e..af2e304c672c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,32 +21,33 @@
#define pr_fmt(fmt) "software IO TLB: " fmt
#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
-#include <linux/mm.h>
#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/mem_encrypt.h>
-#include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
#endif
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
@@ -61,17 +62,74 @@
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-enum swiotlb_force swiotlb_force;
+struct io_tlb_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ unsigned int list;
+};
+
+static bool swiotlb_force_bounce;
+static bool swiotlb_force_disable;
-struct io_tlb_mem *io_tlb_default_mem;
+struct io_tlb_mem io_tlb_default_mem;
+
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used: The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock: The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+ unsigned long used;
+ unsigned int index;
+ spinlock_t lock;
+};
/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
+ * Round up number of slabs to the next power of 2. The last area is going
+ * be smaller than the rest if default_nslabs is not power of two.
+ * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
+ * otherwise a segment may span two or more areas. It conflicts with free
+ * contiguous slots tracking: free slots are treated contiguous no matter
+ * whether they cross an area boundary.
+ *
+ * Return true if default_nslabs is rounded up.
*/
-static unsigned int max_segment;
+static bool round_up_default_nslabs(void)
+{
+ if (!default_nareas)
+ return false;
+
+ if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
+ default_nslabs = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(default_nslabs))
+ return false;
+ default_nslabs = roundup_pow_of_two(default_nslabs);
+ return true;
+}
-static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+ /* use a single area when non is specified */
+ if (!nareas)
+ nareas = 1;
+ else if (!is_power_of_2(nareas))
+ nareas = roundup_pow_of_two(nareas);
+
+ default_nareas = nareas;
+
+ pr_info("area num %d.\n", nareas);
+ if (round_up_default_nslabs())
+ pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+ (default_nslabs << IO_TLB_SHIFT) >> 20);
+}
static int __init
setup_io_tlb_npages(char *str)
@@ -83,29 +141,19 @@ setup_io_tlb_npages(char *str)
}
if (*str == ',')
++str;
+ if (isdigit(*str))
+ swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
+ if (*str == ',')
+ ++str;
if (!strcmp(str, "force"))
- swiotlb_force = SWIOTLB_FORCE;
+ swiotlb_force_bounce = true;
else if (!strcmp(str, "noforce"))
- swiotlb_force = SWIOTLB_NO_FORCE;
+ swiotlb_force_disable = true;
return 0;
}
early_param("swiotlb", setup_io_tlb_npages);
-unsigned int swiotlb_max_segment(void)
-{
- return io_tlb_default_mem ? max_segment : 0;
-}
-EXPORT_SYMBOL_GPL(swiotlb_max_segment);
-
-void swiotlb_set_max_segment(unsigned int val)
-{
- if (swiotlb_force == SWIOTLB_FORCE)
- max_segment = 1;
- else
- max_segment = rounddown(val, PAGE_SIZE);
-}
-
unsigned long swiotlb_size_or_default(void)
{
return default_nslabs << IO_TLB_SHIFT;
@@ -120,16 +168,19 @@ void __init swiotlb_adjust_size(unsigned long size)
*/
if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
return;
+
size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ if (round_up_default_nslabs())
+ size = default_nslabs << IO_TLB_SHIFT;
pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}
void swiotlb_print_info(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
- if (!mem) {
+ if (!mem->nslabs) {
pr_warn("No low mem\n");
return;
}
@@ -156,79 +207,142 @@ static inline unsigned long nr_slots(u64 val)
*/
void __init swiotlb_update_mem_attributes(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
- void *vaddr;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long bytes;
- if (!mem || mem->late_alloc)
+ if (!mem->nslabs || mem->late_alloc)
return;
- vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
- set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
- memset(vaddr, 0, bytes);
+ set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
+ unsigned long nslabs, unsigned int flags,
+ bool late_alloc, unsigned int nareas)
{
+ void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
- struct io_tlb_mem *mem;
- size_t alloc_size;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
-
- /* protect against double initialization */
- if (WARN_ON_ONCE(io_tlb_default_mem))
- return -ENOMEM;
-
- alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs));
- mem = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
mem->nslabs = nslabs;
- mem->start = __pa(tlb);
+ mem->start = start;
mem->end = mem->start + bytes;
- mem->index = 0;
- spin_lock_init(&mem->lock);
+ mem->late_alloc = late_alloc;
+ mem->nareas = nareas;
+ mem->area_nslabs = nslabs / mem->nareas;
+
+ mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+ for (i = 0; i < mem->nareas; i++) {
+ spin_lock_init(&mem->areas[i].lock);
+ mem->areas[i].index = 0;
+ mem->areas[i].used = 0;
+ }
+
for (i = 0; i < mem->nslabs; i++) {
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
- io_tlb_default_mem = mem;
- if (verbose)
- swiotlb_print_info();
- swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
- return 0;
+ memset(vaddr, 0, bytes);
+ mem->vaddr = vaddr;
+ return;
+}
+
+static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
+ unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
+
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
+ }
+
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
+
+ return tlb;
}
/*
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
*/
-void __init
-swiotlb_init(int verbose)
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long nslabs;
+ size_t alloc_size;
void *tlb;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
+ if (!addressing_limit && !swiotlb_force_bounce)
+ return;
+ if (swiotlb_force_disable)
return;
- /* Get IO TLB memory from the low pages */
- tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- if (!tlb)
- goto fail;
- if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
- goto fail_free_mem;
- return;
+ /*
+ * default_nslabs maybe changed when adjust area number.
+ * So allocate bounce buffer after adjusting area number.
+ */
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ nslabs = default_nslabs;
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ }
+
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
+ }
+
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+ mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem->slots) {
+ pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+ return;
+ }
+
+ mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ default_nareas), SMP_CACHE_BYTES);
+ if (!mem->areas) {
+ pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
+ return;
+ }
+
+ swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
+ default_nareas);
+
+ if (flags & SWIOTLB_VERBOSE)
+ swiotlb_print_info();
+}
-fail_free_mem:
- memblock_free_early(__pa(tlb), bytes);
-fail:
- pr_warn("Cannot allocate buffer");
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+{
+ swiotlb_init_remap(addressing_limit, flags, NULL);
}
/*
@@ -236,102 +350,116 @@ fail:
* initialize the swiotlb later using the slab allocator if needed.
* This should be just like above, but with some error catching.
*/
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- unsigned long nslabs =
- ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
- unsigned long bytes;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned char *vstart = NULL;
- unsigned int order;
+ unsigned int order, area_order;
+ bool retried = false;
int rc = 0;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
+ if (swiotlb_force_disable)
return 0;
- /*
- * Get IO TLB memory from the low pages
- */
+retry:
order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
- bytes = nslabs << IO_TLB_SHIFT;
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
- vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
order);
if (vstart)
break;
order--;
+ nslabs = SLABS_PER_PAGE << order;
+ retried = true;
}
if (!vstart)
return -ENOMEM;
- if (order != get_order(bytes)) {
- pr_warn("only able to allocate %ld MB\n",
- (PAGE_SIZE << order) >> 20);
- nslabs = SLABS_PER_PAGE << order;
- }
- rc = swiotlb_late_init_with_tbl(vstart, nslabs);
- if (rc)
+ if (remap)
+ rc = remap(vstart, nslabs);
+ if (rc) {
free_pages((unsigned long)vstart, order);
- return rc;
-}
-
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
-{
- unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
- struct io_tlb_mem *mem;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ if (nslabs < IO_TLB_MIN_SLABS)
+ return rc;
+ retried = true;
+ goto retry;
+ }
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
+ if (retried) {
+ pr_warn("only able to allocate %ld MB\n",
+ (PAGE_SIZE << order) >> 20);
+ }
- /* protect against double initialization */
- if (WARN_ON_ONCE(io_tlb_default_mem))
- return -ENOMEM;
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
- mem = (void *)__get_free_pages(GFP_KERNEL,
- get_order(struct_size(mem, slots, nslabs)));
- if (!mem)
- return -ENOMEM;
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ default_nareas));
+ mem->areas = (struct io_tlb_area *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+ if (!mem->areas)
+ goto error_area;
- mem->nslabs = nslabs;
- mem->start = virt_to_phys(tlb);
- mem->end = mem->start + bytes;
- mem->index = 0;
- mem->late_alloc = 1;
- spin_lock_init(&mem->lock);
- for (i = 0; i < mem->nslabs; i++) {
- mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
- mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
- mem->slots[i].alloc_size = 0;
- }
+ mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(array_size(sizeof(*mem->slots), nslabs)));
+ if (!mem->slots)
+ goto error_slots;
- set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- memset(tlb, 0, bytes);
+ set_memory_decrypted((unsigned long)vstart,
+ (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
+ default_nareas);
- io_tlb_default_mem = mem;
swiotlb_print_info();
- swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
+
+error_slots:
+ free_pages((unsigned long)mem->areas, area_order);
+error_area:
+ free_pages((unsigned long)vstart, order);
+ return -ENOMEM;
}
void __init swiotlb_exit(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
- size_t size;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long tbl_vaddr;
+ size_t tbl_size, slots_size;
+ unsigned int area_order;
- if (!mem)
+ if (swiotlb_force_bounce)
return;
- size = struct_size(mem, slots, mem->nslabs);
- if (mem->late_alloc)
- free_pages((unsigned long)mem, get_order(size));
- else
- memblock_free_late(__pa(mem), PAGE_ALIGN(size));
- io_tlb_default_mem = NULL;
+ if (!mem->nslabs)
+ return;
+
+ pr_info("tearing down default memory pool\n");
+ tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
+ tbl_size = PAGE_ALIGN(mem->end - mem->start);
+ slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
+
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (mem->late_alloc) {
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ mem->nareas));
+ free_pages((unsigned long)mem->areas, area_order);
+ free_pages(tbl_vaddr, get_order(tbl_size));
+ free_pages((unsigned long)mem->slots, get_order(slots_size));
+ } else {
+ memblock_free_late(__pa(mem->areas),
+ array_size(sizeof(*mem->areas), mem->nareas));
+ memblock_free_late(mem->start, tbl_size);
+ memblock_free_late(__pa(mem->slots), slots_size);
+ }
+
+ memset(mem, 0, sizeof(*mem));
}
/*
@@ -348,19 +476,33 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
enum dma_data_direction dir)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
- unsigned char *vaddr = phys_to_virt(tlb_addr);
- unsigned int tlb_offset;
+ unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
+ unsigned int tlb_offset, orig_addr_offset;
if (orig_addr == INVALID_PHYS_ADDR)
return;
- tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
- swiotlb_align_offset(dev, orig_addr);
+ tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
+ orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
+ if (tlb_offset < orig_addr_offset) {
+ dev_WARN_ONCE(dev, 1,
+ "Access before mapping start detected. orig offset %u, requested offset %u.\n",
+ orig_addr_offset, tlb_offset);
+ return;
+ }
+
+ tlb_offset -= orig_addr_offset;
+ if (tlb_offset > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
+ alloc_size, size, tlb_offset);
+ return;
+ }
orig_addr += tlb_offset;
alloc_size -= tlb_offset;
@@ -373,9 +515,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
if (PageHighMem(pfn_to_page(pfn))) {
- /* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
- char *buffer;
+ struct page *page;
unsigned int sz = 0;
unsigned long flags;
@@ -383,12 +524,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
sz = min_t(size_t, PAGE_SIZE - offset, size);
local_irq_save(flags);
- buffer = kmap_atomic(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
if (dir == DMA_TO_DEVICE)
- memcpy(vaddr, buffer + offset, sz);
+ memcpy_from_page(vaddr, page, offset, sz);
else
- memcpy(buffer + offset, vaddr, sz);
- kunmap_atomic(buffer);
+ memcpy_to_page(page, offset, vaddr, sz);
local_irq_restore(flags);
size -= sz;
@@ -403,7 +543,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
}
-#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
+{
+ return start + (idx << IO_TLB_SHIFT);
+}
/*
* Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
@@ -415,51 +558,102 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
return nr_slots(boundary_mask + 1);
}
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
{
- if (index >= mem->nslabs)
+ if (index >= mem->area_nslabs)
return 0;
return index;
}
/*
+ * Track the total used slots with a global atomic value in order to have
+ * correct information to determine the high water mark. The mem_used()
+ * function gives imprecise results because there's no locking across
+ * multiple areas.
+ */
+#ifdef CONFIG_DEBUG_FS
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ unsigned long old_hiwater, new_used;
+
+ new_used = atomic_long_add_return(nslots, &mem->total_used);
+ old_hiwater = atomic_long_read(&mem->used_hiwater);
+ do {
+ if (new_used <= old_hiwater)
+ break;
+ } while (!atomic_long_try_cmpxchg(&mem->used_hiwater,
+ &old_hiwater, new_used));
+}
+
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+/*
* Find a suitable number of IO TLB entries size that will fit this request and
* allocate a buffer from that IO TLB pool.
*/
-static int find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size)
+static int swiotlb_do_find_slots(struct device *dev, int area_index,
+ phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_area *area = mem->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ dma_get_min_align_mask(dev) | alloc_align_mask;
unsigned int nslots = nr_slots(alloc_size), stride;
- unsigned int index, wrap, count = 0, i;
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int index, slots_checked, count = 0, i;
unsigned long flags;
+ unsigned int slot_base;
+ unsigned int slot_index;
BUG_ON(!nslots);
+ BUG_ON(area_index >= mem->nareas);
+
+ /*
+ * For allocations of PAGE_SIZE or larger only look for page aligned
+ * allocations.
+ */
+ if (alloc_size >= PAGE_SIZE)
+ iotlb_align_mask |= ~PAGE_MASK;
+ iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
/*
* For mappings with an alignment requirement don't bother looping to
- * unaligned slots once we found an aligned one. For allocations of
- * PAGE_SIZE or larger only look for page aligned allocations.
+ * unaligned slots once we found an aligned one.
*/
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
- if (alloc_size >= PAGE_SIZE)
- stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
- spin_lock_irqsave(&mem->lock, flags);
- if (unlikely(nslots > mem->nslabs - mem->used))
+ spin_lock_irqsave(&area->lock, flags);
+ if (unlikely(nslots > mem->area_nslabs - area->used))
goto not_found;
- index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
- do {
- if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
- (orig_addr & iotlb_align_mask)) {
- index = wrap_index(mem, index + 1);
+ slot_base = area_index * mem->area_nslabs;
+ index = area->index;
+
+ for (slots_checked = 0; slots_checked < mem->area_nslabs; ) {
+ slot_index = slot_base + index;
+
+ if (orig_addr &&
+ (slot_addr(tbl_dma_addr, slot_index) &
+ iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+ index = wrap_area_index(mem, index + 1);
+ slots_checked++;
continue;
}
@@ -468,23 +662,27 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
* contiguous buffers, we allocate the buffers from that slot
* and mark the entries as '0' indicating unavailable.
*/
- if (!iommu_is_span_boundary(index, nslots,
+ if (!iommu_is_span_boundary(slot_index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (mem->slots[index].list >= nslots)
+ if (mem->slots[slot_index].list >= nslots)
goto found;
}
- index = wrap_index(mem, index + stride);
- } while (index != wrap);
+ index = wrap_area_index(mem, index + stride);
+ slots_checked += stride;
+ }
not_found:
- spin_unlock_irqrestore(&mem->lock, flags);
+ spin_unlock_irqrestore(&area->lock, flags);
return -1;
found:
- for (i = index; i < index + nslots; i++)
+ for (i = slot_index; i < slot_index + nslots; i++) {
mem->slots[i].list = 0;
- for (i = index - 1;
+ mem->slots[i].alloc_size = alloc_size - (offset +
+ ((i - slot_index) << IO_TLB_SHIFT));
+ }
+ for (i = slot_index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
mem->slots[i].list; i--)
mem->slots[i].list = ++count;
@@ -492,30 +690,61 @@ found:
/*
* Update the indices to avoid searching in the next round.
*/
- if (index + nslots < mem->nslabs)
- mem->index = index + nslots;
- else
- mem->index = 0;
- mem->used += nslots;
+ area->index = wrap_area_index(mem, index + nslots);
+ area->used += nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
- spin_unlock_irqrestore(&mem->lock, flags);
- return index;
+ inc_used_and_hiwater(mem, nslots);
+ return slot_index;
+}
+
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ int start = raw_smp_processor_id() & (mem->nareas - 1);
+ int i = start, index;
+
+ do {
+ index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size,
+ alloc_align_mask);
+ if (index >= 0)
+ return index;
+ if (++i >= mem->nareas)
+ i = 0;
+ } while (i != start);
+
+ return -1;
+}
+
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+ int i;
+ unsigned long used = 0;
+
+ for (i = 0; i < mem->nareas; i++)
+ used += mem->areas[i].used;
+ return used;
}
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
+ unsigned int alloc_align_mask, enum dma_data_direction dir,
+ unsigned long attrs)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int i;
int index;
phys_addr_t tlb_addr;
- if (!mem)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ if (!mem || !mem->nslabs) {
+ dev_warn_ratelimited(dev,
+ "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
- if (mem_encrypt_active())
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
if (mapping_size > alloc_size) {
@@ -524,12 +753,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
- index = find_slots(dev, orig_addr, alloc_size + offset);
+ index = swiotlb_find_slots(dev, orig_addr,
+ alloc_size + offset, alloc_align_mask);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, mem->nslabs, mem->used);
+ alloc_size, mem->nslabs, mem_used(mem));
return (phys_addr_t)DMA_MAPPING_ERROR;
}
@@ -538,46 +768,40 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+ for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- mem->slots[index + i].alloc_size =
- alloc_size - (i << IO_TLB_SHIFT);
- }
tlb_addr = slot_addr(mem->start, index) + offset;
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ /*
+ * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+ * to the tlb buffer, if we knew for sure the device will
+ * overwrite the entire current content. But we don't. Thus
+ * unconditional bounce may prevent leaking swiotlb content (i.e.
+ * kernel memory) to user-space.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
- */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long flags;
- unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+ unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ int aindex = index / mem->area_nslabs;
+ struct io_tlb_area *area = &mem->areas[aindex];
int count, i;
/*
- * First, sync the memory before unmapping the entry
- */
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
-
- /*
* Return the buffer to the free list by setting the corresponding
* entries to indicate the number of contiguous entries available.
* While returning the entries to the free list, we merge the entries
* with slots below and above the pool being returned.
*/
- spin_lock_irqsave(&mem->lock, flags);
+ BUG_ON(aindex >= mem->nareas);
+
+ spin_lock_irqsave(&area->lock, flags);
if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
count = mem->slots[index + nslots].list;
else
@@ -601,8 +825,27 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
i--)
mem->slots[i].list = ++count;
- mem->used -= nslots;
- spin_unlock_irqrestore(&mem->lock, flags);
+ area->used -= nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+
+ dec_used(mem, nslots);
+}
+
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+ swiotlb_release_slots(dev, tlb_addr);
}
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
@@ -633,10 +876,9 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
phys_addr_t swiotlb_addr;
dma_addr_t dma_addr;
- trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
- swiotlb_force);
+ trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
- swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
@@ -659,29 +901,208 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
+ int min_align_mask = dma_get_min_align_mask(dev);
+ int min_align = 0;
+
+ /*
+ * swiotlb_find_slots() skips slots according to
+ * min align mask. This affects max mapping size.
+ * Take it into acount here.
+ */
+ if (min_align_mask)
+ min_align = roundup(min_align_mask, IO_TLB_SIZE);
+
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
}
-bool is_swiotlb_active(void)
+bool is_swiotlb_active(struct device *dev)
{
- return io_tlb_default_mem != NULL;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+ return mem && mem->nslabs;
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
#ifdef CONFIG_DEBUG_FS
-static int __init swiotlb_create_debugfs(void)
+static int io_tlb_used_get(void *data, u64 *val)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = data;
+
+ *val = mem_used(mem);
+ return 0;
+}
+
+static int io_tlb_hiwater_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = atomic_long_read(&mem->used_hiwater);
+ return 0;
+}
+
+static int io_tlb_hiwater_set(void *data, u64 val)
+{
+ struct io_tlb_mem *mem = data;
+
+ /* Only allow setting to zero */
+ if (val != 0)
+ return -EINVAL;
+
+ atomic_long_set(&mem->used_hiwater, val);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
+ io_tlb_hiwater_set, "%llu\n");
+
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+ atomic_long_set(&mem->total_used, 0);
+ atomic_long_set(&mem->used_hiwater, 0);
+
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
- if (!mem)
- return 0;
- mem->debugfs = debugfs_create_dir("swiotlb", NULL);
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
- debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+ debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem,
+ &fops_io_tlb_used);
+ debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
+ &fops_io_tlb_hiwater);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
return 0;
}
-late_initcall(swiotlb_create_debugfs);
+late_initcall(swiotlb_create_default_debugfs);
-#endif
+#else /* !CONFIG_DEBUG_FS */
+
+static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ phys_addr_t tlb_addr;
+ int index;
+
+ if (!mem)
+ return NULL;
+
+ index = swiotlb_find_slots(dev, 0, size, 0);
+ if (index == -1)
+ return NULL;
+
+ tlb_addr = slot_addr(mem->start, index);
+
+ return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+ phys_addr_t tlb_addr = page_to_phys(page);
+
+ if (!is_swiotlb_buffer(dev, tlb_addr))
+ return false;
+
+ swiotlb_release_slots(dev, tlb_addr);
+
+ return true;
+}
+
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+ unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+ /* Set Per-device io tlb area to one */
+ unsigned int nareas = 1;
+
+ if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+ dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping.");
+ return -EINVAL;
+ }
+
+ /*
+ * Since multiple devices can share the same pool, the private data,
+ * io_tlb_mem struct, will be initialized by the first device attached
+ * to it.
+ */
+ if (!mem) {
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
+ if (!mem->slots) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+ GFP_KERNEL);
+ if (!mem->areas) {
+ kfree(mem->slots);
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+ rmem->size >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
+ false, nareas);
+ mem->for_alloc = true;
+
+ rmem->priv = mem;
+
+ swiotlb_create_debugfs_files(mem, rmem->name);
+ }
+
+ dev->dma_io_tlb_mem = mem;
+
+ return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+ .device_init = rmem_swiotlb_device_init,
+ .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+ of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ rmem->ops = &rmem_swiotlb_ops;
+ pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..be61332c66b5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,7 +2,10 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
+#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
+#include <linux/jump_label.h>
+#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -15,13 +18,14 @@
/* See comment for enter_from_user_mode() in entry-common.h */
static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
{
- arch_check_user_regs(regs);
+ arch_enter_from_user_mode(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
- CT_WARN_ON(ct_state() != CONTEXT_USER);
+ CT_WARN_ON(__ct_state() != CONTEXT_USER);
user_exit_irqoff();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
}
@@ -58,7 +62,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = arch_syscall_enter_tracehook(regs);
+ ret = ptrace_report_syscall_entry(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
@@ -124,7 +128,7 @@ static __always_inline void __exit_to_user_mode(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
instrumentation_end();
user_enter_irqoff();
@@ -138,15 +142,7 @@ void noinstr exit_to_user_mode(void)
}
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
@@ -169,12 +165,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
- handle_signal_work(regs, ti_work);
+ arch_do_signal_or_restart(regs);
- if (ti_work & _TIF_NOTIFY_RESUME) {
- tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
- }
+ if (ti_work & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -189,7 +183,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
@@ -198,13 +192,14 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work;
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
+ ti_work = read_thread_flags();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
@@ -254,7 +249,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- arch_syscall_exit_tracehook(regs, step);
+ ptrace_report_syscall_exit(regs, step);
}
/*
@@ -329,7 +324,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
}
/*
- * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
@@ -340,12 +335,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
- * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
@@ -358,8 +353,9 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
- rcu_irq_enter();
+ ct_irq_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -375,6 +371,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -382,7 +379,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
-void irqentry_exit_cond_resched(void)
+void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
@@ -394,7 +391,17 @@ void irqentry_exit_cond_resched(void)
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
@@ -414,21 +421,17 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
instrumentation_end();
- rcu_irq_exit();
+ ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMPT_DYNAMIC
- static_call(irqentry_exit_cond_resched)();
-#else
+ if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
-#endif
- }
+
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
@@ -438,7 +441,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
* was not watching on entry.
*/
if (state.exit_rcu)
- rcu_irq_exit();
+ ct_irq_exit();
}
}
@@ -451,9 +454,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
- rcu_nmi_enter();
+ ct_nmi_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
@@ -467,11 +471,11 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
}
instrumentation_end();
- rcu_nmi_exit();
+ ct_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 49972ee99aff..2e0f75bcb7fd 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,10 +8,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- if (ti_work & _TIF_SIGPENDING) {
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
}
@@ -20,13 +17,13 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(NULL);
+ resume_user_mode_work(NULL);
ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
if (ret)
return ret;
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
return 0;
}
@@ -43,7 +40,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
* disabled in the inner loop before going into guest mode. No need
* to disable interrupts here.
*/
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index c240302f56e2..5340c5aa89e7 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -47,14 +48,18 @@ bool syscall_user_dispatch(struct pt_regs *regs)
* access_ok() is performed once, at prctl time, when
* the selector is loaded by userspace.
*/
- if (unlikely(__get_user(state, sd->selector)))
- do_exit(SIGSEGV);
+ if (unlikely(__get_user(state, sd->selector))) {
+ force_exit_sig(SIGSEGV);
+ return true;
+ }
if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW))
return false;
- if (state != SYSCALL_DISPATCH_FILTER_BLOCK)
- do_exit(SIGSYS);
+ if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
+ force_exit_sig(SIGSYS);
+ return true;
+ }
}
sd->on_dispatch = true;
@@ -64,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return true;
}
-int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
- unsigned long len, char __user *selector)
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
{
switch (mode) {
case PR_SYS_DISPATCH_OFF:
@@ -82,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
if (offset && offset + len <= offset)
return -EINVAL;
- if (selector && !access_ok(selector, sizeof(*selector)))
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
return -EFAULT;
break;
@@ -90,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
return -EINVAL;
}
- current->syscall_dispatch.selector = selector;
- current->syscall_dispatch.offset = offset;
- current->syscall_dispatch.len = len;
- current->syscall_dispatch.on_dispatch = false;
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
if (mode == PR_SYS_DISPATCH_ON)
- set_syscall_work(SYSCALL_USER_DISPATCH);
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
else
- clear_syscall_work(SYSCALL_USER_DISPATCH);
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
return 0;
}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 3c022e33c109..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,10 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
-endif
-
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
-
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
- mm_segment_t fs;
-
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 464917096e73..db016e418931 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -54,6 +54,7 @@
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
+#include <linux/task_work.h>
#include "internal.h"
@@ -154,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -183,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -216,7 +219,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -313,7 +316,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -377,7 +380,6 @@ enum event_type_t {
/*
* perf_sched_events : >0 events exist
- * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
static void perf_sched_delayed(struct work_struct *work);
@@ -386,8 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
-static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -447,7 +447,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
int perf_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -570,13 +570,6 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -674,13 +667,48 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
+static void perf_ctx_disable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -719,35 +747,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
if (cgrp) {
+ u64 now = perf_clock();
+
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
- __update_cgrp_time(cgrp);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
- struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
/*
* ensure we access cgroup data only when needed and
@@ -756,19 +800,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;
- cgrp = perf_cgroup_from_task(current, event->ctx);
+ info = this_cpu_ptr(event->cgrp->info);
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- __update_cgrp_time(event->cgrp);
+ if (info->active)
+ __update_cgrp_time(info, perf_clock(), true);
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp;
+ struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
struct perf_cgroup_info *info;
struct cgroup_subsys_state *css;
@@ -777,127 +821,59 @@ perf_cgroup_set_timestamp(struct task_struct *task,
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
- if (!task || !ctx->nr_cgroups)
+ if (!cgrp)
return;
- cgrp = perf_cgroup_from_task(task, ctx);
+ WARN_ON_ONCE(!ctx->nr_cgroups);
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
-#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
-
/*
* reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
*/
-static void perf_cgroup_switch(struct task_struct *task, int mode)
-{
- struct perf_cpu_context *cpuctx;
- struct list_head *list;
- unsigned long flags;
-
- /*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
- */
- local_irq_save(flags);
-
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
-
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
-
- local_irq_restore(flags);
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
+static void perf_cgroup_switch(struct task_struct *task)
{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cgroup *cgrp;
- rcu_read_lock();
/*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
+ * cpuctx->cgrp is set when the first cgroup event enabled,
+ * and is cleared when the last cgroup event disabled.
*/
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- rcu_read_unlock();
-}
+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx);
- rcu_read_lock();
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
/*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
*/
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(prev, NULL);
-
+ cpuctx->cgrp = cgrp;
/*
- * only need to schedule in cgroup events if we are changing
- * cgroup during ctxsw. Cgroup events were not scheduled
- * out of ctxsw out if that was not the case.
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
*/
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
- rcu_read_unlock();
+ perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -915,7 +891,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -982,14 +958,6 @@ out:
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
-
-static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
@@ -1003,24 +971,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
*/
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
- /*
- * Since setting cpuctx->cgrp is conditional on the current @cgrp
- * matching the event's cgroup, we must do this for every new event,
- * because if the first would mismatch, the second would not try again
- * and we would leave cpuctx->cgrp unset.
- */
- if (ctx->is_active && !cpuctx->cgrp) {
- struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- cpuctx->cgrp = cgrp;
- }
-
if (ctx->nr_cgroups++)
return;
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+ cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}
static inline void
@@ -1040,10 +994,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (--ctx->nr_cgroups)
return;
- if (ctx->is_active && cpuctx->cgrp)
- cpuctx->cgrp = NULL;
-
- list_del(&cpuctx->cgrp_cpuctx_entry);
+ cpuctx->cgrp = NULL;
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1066,17 +1017,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
-{
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -1088,22 +1030,16 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
return 0;
}
@@ -1117,6 +1053,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif
/*
@@ -1129,34 +1069,30 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1165,34 +1101,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ raw_spin_lock_init(&cpc->hrtimer_lock);
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1207,32 +1143,9 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
-}
-
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
-{
- lockdep_assert_irqs_disabled();
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -1259,7 +1172,6 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -1444,7 +1356,7 @@ static u64 primary_event_id(struct perf_event *event)
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1460,7 +1372,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1473,7 +1385,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1500,12 +1412,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1525,22 +1437,61 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ lockdep_assert_held(&ctx->lock);
+
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time_now(event, now);
+
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
+
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -1614,14 +1565,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event)
* which provides ordering when rotating groups for the same CPU.
*/
static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
- const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
if (left_cpu < right->cpu)
return -1;
if (left_cpu > right->cpu)
return 1;
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
#ifdef CONFIG_CGROUP_PERF
{
const struct cgroup *right_cgroup = event_cgroup(right);
@@ -1664,12 +1623,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
struct perf_event *e = __node_2_pe(a);
- return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
- __node_2_pe(b)) < 0;
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
}
struct __group_key {
int cpu;
+ struct pmu *pmu;
struct cgroup *cgroup;
};
@@ -1678,14 +1638,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node)
const struct __group_key *a = key;
const struct perf_event *b = __node_2_pe(node);
- /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
- return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
@@ -1735,14 +1706,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
struct __group_key key = {
.cpu = cpu,
+ .pmu = pmu,
.cgroup = cgrp,
};
struct rb_node *node;
@@ -1754,14 +1726,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
struct __group_key key = {
.cpu = event->cpu,
+ .pmu = pmu,
.cgroup = event_cgroup(event),
};
struct rb_node *next;
@@ -1773,6 +1743,10 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1808,6 +1782,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1815,6 +1791,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1841,6 +1818,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
+ if (event->attr.read_format & PERF_FORMAT_LOST)
+ entry += sizeof(u64);
+
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
@@ -1957,7 +1937,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -1999,6 +1980,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -2020,6 +2003,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
}
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2036,13 +2020,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2071,7 +2053,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
+ event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -2122,8 +2104,8 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
/*
@@ -2134,10 +2116,7 @@ static inline struct list_head *get_event_list(struct perf_event *event)
*/
static inline void perf_remove_sibling_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, event->ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
@@ -2184,7 +2163,7 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
- if (!RB_EMPTY_NODE(&event->group_node)) {
+ if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
add_event_to_groups(sibling, event->ctx);
if (sibling->state == PERF_EVENT_STATE_ACTIVE)
@@ -2226,47 +2205,22 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
- struct perf_event *sibling;
-
- if (!__pmu_filter_match(event))
- return 0;
-
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling))
- return 0;
- }
-
- return 1;
-}
-
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2285,50 +2239,61 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
+ if (event->pending_sigtrap) {
+ bool dec = true;
+
+ event->pending_sigtrap = 0;
+ if (state != PERF_EVENT_STATE_OFF &&
+ !event->pending_work) {
+ event->pending_work = 1;
+ dec = false;
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+ task_work_add(current, &event->pending_task, TWA_RESUME);
+ }
+ if (dec)
+ local_dec(&event->ctx->nr_pending);
+ }
+
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
+ cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
-
- perf_pmu_enable(ctx->pmu);
+ event_sched_out(event, ctx);
}
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
+#define DETACH_DEAD 0x04UL
/*
* Cross CPU call to remove a performance event
@@ -2342,23 +2307,46 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
}
- event_sched_out(event, cpuctx, ctx);
+ /*
+ * Ensure event_sched_out() switches to OFF, at the very least
+ * this avoids raising perf_pending_task() at this time.
+ */
+ if (flags & DETACH_DEAD)
+ event->pending_disable = 1;
+ event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
+ if (flags & DETACH_DEAD)
+ event->state = PERF_EVENT_STATE_DEAD;
+
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2389,7 +2377,7 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
*/
raw_spin_lock_irq(&ctx->lock);
if (!ctx->is_active) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
return;
@@ -2415,13 +2403,17 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
}
+ perf_pmu_disable(event->pmu_ctx->pmu);
+
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2434,7 +2426,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
*
- * When called from perf_pending_event it's OK because event->ctx
+ * When called from perf_pending_irq it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
@@ -2473,43 +2465,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
- irq_work_queue(&event->pending);
-}
-
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, event->tstamp);
- else
- event->shadow_ctx_time = event->tstamp - ctx->timestamp;
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2518,10 +2475,10 @@ static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2552,8 +2509,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@@ -2564,14 +2519,12 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
+ cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2580,26 +2533,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, ctx))
goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2618,9 +2569,9 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
error:
pmu->cancel_txn(pmu);
@@ -2630,10 +2581,11 @@ error:
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2643,7 +2595,7 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
@@ -2665,38 +2617,29 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- struct task_struct *task)
+ struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
/*
@@ -2714,11 +2657,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
/*
@@ -2728,11 +2675,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ perf_ctx_disable(&cpuctx->ctx);
+ if (task_ctx) {
+ perf_ctx_disable(task_ctx);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2742,17 +2691,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx, current);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_event_sched_in(cpuctx, task_ctx);
+
+ perf_ctx_enable(&cpuctx->ctx);
+ if (task_ctx)
+ perf_ctx_enable(task_ctx);
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
@@ -2770,7 +2722,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2812,7 +2764,7 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2845,7 +2797,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2861,7 +2813,8 @@ perf_install_in_context(struct perf_event_context *ctx,
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
*/
- if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+ ctx->nr_events && !is_cgroup_event(event)) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
@@ -2957,7 +2910,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2966,7 +2919,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2975,7 +2928,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -3193,6 +3146,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
return err;
}
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+ const struct perf_event_attr *from)
+{
+ to->sig_data = from->sig_data;
+}
+
static int perf_event_modify_attr(struct perf_event *event,
struct perf_event_attr *attr)
{
@@ -3215,10 +3177,17 @@ static int perf_event_modify_attr(struct perf_event *event,
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->child_mutex);
+ /*
+ * Event-type-independent attributes must be copied before event-type
+ * modification, which will validate that final attributes match the
+ * source attributes after all relevant attributes have been copied.
+ */
+ perf_event_modify_copy_attr(&event->attr, attr);
err = func(event, attr);
if (err)
goto out;
list_for_each_entry(child, &event->child_list, child_list) {
+ perf_event_modify_copy_attr(&child->attr, attr);
err = func(child, attr);
if (err)
goto out;
@@ -3228,11 +3197,52 @@ out:
return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!event_type)
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3247,16 +3257,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
- if (ctx->task) {
- WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
- cpuctx->task_ctx = NULL;
- }
-
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@@ -3270,32 +3270,28 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
}
- is_active ^= ctx->is_active; /* changed bits */
-
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
}
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
+ is_active ^= ctx->is_active; /* changed bits */
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
- }
- perf_pmu_enable(ctx->pmu);
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -3400,26 +3396,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
+ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
+ pos2 = list_first_entry(head2, typeof(*pos2), member); \
+ !list_entry_is_head(pos1, head1, member) && \
+ !list_entry_is_head(pos2, head2, member); \
+ pos1 = list_next_entry(pos1, member), \
+ pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event_pmu_context *prev_epc, *next_epc;
+
+ if (!prev_ctx->nr_task_data)
+ return;
+
+ double_list_for_each_entry(prev_epc, next_epc,
+ &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+ pmu_ctx_entry) {
+
+ if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+ continue;
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (prev_epc->pmu->swap_task_ctx)
+ prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+ else
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+ }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
- struct pmu *pmu;
if (likely(!ctx))
return;
- pmu = ctx->pmu;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3444,26 +3482,27 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- WRITE_ONCE(ctx->task, next);
- WRITE_ONCE(next_ctx->task, task);
+ perf_ctx_disable(ctx);
- perf_pmu_disable(pmu);
+ /* PMIs are disabled; ctx->nr_pending is stable. */
+ if (local_read(&ctx->nr_pending) ||
+ local_read(&next_ctx->nr_pending)) {
+ /*
+ * Must not swap out ctx when there's pending
+ * events that rely on the ctx->task relation.
+ */
+ raw_spin_unlock(&next_ctx->lock);
+ rcu_read_unlock();
+ goto inside_switch;
+ }
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
+ WRITE_ONCE(ctx->task, next);
+ WRITE_ONCE(next_ctx->task, task);
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_ctx_sched_task_cb(ctx, false);
+ perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3472,8 +3511,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* since those values are always verified under
* ctx->lock which we're now holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3487,37 +3526,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+inside_switch:
+ perf_ctx_sched_task_cb(ctx, false);
+ task_ctx_sched_out(ctx, EVENT_ALL);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3529,19 +3571,21 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpc->epc.pmu;
+ /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3551,26 +3595,20 @@ static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- if (prev == next)
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- /* will be handled in perf_event_context_sched_in/out */
- if (cpuctx->task_ctx)
- continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3585,33 +3623,20 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_out(task, next);
-}
-
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ perf_cgroup_switch(next);
}
static bool perf_less_group_idx(const void *l, const void *r)
@@ -3645,21 +3670,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
event_heap = (struct min_heap){
.data = cpuctx->heap,
.nr = 0,
@@ -3679,17 +3722,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
min_heapify_all(&event_heap, &perf_min_heap);
while (event_heap.nr) {
@@ -3697,7 +3745,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
min_heapify(&event_heap, 0, &perf_min_heap);
else
@@ -3707,10 +3755,38 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3719,67 +3795,94 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
- }
+ } else {
+ struct perf_cpu_pmu_context *cpc;
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ event->pmu_ctx->rotate_necessary = 1;
+ cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+ perf_mux_hrtimer_restart(cpc);
+ group_update_userpage(event);
+ }
}
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
+}
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+{
+ ctx_flexible_sched_in(ctx, pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
- u64 now;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
+ if (!(is_active & EVENT_TIME)) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
@@ -3790,52 +3893,37 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (is_active & EVENT_TIME) {
- /* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
- }
-
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, NULL);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, NULL);
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
-
- ctx_sched_in(ctx, cpuctx, event_type, task);
-}
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
- cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx);
- /*
- * HACK: for HETEROGENEOUS the task context might have switched to a
- * different PMU, force (re)set the context,
- */
- pmu = ctx->pmu = cpuctx->ctx.pmu;
+ perf_ctx_sched_task_cb(ctx, true);
- if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
- __perf_pmu_sched_task(cpuctx, true);
- return;
+ perf_ctx_enable(ctx);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
}
perf_ctx_lock(cpuctx, ctx);
@@ -3846,7 +3934,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3855,17 +3943,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx);
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(cpuctx->task_ctx, true);
+ perf_event_sched_in(cpuctx, ctx);
- perf_pmu_enable(pmu);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
+
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx);
+
+ perf_ctx_enable(ctx);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3882,26 +3977,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- /*
- * If cgroup events exist on this CPU, then we need to check if we have
- * to switch in PMU state; cgroup event are system-wide mode only.
- *
- * Since cgroup events are CPU events, we must schedule these in before
- * we schedule in the task events.
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -4020,8 +4096,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event *event;
struct hw_perf_event *hwc;
@@ -4033,16 +4109,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
- if (!(ctx->nr_freq || needs_unthr))
+ if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
@@ -4083,7 +4159,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -4105,72 +4180,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ }
+
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
- perf_event_sched_in(cpuctx, task_ctx, current);
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc->ctx, pmu);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4178,8 +4290,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4188,8 +4300,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4211,9 +4328,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4221,13 +4338,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
goto out;
- cpuctx = __get_cpu_context(ctx);
+ if (!ctx->nr_events)
+ goto out;
+
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4240,7 +4360,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4259,18 +4379,13 @@ static void perf_event_exit_event(struct perf_event *event,
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event, *next;
- LIST_HEAD(free_list);
unsigned long flags;
bool modified = false;
- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
- return;
-
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current))
@@ -4291,13 +4406,11 @@ static void perf_event_remove_on_exec(int ctxn)
raw_spin_lock_irqsave(&ctx->lock, flags);
if (modified)
clone_ctx = unclone_ctx(ctx);
- --ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
unlock:
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
if (clone_ctx)
put_ctx(clone_ctx);
}
@@ -4333,7 +4446,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4390,6 +4503,18 @@ static inline u64 perf_event_count(struct perf_event *event)
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
+}
+
/*
* NMI-safe method to read a local event, that is an event that
* is:
@@ -4449,10 +4574,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
*value = local64_read(&event->count);
if (enabled || running) {
- u64 now = event->shadow_ctx_time + perf_clock();
- u64 __enabled, __running;
+ u64 __enabled, __running, __now;
- __perf_update_times(event, now, &__enabled, &__running);
+ calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@@ -4548,17 +4672,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4569,7 +4701,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4598,15 +4729,12 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -4614,7 +4742,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -4625,43 +4753,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4670,12 +4777,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4688,22 +4795,141 @@ retry:
}
}
- free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *epc;
+ void *task_ctx_data = NULL;
+
+ if (!ctx->task) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+ raw_spin_lock_irq(&ctx->lock);
+ if (!epc->ctx) {
+ atomic_set(&epc->refcount, 1);
+ epc->embedded = 1;
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = alloc_task_ctx_data(pmu);
+ if (!task_ctx_data) {
+ kfree(new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ }
+
+ epc = new;
+ new = NULL;
+
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+
+found_epc:
+ if (task_ctx_data && !epc->task_ctx_data) {
+ epc->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ ctx->nr_task_data++;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+
+ free_task_ctx_data(pmu, task_ctx_data);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc->task_ctx_data);
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ struct perf_event_context *ctx = epc->ctx;
+ unsigned long flags;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
+ return;
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (epc->embedded)
+ return;
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
@@ -4747,15 +4973,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
detach_sb_event(event);
}
-static void unaccount_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_dec(&per_cpu(perf_cgroup_events, cpu));
-}
-
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
@@ -4821,8 +5038,6 @@ static void unaccount_event(struct perf_event *event)
schedule_delayed_work(&perf_sched_work, HZ);
}
- unaccount_event_cpu(event, event->cpu);
-
unaccount_pmu_sb_event(event);
}
@@ -4841,7 +5056,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -4925,7 +5140,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
static void _free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending);
+ irq_work_sync(&event->pending_irq);
unaccount_event(event);
@@ -4965,6 +5180,9 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx)
+ put_pmu_ctx(event->pmu_ctx);
+
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
* all task references must be cleaned up.
@@ -5065,8 +5283,8 @@ int perf_event_release_kernel(struct perf_event *event)
LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -5079,9 +5297,7 @@ int perf_event_release_kernel(struct perf_event *event)
ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP);
- raw_spin_lock_irq(&ctx->lock);
/*
* Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
@@ -5093,8 +5309,7 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- event->state = PERF_EVENT_STATE_DEAD;
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
perf_event_ctx_unlock(event, ctx);
@@ -5257,11 +5472,15 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -5318,7 +5537,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5328,6 +5547,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -5495,7 +5716,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
/*
* We could be throttled; unthrottle now to avoid the tick
* trying to unthrottle while we already re-started the event.
@@ -5511,7 +5732,7 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ perf_pmu_enable(event->pmu);
}
}
@@ -5574,7 +5795,6 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
@@ -5637,7 +5857,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return perf_event_set_filter(event, (void __user *)arg);
case PERF_EVENT_IOC_SET_BPF:
- return perf_event_set_bpf_prog(event, arg);
+ {
+ struct bpf_prog *prog;
+ int err;
+
+ prog = bpf_prog_get(arg);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = perf_event_set_bpf_prog(event, prog, 0);
+ if (err) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+ }
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
struct perf_buffer *rb;
@@ -5761,18 +5996,6 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- __perf_update_times(event, ctx_time, enabled, running);
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@@ -5897,6 +6120,8 @@ static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *old_rb = NULL;
unsigned long flags;
+ WARN_ON_ONCE(event->parent);
+
if (event->rb) {
/*
* Should be impossible, we set this when removing
@@ -5954,6 +6179,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -5967,6 +6195,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6233,17 +6464,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages != nr_pages) {
+ if (data_page_nr(event->rb) != nr_pages) {
ret = -EINVAL;
goto unlock;
}
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -6311,6 +6542,7 @@ accounting:
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -6336,7 +6568,7 @@ aux_unlock:
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
- vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
@@ -6408,32 +6640,43 @@ static void perf_sigtrap(struct perf_event *event)
return;
/*
- * perf_pending_event() can race with the task exiting.
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
if (current->flags & PF_EXITING)
return;
- force_sig_perf((void __user *)event->pending_addr,
- event->attr.type, event->attr.sig_data);
+ send_sig_perf((void __user *)event->pending_addr,
+ event->attr.type, event->attr.sig_data);
}
-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_irq(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ int cpu = READ_ONCE(event->oncpu);
+ /*
+ * If the event isn't running; we done. event_sched_out() will have
+ * taken care of things.
+ */
if (cpu < 0)
return;
+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
+ local_dec(&event->ctx->nr_pending);
+ }
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
}
-
- perf_event_disable_local(event);
return;
}
@@ -6453,55 +6696,101 @@ static void perf_pending_event_disable(struct perf_event *event)
* irq_work_queue(); // FAILS
*
* irq_work_run()
- * perf_pending_event()
+ * perf_pending_irq()
*
* But the event runs on CPU-B and wants disabling there.
*/
- irq_work_queue_on(&event->pending, cpu);
+ irq_work_queue_on(&event->pending_irq, cpu);
}
-static void perf_pending_event(struct irq_work *entry)
+static void perf_pending_irq(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry, struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
int rctx;
- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();
- perf_pending_event_disable(event);
-
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+ __perf_pending_irq(event);
+
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
-/*
- * We assume there is only KVM supporting the callbacks.
- * Later on, we might change it to a list if there is
- * another virtualization implementation supporting the callbacks.
- */
-struct perf_guest_info_callbacks *perf_guest_cbs;
+static void perf_pending_task(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_work) {
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_pending);
+ }
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
-int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+ put_event(event);
+}
+
+#ifdef CONFIG_GUEST_PERF_EVENTS
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+
+DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
+DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+
+void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
+ static_call_update(__perf_guest_state, cbs->state);
+ static_call_update(__perf_guest_get_ip, cbs->get_ip);
+
+ /* Implementing ->handle_intel_pt_intr is optional. */
+ if (cbs->handle_intel_pt_intr)
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ static_call_update(__perf_guest_state, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ (void *)&__static_call_return0);
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+#endif
static void
perf_output_sample_regs(struct perf_output_handle *handle,
@@ -6609,7 +6898,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6627,9 +6915,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -6657,7 +6943,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
goto out;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
goto out;
@@ -6723,7 +7009,7 @@ static void perf_aux_sample_output(struct perf_event *event,
if (WARN_ON_ONCE(!sampler || !data->aux_size))
return;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
return;
@@ -6755,14 +7041,20 @@ out_put:
ring_buffer_put(rb);
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event)
-{
- u64 sample_type = event->attr.sample_type;
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
- data->type = sample_type;
- header->size += event->id_header_size;
+static void __perf_event_header__init_id(struct perf_sample_data *data,
+ struct perf_event *event,
+ u64 sample_type)
+{
+ data->type = event->attr.sample_type;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -6789,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -6830,7 +7124,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6844,6 +7138,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6854,9 +7150,16 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ unsigned long flags;
+ u64 values[6];
int n = 0;
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -6872,6 +7175,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6885,9 +7190,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
@@ -6924,11 +7233,6 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
-static inline bool perf_sample_save_hw_index(struct perf_event *event)
-{
- return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
-}
-
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -7017,7 +7321,7 @@ void perf_output_sample(struct perf_output_handle *handle,
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
- if (perf_sample_save_hw_index(event))
+ if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
@@ -7115,7 +7419,6 @@ void perf_output_sample(struct perf_output_handle *handle,
static u64 perf_virt_to_phys(u64 virt)
{
u64 phys_addr = 0;
- struct page *p = NULL;
if (!virt)
return 0;
@@ -7134,14 +7437,15 @@ static u64 perf_virt_to_phys(u64 virt)
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
+ struct page *p;
+
pagefault_disable();
- if (get_user_page_fast_only(virt, 0, &p))
+ if (get_user_page_fast_only(virt, 0, &p)) {
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ put_page(p);
+ }
pagefault_enable();
}
-
- if (p)
- put_page(p);
}
return phys_addr;
@@ -7186,7 +7490,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7254,76 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
+ /*
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
+ */
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
+ }
- __perf_event_header__init_id(header, data, event);
+ __perf_event_header__init_id(data, event, filtered_sample_type);
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
data->ip = perf_instruction_pointer(regs);
-
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
-
- if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
- data->callchain = perf_callchain(event, regs);
-
- size += data->callchain->nr;
-
- header->size += size * sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_IP;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- struct perf_raw_record *raw = data->raw;
- int size;
-
- if (raw) {
- struct perf_raw_frag *frag = &raw->frag;
- u32 sum = 0;
-
- do {
- sum += frag->size;
- if (perf_raw_frag_last(frag))
- break;
- frag = frag->next;
- } while (1);
-
- size = round_up(sum + sizeof(u32), sizeof(u64));
- raw->size = size - sizeof(u32);
- frag->pad = raw->size - sum;
- } else {
- size = sizeof(u64);
- }
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- if (perf_sample_save_hw_index(event))
- size += sizeof(u64);
-
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
perf_sample_regs_user(&data->regs_user, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7332,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
@@ -7343,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header,
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -7357,10 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7372,19 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7393,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_AUX) {
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
- header->size += sizeof(u64); /* size */
+ header_size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
@@ -7410,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
+ size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
- WARN_ON_ONCE(size + header->size > U16_MAX);
- header->size += size;
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
+}
+
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(regs);
+
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
@@ -7445,7 +7785,8 @@ __perf_event_output(struct perf_event *event,
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
err = output_begin(&handle, data, event, header.size);
if (err)
@@ -7578,7 +7919,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7595,11 +7935,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7641,20 +7979,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- perf_event_enable_on_exec(ctxn);
- perf_event_remove_on_exec(ctxn);
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- rcu_read_lock();
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
- NULL, true);
- }
- rcu_read_unlock();
- }
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7694,8 +8029,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -8307,8 +8641,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
else
flags = MAP_PRIVATE;
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
if (is_vm_hugetlb_page(vma))
@@ -8486,7 +8818,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8496,13 +8827,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -8860,7 +9187,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
perf_event_header__init_id(&bpf_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, data, event,
+ ret = perf_output_begin(&handle, &sample, event,
bpf_event->event_id.header.size);
if (ret)
return;
@@ -8890,7 +9217,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
PERF_RECORD_KSYMBOL_TYPE_BPF,
(u64)(unsigned long)subprog->bpf_func,
subprog->jited_len, unregister,
- prog->aux->ksym.name);
+ subprog->aux->ksym.name);
}
}
}
@@ -9062,6 +9389,37 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
+void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 hw_id;
+ } rec;
+ int ret;
+
+ if (event->parent)
+ event = event->parent;
+
+ rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.hw_id = hw_id;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
+
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
@@ -9075,8 +9433,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
+ if (unlikely(throttle &&
+ hwc->interrupts > max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
@@ -9103,13 +9461,26 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
int ret = 0;
@@ -9132,24 +9503,59 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}
+ if (event->attr.sigtrap) {
+ /*
+ * The desired behaviour of sigtrap vs invalid samples is a bit
+ * tricky; on the one hand, one should not loose the SIGTRAP if
+ * it is the first event, on the other hand, we should also not
+ * trigger the WARN or override the data address.
+ */
+ bool valid_sample = sample_is_allowed(event, regs);
+ unsigned int pending_id = 1;
+
+ if (regs)
+ pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
+ if (!event->pending_sigtrap) {
+ event->pending_sigtrap = pending_id;
+ local_inc(&event->ctx->nr_pending);
+ } else if (event->attr.exclude_kernel && valid_sample) {
+ /*
+ * Should not be able to return to user space without
+ * consuming pending_sigtrap; with exceptions:
+ *
+ * 1. Where !exclude_kernel, events can overflow again
+ * in the kernel without returning to user space.
+ *
+ * 2. Events that can overflow again before the IRQ-
+ * work without user space progress (e.g. hrtimer).
+ * To approximate progress (with false negatives),
+ * check 32-bit hash of the current IP.
+ */
+ WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+ }
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending_irq);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
+ irq_work_queue(&event->pending_irq);
}
return ret;
}
int perf_event_overflow(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
@@ -9599,6 +10005,44 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -9648,6 +10092,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, data, regs);
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9663,13 +10145,25 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
};
perf_sample_data_init(&data, 0, 0);
- data.raw = &raw;
+ perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
+ if (perf_tp_event_match(event, &data, regs)) {
perf_swevent_event(event, count, &data, regs);
+
+ /*
+ * Here use the same on-stack perf_sample_data,
+ * some members in data are event-specific and
+ * need to be re-computed for different sweveents.
+ * Re-initialize data->sample_flags safely to avoid
+ * the problem that next event skips preparing data
+ * because data->sample_flags is set.
+ */
+ perf_sample_data_init(&data, 0, 0);
+ perf_sample_save_raw_data(&data, &raw);
+ }
}
/*
@@ -9678,23 +10172,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9703,44 +10189,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -9907,13 +10355,18 @@ static void bpf_overflow_handler(struct perf_event *event,
.data = data,
.event = event,
};
+ struct bpf_prog *prog;
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
- ret = BPF_PROG_RUN(event->prog, &ctx);
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -9923,10 +10376,10 @@ out:
event->orig_overflow_handler(event, data, regs);
}
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- struct bpf_prog *prog;
-
if (event->overflow_handler_context)
/* hw breakpoint or kernel counter */
return -EINVAL;
@@ -9934,13 +10387,12 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
if (event->prog)
return -EEXIST;
- prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
if (event->attr.precise_ip &&
prog->call_get_stack &&
- (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) {
/*
@@ -9952,11 +10404,11 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
* attached to perf_sample_data, do not allow attaching BPF
* program that calls bpf_get_[stack|stackid].
*/
- bpf_prog_put(prog);
return -EPROTO;
}
event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
return 0;
@@ -9974,7 +10426,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
bpf_prog_put(prog);
}
#else
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -EOPNOTSUPP;
}
@@ -10002,57 +10456,46 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- bool is_kprobe, is_tracepoint, is_syscall_tp;
- struct bpf_prog *prog;
- int ret;
+ bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
if (!perf_event_is_tracing(event))
- return perf_event_set_bpf_handler(event, prog_fd);
+ return perf_event_set_bpf_handler(event, prog, bpf_cookie);
- is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
+ is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
- if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
+ if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- prog = bpf_prog_get(prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
- (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
- /* valid fd, but invalid bpf program type */
- bpf_prog_put(prog);
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
+ return -EINVAL;
+
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+ /* only uprobe programs are allowed to be sleepable */
return -EINVAL;
- }
/* Kprobe override only works for kprobes, not uprobes. */
- if (prog->kprobe_override &&
- !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
- bpf_prog_put(prog);
+ if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
- }
if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
- if (prog->aux->max_ctx_offset > off) {
- bpf_prog_put(prog);
+ if (prog->aux->max_ctx_offset > off)
return -EACCES;
- }
}
- ret = perf_event_attach_bpf_prog(event, prog);
- if (ret)
- bpf_prog_put(prog);
- return ret;
+ return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
@@ -10071,12 +10514,13 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -ENOENT;
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */
@@ -10161,8 +10605,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
@@ -10192,7 +10637,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
return;
if (ifh->nr_file_filters) {
- mm = get_task_mm(event->ctx->task);
+ mm = get_task_mm(task);
if (!mm)
goto restart;
@@ -10373,8 +10818,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
if (state == IF_STATE_END) {
ret = -EINVAL;
- if (kernel && event->attr.exclude_kernel)
- goto fail;
/*
* ACTION "filter" must have a non-zero length region
@@ -10416,8 +10859,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
@@ -10823,36 +11269,9 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
+ free_percpu(pmu->cpu_pmu_context);
}
/*
@@ -10864,7 +11283,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -10875,7 +11294,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -10886,7 +11305,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -10916,12 +11335,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -10958,13 +11376,15 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -11032,47 +11452,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
-
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
-
- hw_context_taken = 1;
- }
-
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ if (!pmu->cpu_pmu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
- __perf_mux_hrtimer_init(cpuctx, cpu);
+ struct perf_cpu_pmu_context *cpc;
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -11305,15 +11697,6 @@ static void account_pmu_sb_event(struct perf_event *event)
attach_sb_event(event);
}
-static void account_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_inc(&per_cpu(perf_cgroup_events, cpu));
-}
-
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
@@ -11401,8 +11784,6 @@ static void account_event(struct perf_event *event)
}
enabled:
- account_event_cpu(event, event->cpu);
-
account_pmu_sb_event(event);
}
@@ -11459,8 +11840,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
- init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11479,8 +11860,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
+ if (parent_event)
+ event->event_caps = parent_event->event_caps;
if (task) {
event->attach_state = PERF_ATTACH_TASK;
@@ -11551,10 +11932,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
goto err_pmu;
}
@@ -11637,11 +12019,9 @@ err_pmu:
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (event->ns)
- put_pid_ns(event->ns);
if (event->hw.target)
put_task_struct(event->hw.target);
- kmem_cache_free(perf_event_cache, event);
+ call_rcu(&event->rcu_head, free_event_rcu);
return ERR_PTR(err);
}
@@ -11772,14 +12152,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11794,7 +12185,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -11817,8 +12208,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -11828,6 +12226,12 @@ set:
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11835,20 +12239,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
@@ -11886,35 +12283,35 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
- struct perf_event_context *gctx;
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
}
- return gctx;
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
}
/**
@@ -11931,9 +12328,10 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *gctx;
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -11948,12 +12346,12 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
- /* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
- err = perf_copy_attr(attr_uptr, &attr);
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
if (err)
return err;
@@ -12063,42 +12461,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -12107,11 +12516,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -12119,133 +12528,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
-
- /*
- * Make sure we're both on the same task, or both
- * per-CPU events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_file;
-
- /*
- * Preserve ptrace permission check for backwards compatibility.
- *
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto err_cred;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
-
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
}
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -12254,36 +12606,33 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -12294,9 +12643,10 @@ SYSCALL_DEFINE5(perf_event_open,
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -12304,9 +12654,10 @@ SYSCALL_DEFINE5(perf_event_open,
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -12323,8 +12674,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -12346,25 +12695,18 @@ SYSCALL_DEFINE5(perf_event_open,
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
-err_file:
- fput(event_file);
-err_context:
- perf_unpin_context(ctx);
- put_ctx(ctx);
err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -12390,8 +12732,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
@@ -12410,14 +12754,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -12427,6 +12775,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -12438,13 +12793,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12453,44 +12808,59 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
+err_alloc:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_install_in_context(ctx, event, cpu);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12500,30 +12870,50 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
+ }
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
}
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
@@ -12603,14 +12993,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
WARN_ON_ONCE(child != current);
- child_ctx = perf_pin_task_context(child, ctxn);
+ child_ctx = perf_pin_task_context(child);
if (!child_ctx)
return;
@@ -12632,13 +13022,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
put_ctx(child_ctx); /* cannot be last */
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
@@ -12673,7 +13063,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
- int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@ -12689,8 +13078,7 @@ void perf_event_exit_task(struct task_struct *child)
}
mutex_unlock(&child->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(child);
/*
* The perf_event_exit_task_context calls perf_event_task
@@ -12733,56 +13121,51 @@ void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = rcu_access_pointer(task->perf_event_ctxp);
+ if (!ctx)
+ return;
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
- mutex_unlock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+ perf_free_event(event, ctx);
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ mutex_unlock(&ctx->mutex);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -12832,6 +13215,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -12852,17 +13236,12 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
-
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return ERR_CAST(pmu_ctx);
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -12985,11 +13364,11 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
+ struct task_struct *child,
u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@ -12999,7 +13378,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -13007,16 +13386,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -13026,8 +13403,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn,
- u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -13037,14 +13413,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -13067,8 +13443,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13084,8 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13093,7 +13467,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -13129,18 +13503,16 @@ out_unlock:
*/
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn, clone_flags);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -13149,6 +13521,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@ -13156,15 +13529,19 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -13186,12 +13563,12 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -13201,18 +13578,16 @@ static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
@@ -13226,20 +13601,17 @@ int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -13376,9 +13748,11 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
- rcu_read_unlock();
+
+ preempt_disable();
+ perf_cgroup_switch(task);
+ preempt_enable();
+
return 0;
}
@@ -13405,3 +13779,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 835973444a1e..c3797701339c 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -17,61 +17,276 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/bug.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
+}
+
+#ifdef hw_breakpoint_slots
+/*
+ * Number of breakpoint slots is constant, and the same for all types.
+ */
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
+{
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
+ return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
+#endif
static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
@@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
}
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
- return i + 1;
- }
-
- return 0;
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
-
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
+ }
+
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
+
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
+
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * Readers want a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
__weak int arch_reserve_bp_slot(struct perf_event *bp)
@@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
}
/*
- * Constraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*/
static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
int ret;
@@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
ret = arch_reserve_bp_slot(bp);
if (ret)
return ret;
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
}
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
@@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
{
- int ret;
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_lock(&nr_bp_mutex);
- ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp, bp->attr.bp_type);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
__release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
@@ -568,7 +879,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
if (!cpu_events)
return (void __percpu __force *)ERR_PTR(-ENOMEM);
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
@@ -579,7 +890,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
per_cpu(*cpu_events, cpu) = bp;
}
- put_online_cpus();
+ cpus_read_unlock();
if (likely(!err))
return cpu_events;
@@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..2cfeeecf8de9
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ kunit_skip(test, "not enough cpus");
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ kunit_skip(test, "hw breakpoint already in use");
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 228801e20788..5150d5f84c03 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb)
}
#endif
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
@@ -205,12 +210,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
static inline int get_recursion_context(int *recursion)
{
- unsigned int pc = preempt_count();
- unsigned char rctx = 0;
-
- rctx += !!(pc & (NMI_MASK));
- rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
- rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+ unsigned char rctx = interrupt_context_level();
if (recursion[rctx])
return -1;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 52868716ec35..a0433f37b024 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
+ irq_work_queue(&handle->event->pending_irq);
}
/*
@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -811,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
@@ -859,11 +862,6 @@ void rb_free(struct perf_buffer *rb)
}
#else
-static int data_page_nr(struct perf_buffer *rb)
-{
- return rb->nr_pages << page_order(rb);
-}
-
static struct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index af24dc3febbe..59887c69d54c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -19,10 +19,9 @@
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
@@ -154,26 +153,25 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
+ struct folio *old_folio = page_folio(old_page);
+ struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+ new_folio = page_folio(new_page);
+ err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(old_page);
+ /* For folio_free_swap() below */
+ folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -182,14 +180,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_get(new_folio);
+ page_add_new_anon_rmap(new_page, vma, addr);
+ folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
- if (!PageAnon(old_page)) {
+ if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -200,19 +198,16 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
+ page_remove_rmap(old_page, vma, false);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
- put_page(old_page);
+ folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
+ folio_unlock(old_folio);
return err;
}
@@ -355,9 +350,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -558,7 +554,7 @@ put_old:
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -793,10 +789,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
+ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -1149,7 +1145,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
return -EINVAL;
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
@@ -1236,11 +1233,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1353,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1988,9 +1986,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 9a89e7f36acb..34b90e2e7cf7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,9 +48,9 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
-#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -60,15 +60,66 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/sysfs.h>
+#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -116,7 +167,7 @@ static void __exit_signal(struct task_struct *tsk)
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
- wake_up_process(sig->group_exit_task);
+ wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
@@ -168,6 +219,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+ kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -179,6 +232,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
+
void release_task(struct task_struct *p)
{
struct task_struct *leader;
@@ -339,6 +396,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
+static void coredump_task_exit(struct task_struct *tsk)
+{
+ struct core_state *core_state;
+
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = tsk->signal->core_state;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (core_state) {
+ struct core_thread self;
+
+ self.task = current;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+}
+
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -422,6 +519,7 @@ assign_new_owner:
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -434,48 +532,13 @@ assign_new_owner:
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
- struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
- /*
- * Serialize with any possible pending coredump.
- * We must hold mmap_lock around checking core_state
- * and clearing tsk->mm. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group with ->mm != NULL.
- */
mmap_read_lock(mm);
- core_state = mm->core_state;
- if (core_state) {
- struct core_thread self;
-
- mmap_read_unlock(mm);
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
-
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!self.task) /* see coredump_finish() */
- break;
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
- mmap_read_lock(mm);
- }
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
@@ -691,7 +754,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
- wake_up_process(tsk->signal->group_exit_task);
+ wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@@ -724,60 +787,43 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+static void synchronize_group_exit(struct task_struct *tsk, long code)
+{
+ struct sighand_struct *sighand = tsk->sighand;
+ struct signal_struct *signal = tsk->signal;
+
+ spin_lock_irq(&sighand->siglock);
+ signal->quick_threads--;
+ if ((signal->quick_threads == 0) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT)) {
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = code;
+ signal->group_stop_count = 0;
+ }
+ spin_unlock_irq(&sighand->siglock);
+}
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- /*
- * We can get here from a kernel oops, sometimes with preemption off.
- * Start by checking for critical errors.
- * Then fix up important state like USER_DS and preemption.
- * Then do everything else.
- */
-
- WARN_ON(blk_needs_flush_plug(tsk));
-
- if (unlikely(in_interrupt()))
- panic("Aiee, killing interrupt handler!");
- if (unlikely(!tsk->pid))
- panic("Attempted to kill the idle task!");
+ WARN_ON(irqs_disabled());
- /*
- * If do_exit is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- */
- force_uaccess_begin();
+ synchronize_group_exit(tsk, code);
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
+ WARN_ON(tsk->plug);
- profile_task_exit(tsk);
kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
+ coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
+ user_events_exit(tsk);
validate_creds_for_do_exit(tsk);
- /*
- * We're taking recursive faults here in do_exit. Safest is to just
- * leave this task alone and wait for reboot.
- */
- if (unlikely(tsk->flags & PF_EXITING)) {
- pr_alert("Fixing recursive fault but reboot is needed!\n");
- futex_exit_recursive(tsk);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- }
-
- io_uring_files_cancel(tsk->files);
+ io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
@@ -796,7 +842,7 @@ void __noreturn do_exit(long code)
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
+ exit_itimers(tsk);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
@@ -864,6 +910,7 @@ void __noreturn do_exit(long code)
put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
@@ -875,16 +922,66 @@ void __noreturn do_exit(long code)
lockdep_free_task(tsk);
do_task_dead();
}
-EXPORT_SYMBOL_GPL(do_exit);
-void complete_and_exit(struct completion *comp, long code)
+void __noreturn make_task_dead(int signr)
{
- if (comp)
- complete(comp);
+ /*
+ * Take the task off the cpu after something catastrophic has
+ * happened.
+ *
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
+ struct task_struct *tsk = current;
+ unsigned int limit;
- do_exit(code);
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ /*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
+ * We're taking recursive faults here in make_task_dead. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+ futex_exit_recursive(tsk);
+ tsk->exit_state = EXIT_DEAD;
+ refcount_inc(&tsk->rcu_users);
+ do_task_dead();
+ }
+
+ do_exit(signr);
}
-EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
@@ -895,22 +992,24 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
- BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
- else if (!thread_group_empty(current)) {
+ else if (sig->group_exec_task)
+ exit_code = 0;
+ else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
@@ -1005,7 +1104,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- status = p->exit_code;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
@@ -1047,7 +1147,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
- * psig->stats_lock also protects us from our sub-theads
+ * psig->stats_lock also protects us from our sub-threads
* which can reap other children at the same time. Until
* we change k_getrusage()-like users to rely on this lock
* we have to take ->siglock as well.
@@ -1807,7 +1907,14 @@ bool thread_group_exited(struct pid *pid)
}
EXPORT_SYMBOL(thread_group_exited);
-__weak void abort(void)
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
+ */
+__weak __function_aligned void abort(void)
{
BUG();
diff --git a/kernel/extable.c b/kernel/extable.c
index b0ea5eb0c3b4..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/extable.h>
@@ -62,40 +63,13 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
-int init_kernel_text(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext &&
- addr < (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
int notrace core_kernel_text(unsigned long addr)
{
- if (addr >= (unsigned long)_stext &&
- addr < (unsigned long)_etext)
+ if (is_kernel_text(addr))
return 1;
- if (system_state < SYSTEM_RUNNING &&
- init_kernel_text(addr))
- return 1;
- return 0;
-}
-
-/**
- * core_kernel_data - tell if addr points to kernel data
- * @addr: address to test
- *
- * Returns true if @addr passed in is from the core kernel data
- * section.
- *
- * Note: On some archs it may return true for core RODATA, and false
- * for others. But will always be true for core RW data.
- */
-int core_kernel_data(unsigned long addr)
-{
- if (addr >= (unsigned long)_sdata &&
- addr < (unsigned long)_edata)
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ is_kernel_inittext(addr))
return 1;
return 0;
}
@@ -112,7 +86,7 @@ int __kernel_text_address(unsigned long addr)
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
- if (init_kernel_text(addr))
+ if (is_kernel_inittext(addr))
return 1;
return 0;
}
@@ -140,7 +114,7 @@ int kernel_text_address(unsigned long addr)
/* Treat this like an NMI as it can happen anywhere */
if (no_rcu)
- rcu_nmi_enter();
+ ct_nmi_enter();
if (is_module_text_address(addr))
goto out;
@@ -153,18 +127,39 @@ int kernel_text_address(unsigned long addr)
ret = 0;
out:
if (no_rcu)
- rcu_nmi_exit();
+ ct_nmi_exit();
return ret;
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 60dc825ecc2b..d971a0189319 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr)
static void fei_debugfs_remove_attr(struct fei_attr *attr)
{
- struct dentry *dir;
-
- dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
- debugfs_remove_recursive(dir);
+ debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);
}
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
@@ -247,15 +244,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- if (copy_from_user(buf, buffer, count)) {
- ret = -EFAULT;
- goto out_free;
- }
- buf[count] = '\0';
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +291,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
- if (!ret)
- fei_debugfs_add_attr(attr);
- if (ret < 0)
- fei_attr_remove(attr);
- else {
- list_add_tail(&attr->list, &fei_attr_list);
- ret = count;
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
}
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
out:
mutex_unlock(&fei_lock);
-out_free:
kfree(buf);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index bc94b2cc5995..ed4e01daccaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,12 +37,13 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -74,9 +75,7 @@
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
@@ -97,6 +96,9 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/stackprotector.h>
+#include <linux/user_events.h>
+#include <linux/iommu.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -185,7 +187,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -193,6 +195,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -210,11 +247,35 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}
-#endif
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
-#ifdef CONFIG_VMAP_STACK
+ int i;
+ int ret;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ /*
+ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+ * ignore this page.
+ */
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm;
void *stack;
int i;
@@ -226,15 +287,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Mark stack accessible for KASAN. */
+ /* Reset stack metadata. */
kasan_unpoison_range(s->addr, THREAD_SIZE);
+ stack = kasan_reset_tag(s->addr);
+
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack, 0, THREAD_SIZE);
+
+ if (memcg_charge_kernel_stack(s)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
tsk->stack_vm_area = s;
- tsk->stack = s->addr;
- return s->addr;
+ tsk->stack = stack;
+ return 0;
}
/*
@@ -247,71 +315,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
-#else
+ tsk->stack_vm_area = vm;
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+# else /* !CONFIG_VMAP_STACK */
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
-#endif
+ return -ENOMEM;
}
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
- int i;
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
+}
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+static struct kmem_cache *thread_stack_cache;
- return;
- }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ call_rcu(rh, thread_stack_free_rcu);
}
-# else
-static struct kmem_cache *thread_stack_cache;
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
void thread_stack_cache_init(void)
@@ -321,8 +414,26 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ unsigned long *stack;
+
+ stack = arch_alloc_thread_stack_node(tsk, node);
+ tsk->stack = stack;
+ return stack ? 0 : -ENOMEM;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
+}
+
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -342,13 +453,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
@@ -356,71 +503,86 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- *new = data_race(*orig);
- INIT_LIST_HEAD(&new->anon_vma_chain);
- new->vm_next = new->vm_prev = NULL;
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
-static void account_kernel_stack(struct task_struct *tsk, int account)
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
{
- void *stack = task_stack_page(tsk);
- struct vm_struct *vm = task_stack_vm_area(tsk);
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
- if (vm) {
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
+
+static void account_kernel_stack(struct task_struct *tsk, int account)
+{
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
+ void *stack = task_stack_page(tsk);
+
/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ account_kernel_stack(tsk, -1);
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
int i;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
}
-#endif
- return 0;
}
static void release_task_stack(struct task_struct *tsk)
@@ -428,12 +590,7 @@ static void release_task_stack(struct task_struct *tsk)
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -446,6 +603,10 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
+ release_user_cpus_ptr(tsk);
scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK
@@ -470,15 +631,30 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
+static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct file *exe_file;
+
+ exe_file = get_mm_exe_file(oldmm);
+ RCU_INIT_POINTER(mm->exe_file, exe_file);
+ /*
+ * We depend on the oldmm having properly denied write access to the
+ * exe_file already.
+ */
+ if (exe_file && deny_write_access(exe_file))
+ pr_warn_once("deny_write_access() failed in %s\n", __func__);
+}
+
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
+ struct vm_area_struct *mpnt, *tmp;
int retval;
- unsigned long charge;
+ unsigned long charge = 0;
LIST_HEAD(uf);
+ VMA_ITERATOR(old_vmi, oldmm, 0);
+ VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -493,25 +669,24 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mm->total_vm = oldmm->total_vm;
mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
- retval = khugepaged_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
+
+ retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
if (retval)
goto out;
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(old_vmi, mpnt) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -525,7 +700,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto out;
+ goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -553,15 +728,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
file = tmp->vm_file;
if (file) {
- struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
- if (tmp->vm_flags & VM_DENYWRITE)
- put_write_access(inode);
i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
@@ -574,24 +746,14 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
+ hugetlb_dup_vma_private(tmp);
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
-
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ /* Link the vma into the MT */
+ if (vma_iter_bulk_store(&vmi, tmp))
+ goto fail_nomem_vmi_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -601,10 +763,14 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp);
if (retval)
- goto out;
+ goto loop_out;
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -613,6 +779,9 @@ out:
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
+
+fail_nomem_vmi_store:
+ unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -620,7 +789,7 @@ fail_nomem_policy:
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
- goto out;
+ goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -639,7 +808,7 @@ static inline void mm_free_pgd(struct mm_struct *mm)
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
mmap_write_lock(oldmm);
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mmap_write_unlock(oldmm);
return 0;
}
@@ -655,7 +824,7 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
@@ -674,6 +843,67 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -681,14 +911,25 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
+ int i;
+
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -743,9 +984,7 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
sched_core_free(tsk);
-
- if (!profile_handoff_task(tsk))
- free_task(tsk);
+ free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
@@ -825,13 +1064,13 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
+ for (i = 0; i < UCOUNT_COUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
@@ -862,8 +1101,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -872,32 +1109,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
-
- if (err)
- goto free_stack;
+ account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
@@ -924,6 +1147,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
+ dup_user_cpus_ptr(tsk, orig, node);
/*
* One for the user space visible state that goes away when reaped.
@@ -938,11 +1162,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- tsk->pf_io_worker = NULL;
-
- account_kernel_stack(tsk, 1);
+ tsk->worker_private = NULL;
kcov_task_init(tsk);
+ kmsan_task_create(tsk);
kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
@@ -950,16 +1173,32 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_IOMMU_SVA
+ tsk->pasid_activated = 0;
+#endif
+
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ tsk->reported_split_lock = 0;
+#endif
+
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
+ tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
+#endif
return tsk;
free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -1006,13 +1245,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SUPPORT
- mm->pasid = INIT_PASID;
-#endif
-}
-
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1023,15 +1255,18 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
+ int i;
+
+ mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+ mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
- mm->core_state = NULL;
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1042,7 +1277,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
- mm_init_pasid(mm);
+ mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1050,6 +1285,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ hugetlb_count_init(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -1065,9 +1301,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ while (i > 0)
+ percpu_counter_destroy(&mm->rss_stat[--i]);
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1108,6 +1358,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -1139,6 +1390,7 @@ void mmput_async(struct mm_struct *mm)
schedule_work(&mm->async_put_work);
}
}
+EXPORT_SYMBOL_GPL(mmput_async);
#endif
/**
@@ -1148,11 +1400,11 @@ void mmput_async(struct mm_struct *mm)
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
* invocations: in mmput() nobody alive left, in execve task is single
- * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
- * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to avoid the need for any locks.
+ * threaded.
+ *
+ * Can only fail if new_exe_file != NULL.
*/
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct file *old_exe_file;
@@ -1163,11 +1415,76 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file)
+ if (new_exe_file) {
+ /*
+ * We expect the caller (i.e., sys_execve) to already denied
+ * write access, so this is unlikely to fail.
+ */
+ if (unlikely(deny_write_access(new_exe_file)))
+ return -EACCES;
get_file(new_exe_file);
+ }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file)
+ if (old_exe_file) {
+ allow_write_access(old_exe_file);
fput(old_exe_file);
+ }
+ return 0;
+}
+
+/**
+ * replace_mm_exe_file - replace a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
+ * dealing with concurrent invocation and without grabbing the mmap lock in
+ * write mode.
+ *
+ * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ */
+int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ struct vm_area_struct *vma;
+ struct file *old_exe_file;
+ int ret = 0;
+
+ /* Forbid mm->exe_file change if old file still mapped. */
+ old_exe_file = get_mm_exe_file(mm);
+ if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &old_exe_file->f_path)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ mmap_read_unlock(mm);
+ fput(old_exe_file);
+ if (ret)
+ return ret;
+ }
+
+ /* set the new file, lockless */
+ ret = deny_write_access(new_exe_file);
+ if (ret)
+ return -EACCES;
+ get_file(new_exe_file);
+
+ old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ if (old_exe_file) {
+ /*
+ * Don't race with dup_mmap() getting the file and disallowing
+ * write access while someone might open the file writable.
+ */
+ mmap_read_lock(mm);
+ allow_write_access(old_exe_file);
+ fput(old_exe_file);
+ mmap_read_unlock(mm);
+ }
+ return 0;
}
/**
@@ -1187,7 +1504,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
rcu_read_unlock();
return exe_file;
}
-EXPORT_SYMBOL(get_mm_exe_file);
/**
* get_task_exe_file - acquire a reference to the task's executable file
@@ -1210,7 +1526,6 @@ struct file *get_task_exe_file(struct task_struct *task)
task_unlock(task);
return exe_file;
}
-EXPORT_SYMBOL(get_task_exe_file);
/**
* get_task_mm - acquire a reference to the task's mm
@@ -1274,13 +1589,12 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
+ unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
int killed;
- freezer_do_not_count();
cgroup_enter_frozen();
- killed = wait_for_completion_killable(vfork);
+ killed = wait_for_completion_state(vfork, state);
cgroup_leave_frozen(false);
- freezer_count();
if (killed) {
task_lock(child);
@@ -1318,8 +1632,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
- atomic_read(&mm->mm_users) > 1) {
+ if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
@@ -1421,9 +1734,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
- /* initialize the new vmacache entries */
- vmacache_flush(tsk);
-
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
@@ -1435,6 +1745,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
}
@@ -1458,7 +1769,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1470,6 +1782,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
@@ -1485,32 +1802,6 @@ out:
return error;
}
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
- return 0;
-}
-
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -1573,6 +1864,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
sig->nr_threads = 1;
+ sig->quick_threads = 1;
atomic_set(&sig->live, 1);
refcount_set(&sig->sigcnt, 1);
@@ -1695,6 +1987,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->trc_reader_nesting = 0;
p->trc_reader_special.s = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
+ INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
@@ -1811,6 +2104,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -1845,6 +2223,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
mutex_unlock(&oom_adj_mutex);
}
+#ifdef CONFIG_RV
+static void rv_task_fork(struct task_struct *p)
+{
+ int i;
+
+ for (i = 0; i < RV_PER_TASK_MONITORS; i++)
+ p->rv[i].da_mon.monitoring = false;
+}
+#else
+#define rv_task_fork(p) do {} while (0)
+#endif
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1853,7 +2243,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -1863,7 +2253,7 @@ static __latent_entropy struct task_struct *copy_process(
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
- u64 clone_flags = args->flags;
+ const u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;
/*
@@ -1911,15 +2301,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- /*
- * If the new process will be in a different time namespace
- * do not allow it to share VM or a thread group with the forking task.
- */
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
- if (nsp->time_ns != nsp->time_ns_for_children)
- return ERR_PTR(-EINVAL);
- }
-
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
@@ -1952,6 +2333,11 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ p->flags &= ~PF_KTHREAD;
+ if (args->kthread)
+ p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -1961,12 +2347,9 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
- /*
- * This _must_ happen before we call free_task(), i.e. before we jump
- * to any of the bad_fork_* labels. This is to avoid freeing
- * p->set_child_tid which is (ab)used as a kthread's data pointer for
- * kernel threads (PF_KTHREAD).
- */
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -1981,18 +2364,18 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
retval = -EAGAIN;
- if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
- goto bad_fork_free;
+ goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
- retval = copy_creds(p, clone_flags);
- if (retval < 0)
- goto bad_fork_free;
-
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
@@ -2047,12 +2430,16 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
+ if (args->kthread) {
+ if (!set_kthread_struct(p))
+ goto bad_fork_cleanup_delayacct;
+ }
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_threadgroup_lock;
+ goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
@@ -2083,6 +2470,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
#ifdef CONFIG_BPF_SYSCALL
RCU_INIT_POINTER(p->bpf_storage, NULL);
+ p->bpf_ctx = NULL;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
@@ -2104,7 +2492,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2125,10 +2513,13 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
+ retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
@@ -2146,21 +2537,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2205,10 +2587,14 @@ static __latent_entropy struct task_struct *copy_process(
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
@@ -2221,6 +2607,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;
/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2257,11 +2654,7 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock(&current->sighand->siglock);
- /*
- * Copy seccomp details explicitly here, in case they were changed
- * before holding sighand lock.
- */
- copy_seccomp(p);
+ rv_task_fork(p);
rseq_fork(p, clone_flags);
@@ -2277,9 +2670,13 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
- /* past the last point of failure */
- if (pidfile)
- fd_install(pidfd, pidfile);
+ /* No more failure paths after this point. */
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2312,6 +2709,7 @@ static __latent_entropy struct task_struct *copy_process(
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
+ current->signal->quick_threads++;
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
@@ -2329,6 +2727,9 @@ static __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
sched_post_fork(p);
cgroup_post_fork(p, args);
@@ -2336,6 +2737,7 @@ static __latent_entropy struct task_struct *copy_process(
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
@@ -2387,14 +2789,15 @@ bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_threadgroup_lock:
#endif
+bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
@@ -2414,11 +2817,21 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}
+static int idle_dummy(void *dummy)
+{
+ /* This function is never called */
+ return 0;
+}
+
struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
- .flags = CLONE_VM,
+ .flags = CLONE_VM,
+ .fn = &idle_dummy,
+ .fn_arg = NULL,
+ .kthread = 1,
+ .idle = 1,
};
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@ -2430,11 +2843,6 @@ struct task_struct * __init fork_idle(int cpu)
return task;
}
-struct mm_struct *copy_init_mm(void)
-{
- return dup_mm(NULL, &init_mm);
-}
-
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
@@ -2449,9 +2857,10 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .fn = fn,
+ .fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2530,6 +2939,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
@@ -2548,14 +2964,33 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
+{
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
+ .name = name,
+ .kthread = 1,
+ };
+
+ return kernel_clone(&args);
+}
+
+/*
+ * Create a user mode thread.
+ */
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .fn = fn,
+ .fn_arg = arg,
};
return kernel_clone(&args);
@@ -2737,7 +3172,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
* - make the CLONE_DETACHED bit reusable for clone3
* - make the CSIGNAL bits reusable for clone3
*/
- if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
@@ -2829,10 +3264,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
-void __init proc_caches_init(void)
+void __init mm_cache_init(void)
{
unsigned int mm_size;
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
+void __init proc_caches_init(void)
+{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2850,20 +3302,10 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- /*
- * The mm_cpumask is located at the end of mm_struct, and is
- * dynamically sized based on the maximum CPU number this system
- * can have, taking hotplug into account (nr_cpu_ids).
- */
- mm_size = sizeof(struct mm_struct) + cpumask_size();
-
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}
@@ -2952,7 +3394,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
- struct files_struct *fd, *new_fd = NULL;
+ struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
@@ -3039,11 +3481,8 @@ int ksys_unshare(unsigned long unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_fd) {
- fd = current->files;
- current->files = new_fd;
- new_fd = fd;
- }
+ if (new_fd)
+ swap(current->files, new_fd);
task_unlock(current);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 45ab36ffd0e7..4fad0e6fca64 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -13,10 +13,11 @@
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
-/* indicate whether PM freezing is in effect, protected by
+/*
+ * indicate whether PM freezing is in effect, protected by
* system_transition_mutex
*/
bool pm_freezing;
@@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock);
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
@@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p)
}
EXPORT_SYMBOL(freezing_slow_path);
+bool frozen(struct task_struct *p)
+{
+ return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
- /* Hmm, should we be allowed to suspend when there are realtime
- processes around? */
+ unsigned int state = get_current_state();
bool was_frozen = false;
- unsigned int save = get_current_state();
pr_debug("%s entered refrigerator\n", current->comm);
+ WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ bool freeze;
+
+ set_current_state(TASK_FROZEN);
spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
- current->flags &= ~PF_FROZEN;
+ freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
spin_unlock_irq(&freezer_lock);
- if (!(current->flags & PF_FROZEN))
+ if (!freeze)
break;
+
was_frozen = true;
schedule();
}
+ __set_current_state(TASK_RUNNING);
pr_debug("%s left refrigerator\n", current->comm);
- /*
- * Restore saved task state before returning. The mb'd version
- * needs to be used; otherwise, it might silently break
- * synchronization which depends on ordered task state change.
- */
- set_current_state(save);
-
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
@@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p)
}
}
+static int __set_task_frozen(struct task_struct *p, void *arg)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ if (p->on_rq)
+ return 0;
+
+ if (p != current && task_curr(p))
+ return 0;
+
+ if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+ return 0;
+
+ /*
+ * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+ * can suffer spurious wakeups.
+ */
+ if (state & TASK_FREEZABLE)
+ WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It's dangerous to freeze with locks held; there be dragons there.
+ */
+ if (!(state & __TASK_FREEZABLE_UNSAFE))
+ WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+ WRITE_ONCE(p->__state, TASK_FROZEN);
+ return TASK_FROZEN;
+}
+
+static bool __freeze_task(struct task_struct *p)
+{
+ /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+ return task_call_func(p, __set_task_frozen, NULL);
+}
+
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
@@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * This check can race with freezer_do_not_count, but worst case that
- * will result in an extra wakeup being sent to the task. It does not
- * race with freezer_count(), the barriers in freezer_count() and
- * freezer_should_skip() ensure that either freezer_count() sees
- * freezing == true in try_to_freeze() and freezes, or
- * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
- * normally.
- */
- if (freezer_should_skip(p))
- return false;
-
spin_lock_irqsave(&freezer_lock, flags);
- if (!freezing(p) || frozen(p)) {
+ if (!freezing(p) || frozen(p) || __freeze_task(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
@@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p)
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
- wake_up_state(p, TASK_INTERRUPTIBLE);
+ wake_up_state(p, TASK_NORMAL);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
+/*
+ * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
+ * state in p->jobctl. If either of them got a wakeup that was missed because
+ * TASK_FROZEN, then their canonical state reflects that and the below will
+ * refuse to restore the special state and instead issue the wakeup.
+ */
+static int __set_task_special(struct task_struct *p, void *arg)
+{
+ unsigned int state = 0;
+
+ if (p->jobctl & JOBCTL_TRACED)
+ state = TASK_TRACED;
+
+ else if (p->jobctl & JOBCTL_STOPPED)
+ state = TASK_STOPPED;
+
+ if (state)
+ WRITE_ONCE(p->__state, state);
+
+ return state;
+}
+
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
+ unsigned long flags, flags2;
spin_lock_irqsave(&freezer_lock, flags);
- if (frozen(p))
- wake_up_process(p);
+ if (WARN_ON_ONCE(freezing(p)))
+ goto unlock;
+
+ if (lock_task_sighand(p, &flags2)) {
+ /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
+ bool ret = task_call_func(p, __set_task_special, NULL);
+ unlock_task_sighand(p, &flags2);
+ if (ret)
+ goto unlock;
+ }
+
+ wake_up_state(p, TASK_FROZEN);
+unlock:
spin_unlock_irqrestore(&freezer_lock, flags);
}
diff --git a/kernel/futex.c b/kernel/futex.c
deleted file mode 100644
index 2ecb07575055..000000000000
--- a/kernel/futex.c
+++ /dev/null
@@ -1,4038 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Fast Userspace Mutexes (which I call "Futexes!").
- * (C) Rusty Russell, IBM 2002
- *
- * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
- * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
- *
- * Removed page pinning, fix privately mapped COW pages and other cleanups
- * (C) Copyright 2003, 2004 Jamie Lokier
- *
- * Robust futex support started by Ingo Molnar
- * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
- * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
- *
- * PI-futex support started by Ingo Molnar and Thomas Gleixner
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * PRIVATE futexes by Eric Dumazet
- * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
- *
- * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
- * Copyright (C) IBM Corporation, 2009
- * Thanks to Thomas Gleixner for conceptual design and careful reviews.
- *
- * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
- * enough at me, Linus for the original (flawed) idea, Matthew
- * Kirkwood for proof-of-concept implementation.
- *
- * "The futexes are also cursed."
- * "But they come in a choice of three flavours!"
- */
-#include <linux/compat.h>
-#include <linux/jhash.h>
-#include <linux/pagemap.h>
-#include <linux/syscalls.h>
-#include <linux/freezer.h>
-#include <linux/memblock.h>
-#include <linux/fault-inject.h>
-#include <linux/time_namespace.h>
-
-#include <asm/futex.h>
-
-#include "locking/rtmutex_common.h"
-
-/*
- * READ this before attempting to hack on futexes!
- *
- * Basic futex operation and ordering guarantees
- * =============================================
- *
- * The waiter reads the futex value in user space and calls
- * futex_wait(). This function computes the hash bucket and acquires
- * the hash bucket lock. After that it reads the futex user space value
- * again and verifies that the data has not changed. If it has not changed
- * it enqueues itself into the hash bucket, releases the hash bucket lock
- * and schedules.
- *
- * The waker side modifies the user space value of the futex and calls
- * futex_wake(). This function computes the hash bucket and acquires the
- * hash bucket lock. Then it looks for waiters on that futex in the hash
- * bucket and wakes them.
- *
- * In futex wake up scenarios where no tasks are blocked on a futex, taking
- * the hb spinlock can be avoided and simply return. In order for this
- * optimization to work, ordering guarantees must exist so that the waiter
- * being added to the list is acknowledged when the list is concurrently being
- * checked by the waker, avoiding scenarios like the following:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- * uval = *futex;
- * *futex = newval;
- * sys_futex(WAKE, futex);
- * futex_wake(futex);
- * if (queue_empty())
- * return;
- * if (uval == val)
- * lock(hash_bucket(futex));
- * queue();
- * unlock(hash_bucket(futex));
- * schedule();
- *
- * This would cause the waiter on CPU 0 to wait forever because it
- * missed the transition of the user space value from val to newval
- * and the waker did not find the waiter in the hash bucket queue.
- *
- * The correct serialization ensures that a waiter either observes
- * the changed user space value before blocking or is woken by a
- * concurrent waker:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- *
- * waiters++; (a)
- * smp_mb(); (A) <-- paired with -.
- * |
- * lock(hash_bucket(futex)); |
- * |
- * uval = *futex; |
- * | *futex = newval;
- * | sys_futex(WAKE, futex);
- * | futex_wake(futex);
- * |
- * `--------> smp_mb(); (B)
- * if (uval == val)
- * queue();
- * unlock(hash_bucket(futex));
- * schedule(); if (waiters)
- * lock(hash_bucket(futex));
- * else wake_waiters(futex);
- * waiters--; (b) unlock(hash_bucket(futex));
- *
- * Where (A) orders the waiters increment and the futex value read through
- * atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read (see hb_waiters_pending()).
- *
- * This yields the following case (where X:=waiters, Y:=futex):
- *
- * X = Y = 0
- *
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
- *
- * Which guarantees that x==0 && y==0 is impossible; which translates back into
- * the guarantee that we cannot both miss the futex variable change and the
- * enqueue.
- *
- * Note that a new waiter is accounted for in (a) even when it is possible that
- * the wait call can return error, in which case we backtrack from it in (b).
- * Refer to the comment in queue_lock().
- *
- * Similarly, in order to account for waiters being requeued on another
- * address we always increment the waiters for the destination bucket before
- * acquiring the lock. It then decrements them again after releasing it -
- * the code that actually moves the futex(es) between hash buckets (requeue_futex)
- * will do the additional required waiter count housekeeping. This is done for
- * double_lock_hb() and double_unlock_hb(), respectively.
- */
-
-#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
-#define futex_cmpxchg_enabled 1
-#else
-static int __read_mostly futex_cmpxchg_enabled;
-#endif
-
-/*
- * Futex flags used to encode options to functions and preserve them across
- * restarts.
- */
-#ifdef CONFIG_MMU
-# define FLAGS_SHARED 0x01
-#else
-/*
- * NOMMU does not have per process address space. Let the compiler optimize
- * code away.
- */
-# define FLAGS_SHARED 0x00
-#endif
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
-/*
- * Priority Inheritance state:
- */
-struct futex_pi_state {
- /*
- * list of 'owned' pi_state instances - these have to be
- * cleaned up in do_exit() if the task exits prematurely:
- */
- struct list_head list;
-
- /*
- * The PI object:
- */
- struct rt_mutex pi_mutex;
-
- struct task_struct *owner;
- refcount_t refcount;
-
- union futex_key key;
-} __randomize_layout;
-
-/**
- * struct futex_q - The hashed futex queue entry, one per waiting task
- * @list: priority-sorted list of tasks waiting on this futex
- * @task: the task waiting on the futex
- * @lock_ptr: the hash bucket lock
- * @key: the key the futex is hashed on
- * @pi_state: optional priority inheritance state
- * @rt_waiter: rt_waiter storage for use with requeue_pi
- * @requeue_pi_key: the requeue_pi target futex key
- * @bitset: bitset for the optional bitmasked wakeup
- *
- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
- * we can wake only the relevant ones (hashed queues may be shared).
- *
- * A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakeup is always to make the first condition true, then
- * the second.
- *
- * PI futexes are typically woken before they are removed from the hash list via
- * the rt_mutex code. See unqueue_me_pi().
- */
-struct futex_q {
- struct plist_node list;
-
- struct task_struct *task;
- spinlock_t *lock_ptr;
- union futex_key key;
- struct futex_pi_state *pi_state;
- struct rt_mutex_waiter *rt_waiter;
- union futex_key *requeue_pi_key;
- u32 bitset;
-} __randomize_layout;
-
-static const struct futex_q futex_q_init = {
- /* list gets initialized in queue_me()*/
- .key = FUTEX_KEY_INIT,
- .bitset = FUTEX_BITSET_MATCH_ANY
-};
-
-/*
- * Hash buckets are shared by all the futex_keys that hash to the same
- * location. Each key may have multiple futex_q structures, one for each task
- * waiting on a futex.
- */
-struct futex_hash_bucket {
- atomic_t waiters;
- spinlock_t lock;
- struct plist_head chain;
-} ____cacheline_aligned_in_smp;
-
-/*
- * The base of the bucket array and its size are always used together
- * (after initialization only in hash_futex()), so ensure that they
- * reside in the same cacheline.
- */
-static struct {
- struct futex_hash_bucket *queues;
- unsigned long hashsize;
-} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
-
-
-/*
- * Fault injections for futexes.
- */
-#ifdef CONFIG_FAIL_FUTEX
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_private;
-} fail_futex = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_private = false,
-};
-
-static int __init setup_fail_futex(char *str)
-{
- return setup_fault_attr(&fail_futex.attr, str);
-}
-__setup("fail_futex=", setup_fail_futex);
-
-static bool should_fail_futex(bool fshared)
-{
- if (fail_futex.ignore_private && !fshared)
- return false;
-
- return should_fail(&fail_futex.attr, 1);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_futex_debugfs(void)
-{
- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_futex", NULL,
- &fail_futex.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- debugfs_create_bool("ignore-private", mode, dir,
- &fail_futex.ignore_private);
- return 0;
-}
-
-late_initcall(fail_futex_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else
-static inline bool should_fail_futex(bool fshared)
-{
- return false;
-}
-#endif /* CONFIG_FAIL_FUTEX */
-
-#ifdef CONFIG_COMPAT
-static void compat_exit_robust_list(struct task_struct *curr);
-#endif
-
-/*
- * Reflects a new waiter being added to the waitqueue.
- */
-static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_inc(&hb->waiters);
- /*
- * Full barrier (A), see the ordering comment above.
- */
- smp_mb__after_atomic();
-#endif
-}
-
-/*
- * Reflects a waiter being removed from the waitqueue by wakeup
- * paths.
- */
-static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_dec(&hb->waiters);
-#endif
-}
-
-static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- /*
- * Full barrier (B), see the ordering comment above.
- */
- smp_mb();
- return atomic_read(&hb->waiters);
-#else
- return 1;
-#endif
-}
-
-/**
- * hash_futex - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
- *
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
- */
-static struct futex_hash_bucket *hash_futex(union futex_key *key)
-{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
-
- return &futex_queues[hash & (futex_hashsize - 1)];
-}
-
-
-/**
- * match_futex - Check whether two futex keys are equal
- * @key1: Pointer to key1
- * @key2: Pointer to key2
- *
- * Return 1 if two futex_keys are equal, 0 otherwise.
- */
-static inline int match_futex(union futex_key *key1, union futex_key *key2)
-{
- return (key1 && key2
- && key1->both.word == key2->both.word
- && key1->both.ptr == key2->both.ptr
- && key1->both.offset == key2->both.offset);
-}
-
-enum futex_access {
- FUTEX_READ,
- FUTEX_WRITE
-};
-
-/**
- * futex_setup_timer - set up the sleeping hrtimer.
- * @time: ptr to the given timeout value
- * @timeout: the hrtimer_sleeper structure to be set up
- * @flags: futex flags
- * @range_ns: optional range in ns
- *
- * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
- * value given
- */
-static inline struct hrtimer_sleeper *
-futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
- int flags, u64 range_ns)
-{
- if (!time)
- return NULL;
-
- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- /*
- * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
- * effectively the same as calling hrtimer_set_expires().
- */
- hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
-
- return timeout;
-}
-
-/*
- * Generate a machine wide unique identifier for this inode.
- *
- * This relies on u64 not wrapping in the life-time of the machine; which with
- * 1ns resolution means almost 585 years.
- *
- * This further relies on the fact that a well formed program will not unmap
- * the file while it has a (shared) futex waiting on it. This mapping will have
- * a file reference which pins the mount and inode.
- *
- * If for some reason an inode gets evicted and read back in again, it will get
- * a new sequence number and will _NOT_ match, even though it is the exact same
- * file.
- *
- * It is important that match_futex() will never have a false-positive, esp.
- * for PI futexes that can mess up the state. The above argues that false-negatives
- * are only possible for malformed programs.
- */
-static u64 get_inode_sequence_number(struct inode *inode)
-{
- static atomic64_t i_seq;
- u64 old;
-
- /* Does the inode already have a sequence number? */
- old = atomic64_read(&inode->i_sequence);
- if (likely(old))
- return old;
-
- for (;;) {
- u64 new = atomic64_add_return(1, &i_seq);
- if (WARN_ON_ONCE(!new))
- continue;
-
- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
- if (old)
- return old;
- return new;
- }
-}
-
-/**
- * get_futex_key() - Get parameters which are the keys for a futex
- * @uaddr: virtual address of the futex
- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
- * @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: FUTEX_READ,
- * FUTEX_WRITE)
- *
- * Return: a negative error code or 0
- *
- * The key words are stored in @key on success.
- *
- * For shared mappings (when @fshared), the key is:
- *
- * ( inode->i_sequence, page->index, offset_within_page )
- *
- * [ also see get_inode_sequence_number() ]
- *
- * For private mappings (or when !@fshared), the key is:
- *
- * ( current->mm, address, 0 )
- *
- * This allows (cross process, where applicable) identification of the futex
- * without keeping the page pinned for the duration of the FUTEX_WAIT.
- *
- * lock_page() might sleep, the caller should not hold a spinlock.
- */
-static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
- enum futex_access rw)
-{
- unsigned long address = (unsigned long)uaddr;
- struct mm_struct *mm = current->mm;
- struct page *page, *tail;
- struct address_space *mapping;
- int err, ro = 0;
-
- /*
- * The futex address must be "naturally" aligned.
- */
- key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
- return -EINVAL;
- address -= key->both.offset;
-
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(fshared)))
- return -EFAULT;
-
- /*
- * PROCESS_PRIVATE futexes are fast.
- * As the mm cannot disappear under us and the 'key' only needs
- * virtual address, we dont even have to find the underlying vma.
- * Note : We do have to check 'uaddr' is a valid user address,
- * but access_ok() should be faster than find_vma()
- */
- if (!fshared) {
- key->private.mm = mm;
- key->private.address = address;
- return 0;
- }
-
-again:
- /* Ignore any VERIFY_READ mapping (futex common case) */
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
- /*
- * If write access is not required (eg. FUTEX_WAIT), try
- * and get read-only access.
- */
- if (err == -EFAULT && rw == FUTEX_READ) {
- err = get_user_pages_fast(address, 1, 0, &page);
- ro = 1;
- }
- if (err < 0)
- return err;
- else
- err = 0;
-
- /*
- * The treatment of mapping from this point on is critical. The page
- * lock protects many things but in this context the page lock
- * stabilizes mapping, prevents inode freeing in the shared
- * file-backed region case and guards against movement to swap cache.
- *
- * Strictly speaking the page lock is not needed in all cases being
- * considered here and page lock forces unnecessarily serialization
- * From this point on, mapping will be re-verified if necessary and
- * page lock will be acquired only if it is unavoidable
- *
- * Mapping checks require the head page for any compound page so the
- * head page and mapping is looked up now. For anonymous pages, it
- * does not matter if the page splits in the future as the key is
- * based on the address. For filesystem-backed pages, the tail is
- * required as the index of the page determines the key. For
- * base pages, there is no tail page and tail == page.
- */
- tail = page;
- page = compound_head(page);
- mapping = READ_ONCE(page->mapping);
-
- /*
- * If page->mapping is NULL, then it cannot be a PageAnon
- * page; but it might be the ZERO_PAGE or in the gate area or
- * in a special mapping (all cases which we are happy to fail);
- * or it may have been a good file page when get_user_pages_fast
- * found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
- * cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
- * prevents drop_caches from setting mapping to NULL beneath us.
- *
- * The case we do have to guard against is when memory pressure made
- * shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page->mapping.
- */
- if (unlikely(!mapping)) {
- int shmem_swizzled;
-
- /*
- * Page lock is required to identify which special case above
- * applies. If this is really a shmem page then the page lock
- * will prevent unexpected transitions.
- */
- lock_page(page);
- shmem_swizzled = PageSwapCache(page) || page->mapping;
- unlock_page(page);
- put_page(page);
-
- if (shmem_swizzled)
- goto again;
-
- return -EFAULT;
- }
-
- /*
- * Private mappings are handled in a simple way.
- *
- * If the futex key is stored on an anonymous page, then the associated
- * object is the mm which is implicitly pinned by the calling process.
- *
- * NOTE: When userspace waits on a MAP_SHARED mapping, even if
- * it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process.
- */
- if (PageAnon(page)) {
- /*
- * A RO anonymous page will never change and thus doesn't make
- * sense for futex operations.
- */
- if (unlikely(should_fail_futex(true)) || ro) {
- err = -EFAULT;
- goto out;
- }
-
- key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
- key->private.mm = mm;
- key->private.address = address;
-
- } else {
- struct inode *inode;
-
- /*
- * The associated futex object in this case is the inode and
- * the page->mapping must be traversed. Ordinarily this should
- * be stabilised under page lock but it's not strictly
- * necessary in this case as we just want to pin the inode, not
- * update the radix tree or anything like that.
- *
- * The RCU read lock is taken as the inode is finally freed
- * under RCU. If the mapping still matches expectations then the
- * mapping->host can be safely accessed as being a valid inode.
- */
- rcu_read_lock();
-
- if (READ_ONCE(page->mapping) != mapping) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- inode = READ_ONCE(mapping->host);
- if (!inode) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = page_to_pgoff(tail);
- rcu_read_unlock();
- }
-
-out:
- put_page(page);
- return err;
-}
-
-/**
- * fault_in_user_writeable() - Fault in user address and verify RW access
- * @uaddr: pointer to faulting user space address
- *
- * Slow path to fixup the fault we just took in the atomic write
- * access to @uaddr.
- *
- * We have no generic implementation of a non-destructive write to the
- * user address. We know that we faulted in the atomic pagefault
- * disabled section so we can as well avoid the #PF overhead by
- * calling get_user_pages() right away.
- */
-static int fault_in_user_writeable(u32 __user *uaddr)
-{
- struct mm_struct *mm = current->mm;
- int ret;
-
- mmap_read_lock(mm);
- ret = fixup_user_fault(mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE, NULL);
- mmap_read_unlock(mm);
-
- return ret < 0 ? ret : 0;
-}
-
-/**
- * futex_top_waiter() - Return the highest priority waiter on a futex
- * @hb: the hash bucket the futex_q's reside in
- * @key: the futex key (to distinguish it from other futex futex_q's)
- *
- * Must be called with the hb lock held.
- */
-static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
- union futex_key *key)
-{
- struct futex_q *this;
-
- plist_for_each_entry(this, &hb->chain, list) {
- if (match_futex(&this->key, key))
- return this;
- }
- return NULL;
-}
-
-static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
- u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-static int get_futex_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __get_user(*dest, from);
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
-
-/*
- * PI code:
- */
-static int refill_pi_state_cache(void)
-{
- struct futex_pi_state *pi_state;
-
- if (likely(current->pi_state_cache))
- return 0;
-
- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
-
- if (!pi_state)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&pi_state->list);
- /* pi_mutex gets initialized later */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- pi_state->key = FUTEX_KEY_INIT;
-
- current->pi_state_cache = pi_state;
-
- return 0;
-}
-
-static struct futex_pi_state *alloc_pi_state(void)
-{
- struct futex_pi_state *pi_state = current->pi_state_cache;
-
- WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
-
- return pi_state;
-}
-
-static void pi_state_update_owner(struct futex_pi_state *pi_state,
- struct task_struct *new_owner)
-{
- struct task_struct *old_owner = pi_state->owner;
-
- lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
-
- if (old_owner) {
- raw_spin_lock(&old_owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock(&old_owner->pi_lock);
- }
-
- if (new_owner) {
- raw_spin_lock(&new_owner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
- pi_state->owner = new_owner;
- raw_spin_unlock(&new_owner->pi_lock);
- }
-}
-
-static void get_pi_state(struct futex_pi_state *pi_state)
-{
- WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
-}
-
-/*
- * Drops a reference to the pi_state object and frees or caches it
- * when the last reference is gone.
- */
-static void put_pi_state(struct futex_pi_state *pi_state)
-{
- if (!pi_state)
- return;
-
- if (!refcount_dec_and_test(&pi_state->refcount))
- return;
-
- /*
- * If pi_state->owner is NULL, the owner is most probably dying
- * and has cleaned up the pi_state already
- */
- if (pi_state->owner) {
- unsigned long flags;
-
- raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
- pi_state_update_owner(pi_state, NULL);
- rt_mutex_proxy_unlock(&pi_state->pi_mutex);
- raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
- }
-
- if (current->pi_state_cache) {
- kfree(pi_state);
- } else {
- /*
- * pi_state->list is already empty.
- * clear pi_state->owner.
- * refcount is at 0 - put it back to 1.
- */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
- }
-}
-
-#ifdef CONFIG_FUTEX_PI
-
-/*
- * This task is holding PI mutexes at exit time => bad.
- * Kernel cleans up PI-state, but userspace is likely hosed.
- * (Robust-futex cleanup is separate and might save the day for userspace.)
- */
-static void exit_pi_state_list(struct task_struct *curr)
-{
- struct list_head *next, *head = &curr->pi_state_list;
- struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
- union futex_key key = FUTEX_KEY_INIT;
-
- if (!futex_cmpxchg_enabled)
- return;
- /*
- * We are a ZOMBIE and nobody can enqueue itself on
- * pi_state_list anymore, but we have to be careful
- * versus waiters unqueueing themselves:
- */
- raw_spin_lock_irq(&curr->pi_lock);
- while (!list_empty(head)) {
- next = head->next;
- pi_state = list_entry(next, struct futex_pi_state, list);
- key = pi_state->key;
- hb = hash_futex(&key);
-
- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
- raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-
- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
- }
-
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
- put_pi_state(pi_state);
-
- raw_spin_lock_irq(&curr->pi_lock);
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-}
-#else
-static inline void exit_pi_state_list(struct task_struct *curr) { }
-#endif
-
-/*
- * We need to check the following states:
- *
- * Waiter | pi_state | pi->owner | uTID | uODIED | ?
- *
- * [1] NULL | --- | --- | 0 | 0/1 | Valid
- * [2] NULL | --- | --- | >0 | 0/1 | Valid
- *
- * [3] Found | NULL | -- | Any | 0/1 | Invalid
- *
- * [4] Found | Found | NULL | 0 | 1 | Valid
- * [5] Found | Found | NULL | >0 | 1 | Invalid
- *
- * [6] Found | Found | task | 0 | 1 | Valid
- *
- * [7] Found | Found | NULL | Any | 0 | Invalid
- *
- * [8] Found | Found | task | ==taskTID | 0/1 | Valid
- * [9] Found | Found | task | 0 | 0 | Invalid
- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
- *
- * [1] Indicates that the kernel can acquire the futex atomically. We
- * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
- *
- * [2] Valid, if TID does not belong to a kernel thread. If no matching
- * thread is found then it indicates that the owner TID has died.
- *
- * [3] Invalid. The waiter is queued on a non PI futex
- *
- * [4] Valid state after exit_robust_list(), which sets the user space
- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
- *
- * [5] The user space value got manipulated between exit_robust_list()
- * and exit_pi_state_list()
- *
- * [6] Valid state after exit_pi_state_list() which sets the new owner in
- * the pi_state but cannot access the user space value.
- *
- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
- *
- * [8] Owner and user space value match
- *
- * [9] There is no transient state which sets the user space TID to 0
- * except exit_robust_list(), but this is indicated by the
- * FUTEX_OWNER_DIED bit. See [4]
- *
- * [10] There is no transient state which leaves owner and user space
- * TID out of sync. Except one error case where the kernel is denied
- * write access to the user address, see fixup_pi_state_owner().
- *
- *
- * Serialization and lifetime rules:
- *
- * hb->lock:
- *
- * hb -> futex_q, relation
- * futex_q -> pi_state, relation
- *
- * (cannot be raw because hb can contain arbitrary amount
- * of futex_q's)
- *
- * pi_mutex->wait_lock:
- *
- * {uval, pi_state}
- *
- * (and pi_mutex 'obviously')
- *
- * p->pi_lock:
- *
- * p->pi_state_list -> pi_state->list, relation
- * pi_mutex->owner -> pi_state->owner, relation
- *
- * pi_state->refcount:
- *
- * pi_state lifetime
- *
- *
- * Lock order:
- *
- * hb->lock
- * pi_mutex->wait_lock
- * p->pi_lock
- *
- */
-
-/*
- * Validate that the existing waiter has a pi_state and sanity check
- * the pi_state against the user space value. If correct, attach to
- * it.
- */
-static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_pi_state *pi_state,
- struct futex_pi_state **ps)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- u32 uval2;
- int ret;
-
- /*
- * Userspace might have messed up non-PI and PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
-
- /*
- * We get here with hb->lock held, and having found a
- * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
- * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
- * which in turn means that futex_lock_pi() still has a reference on
- * our pi_state.
- *
- * The waiter holding a reference on @pi_state also protects against
- * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
- * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
- * free pi_state before we can take a reference ourselves.
- */
- WARN_ON(!refcount_read(&pi_state->refcount));
-
- /*
- * Now that we have a pi_state, we can acquire wait_lock
- * and do the state validation.
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- /*
- * Since {uval, pi_state} is serialized by wait_lock, and our current
- * uval was read without holding it, it can have changed. Verify it
- * still is what we expect it to be, otherwise retry the entire
- * operation.
- */
- if (get_futex_value_locked(&uval2, uaddr))
- goto out_efault;
-
- if (uval != uval2)
- goto out_eagain;
-
- /*
- * Handle the owner died case:
- */
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and wakes the
- * topmost waiter. The task which acquires the
- * pi_state->rt_mutex will fixup owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user space TID
- * is not 0. Inconsistent state. [5]
- */
- if (pid)
- goto out_einval;
- /*
- * Take a ref on the state and return success. [4]
- */
- goto out_attach;
- }
-
- /*
- * If TID is 0, then either the dying owner has not
- * yet executed exit_pi_state_list() or some waiter
- * acquired the rtmutex in the pi state, but did not
- * yet fixup the TID in user space.
- *
- * Take a ref on the state and return success. [6]
- */
- if (!pid)
- goto out_attach;
- } else {
- /*
- * If the owner died bit is not set, then the pi_state
- * must have an owner. [7]
- */
- if (!pi_state->owner)
- goto out_einval;
- }
-
- /*
- * Bail out if user space manipulated the futex value. If pi
- * state exists then the owner TID must be the same as the
- * user space TID. [9/10]
- */
- if (pid != task_pid_vnr(pi_state->owner))
- goto out_einval;
-
-out_attach:
- get_pi_state(pi_state);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- *ps = pi_state;
- return 0;
-
-out_einval:
- ret = -EINVAL;
- goto out_error;
-
-out_eagain:
- ret = -EAGAIN;
- goto out_error;
-
-out_efault:
- ret = -EFAULT;
- goto out_error;
-
-out_error:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
-}
-
-/**
- * wait_for_owner_exiting - Block until the owner has exited
- * @ret: owner's current futex lock status
- * @exiting: Pointer to the exiting task
- *
- * Caller must hold a refcount on @exiting.
- */
-static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
-{
- if (ret != -EBUSY) {
- WARN_ON_ONCE(exiting);
- return;
- }
-
- if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
- return;
-
- mutex_lock(&exiting->futex_exit_mutex);
- /*
- * No point in doing state checking here. If the waiter got here
- * while the task was in exec()->exec_futex_release() then it can
- * have any FUTEX_STATE_* value when the waiter has acquired the
- * mutex. OK, if running, EXITING or DEAD if it reached exit()
- * already. Highly unlikely and not a problem. Just one more round
- * through the futex maze.
- */
- mutex_unlock(&exiting->futex_exit_mutex);
-
- put_task_struct(exiting);
-}
-
-static int handle_exit_race(u32 __user *uaddr, u32 uval,
- struct task_struct *tsk)
-{
- u32 uval2;
-
- /*
- * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
- * caller that the alleged owner is busy.
- */
- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
- return -EBUSY;
-
- /*
- * Reread the user space value to handle the following situation:
- *
- * CPU0 CPU1
- *
- * sys_exit() sys_futex()
- * do_exit() futex_lock_pi()
- * futex_lock_pi_atomic()
- * exit_signals(tsk) No waiters:
- * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
- * mm_release(tsk) Set waiter bit
- * exit_robust_list(tsk) { *uaddr = 0x80000PID;
- * Set owner died attach_to_pi_owner() {
- * *uaddr = 0xC0000000; tsk = get_task(PID);
- * } if (!tsk->flags & PF_EXITING) {
- * ... attach();
- * tsk->futex_state = } else {
- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
- * FUTEX_STATE_DEAD)
- * return -EAGAIN;
- * return -ESRCH; <--- FAIL
- * }
- *
- * Returning ESRCH unconditionally is wrong here because the
- * user space value has been changed by the exiting task.
- *
- * The same logic applies to the case where the exiting task is
- * already gone.
- */
- if (get_futex_value_locked(&uval2, uaddr))
- return -EFAULT;
-
- /* If the user space value has changed, try again. */
- if (uval2 != uval)
- return -EAGAIN;
-
- /*
- * The exiting task did not have a robust list, the robust list was
- * corrupted or the user space value in *uaddr is simply bogus.
- * Give up and tell user space.
- */
- return -ESRCH;
-}
-
-/*
- * Lookup the task for the TID provided from user space and attach to
- * it after doing proper sanity checks.
- */
-static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- struct futex_pi_state *pi_state;
- struct task_struct *p;
-
- /*
- * We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0 [1]
- *
- * The !pid check is paranoid. None of the call sites should end up
- * with pid == 0, but better safe than sorry. Let the caller retry
- */
- if (!pid)
- return -EAGAIN;
- p = find_get_task_by_vpid(pid);
- if (!p)
- return handle_exit_race(uaddr, uval, NULL);
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- put_task_struct(p);
- return -EPERM;
- }
-
- /*
- * We need to look at the task state to figure out, whether the
- * task is exiting. To protect against the change of the task state
- * in futex_exit_release(), we do this protected by p->pi_lock:
- */
- raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
- /*
- * The task is on the way out. When the futex state is
- * FUTEX_STATE_DEAD, we know that the task has finished
- * the cleanup:
- */
- int ret = handle_exit_race(uaddr, uval, p);
-
- raw_spin_unlock_irq(&p->pi_lock);
- /*
- * If the owner task is between FUTEX_STATE_EXITING and
- * FUTEX_STATE_DEAD then store the task pointer and keep
- * the reference on the task struct. The calling code will
- * drop all locks, wait for the task to reach
- * FUTEX_STATE_DEAD and then drop the refcount. This is
- * required to prevent a live lock when the current task
- * preempted the exiting task between the two states.
- */
- if (ret == -EBUSY)
- *exiting = p;
- else
- put_task_struct(p);
- return ret;
- }
-
- /*
- * No existing pi state. First waiter. [2]
- *
- * This creates pi_state, we have hb->lock held, this means nothing can
- * observe this state, wait_lock is irrelevant.
- */
- pi_state = alloc_pi_state();
-
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
-
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- /*
- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
- * because there is no concurrency as the object is not published yet.
- */
- pi_state->owner = p;
- raw_spin_unlock_irq(&p->pi_lock);
-
- put_task_struct(p);
-
- *ps = pi_state;
-
- return 0;
-}
-
-static int lookup_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- struct futex_q *top_waiter = futex_top_waiter(hb, key);
-
- /*
- * If there is a waiter on that futex, validate it and
- * attach to the pi_state when the validation succeeds.
- */
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * We are the first waiter - try to look up the owner based on
- * @uval and attach to it.
- */
- return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
-}
-
-static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
-{
- int err;
- u32 curval;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (unlikely(err))
- return err;
-
- /* If user space value changed, let the caller retry */
- return curval != uval ? -EAGAIN : 0;
-}
-
-/**
- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
- * @uaddr: the pi futex user address
- * @hb: the pi futex hash bucket
- * @key: the futex key associated with uaddr and hb
- * @ps: the pi_state pointer where we store the result of the
- * lookup
- * @task: the task to perform the atomic lock work for. This will
- * be "current" except in the case of requeue pi.
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Return:
- * - 0 - ready to wait;
- * - 1 - acquired the lock;
- * - <0 - error
- *
- * The hb->lock and futex_key refs shall be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- */
-static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
- union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct *task,
- struct task_struct **exiting,
- int set_waiters)
-{
- u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *top_waiter;
- int ret;
-
- /*
- * Read the user space value first so we can validate a few
- * things before proceeding further.
- */
- if (get_futex_value_locked(&uval, uaddr))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- /*
- * Detect deadlocks.
- */
- if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
- return -EDEADLK;
-
- if ((unlikely(should_fail_futex(true))))
- return -EDEADLK;
-
- /*
- * Lookup existing state first. If it exists, try to attach to
- * its pi_state.
- */
- top_waiter = futex_top_waiter(hb, key);
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * No waiter and user TID is 0. We are here because the
- * waiters or the owner died bit is set or called from
- * requeue_cmp_pi or for whatever reason something took the
- * syscall.
- */
- if (!(uval & FUTEX_TID_MASK)) {
- /*
- * We take over the futex. No other waiters and the user space
- * TID is 0. We preserve the owner died bit.
- */
- newval = uval & FUTEX_OWNER_DIED;
- newval |= vpid;
-
- /* The futex requeue_pi code can enforce the waiters bit */
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- /* If the take over worked, return 1 */
- return ret < 0 ? ret : 1;
- }
-
- /*
- * First waiter. Set the waiters bit before attaching ourself to
- * the owner. If owner tries to unlock, it will be forced into
- * the kernel and blocked on hb->lock.
- */
- newval = uval | FUTEX_WAITERS;
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- if (ret)
- return ret;
- /*
- * If the update of the user space value succeeded, we try to
- * attach to the owner. If that fails, no harm done, we only
- * set the FUTEX_WAITERS bit in the user space variable.
- */
- return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
-}
-
-/**
- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be NULL and must be held by the caller.
- */
-static void __unqueue_futex(struct futex_q *q)
-{
- struct futex_hash_bucket *hb;
-
- if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
- return;
- lockdep_assert_held(q->lock_ptr);
-
- hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-}
-
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
-{
- struct task_struct *p = q->task;
-
- if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
-
- get_task_struct(p);
- __unqueue_futex(q);
- /*
- * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
- * is written, without taking any locks. This is possible in the event
- * of a spurious wakeup, for example. A memory barrier is required here
- * to prevent the following store to lock_ptr from getting ahead of the
- * plist_del in __unqueue_futex().
- */
- smp_store_release(&q->lock_ptr, NULL);
-
- /*
- * Queue the task for later wakeup for after we've released
- * the hb->lock.
- */
- wake_q_add_safe(wake_q, p);
-}
-
-/*
- * Caller must hold a reference on @pi_state.
- */
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
-{
- u32 curval, newval;
- struct rt_mutex_waiter *top_waiter;
- struct task_struct *new_owner;
- bool postunlock = false;
- DEFINE_WAKE_Q(wake_q);
- int ret = 0;
-
- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!top_waiter)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
-
- new_owner = top_waiter->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always kept
- * enabled while there is PI state around. We cleanup the owner
- * died bit, because we are the owner.
- */
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-
- if (unlikely(should_fail_futex(true))) {
- ret = -EFAULT;
- goto out_unlock;
- }
-
- ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (!ret && (curval != uval)) {
- /*
- * If a unconditional UNLOCK_PI operation (user space did not
- * try the TID->0 transition) raced with a waiter setting the
- * FUTEX_WAITERS flag between get_user() and locking the hash
- * bucket lock, retry the operation.
- */
- if ((FUTEX_TID_MASK & curval) == uval)
- ret = -EAGAIN;
- else
- ret = -EINVAL;
- }
-
- if (!ret) {
- /*
- * This is a point of no return; once we modified the uval
- * there is no going back and subsequent operations must
- * not fail.
- */
- pi_state_update_owner(pi_state, new_owner);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
- }
-
-out_unlock:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
-
- return ret;
-}
-
-/*
- * Express the locking dependencies for lockdep:
- */
-static inline void
-double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- if (hb1 <= hb2) {
- spin_lock(&hb1->lock);
- if (hb1 < hb2)
- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
- } else { /* hb1 > hb2 */
- spin_lock(&hb2->lock);
- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
- }
-}
-
-static inline void
-double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
-}
-
-/*
- * Wake up waiters matching bitset queued on this futex (uaddr).
- */
-static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
-{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
- union futex_key key = FUTEX_KEY_INIT;
- int ret;
- DEFINE_WAKE_Q(wake_q);
-
- if (!bitset)
- return -EINVAL;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
- hb = hash_futex(&key);
-
- /* Make sure we really have tasks to wakeup */
- if (!hb_waiters_pending(hb))
- return ret;
-
- spin_lock(&hb->lock);
-
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex (&this->key, &key)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- break;
- }
-
- /* Check if one of the bits is set in both bitsets */
- if (!(this->bitset & bitset))
- continue;
-
- mark_wake_futex(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- return ret;
-}
-
-static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
-{
- unsigned int op = (encoded_op & 0x70000000) >> 28;
- unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
- int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
- int oldval, ret;
-
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
- if (oparg < 0 || oparg > 31) {
- char comm[sizeof(current->comm)];
- /*
- * kill this print and return -EINVAL when userspace
- * is sane again
- */
- pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
- get_task_comm(comm, current), oparg);
- oparg &= 31;
- }
- oparg = 1 << oparg;
- }
-
- pagefault_disable();
- ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
- pagefault_enable();
- if (ret)
- return ret;
-
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- return oldval == cmparg;
- case FUTEX_OP_CMP_NE:
- return oldval != cmparg;
- case FUTEX_OP_CMP_LT:
- return oldval < cmparg;
- case FUTEX_OP_CMP_GE:
- return oldval >= cmparg;
- case FUTEX_OP_CMP_LE:
- return oldval <= cmparg;
- case FUTEX_OP_CMP_GT:
- return oldval > cmparg;
- default:
- return -ENOSYS;
- }
-}
-
-/*
- * Wake up all waiters hashed on the physical page that is mapped
- * to this virtual address:
- */
-static int
-futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
- int nr_wake, int nr_wake2, int op)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- int ret, op_ret;
- DEFINE_WAKE_Q(wake_q);
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
- if (unlikely(ret != 0))
- return ret;
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
-
- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- return ret;
- }
-
- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
- return ret;
- }
-
- cond_resched();
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (match_futex (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- mark_wake_futex(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (match_futex (&this->key, &key2)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- mark_wake_futex(&wake_q, this);
- if (++op_ret >= nr_wake2)
- break;
- }
- }
- ret += op_ret;
- }
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
- wake_up_q(&wake_q);
- return ret;
-}
-
-/**
- * requeue_futex() - Requeue a futex_q from one hb to another
- * @q: the futex_q to requeue
- * @hb1: the source hash_bucket
- * @hb2: the target hash_bucket
- * @key2: the new key for the requeued futex_q
- */
-static inline
-void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key2)
-{
-
- /*
- * If key1 and key2 hash to the same bucket, no need to
- * requeue.
- */
- if (likely(&hb1->chain != &hb2->chain)) {
- plist_del(&q->list, &hb1->chain);
- hb_waiters_dec(hb1);
- hb_waiters_inc(hb2);
- plist_add(&q->list, &hb2->chain);
- q->lock_ptr = &hb2->lock;
- }
- q->key = *key2;
-}
-
-/**
- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
- * @q: the futex_q
- * @key: the key of the requeue target futex
- * @hb: the hash_bucket of the requeue target futex
- *
- * During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal. Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later. Must be called
- * with both q->lock_ptr and hb->lock held.
- */
-static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
- struct futex_hash_bucket *hb)
-{
- q->key = *key;
-
- __unqueue_futex(q);
-
- WARN_ON(!q->rt_waiter);
- q->rt_waiter = NULL;
-
- q->lock_ptr = &hb->lock;
-
- wake_up_state(q->task, TASK_NORMAL);
-}
-
-/**
- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex: the user address of the to futex
- * @hb1: the from futex hash bucket, must be locked by the caller
- * @hb2: the to futex hash bucket, must be locked by the caller
- * @key1: the from futex key
- * @key2: the to futex key
- * @ps: address to store the pi_state pointer
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed. If the caller specified set_waiters,
- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
- * hb1 and hb2 must be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- *
- * Return:
- * - 0 - failed to acquire the lock atomically;
- * - >0 - acquired the lock, return value is vpid of the top_waiter
- * - <0 - error
- */
-static int
-futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key1,
- union futex_key *key2, struct futex_pi_state **ps,
- struct task_struct **exiting, int set_waiters)
-{
- struct futex_q *top_waiter = NULL;
- u32 curval;
- int ret, vpid;
-
- if (get_futex_value_locked(&curval, pifutex))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- /*
- * Find the top_waiter and determine if there are additional waiters.
- * If the caller intends to requeue more than 1 waiter to pifutex,
- * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
- * as we have means to handle the possible fault. If not, don't set
- * the bit unnecessarily as it will force the subsequent unlock to enter
- * the kernel.
- */
- top_waiter = futex_top_waiter(hb1, key1);
-
- /* There are no waiters, nothing for us to do. */
- if (!top_waiter)
- return 0;
-
- /* Ensure we requeue to the expected futex. */
- if (!match_futex(top_waiter->requeue_pi_key, key2))
- return -EINVAL;
-
- /*
- * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
- * the contended case or if set_waiters is 1. The pi_state is returned
- * in ps in contended cases.
- */
- vpid = task_pid_vnr(top_waiter->task);
- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- exiting, set_waiters);
- if (ret == 1) {
- requeue_pi_wake_futex(top_waiter, key2, hb2);
- return vpid;
- }
- return ret;
-}
-
-/**
- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @uaddr2: target futex user address
- * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * @nr_requeue: number of waiters to requeue (0-INT_MAX)
- * @cmpval: @uaddr1 expected value (or %NULL)
- * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
- *
- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
- * uaddr2 atomically on behalf of the top waiter.
- *
- * Return:
- * - >=0 - on success, the number of tasks requeued or woken;
- * - <0 - on error
- */
-static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
- u32 *cmpval, int requeue_pi)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- int task_count = 0, ret;
- struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- DEFINE_WAKE_Q(wake_q);
-
- if (nr_wake < 0 || nr_requeue < 0)
- return -EINVAL;
-
- /*
- * When PI not supported: return -ENOSYS if requeue_pi is true,
- * consequently the compiler knows requeue_pi is always false past
- * this point which will optimize away all the conditional code
- * further down.
- */
- if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
- return -ENOSYS;
-
- if (requeue_pi) {
- /*
- * Requeue PI only works on two distinct uaddrs. This
- * check is only valid for private futexes. See below.
- */
- if (uaddr1 == uaddr2)
- return -EINVAL;
-
- /*
- * requeue_pi requires a pi_state, try to allocate it now
- * without any locks in case it fails.
- */
- if (refill_pi_state_cache())
- return -ENOMEM;
- /*
- * requeue_pi must wake as many tasks as it can, up to nr_wake
- * + nr_requeue, since it acquires the rt_mutex prior to
- * returning to userspace, so as to not leave the rt_mutex with
- * waiters and no owner. However, second and third wake-ups
- * cannot be predicted as they involve race conditions with the
- * first wake and a fault while looking up the pi_state. Both
- * pthread_cond_signal() and pthread_cond_broadcast() should
- * use nr_wake=1.
- */
- if (nr_wake != 1)
- return -EINVAL;
- }
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
- requeue_pi ? FUTEX_WRITE : FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (requeue_pi && match_futex(&key1, &key2))
- return -EINVAL;
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
-
- if (likely(cmpval != NULL)) {
- u32 curval;
-
- ret = get_futex_value_locked(&curval, uaddr1);
-
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
-
- ret = get_user(curval, uaddr1);
- if (ret)
- return ret;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
-
- if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
- struct task_struct *exiting = NULL;
-
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
-
- /*
- * At this point the top_waiter has either taken uaddr2 or is
- * waiting on it. If the former, then the pi_state will not
- * exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, ret contains the
- * vpid of the top waiter task.
- * If the lock was not taken, we have pi_state and an initial
- * refcount on it. In case of an error we have nothing.
- */
- if (ret > 0) {
- WARN_ON(pi_state);
- task_count++;
- /*
- * If we acquired the lock, then the user space value
- * of uaddr2 should be vpid. It cannot be changed by
- * the top waiter as it is blocked on hb2 lock if it
- * tries to do so. If something fiddled with it behind
- * our back the pi state lookup might unearth it. So
- * we rather use the known value than rereading and
- * handing potential crap to lookup_pi_state.
- *
- * If that call succeeds then we have pi_state and an
- * initial refcount on it.
- */
- ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
- &pi_state, &exiting);
- }
-
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- /* If the above failed, then pi_state is NULL */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- return ret;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!match_futex(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Wake nr_wake waiters. For requeue_pi, if we acquired the
- * lock, we already woke the top_waiter. If not, it will be
- * woken by futex_unlock_pi().
- */
- if (++task_count <= nr_wake && !requeue_pi) {
- mark_wake_futex(&wake_q, this);
- continue;
- }
-
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- */
- if (requeue_pi) {
- /*
- * Prepare the waiter to take the rt_mutex. Take a
- * refcount on the pi_state and store the pointer in
- * the futex_q object of the waiter.
- */
- get_pi_state(pi_state);
- this->pi_state = pi_state;
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the
- * refcount on pi_state nor clear
- * this->pi_state because the waiter needs the
- * pi_state for cleaning up the user space
- * value. It will drop the refcount after
- * doing so.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- continue;
- } else if (ret) {
- /*
- * rt_mutex_start_proxy_lock() detected a
- * potential deadlock when we tried to queue
- * that waiter. Drop the pi_state reference
- * which we took above and remove the pointer
- * to the state from the waiters futex_q
- * object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- /*
- * We stop queueing more waiters and let user
- * space deal with the mess.
- */
- break;
- }
- }
- requeue_futex(this, hb1, hb2, &key2);
- }
-
- /*
- * We took an extra initial reference to the pi_state either
- * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
- * need to drop it here again.
- */
- put_pi_state(pi_state);
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
- wake_up_q(&wake_q);
- hb_waiters_dec(hb2);
- return ret ? ret : task_count;
-}
-
-/* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
- __acquires(&hb->lock)
-{
- struct futex_hash_bucket *hb;
-
- hb = hash_futex(&q->key);
-
- /*
- * Increment the counter before taking the lock so that
- * a potential waker won't miss a to-be-slept task that is
- * waiting for the spinlock. This is safe as all queue_lock()
- * users end up calling queue_me(). Similarly, for housekeeping,
- * decrement the counter at queue_unlock() when some error has
- * occurred and we don't end up adding the task to the list.
- */
- hb_waiters_inc(hb); /* implies smp_mb(); (A) */
-
- q->lock_ptr = &hb->lock;
-
- spin_lock(&hb->lock);
- return hb;
-}
-
-static inline void
-queue_unlock(struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- spin_unlock(&hb->lock);
- hb_waiters_dec(hb);
-}
-
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-{
- int prio;
-
- /*
- * The priority used to register this element is
- * - either the real thread-priority for the real-time threads
- * (i.e. threads with a priority lower than MAX_RT_PRIO)
- * - or MAX_RT_PRIO for non-RT threads.
- * Thus, all RT-threads are woken first in priority order, and
- * the others are woken last, in FIFO order.
- */
- prio = min(current->normal_prio, MAX_RT_PRIO);
-
- plist_node_init(&q->list, prio);
- plist_add(&q->list, &hb->chain);
- q->task = current;
-}
-
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- __queue_me(q, hb);
- spin_unlock(&hb->lock);
-}
-
-/**
- * unqueue_me() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
- * be paired with exactly one earlier call to queue_me().
- *
- * Return:
- * - 1 - if the futex_q was still queued (and we removed unqueued it);
- * - 0 - if the futex_q was already removed by the waking thread
- */
-static int unqueue_me(struct futex_q *q)
-{
- spinlock_t *lock_ptr;
- int ret = 0;
-
- /* In the common case we don't take the spinlock, which is nice. */
-retry:
- /*
- * q->lock_ptr can change between this read and the following spin_lock.
- * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
- * optimizing lock_ptr out of the logic below.
- */
- lock_ptr = READ_ONCE(q->lock_ptr);
- if (lock_ptr != NULL) {
- spin_lock(lock_ptr);
- /*
- * q->lock_ptr can change between reading it and
- * spin_lock(), causing us to take the wrong lock. This
- * corrects the race condition.
- *
- * Reasoning goes like this: if we have the wrong lock,
- * q->lock_ptr must have changed (maybe several times)
- * between reading it and the spin_lock(). It can
- * change again after the spin_lock() but only if it was
- * already changed before the spin_lock(). It cannot,
- * however, change back to the original value. Therefore
- * we can detect whether we acquired the correct lock.
- */
- if (unlikely(lock_ptr != q->lock_ptr)) {
- spin_unlock(lock_ptr);
- goto retry;
- }
- __unqueue_futex(q);
-
- BUG_ON(q->pi_state);
-
- spin_unlock(lock_ptr);
- ret = 1;
- }
-
- return ret;
-}
-
-/*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
- */
-static void unqueue_me_pi(struct futex_q *q)
-{
- __unqueue_futex(q);
-
- BUG_ON(!q->pi_state);
- put_pi_state(q->pi_state);
- q->pi_state = NULL;
-}
-
-static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
-{
- struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner, *newowner;
- u32 uval, curval, newval, newtid;
- int err = 0;
-
- oldowner = pi_state->owner;
-
- /*
- * We are here because either:
- *
- * - we stole the lock and pi_state->owner needs updating to reflect
- * that (@argowner == current),
- *
- * or:
- *
- * - someone stole our lock and we need to fix things to point to the
- * new owner (@argowner == NULL).
- *
- * Either way, we have to replace the TID in the user space variable.
- * This must be atomic as we have to preserve the owner died bit here.
- *
- * Note: We write the user space value _before_ changing the pi_state
- * because we can fault here. Imagine swapped out pages or a fork
- * that marked all the anonymous memory readonly for cow.
- *
- * Modifying pi_state _before_ the user space value would leave the
- * pi_state in an inconsistent state when we fault here, because we
- * need to drop the locks to handle the fault. This might be observed
- * in the PID check in lookup_pi_state.
- */
-retry:
- if (!argowner) {
- if (oldowner != current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- return 0;
- }
-
- if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
- /* We got the lock. pi_state is correct. Tell caller. */
- return 1;
- }
-
- /*
- * The trylock just failed, so either there is an owner or
- * there is a higher priority waiter than this one.
- */
- newowner = rt_mutex_owner(&pi_state->pi_mutex);
- /*
- * If the higher priority waiter has not yet taken over the
- * rtmutex then newowner is NULL. We can't return here with
- * that state because it's inconsistent vs. the user space
- * state. So drop the locks and try again. It's a valid
- * situation and not any different from the other retry
- * conditions.
- */
- if (unlikely(!newowner)) {
- err = -EAGAIN;
- goto handle_err;
- }
- } else {
- WARN_ON_ONCE(argowner != current);
- if (oldowner == current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- return 1;
- }
- newowner = argowner;
- }
-
- newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
-
- err = get_futex_value_locked(&uval, uaddr);
- if (err)
- goto handle_err;
-
- for (;;) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
-
- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (err)
- goto handle_err;
-
- if (curval == uval)
- break;
- uval = curval;
- }
-
- /*
- * We fixed up user space. Now we need to fix the pi_state
- * itself.
- */
- pi_state_update_owner(pi_state, newowner);
-
- return argowner == current;
-
- /*
- * In order to reschedule or handle a page fault, we need to drop the
- * locks here. In the case of a fault, this gives the other task
- * (either the highest priority waiter itself or the task which stole
- * the rtmutex) the chance to try the fixup of the pi_state. So once we
- * are back from handling the fault we need to check the pi_state after
- * reacquiring the locks and before trying to do another fixup. When
- * the fixup has been done already we simply return.
- *
- * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
- * drop hb->lock since the caller owns the hb -> futex_q relation.
- * Dropping the pi_mutex->wait_lock requires the state revalidate.
- */
-handle_err:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(q->lock_ptr);
-
- switch (err) {
- case -EFAULT:
- err = fault_in_user_writeable(uaddr);
- break;
-
- case -EAGAIN:
- cond_resched();
- err = 0;
- break;
-
- default:
- WARN_ON_ONCE(1);
- break;
- }
-
- spin_lock(q->lock_ptr);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- /*
- * Check if someone else fixed it for us:
- */
- if (pi_state->owner != oldowner)
- return argowner == current;
-
- /* Retry if err was -EAGAIN or the fault in succeeded */
- if (!err)
- goto retry;
-
- /*
- * fault_in_user_writeable() failed so user state is immutable. At
- * best we can make the kernel state consistent but user state will
- * be most likely hosed and any subsequent unlock operation will be
- * rejected due to PI futex rule [10].
- *
- * Ensure that the rtmutex owner is also the pi_state owner despite
- * the user space value claiming something different. There is no
- * point in unlocking the rtmutex if current is the owner as it
- * would need to wait until the next waiter has taken the rtmutex
- * to guarantee consistent state. Keep it simple. Userspace asked
- * for this wreckaged state.
- *
- * The rtmutex has an owner - either current or some other
- * task. See the EAGAIN loop above.
- */
- pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
-
- return err;
-}
-
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
-{
- struct futex_pi_state *pi_state = q->pi_state;
- int ret;
-
- lockdep_assert_held(q->lock_ptr);
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- ret = __fixup_pi_state_owner(uaddr, q, argowner);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
-}
-
-static long futex_wait_restart(struct restart_block *restart);
-
-/**
- * fixup_owner() - Post lock pi_state and corner case management
- * @uaddr: user address of the futex
- * @q: futex_q (contains pi_state and access to the rt_mutex)
- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
- *
- * After attempting to lock an rt_mutex, this function is called to cleanup
- * the pi_state owner as well as handle race conditions that may allow us to
- * acquire the lock. Must be called with the hb lock held.
- *
- * Return:
- * - 1 - success, lock taken;
- * - 0 - success, lock not taken;
- * - <0 - on error (-EFAULT)
- */
-static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-{
- if (locked) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case:
- *
- * Speculative pi_state->owner read (we don't hold wait_lock);
- * since we own the lock pi_state->owner == current is the
- * stable state, anything else needs more attention.
- */
- if (q->pi_state->owner != current)
- return fixup_pi_state_owner(uaddr, q, current);
- return 1;
- }
-
- /*
- * If we didn't get the lock; check if anybody stole it from us. In
- * that case, we need to fix up the uval to point to them instead of
- * us, otherwise bad things happen. [10]
- *
- * Another speculative read; pi_state->owner == current is unstable
- * but needs our attention.
- */
- if (q->pi_state->owner == current)
- return fixup_pi_state_owner(uaddr, q, NULL);
-
- /*
- * Paranoia check. If we did not take the lock, then we should not be
- * the owner of the rt_mutex. Warn and establish consistent state.
- */
- if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
- return fixup_pi_state_owner(uaddr, q, current);
-
- return 0;
-}
-
-/**
- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
- * @q: the futex_q to queue up on
- * @timeout: the prepared hrtimer_sleeper, or null for no timeout
- */
-static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
-{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * queue_me() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- queue_me(q, hb);
-
- /* Arm the timer */
- if (timeout)
- hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
-
- /*
- * If we have been removed from the hash list, then another task
- * has tried to wake us, and we can skip the call to schedule().
- */
- if (likely(!plist_node_empty(&q->list))) {
- /*
- * If the timer has already expired, current will already be
- * flagged for rescheduling. Only call schedule if there
- * is no timeout, or if it has yet to expire.
- */
- if (!timeout || timeout->task)
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
-}
-
-/**
- * futex_wait_setup() - Prepare to wait on a futex
- * @uaddr: the futex userspace address
- * @val: the expected value
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
- *
- * Setup the futex_q and locate the hash_bucket. Get the futex value and
- * compare it with the expected value. Handle atomic faults internally.
- * Return with the hb lock held and a q.key reference on success, and unlocked
- * with no q.key reference on failure.
- *
- * Return:
- * - 0 - uaddr contains val and hb has been locked;
- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
- */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
-{
- u32 uval;
- int ret;
-
- /*
- * Access the page AFTER the hash-bucket is locked.
- * Order is important:
- *
- * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
- * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
- *
- * The basic logical guarantee of a futex is that it blocks ONLY
- * if cond(var) is known to be true at the time of blocking, for
- * any cond. If we locked the hash-bucket after testing *uaddr, that
- * would open a race condition where we could block indefinitely with
- * cond(var) false, which would violate the guarantee.
- *
- * On the other hand, we insert q and release the hash-bucket only
- * after testing *uaddr. This guarantees that futex_wait() will NOT
- * absorb a wakeup if *uaddr does not match the desired values
- * while the syscall executes.
- */
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
-retry_private:
- *hb = queue_lock(q);
-
- ret = get_futex_value_locked(&uval, uaddr);
-
- if (ret) {
- queue_unlock(*hb);
-
- ret = get_user(uval, uaddr);
- if (ret)
- return ret;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
- }
-
- if (uval != val) {
- queue_unlock(*hb);
- ret = -EWOULDBLOCK;
- }
-
- return ret;
-}
-
-static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
- ktime_t *abs_time, u32 bitset)
-{
- struct hrtimer_sleeper timeout, *to;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int ret;
-
- if (!bitset)
- return -EINVAL;
- q.bitset = bitset;
-
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
-retry:
- /*
- * Prepare to wait on uaddr. On success, holds hb lock and increments
- * q.key refs.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out;
-
- /* queue_me and wait for wakeup, timeout, or a signal. */
- futex_wait_queue_me(hb, &q, to);
-
- /* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
- /* unqueue_me() drops q.key ref */
- if (!unqueue_me(&q))
- goto out;
- ret = -ETIMEDOUT;
- if (to && !to->task)
- goto out;
-
- /*
- * We expect signal_pending(current), but we might be the
- * victim of a spurious wakeup as well.
- */
- if (!signal_pending(current))
- goto retry;
-
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
-
- restart = &current->restart_block;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = *abs_time;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
-
- ret = set_restart_fn(restart, futex_wait_restart);
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-
-static long futex_wait_restart(struct restart_block *restart)
-{
- u32 __user *uaddr = restart->futex.uaddr;
- ktime_t t, *tp = NULL;
-
- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t = restart->futex.time;
- tp = &t;
- }
- restart->fn = do_no_restart_syscall;
-
- return (long)futex_wait(uaddr, restart->futex.flags,
- restart->futex.val, tp, restart->futex.bitset);
-}
-
-
-/*
- * Userspace tried a 0 -> TID atomic transition of the futex value
- * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block as a consequence of relying
- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
- * a 0 value of the futex too.).
- *
- * Also serves as futex trylock_pi()'ing, and due semantics.
- */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
- ktime_t *time, int trylock)
-{
- struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
- if (refill_pi_state_cache())
- return -ENOMEM;
-
- to = futex_setup_timer(time, &timeout, flags, 0);
-
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
-retry_private:
- hb = queue_lock(&q);
-
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- queue_unlock(hb);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
- }
- }
-
- WARN_ON(!q.pi_state);
-
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __queue_me(&q, hb);
-
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
-
- rt_mutex_init_waiter(&rt_waiter);
-
- /*
- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
-
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
-
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
-
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
-
-cleanup:
- spin_lock(q.lock_ptr);
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
- *
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
-
-no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr, &q, !ret);
- /*
- * If fixup_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- unqueue_me_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
-
-out_unlock_put_key:
- queue_unlock(hb);
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- queue_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
-}
-
-/*
- * Userspace attempted a TID -> 0 atomic transition, and failed.
- * This is the in-kernel slowpath: we look up the PI state (if any),
- * and do the rt-mutex unlock.
- */
-static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
-{
- u32 curval, uval, vpid = task_pid_vnr(current);
- union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
- struct futex_q *top_waiter;
- int ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
-retry:
- if (get_user(uval, uaddr))
- return -EFAULT;
- /*
- * We release only a lock we actually own:
- */
- if ((uval & FUTEX_TID_MASK) != vpid)
- return -EPERM;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
- if (ret)
- return ret;
-
- hb = hash_futex(&key);
- spin_lock(&hb->lock);
-
- /*
- * Check waiters first. We do not trust user space values at
- * all and we at least want to know if user space fiddled
- * with the futex value instead of blindly unlocking.
- */
- top_waiter = futex_top_waiter(hb, &key);
- if (top_waiter) {
- struct futex_pi_state *pi_state = top_waiter->pi_state;
-
- ret = -EINVAL;
- if (!pi_state)
- goto out_unlock;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- goto out_unlock;
-
- get_pi_state(pi_state);
- /*
- * By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_pi() must observe a state consistent with what we
- * observed.
- *
- * In particular; this forces __rt_mutex_start_proxy() to
- * complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
- /* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
-
- put_pi_state(pi_state);
-
- /*
- * Success, we're done! No tricky corner cases.
- */
- if (!ret)
- return ret;
- /*
- * The atomic access to the futex value generated a
- * pagefault, so retry the user-access and the wakeup:
- */
- if (ret == -EFAULT)
- goto pi_faulted;
- /*
- * A unconditional UNLOCK_PI op raced against a waiter
- * setting the FUTEX_WAITERS bit. Try again.
- */
- if (ret == -EAGAIN)
- goto pi_retry;
- /*
- * wake_futex_pi has detected invalid state. Tell user
- * space.
- */
- return ret;
- }
-
- /*
- * We have no kernel internal state, i.e. no waiters in the
- * kernel. Waiters which are about to queue themselves are stuck
- * on hb->lock. So we can safely ignore them. We do neither
- * preserve the WAITERS bit not the OWNER_DIED one. We are the
- * owner.
- */
- if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
- spin_unlock(&hb->lock);
- switch (ret) {
- case -EFAULT:
- goto pi_faulted;
-
- case -EAGAIN:
- goto pi_retry;
-
- default:
- WARN_ON_ONCE(1);
- return ret;
- }
- }
-
- /*
- * If uval has changed, let user space handle it.
- */
- ret = (curval == uval) ? 0 : -EAGAIN;
-
-out_unlock:
- spin_unlock(&hb->lock);
- return ret;
-
-pi_retry:
- cond_resched();
- goto retry;
-
-pi_faulted:
-
- ret = fault_in_user_writeable(uaddr);
- if (!ret)
- goto retry;
-
- return ret;
-}
-
-/**
- * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
- * @hb: the hash_bucket futex_q was original enqueued on
- * @q: the futex_q woken while waiting to be requeued
- * @key2: the futex_key of the requeue target futex
- * @timeout: the timeout associated with the wait (NULL if none)
- *
- * Detect if the task was woken on the initial futex as opposed to the requeue
- * target futex. If so, determine if it was a timeout or a signal that caused
- * the wakeup and return the appropriate error code to the caller. Must be
- * called with the hb lock held.
- *
- * Return:
- * - 0 = no early wakeup detected;
- * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
- */
-static inline
-int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
- struct futex_q *q, union futex_key *key2,
- struct hrtimer_sleeper *timeout)
-{
- int ret = 0;
-
- /*
- * With the hb lock held, we avoid races while we process the wakeup.
- * We only need to hold hb (and not hb2) to ensure atomicity as the
- * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
- * It can't be requeued from uaddr2 to something else since we don't
- * support a PI aware source futex for requeue.
- */
- if (!match_futex(&q->key, key2)) {
- WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
- /*
- * We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
- */
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-
- /* Handle spurious wakeups gracefully */
- ret = -EWOULDBLOCK;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- else if (signal_pending(current))
- ret = -ERESTARTNOINTR;
- }
- return ret;
-}
-
-/**
- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
- * @uaddr: the futex we initially wait on (non-pi)
- * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
- * @val: the expected value of uaddr
- * @abs_time: absolute timeout
- * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @uaddr2: the pi futex we will take prior to returning to user-space
- *
- * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
- * without one, the pi logic would not know which task to boost/deboost, if
- * there was a need to.
- *
- * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following--
- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue
- * 3) signal
- * 4) timeout
- *
- * If 3, cleanup and return -ERESTARTNOINTR.
- *
- * If 2, we may then block on trying to take the rt_mutex and return via:
- * 5) successful lock
- * 6) signal
- * 7) timeout
- * 8) other lock acquisition failure
- *
- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
- *
- * If 4 or 7, we cleanup and return with -ETIMEDOUT.
- *
- * Return:
- * - 0 - On success;
- * - <0 - On error
- */
-static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
- u32 val, ktime_t *abs_time, u32 bitset,
- u32 __user *uaddr2)
-{
- struct hrtimer_sleeper timeout, *to;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- union futex_key key2 = FUTEX_KEY_INIT;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
- if (uaddr == uaddr2)
- return -EINVAL;
-
- if (!bitset)
- return -EINVAL;
-
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
-
- /*
- * The waiter is allocated on our stack, manipulated by the requeue
- * code while we sleep on uaddr.
- */
- rt_mutex_init_waiter(&rt_waiter);
-
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
- q.bitset = bitset;
- q.rt_waiter = &rt_waiter;
- q.requeue_pi_key = &key2;
-
- /*
- * Prepare to wait on uaddr. On success, increments q.key (key1) ref
- * count.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (match_futex(&q.key, &key2)) {
- queue_unlock(hb);
- ret = -EINVAL;
- goto out;
- }
-
- /* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue_me(hb, &q, to);
-
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out;
-
- /*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
- */
-
- /*
- * Check if the requeue code acquired the second futex for us and do
- * any pertinent fixup.
- */
- if (!q.rt_waiter) {
- if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
- ret = fixup_owner(uaddr2, &q, true);
- /*
- * Drop the reference to the pi state which
- * the requeue_pi() code acquired for us.
- */
- put_pi_state(q.pi_state);
- spin_unlock(q.lock_ptr);
- /*
- * Adjust the return value. It's either -EFAULT or
- * success (1) but the caller expects 0 for success.
- */
- ret = ret < 0 ? ret : 0;
- }
- } else {
- struct rt_mutex *pi_mutex;
-
- /*
- * We have been woken up by futex_unlock_pi(), a timeout, or a
- * signal. futex_unlock_pi() will not destroy the lock_ptr nor
- * the pi_state.
- */
- WARN_ON(!q.pi_state);
- pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
-
- spin_lock(q.lock_ptr);
- if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
- ret = 0;
-
- debug_rt_mutex_free_waiter(&rt_waiter);
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr2, &q, !ret);
- /*
- * If fixup_owner() returned an error, propagate that. If it
- * acquired the lock, clear -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- unqueue_me_pi(&q);
- spin_unlock(q.lock_ptr);
- }
-
- if (ret == -EINTR) {
- /*
- * We've already been requeued, but cannot restart by calling
- * futex_lock_pi() directly. We could restart this syscall, but
- * it would detect that the user space "val" changed and return
- * -EWOULDBLOCK. Save the overhead of the restart and return
- * -EWOULDBLOCK directly.
- */
- ret = -EWOULDBLOCK;
- }
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-/*
- * Support for robust futexes: the kernel cleans up held futexes at
- * thread exit time.
- *
- * Implementation: user-space maintains a per-thread list of locks it
- * is holding. Upon do_exit(), the kernel carefully walks this list,
- * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
- * always manipulated with the lock held, so the list is private and
- * per-thread. Userspace also maintains a per-thread 'list_op_pending'
- * field, to allow the kernel to clean up if the thread dies after
- * acquiring the lock, but just before it could have added itself to
- * the list. There can only be one such pending lock.
- */
-
-/**
- * sys_set_robust_list() - Set the robust-futex list head of a task
- * @head: pointer to the list-head
- * @len: length of the list-head, as userspace expects
- */
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
- size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- /*
- * The kernel knows only one size for now:
- */
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->robust_list = head;
-
- return 0;
-}
-
-/**
- * sys_get_robust_list() - Get the robust-futex list head of a task
- * @pid: pid of the process [zero for current task]
- * @head_ptr: pointer to a list-head pointer, the kernel fills it in
- * @len_ptr: pointer to a length field, the kernel fills in the header size
- */
-SYSCALL_DEFINE3(get_robust_list, int, pid,
- struct robust_list_head __user * __user *, head_ptr,
- size_t __user *, len_ptr)
-{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-/* Constants for the pending_op argument of handle_futex_death */
-#define HANDLE_DEATH_PENDING true
-#define HANDLE_DEATH_LIST false
-
-/*
- * Process a futex-list entry, check whether it's owned by the
- * dying task, and do notification if so:
- */
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
- bool pi, bool pending_op)
-{
- u32 uval, nval, mval;
- int err;
-
- /* Futex address must be 32bit aligned */
- if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
- return -1;
-
-retry:
- if (get_user(uval, uaddr))
- return -1;
-
- /*
- * Special case for regular (non PI) futexes. The unlock path in
- * user space has two race scenarios:
- *
- * 1. The unlock path releases the user space futex value and
- * before it can execute the futex() syscall to wake up
- * waiters it is killed.
- *
- * 2. A woken up waiter is killed before it can acquire the
- * futex in user space.
- *
- * In both cases the TID validation below prevents a wakeup of
- * potential waiters which can cause these waiters to block
- * forever.
- *
- * In both cases the following conditions are met:
- *
- * 1) task->robust_list->list_op_pending != NULL
- * @pending_op == true
- * 2) User space futex value == 0
- * 3) Regular futex: @pi == false
- *
- * If these conditions are met, it is safe to attempt waking up a
- * potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
- */
- if (pending_op && !pi && !uval) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
- return 0;
- }
-
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
- return 0;
-
- /*
- * Ok, this dying thread is truly holding a futex
- * of interest. Set the OWNER_DIED bit atomically
- * via cmpxchg, and if the value had FUTEX_WAITERS
- * set, wake up a waiter (if any). (We have to do a
- * futex_wake() even if OWNER_DIED is already set -
- * to handle the rare but possible case of recursive
- * thread-death.) The rest of the cleanup is done in
- * userspace.
- */
- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-
- /*
- * We are not holding a lock here, but we want to have
- * the pagefault_disable/enable() protection because
- * we want to handle the fault gracefully. If the
- * access fails we try to fault in the futex with R/W
- * verification via get_user_pages. get_user() above
- * does not guarantee R/W access. If that fails we
- * give up and leave the futex locked.
- */
- if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
- switch (err) {
- case -EFAULT:
- if (fault_in_user_writeable(uaddr))
- return -1;
- goto retry;
-
- case -EAGAIN:
- cond_resched();
- goto retry;
-
- default:
- WARN_ON_ONCE(1);
- return err;
- }
- }
-
- if (nval != uval)
- goto retry;
-
- /*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
- */
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
-
- return 0;
-}
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int fetch_robust_entry(struct robust_list __user **entry,
- struct robust_list __user * __user *head,
- unsigned int *pi)
-{
- unsigned long uentry;
-
- if (get_user(uentry, (unsigned long __user *)head))
- return -EFAULT;
-
- *entry = (void __user *)(uentry & ~1UL);
- *pi = uentry & 1;
-
- return 0;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-static void exit_robust_list(struct task_struct *curr)
-{
- struct robust_list_head __user *head = curr->robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
- unsigned long futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * don't process it twice:
- */
- if (entry != pending) {
- if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi, HANDLE_DEATH_LIST))
- return;
- }
- if (rc)
- return;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
-
- if (pending) {
- handle_futex_death((void __user *)pending + futex_offset,
- curr, pip, HANDLE_DEATH_PENDING);
- }
-}
-
-static void futex_cleanup(struct task_struct *tsk)
-{
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
-
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-}
-
-/**
- * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
- * @tsk: task to set the state on
- *
- * Set the futex exit state of the task lockless. The futex waiter code
- * observes that state when a task is exiting and loops until the task has
- * actually finished the futex cleanup. The worst case for this is that the
- * waiter runs through the wait loop until the state becomes visible.
- *
- * This is called from the recursive fault handling path in do_exit().
- *
- * This is best effort. Either the futex exit code has run already or
- * not. If the OWNER_DIED bit has been set on the futex then the waiter can
- * take it over. If not, the problem is pushed back to user space. If the
- * futex exit code did not run yet, then an already queued waiter might
- * block forever, but there is nothing which can be done about that.
- */
-void futex_exit_recursive(struct task_struct *tsk)
-{
- /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
- if (tsk->futex_state == FUTEX_STATE_EXITING)
- mutex_unlock(&tsk->futex_exit_mutex);
- tsk->futex_state = FUTEX_STATE_DEAD;
-}
-
-static void futex_cleanup_begin(struct task_struct *tsk)
-{
- /*
- * Prevent various race issues against a concurrent incoming waiter
- * including live locks by forcing the waiter to block on
- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
- * attach_to_pi_owner().
- */
- mutex_lock(&tsk->futex_exit_mutex);
-
- /*
- * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
- *
- * This ensures that all subsequent checks of tsk->futex_state in
- * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
- * tsk->pi_lock held.
- *
- * It guarantees also that a pi_state which was queued right before
- * the state change under tsk->pi_lock by a concurrent waiter must
- * be observed in exit_pi_state_list().
- */
- raw_spin_lock_irq(&tsk->pi_lock);
- tsk->futex_state = FUTEX_STATE_EXITING;
- raw_spin_unlock_irq(&tsk->pi_lock);
-}
-
-static void futex_cleanup_end(struct task_struct *tsk, int state)
-{
- /*
- * Lockless store. The only side effect is that an observer might
- * take another loop until it becomes visible.
- */
- tsk->futex_state = state;
- /*
- * Drop the exit protection. This unblocks waiters which observed
- * FUTEX_STATE_EXITING to reevaluate the state.
- */
- mutex_unlock(&tsk->futex_exit_mutex);
-}
-
-void futex_exec_release(struct task_struct *tsk)
-{
- /*
- * The state handling is done for consistency, but in the case of
- * exec() there is no way to prevent further damage as the PID stays
- * the same. But for the unlikely and arguably buggy case that a
- * futex is held on exec(), this provides at least as much state
- * consistency protection which is possible.
- */
- futex_cleanup_begin(tsk);
- futex_cleanup(tsk);
- /*
- * Reset the state to FUTEX_STATE_OK. The task is alive and about
- * exec a new binary.
- */
- futex_cleanup_end(tsk, FUTEX_STATE_OK);
-}
-
-void futex_exit_release(struct task_struct *tsk)
-{
- futex_cleanup_begin(tsk);
- futex_cleanup(tsk);
- futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
-}
-
-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
- u32 __user *uaddr2, u32 val2, u32 val3)
-{
- int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
-
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
-
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
- cmd != FUTEX_LOCK_PI2)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_LOCK_PI:
- case FUTEX_LOCK_PI2:
- case FUTEX_UNLOCK_PI:
- case FUTEX_TRYLOCK_PI:
- case FUTEX_WAIT_REQUEUE_PI:
- case FUTEX_CMP_REQUEUE_PI:
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_WAIT:
- val3 = FUTEX_BITSET_MATCH_ANY;
- fallthrough;
- case FUTEX_WAIT_BITSET:
- return futex_wait(uaddr, flags, val, timeout, val3);
- case FUTEX_WAKE:
- val3 = FUTEX_BITSET_MATCH_ANY;
- fallthrough;
- case FUTEX_WAKE_BITSET:
- return futex_wake(uaddr, flags, val, val3);
- case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- case FUTEX_WAKE_OP:
- return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- case FUTEX_LOCK_PI:
- flags |= FLAGS_CLOCKRT;
- fallthrough;
- case FUTEX_LOCK_PI2:
- return futex_lock_pi(uaddr, flags, timeout, 0);
- case FUTEX_UNLOCK_PI:
- return futex_unlock_pi(uaddr, flags);
- case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, NULL, 1);
- case FUTEX_WAIT_REQUEUE_PI:
- val3 = FUTEX_BITSET_MATCH_ANY;
- return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- }
- return -ENOSYS;
-}
-
-static __always_inline bool futex_cmd_has_timeout(u32 cmd)
-{
- switch (cmd) {
- case FUTEX_WAIT:
- case FUTEX_LOCK_PI:
- case FUTEX_LOCK_PI2:
- case FUTEX_WAIT_BITSET:
- case FUTEX_WAIT_REQUEUE_PI:
- return true;
- }
- return false;
-}
-
-static __always_inline int
-futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
-{
- if (!timespec64_valid(ts))
- return -EINVAL;
-
- *t = timespec64_to_ktime(*ts);
- if (cmd == FUTEX_WAIT)
- *t = ktime_add_safe(ktime_get(), *t);
- else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
- *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
- return 0;
-}
-
-SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- const struct __kernel_timespec __user *, utime,
- u32 __user *, uaddr2, u32, val3)
-{
- int ret, cmd = op & FUTEX_CMD_MASK;
- ktime_t t, *tp = NULL;
- struct timespec64 ts;
-
- if (utime && futex_cmd_has_timeout(cmd)) {
- if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
- return -EFAULT;
- if (get_timespec64(&ts, utime))
- return -EFAULT;
- ret = futex_init_timeout(cmd, op, &ts, &t);
- if (ret)
- return ret;
- tp = &t;
- }
-
- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
-}
-
-#ifdef CONFIG_COMPAT
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-static void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (compat_fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi,
- HANDLE_DEATH_LIST))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-#endif /* CONFIG_COMPAT */
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
- const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- int ret, cmd = op & FUTEX_CMD_MASK;
- ktime_t t, *tp = NULL;
- struct timespec64 ts;
-
- if (utime && futex_cmd_has_timeout(cmd)) {
- if (get_old_timespec32(&ts, utime))
- return -EFAULT;
- ret = futex_init_timeout(cmd, op, &ts, &t);
- if (ret)
- return ret;
- tp = &t;
- }
-
- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
-}
-#endif /* CONFIG_COMPAT_32BIT_TIME */
-
-static void __init futex_detect_cmpxchg(void)
-{
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
- u32 curval;
-
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
-#endif
-}
-
-static int __init futex_init(void)
-{
- unsigned int futex_shift;
- unsigned long i;
-
-#if CONFIG_BASE_SMALL
- futex_hashsize = 16;
-#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
-#endif
-
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- futex_detect_cmpxchg();
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
- }
-
- return 0;
-}
-core_initcall(futex_init);
diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
new file mode 100644
index 000000000000..b77188d1fa07
--- /dev/null
+++ b/kernel/futex/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
new file mode 100644
index 000000000000..514e4582b863
--- /dev/null
+++ b/kernel/futex/core.c
@@ -0,0 +1,1149 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ */
+#include <linux/compat.h>
+#include <linux/jhash.h>
+#include <linux/pagemap.h>
+#include <linux/memblock.h>
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in futex_hash()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+ struct futex_hash_bucket *queues;
+ unsigned long hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
+
+
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = false,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#endif /* CONFIG_FAIL_FUTEX */
+
+/**
+ * futex_hash - Return the hash bucket in the global hash
+ * @key: Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
+ */
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
+ key->both.offset);
+
+ return &futex_queues[hash & (futex_hashsize - 1)];
+}
+
+
+/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that futex_match() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
+/**
+ * get_futex_key() - Get parameters which are the keys for a futex
+ * @uaddr: virtual address of the futex
+ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: FUTEX_READ,
+ * FUTEX_WRITE)
+ *
+ * Return: a negative error code or 0
+ *
+ * The key words are stored in @key on success.
+ *
+ * For shared mappings (when @fshared), the key is:
+ *
+ * ( inode->i_sequence, page->index, offset_within_page )
+ *
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ *
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
+ *
+ * lock_page() might sleep, the caller should not hold a spinlock.
+ */
+int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+ enum futex_access rw)
+{
+ unsigned long address = (unsigned long)uaddr;
+ struct mm_struct *mm = current->mm;
+ struct page *page, *tail;
+ struct address_space *mapping;
+ int err, ro = 0;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = address % PAGE_SIZE;
+ if (unlikely((address % sizeof(u32)) != 0))
+ return -EINVAL;
+ address -= key->both.offset;
+
+ if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
+ /*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+
+again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == FUTEX_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
+ if (err < 0)
+ return err;
+ else
+ err = 0;
+
+ /*
+ * The treatment of mapping from this point on is critical. The page
+ * lock protects many things but in this context the page lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the page lock is not needed in all cases being
+ * considered here and page lock forces unnecessarily serialization
+ * From this point on, mapping will be re-verified if necessary and
+ * page lock will be acquired only if it is unavoidable
+ *
+ * Mapping checks require the head page for any compound page so the
+ * head page and mapping is looked up now. For anonymous pages, it
+ * does not matter if the page splits in the future as the key is
+ * based on the address. For filesystem-backed pages, the tail is
+ * required as the index of the page determines the key. For
+ * base pages, there is no tail page and tail == page.
+ */
+ tail = page;
+ page = compound_head(page);
+ mapping = READ_ONCE(page->mapping);
+
+ /*
+ * If page->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_inode_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page->mapping.
+ */
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Page lock is required to identify which special case above
+ * applies. If this is really a shmem page then the page lock
+ * will prevent unexpected transitions.
+ */
+ lock_page(page);
+ shmem_swizzled = PageSwapCache(page) || page->mapping;
+ unlock_page(page);
+ put_page(page);
+
+ if (shmem_swizzled)
+ goto again;
+
+ return -EFAULT;
+ }
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * If the futex key is stored on an anonymous page, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process.
+ */
+ if (PageAnon(page)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (unlikely(should_fail_futex(true)) || ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
+ key->private.mm = mm;
+ key->private.address = address;
+
+ } else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the page->mapping must be traversed. Ordinarily this should
+ * be stabilised under page lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update the radix tree or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(page->mapping) != mapping) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.i_seq = get_inode_sequence_number(inode);
+ key->shared.pgoff = page_to_pgoff(tail);
+ rcu_read_unlock();
+ }
+
+out:
+ put_page(page);
+ return err;
+}
+
+/**
+ * fault_in_user_writeable() - Fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non-destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+int fault_in_user_writeable(u32 __user *uaddr)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ mmap_read_lock(mm);
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE, NULL);
+ mmap_read_unlock(mm);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (futex_match(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
+int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = __get_user(*dest, from);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
+/**
+ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+void __futex_unqueue(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
+ return;
+ lockdep_assert_held(q->lock_ptr);
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+}
+
+/* The key must be already stored in q->key. */
+struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
+ __acquires(&hb->lock)
+{
+ struct futex_hash_bucket *hb;
+
+ hb = futex_hash(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all futex_q_lock()
+ * users end up calling futex_queue(). Similarly, for housekeeping,
+ * decrement the counter at futex_q_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
+
+ q->lock_ptr = &hb->lock;
+
+ spin_lock(&hb->lock);
+ return hb;
+}
+
+void futex_q_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ spin_unlock(&hb->lock);
+ futex_hb_waiters_dec(hb);
+}
+
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
+}
+
+/**
+ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
+ * be paired with exactly one earlier call to futex_queue().
+ *
+ * Return:
+ * - 1 - if the futex_q was still queued (and we removed unqueued it);
+ * - 0 - if the futex_q was already removed by the waking thread
+ */
+int futex_unqueue(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+ int ret = 0;
+
+ /* In the common case we don't take the spinlock, which is nice. */
+retry:
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ if (lock_ptr != NULL) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ __futex_unqueue(q);
+
+ BUG_ON(q->pi_state);
+
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * PI futexes can not be requeued and must remove themselves from the
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ */
+void futex_unqueue_pi(struct futex_q *q)
+{
+ __futex_unqueue(q);
+
+ BUG_ON(!q->pi_state);
+ put_pi_state(q->pi_state);
+ q->pi_state = NULL;
+}
+
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
+{
+ u32 uval, nval, mval;
+ pid_t owner;
+ int err;
+
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) The owner part of user space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
+ */
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
+ if (owner != task_pid_vnr(curr))
+ return 0;
+
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
+ switch (err) {
+ case -EFAULT:
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+
+ case -EAGAIN:
+ cond_resched();
+ goto retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return err;
+ }
+ }
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS))
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
+ return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ unsigned int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ unsigned long futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+ if (entry != pending) {
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi, HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+
+ if (pending) {
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+#endif
+
+#ifdef CONFIG_FUTEX_PI
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+static void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
+ union futex_key key = FUTEX_KEY_INIT;
+
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ raw_spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ hb = futex_hash(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
+ raw_spin_lock_irq(&curr->pi_lock);
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+}
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
+#endif
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in make_task_dead().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent further damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
+ plist_head_init(&futex_queues[i].chain);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+ return 0;
+}
+core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
new file mode 100644
index 000000000000..b5379c0e6d6d
--- /dev/null
+++ b/kernel/futex/futex.h
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FUTEX_H
+#define _FUTEX_H
+
+#include <linux/futex.h>
+#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
+
+#ifdef CONFIG_PREEMPT_RT
+#include <linux/rcuwait.h>
+#endif
+
+#include <asm/futex.h>
+
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x00
+#endif
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
+#ifdef CONFIG_FAIL_FUTEX
+extern bool should_fail_futex(bool fshared);
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif
+
+/*
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location. Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
+ */
+struct futex_hash_bucket {
+ atomic_t waiters;
+ spinlock_t lock;
+ struct plist_head chain;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex_base pi_mutex;
+
+ struct task_struct *owner;
+ refcount_t refcount;
+
+ union futex_key key;
+} __randomize_layout;
+
+/**
+ * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
+ * @task: the task waiting on the futex
+ * @lock_ptr: the hash bucket lock
+ * @key: the key the futex is hashed on
+ * @pi_state: optional priority inheritance state
+ * @rt_waiter: rt_waiter storage for use with requeue_pi
+ * @requeue_pi_key: the requeue_pi target futex key
+ * @bitset: bitset for the optional bitmasked wakeup
+ * @requeue_state: State field for futex_requeue_pi()
+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
+ *
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakeup is always to make the first condition true, then
+ * the second.
+ *
+ * PI futexes are typically woken before they are removed from the hash list via
+ * the rt_mutex code. See futex_unqueue_pi().
+ */
+struct futex_q {
+ struct plist_node list;
+
+ struct task_struct *task;
+ spinlock_t *lock_ptr;
+ union futex_key key;
+ struct futex_pi_state *pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+ union futex_key *requeue_pi_key;
+ u32 bitset;
+ atomic_t requeue_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcuwait requeue_wait;
+#endif
+} __randomize_layout;
+
+extern const struct futex_q futex_q_init;
+
+enum futex_access {
+ FUTEX_READ,
+ FUTEX_WRITE
+};
+
+extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+ enum futex_access rw);
+
+extern struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns);
+
+extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+
+/**
+ * futex_match - Check whether two futex keys are equal
+ * @key1: Pointer to key1
+ * @key2: Pointer to key2
+ *
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int futex_match(union futex_key *key1, union futex_key *key2)
+{
+ return (key1 && key2
+ && key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb);
+extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout);
+extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
+
+extern int fault_in_user_writeable(u32 __user *uaddr);
+extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
+extern int futex_get_value_locked(u32 *dest, u32 __user *from);
+extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+
+extern void __futex_unqueue(struct futex_q *q);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern int futex_unqueue(struct futex_q *q);
+
+/**
+ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * futex_queue() is typically paired with exactly one call to futex_unqueue(). The
+ * exceptions involve the PI related operations, which may use futex_unqueue_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __futex_queue(q, hb);
+ spin_unlock(&hb->lock);
+}
+
+extern void futex_unqueue_pi(struct futex_q *q);
+
+extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Full barrier (B), see the ordering comment above.
+ */
+ smp_mb();
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
+
+extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
+extern void futex_q_unlock(struct futex_hash_bucket *hb);
+
+
+extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters);
+
+extern int refill_pi_state_cache(void);
+extern void get_pi_state(struct futex_pi_state *pi_state);
+extern void put_pi_state(struct futex_pi_state *pi_state);
+extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 > hb2)
+ swap(hb1, hb2);
+
+ spin_lock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+}
+
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+}
+
+/* syscalls */
+
+extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
+ val, ktime_t *abs_time, u32 bitset, u32 __user
+ *uaddr2);
+
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi);
+
+extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset);
+
+/**
+ * struct futex_vector - Auxiliary struct for futex_waitv()
+ * @w: Userspace provided data
+ * @q: Kernel side data
+ *
+ * Struct used to build an array with all data need for futex_waitv()
+ */
+struct futex_vector {
+ struct futex_waitv w;
+ struct futex_q q;
+};
+
+extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to);
+
+extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
+
+extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
+
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+
+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+
+#endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
new file mode 100644
index 000000000000..ce2889f12375
--- /dev/null
+++ b/kernel/futex/pi.c
@@ -0,0 +1,1233 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * PI code:
+ */
+int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state *alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void pi_state_update_owner(struct futex_pi_state *pi_state,
+ struct task_struct *new_owner)
+{
+ struct task_struct *old_owner = pi_state->owner;
+
+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+
+ if (old_owner) {
+ raw_spin_lock(&old_owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&old_owner->pi_lock);
+ }
+
+ if (new_owner) {
+ raw_spin_lock(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ raw_spin_unlock(&new_owner->pi_lock);
+ }
+}
+
+void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
+}
+
+/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ */
+void put_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!pi_state)
+ return;
+
+ if (!refcount_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
+ pi_state_update_owner(pi_state, NULL);
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex);
+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
+ }
+
+ if (current->pi_state_cache) {
+ kfree(pi_state);
+ } else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync. Except one error case where the kernel is denied
+ * write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ * pi_mutex->owner -> pi_state->owner, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ u32 uval2;
+ int ret;
+
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
+ WARN_ON(!refcount_read(&pi_state->refcount));
+
+ /*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
+ */
+ if (pid)
+ goto out_einval;
+ /*
+ * Take a ref on the state and return success. [4]
+ */
+ goto out_attach;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_attach;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ goto out_einval;
+ }
+
+ /*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ *ps = pi_state;
+ return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
+}
+
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ /*
+ * No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
+ */
+ struct futex_pi_state *pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
+ pi_state->owner = p;
+
+ *ps = pi_state;
+}
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct task_struct *p;
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
+ */
+ if (!pid)
+ return -EAGAIN;
+ p = find_get_task_by_vpid(pid);
+ if (!p)
+ return handle_exit_race(uaddr, uval, NULL);
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
+ /*
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ /*
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
+ */
+ int ret = handle_exit_race(uaddr, uval, p);
+
+ raw_spin_unlock_irq(&p->pi_lock);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
+ return ret;
+ }
+
+ __attach_to_pi_owner(p, key, ps);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int err;
+ u32 curval;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (unlikely(err))
+ return err;
+
+ /* If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
+/**
+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Return:
+ * - 0 - ready to wait;
+ * - 1 - acquired the lock;
+ * - <0 - error
+ *
+ * The hb->lock must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ */
+int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
+{
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *top_waiter;
+ int ret;
+
+ /*
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
+ */
+ if (futex_get_value_locked(&uval, uaddr))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
+ return -EDEADLK;
+
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
+ /*
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
+ */
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
+ */
+ if (!(uval & FUTEX_TID_MASK)) {
+ /*
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
+ */
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
+
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+
+ /*
+ * If the waiter bit was requested the caller also needs PI
+ * state attached to the new owner of the user space futex.
+ *
+ * @task is guaranteed to be alive and it cannot be exiting
+ * because it is either sleeping or waiting in
+ * futex_requeue_pi_wakeup_sync().
+ *
+ * No need to do the full attach_to_pi_owner() exercise
+ * because @task is known and valid.
+ */
+ if (set_waiters) {
+ raw_spin_lock_irq(&task->pi_lock);
+ __attach_to_pi_owner(task, key, ps);
+ raw_spin_unlock_irq(&task->pi_lock);
+ }
+ return 1;
+ }
+
+ /*
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
+ */
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+ /*
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
+ */
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+}
+
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+{
+ struct rt_mutex_waiter *top_waiter;
+ struct task_struct *new_owner;
+ bool postunlock = false;
+ DEFINE_RT_WAKE_Q(wqh);
+ u32 curval, newval;
+ int ret = 0;
+
+ top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!top_waiter)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ new_owner = top_waiter->task;
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+ if (unlikely(should_fail_futex(true))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
+
+ if (!ret) {
+ /*
+ * This is a point of no return; once we modified the uval
+ * there is no going back and subsequent operations must
+ * not fail.
+ */
+ pi_state_update_owner(pi_state, new_owner);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
+ }
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+
+ return ret;
+}
+
+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner, *newowner;
+ u32 uval, curval, newval, newtid;
+ int err = 0;
+
+ oldowner = pi_state->owner;
+
+ /*
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the pi_state
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
+ *
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID checks when attaching to PI state .
+ */
+retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 0;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock. pi_state is correct. Tell caller. */
+ return 1;
+ }
+
+ /*
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_err;
+ }
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 1;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ err = futex_get_value_locked(&uval, uaddr);
+ if (err)
+ goto handle_err;
+
+ for (;;) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (err)
+ goto handle_err;
+
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
+ pi_state_update_owner(pi_state, newowner);
+
+ return argowner == current;
+
+ /*
+ * In order to reschedule or handle a page fault, we need to drop the
+ * locks here. In the case of a fault, this gives the other task
+ * (either the highest priority waiter itself or the task which stole
+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
+ * are back from handling the fault we need to check the pi_state after
+ * reacquiring the locks and before trying to do another fixup. When
+ * the fixup has been done already we simply return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
+ */
+handle_err:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(q->lock_ptr);
+
+ switch (err) {
+ case -EFAULT:
+ err = fault_in_user_writeable(uaddr);
+ break;
+
+ case -EAGAIN:
+ cond_resched();
+ err = 0;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return argowner == current;
+
+ /* Retry if err was -EAGAIN or the fault in succeeded */
+ if (!err)
+ goto retry;
+
+ /*
+ * fault_in_user_writeable() failed so user state is immutable. At
+ * best we can make the kernel state consistent but user state will
+ * be most likely hosed and any subsequent unlock operation will be
+ * rejected due to PI futex rule [10].
+ *
+ * Ensure that the rtmutex owner is also the pi_state owner despite
+ * the user space value claiming something different. There is no
+ * point in unlocking the rtmutex if current is the owner as it
+ * would need to wait until the next waiter has taken the rtmutex
+ * to guarantee consistent state. Keep it simple. Userspace asked
+ * for this wreckaged state.
+ *
+ * The rtmutex has an owner - either current or some other
+ * task. See the EAGAIN loop above.
+ */
+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
+ return err;
+}
+
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ int ret;
+
+ lockdep_assert_held(q->lock_ptr);
+
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ ret = __fixup_pi_state_owner(uaddr, q, argowner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+/**
+ * fixup_pi_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Return:
+ * - 1 - success, lock taken;
+ * - 0 - success, lock not taken;
+ * - <0 - on error (-EFAULT)
+ */
+int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+{
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ *
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
+ */
+ if (q->pi_state->owner != current)
+ return fixup_pi_state_owner(uaddr, q, current);
+ return 1;
+ }
+
+ /*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current)
+ return fixup_pi_state_owner(uaddr, q, NULL);
+
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner of the rt_mutex. Warn and establish consistent state.
+ */
+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+ return fixup_pi_state_owner(uaddr, q, current);
+
+ return 0;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
+ */
+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ to = futex_setup_timer(time, &timeout, flags, 0);
+
+retry:
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+retry_private:
+ hb = futex_q_lock(&q);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
+ }
+
+ WARN_ON(!q.pi_state);
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb);
+
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
+
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
+
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
+cleanup:
+ spin_lock(q.lock_ptr);
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+ * lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ goto out;
+
+out_unlock_put_key:
+ futex_q_unlock(hb);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+{
+ u32 curval, uval, vpid = task_pid_vnr(current);
+ union futex_key key = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb;
+ struct futex_q *top_waiter;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != vpid)
+ return -EPERM;
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+ if (ret)
+ return ret;
+
+ hb = futex_hash(&key);
+ spin_lock(&hb->lock);
+
+ /*
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_p() must observe a state consistent with what we
+ * observed.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter. Also see the WARN in wake_futex_pi().
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ /* drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
+ */
+ if (!ret)
+ return ret;
+ /*
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ /*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN)
+ goto pi_retry;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
+ return ret;
+ }
+
+ /*
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+ if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
+ spin_unlock(&hb->lock);
+ switch (ret) {
+ case -EFAULT:
+ goto pi_faulted;
+
+ case -EAGAIN:
+ goto pi_retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return ret;
+ }
+ }
+
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
+out_unlock:
+ spin_unlock(&hb->lock);
+ return ret;
+
+pi_retry:
+ cond_resched();
+ goto retry;
+
+pi_faulted:
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+ goto retry;
+
+ return ret;
+}
+
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
new file mode 100644
index 000000000000..cba8b1a6a4cc
--- /dev/null
+++ b/kernel/futex/requeue.c
@@ -0,0 +1,897 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/sched/signal.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+ * underlying rtmutex. The task which is about to be requeued could have
+ * just woken up (timeout, signal). After the wake up the task has to
+ * acquire hash bucket lock, which is held by the requeue code. As a task
+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+ * and the hash bucket lock blocking would collide and corrupt state.
+ *
+ * On !PREEMPT_RT this is not a problem and everything could be serialized
+ * on hash bucket lock, but aside of having the benefit of common code,
+ * this allows to avoid doing the requeue when the task is already on the
+ * way out and taking the hash bucket lock of the original uaddr1 when the
+ * requeue has been completed.
+ *
+ * The following state transitions are valid:
+ *
+ * On the waiter side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
+ *
+ * On the requeue side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
+ *
+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+ * signals that the waiter is already on the way out. It also means that
+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
+ *
+ * The waiter side signals early wakeup to the requeue side either through
+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+ * which means the wakeup is interleaving with a requeue in progress it has
+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+ * the requeue side when the requeue attempt failed via deadlock detection
+ * and therefore the waiter q is still on the uaddr1 futex.
+ */
+enum {
+ Q_REQUEUE_PI_NONE = 0,
+ Q_REQUEUE_PI_IGNORE,
+ Q_REQUEUE_PI_IN_PROGRESS,
+ Q_REQUEUE_PI_WAIT,
+ Q_REQUEUE_PI_DONE,
+ Q_REQUEUE_PI_LOCKED,
+};
+
+const struct futex_q futex_q_init = {
+ /* list gets initialized in futex_queue()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY,
+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
+};
+
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ futex_hb_waiters_dec(hb1);
+ futex_hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
+ q->lock_ptr = &hb2->lock;
+ }
+ q->key = *key2;
+}
+
+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+ struct futex_pi_state *pi_state)
+{
+ int old, new;
+
+ /*
+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+ * ignore the waiter.
+ */
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return false;
+
+ /*
+ * futex_proxy_trylock_atomic() might have set it to
+ * IN_PROGRESS and a interleaved early wake to WAIT.
+ *
+ * It was considered to have an extra state for that
+ * trylock, but that would just add more conditionals
+ * all over the place for a dubious value.
+ */
+ if (old != Q_REQUEUE_PI_NONE)
+ break;
+
+ new = Q_REQUEUE_PI_IN_PROGRESS;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ q->pi_state = pi_state;
+ return true;
+}
+
+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return;
+
+ if (locked >= 0) {
+ /* Requeue succeeded. Set DONE or LOCKED */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
+ old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_DONE + locked;
+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+ /* Deadlock, no early wakeup interleave */
+ new = Q_REQUEUE_PI_NONE;
+ } else {
+ /* Deadlock, early wakeup interleave. */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_IGNORE;
+ }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+#ifdef CONFIG_PREEMPT_RT
+ /* If the waiter interleaved with the requeue let it know */
+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
+ rcuwait_wake_up(&q->requeue_wait);
+#endif
+}
+
+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ /* Is requeue done already? */
+ if (old >= Q_REQUEUE_PI_DONE)
+ return old;
+
+ /*
+ * If not done, then tell the requeue code to either ignore
+ * the waiter or to wake it up once the requeue is done.
+ */
+ new = Q_REQUEUE_PI_WAIT;
+ if (old == Q_REQUEUE_PI_NONE)
+ new = Q_REQUEUE_PI_IGNORE;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ /* If the requeue was in progress, wait for it to complete */
+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+#ifdef CONFIG_PREEMPT_RT
+ rcuwait_wait_event(&q->requeue_wait,
+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+ TASK_UNINTERRUPTIBLE);
+#else
+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
+#endif
+ }
+
+ /*
+ * Requeue is now either prohibited or complete. Reread state
+ * because during the wait above it might have changed. Nothing
+ * will modify q->requeue_state after this point.
+ */
+ return atomic_read(&q->requeue_state);
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * @q: the futex_q
+ * @key: the key of the requeue target futex
+ * @hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ * the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ * acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ * the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ * this point the waiter task can return from the syscall immediately in
+ * case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ q->key = *key;
+
+ __futex_unqueue(q);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+
+ q->lock_ptr = &hb->lock;
+
+ /* Signal locked state to the waiter */
+ futex_requeue_pi_complete(q, 1);
+ wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
+ * Return:
+ * - 0 - failed to acquire the lock atomically;
+ * - >0 - acquired the lock, return value is vpid of the top_waiter
+ * - <0 - error
+ */
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
+{
+ struct futex_q *top_waiter = NULL;
+ u32 curval;
+ int ret;
+
+ if (futex_get_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unnecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /*
+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+ * and waiting on the 'waitqueue' futex which is always !PI.
+ */
+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
+ return -EINVAL;
+
+ /* Ensure we requeue to the expected futex. */
+ if (!futex_match(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
+ /* Ensure that this does not race against an early wakeup */
+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ return -EAGAIN;
+
+ /*
+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+ * in the contended case or if @set_waiters is true.
+ *
+ * In the contended case PI state is attached to the lock owner. If
+ * the user space lock can be acquired then PI state is attached to
+ * the new owner (@top_waiter->task) when @set_waiters is true.
+ */
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ exiting, set_waiters);
+ if (ret == 1) {
+ /*
+ * Lock was acquired in user space and PI state was
+ * attached to @top_waiter->task. That means state is fully
+ * consistent and the waiter can return to user space
+ * immediately after the wakeup.
+ */
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+ } else if (ret < 0) {
+ /* Rewind top_waiter::requeue_state */
+ futex_requeue_pi_complete(top_waiter, ret);
+ } else {
+ /*
+ * futex_lock_pi_atomic() did not acquire the user space
+ * futex, but managed to establish the proxy lock and pi
+ * state. top_waiter::requeue_state cannot be fixed up here
+ * because the waiter is not enqueued on the rtmutex
+ * yet. This is handled at the callsite depending on the
+ * result of rt_mutex_start_proxy_lock() which is
+ * guaranteed to be reached with this function returning 0.
+ */
+ }
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * @uaddr1: source futex user address
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Return:
+ * - >=0 - on success, the number of tasks requeued or woken;
+ * - <0 - on error
+ */
+int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ DEFINE_WAKE_Q(wake_q);
+
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
+ if (requeue_pi) {
+ /*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
+ * futex_requeue() allows the caller to define the number
+ * of waiters to wake up via the @nr_wake argument. With
+ * REQUEUE_PI, waking up more than one waiter is creating
+ * more problems than it solves. Waking up a waiter makes
+ * only sense if the PI futex @uaddr2 is uncontended as
+ * this allows the requeue code to acquire the futex
+ * @uaddr2 before waking the waiter. The waiter can then
+ * return to user space without further action. A secondary
+ * wakeup would just make the futex_wait_requeue_pi()
+ * handling more complex, because that code would have to
+ * look up pi_state and do more or less all the handling
+ * which the requeue code has to do for the to be requeued
+ * waiters. So restrict the number of waiters to wake to
+ * one, and only wake it up when the PI futex is
+ * uncontended. Otherwise requeue it and let the unlock of
+ * the PI futex handle the wakeup.
+ *
+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+ * pthread_cond_broadcast() must use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ }
+
+retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && futex_match(&key1, &key2))
+ return -EINVAL;
+
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);
+
+retry_private:
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = futex_get_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
+
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
+ */
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!futex_match(&this->key, &key1))
+ continue;
+
+ /*
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ futex_wake_mark(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
+ */
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
+ }
+
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ futex_hb_waiters_dec(hb2);
+ return ret ? ret : task_count;
+}
+
+/**
+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Determine the cause for the early wakeup.
+ *
+ * Return:
+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
+
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initially wait on (non-pi)
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
+ *
+ * We call schedule in futex_wait_queue() when we enqueue and return there
+ * via the following--
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
+ *
+ * If 3, cleanup and return -ERESTARTNOINTR.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Return:
+ * - 0 - On success;
+ * - <0 - On error
+ */
+int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
+ struct rt_mutex_base *pi_mutex;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
+ if (!bitset)
+ return -EINVAL;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+
+ /*
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (futex_match(&q.key, &key2)) {
+ futex_q_unlock(hb);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_wait_queue(hb, &q, to);
+
+ switch (futex_requeue_pi_wakeup_sync(&q)) {
+ case Q_REQUEUE_PI_IGNORE:
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ break;
+
+ case Q_REQUEUE_PI_LOCKED:
+ /* The requeue acquired the lock */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_owner(uaddr2, &q, true);
+ /*
+ * Drop the reference to the pi state which the
+ * requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
+ spin_unlock(q.lock_ptr);
+ /*
+ * Adjust the return value. It's either -EFAULT or
+ * success (1) but the caller expects 0 for success.
+ */
+ ret = ret < 0 ? ret : 0;
+ }
+ break;
+
+ case Q_REQUEUE_PI_DONE:
+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+
+ /* Current is not longer pi_blocked_on */
+ spin_lock(q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr2, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it
+ * acquired the lock, clear -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart
+ * by calling futex_lock_pi() directly. We could
+ * restart this syscall, but it would detect that
+ * the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart
+ * and return -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+ break;
+ default:
+ BUG();
+ }
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
new file mode 100644
index 000000000000..a8074079b09e
--- /dev/null
+++ b/kernel/futex/syscalls.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/time_namespace.h>
+
+#include "futex.h"
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list() - Set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+ size_t, len)
+{
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+/**
+ * sys_get_robust_list() - Get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+ struct robust_list_head __user * __user *, head_ptr,
+ size_t __user *, len_ptr)
+{
+ struct robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+{
+ int cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+ cmd != FUTEX_LOCK_PI2)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAIT_BITSET:
+ return futex_wait(uaddr, flags, val, timeout, val3);
+ case FUTEX_WAKE:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAKE_BITSET:
+ return futex_wake(uaddr, flags, val, val3);
+ case FUTEX_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+ case FUTEX_CMP_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+ case FUTEX_WAKE_OP:
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+ case FUTEX_LOCK_PI:
+ flags |= FLAGS_CLOCKRT;
+ fallthrough;
+ case FUTEX_LOCK_PI2:
+ return futex_lock_pi(uaddr, flags, timeout, 0);
+ case FUTEX_UNLOCK_PI:
+ return futex_unlock_pi(uaddr, flags);
+ case FUTEX_TRYLOCK_PI:
+ return futex_lock_pi(uaddr, flags, NULL, 1);
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
+ case FUTEX_CMP_REQUEUE_PI:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ }
+ return -ENOSYS;
+}
+
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+ switch (cmd) {
+ case FUTEX_WAIT:
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ return true;
+ }
+ return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+ if (!timespec64_valid(ts))
+ return -EINVAL;
+
+ *t = timespec64_to_ktime(*ts);
+ if (cmd == FUTEX_WAIT)
+ *t = ktime_add_safe(ktime_get(), *t);
+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+ return 0;
+}
+
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ const struct __kernel_timespec __user *, utime,
+ u32 __user *, uaddr2, u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
+ if (get_timespec64(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+
+/* Mask of available flags for each futex in futex_waitv list */
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
+
+/**
+ * futex_parse_waitv - Parse a waitv array from userspace
+ * @futexv: Kernel side list of waiters to be filled
+ * @uwaitv: Userspace list to be parsed
+ * @nr_futexes: Length of futexv
+ *
+ * Return: Error code on failure, 0 on success
+ */
+static int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes)
+{
+ struct futex_waitv aux;
+ unsigned int i;
+
+ for (i = 0; i < nr_futexes; i++) {
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
+ return -EFAULT;
+
+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ if (!(aux.flags & FUTEX_32))
+ return -EINVAL;
+
+ futexv[i].w.flags = aux.flags;
+ futexv[i].w.val = aux.val;
+ futexv[i].w.uaddr = aux.uaddr;
+ futexv[i].q = futex_q_init;
+ }
+
+ return 0;
+}
+
+/**
+ * sys_futex_waitv - Wait on a list of futexes
+ * @waiters: List of futexes to wait on
+ * @nr_futexes: Length of futexv
+ * @flags: Flag for timeout (monotonic/realtime)
+ * @timeout: Optional absolute timeout.
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
+ *
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
+ * the operation. Each waiter has individual flags. The `flags` argument for
+ * the syscall should be used solely for specifying the timeout as realtime, if
+ * needed. Flags for private futexes, sizes, etc. should be used on the
+ * individual flags of each waiter.
+ *
+ * Returns the array index of one of the woken futexes. No further information
+ * is provided: any number of other futexes may also have been woken by the
+ * same event, and if more than one futex was woken, the retrned index may
+ * refer to any one of them. (It is not necessaryily the futex with the
+ * smallest index, nor the one most recently woken, nor...)
+ */
+
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
+ unsigned int, nr_futexes, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ struct futex_vector *futexv;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ /* This syscall supports no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
+ return -EINVAL;
+
+ if (timeout) {
+ int flag_clkid = 0, flag_init = 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
+ }
+
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, &to, flag_clkid, 0);
+ }
+
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv) {
+ ret = -ENOMEM;
+ goto destroy_timer;
+ }
+
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
+ if (!ret)
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
+
+ kfree(futexv);
+
+destroy_timer:
+ if (timeout) {
+ hrtimer_cancel(&to.timer);
+ destroy_hrtimer_on_stack(&to.timer);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+#endif /* CONFIG_COMPAT_32BIT_TIME */
+
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
new file mode 100644
index 000000000000..ba01b9408203
--- /dev/null
+++ b/kernel/futex/waitwake.c
@@ -0,0 +1,708 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+#include <linux/freezer.h>
+
+#include "futex.h"
+
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * smp_mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `--------> smp_mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read (see futex_hb_waiters_pending()).
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in futex_q_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
+ get_task_struct(p);
+ __futex_unqueue(q);
+ /*
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __futex_unqueue().
+ */
+ smp_store_release(&q->lock_ptr, NULL);
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock.
+ */
+ wake_q_add_safe(wake_q, p);
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ union futex_key key = FUTEX_KEY_INIT;
+ int ret;
+ DEFINE_WAKE_Q(wake_q);
+
+ if (!bitset)
+ return -EINVAL;
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ hb = futex_hash(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!futex_hb_waiters_pending(hb))
+ return ret;
+
+ spin_lock(&hb->lock);
+
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
+ if (futex_match (&this->key, &key)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
+ futex_wake_mark(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
+ oparg = 1 << oparg;
+ }
+
+ pagefault_disable();
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ pagefault_enable();
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ int ret, op_ret;
+ DEFINE_WAKE_Q(wake_q);
+
+retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ return ret;
+
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);
+
+retry_private:
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
+ return ret;
+ }
+
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
+ }
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ futex_wake_mark(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ futex_wake_mark(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart);
+
+/**
+ * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
+ * @hb: the futex hash bucket, must be locked by the caller
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb);
+
+ /* Arm the timer */
+ if (timeout)
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
+
+ /*
+ * If we have been removed from the hash list, then another task
+ * has tried to wake us, and we can skip the call to schedule().
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * unqueue_multiple - Remove various futexes from their hash bucket
+ * @v: The list of futexes to unqueue
+ * @count: Number of futexes in the list
+ *
+ * Helper to unqueue a list of futexes. This can't fail.
+ *
+ * Return:
+ * - >=0 - Index of the last futex that was awoken;
+ * - -1 - No futex was awoken
+ */
+static int unqueue_multiple(struct futex_vector *v, int count)
+{
+ int ret = -1, i;
+
+ for (i = 0; i < count; i++) {
+ if (!futex_unqueue(&v[i].q))
+ ret = i;
+ }
+
+ return ret;
+}
+
+/**
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
+ * @vs: The futex list to wait on
+ * @count: The size of the list
+ * @woken: Index of the last woken futex, if any. Used to notify the
+ * caller that it can return this index to userspace (return parameter)
+ *
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
+ * the futex list is invalid or if any futex was already awoken. On success the
+ * task is ready to interruptible sleep.
+ *
+ * Return:
+ * - 1 - One of the futexes was woken by another thread
+ * - 0 - Success
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
+ */
+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+{
+ struct futex_hash_bucket *hb;
+ bool retry = false;
+ int ret, i;
+ u32 uval;
+
+ /*
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
+ * each futex on the list before dealing with the next one to avoid
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
+ * lose any wake events, which cannot be done before the get_futex_key
+ * of the next key, because it calls get_user_pages, which can sleep.
+ * Thus, we fetch the list of futexes keys in two steps, by first
+ * pinning all the memory keys in the futex key, and only then we read
+ * each key and queue the corresponding futex.
+ *
+ * Private futexes doesn't need to recalculate hash in retry, so skip
+ * get_futex_key() when retrying.
+ */
+retry:
+ for (i = 0; i < count; i++) {
+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+ continue;
+
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+ &vs[i].q.key, FUTEX_READ);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+
+ for (i = 0; i < count; i++) {
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
+ struct futex_q *q = &vs[i].q;
+ u32 val = (u32)vs[i].w.val;
+
+ hb = futex_q_lock(q);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb);
+ continue;
+ }
+
+ futex_q_unlock(hb);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Even if something went wrong, if we find out that a futex
+ * was woken, we don't return error and return this index to
+ * userspace
+ */
+ *woken = unqueue_multiple(vs, i);
+ if (*woken >= 0)
+ return 1;
+
+ if (ret) {
+ /*
+ * If we need to handle a page fault, we need to do so
+ * without any lock and any enqueued futex (otherwise
+ * we could lose some wakeup). So we do it here, after
+ * undoing all the work done so far. In success, we
+ * retry all the work.
+ */
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+
+ retry = true;
+ goto retry;
+ }
+
+ if (uval != val)
+ return -EWOULDBLOCK;
+ }
+
+ return 0;
+}
+
+/**
+ * futex_sleep_multiple - Check sleeping conditions and sleep
+ * @vs: List of futexes to wait for
+ * @count: Length of vs
+ * @to: Timeout
+ *
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
+ * been woken up.
+ */
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ if (to && !to->task)
+ return;
+
+ for (; count; count--, vs++) {
+ if (!READ_ONCE(vs->q.lock_ptr))
+ return;
+ }
+
+ schedule();
+}
+
+/**
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
+ * @vs: The list of futexes to wait on
+ * @count: The number of objects
+ * @to: Timeout before giving up and returning to userspace
+ *
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
+ * sleeps on a group of futexes and returns on the first futex that is
+ * wake, or after the timeout has elapsed.
+ *
+ * Return:
+ * - >=0 - Hint to the futex that was awoken
+ * - <0 - On error
+ */
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ int ret, hint = 0;
+
+ if (to)
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ while (1) {
+ ret = futex_wait_multiple_setup(vs, count, &hint);
+ if (ret) {
+ if (ret > 0) {
+ /* A futex was woken during setup */
+ ret = hint;
+ }
+ return ret;
+ }
+
+ futex_sleep_multiple(vs, count, to);
+
+ __set_current_state(TASK_RUNNING);
+
+ ret = unqueue_multiple(vs, count);
+ if (ret >= 0)
+ return ret;
+
+ if (to && !to->task)
+ return -ETIMEDOUT;
+ else if (signal_pending(current))
+ return -ERESTARTSYS;
+ /*
+ * The final case is a spurious wakeup, for
+ * which just retry.
+ */
+ }
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @q: the associated futex_q
+ * @hb: storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held on success, and unlocked on failure.
+ *
+ * Return:
+ * - 0 - uaddr contains val and hb has been locked;
+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ */
+int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+{
+ u32 uval;
+ int ret;
+
+ /*
+ * Access the page AFTER the hash-bucket is locked.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
+ */
+retry:
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ *hb = futex_q_lock(q);
+
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (ret) {
+ futex_q_unlock(*hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+
+ if (uval != val) {
+ futex_q_unlock(*hb);
+ ret = -EWOULDBLOCK;
+ }
+
+ return ret;
+}
+
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+ q.bitset = bitset;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+retry:
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out;
+
+ /* futex_queue and wait for wakeup, timeout, or a signal. */
+ futex_wait_queue(hb, &q, to);
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ ret = 0;
+ if (!futex_unqueue(&q))
+ goto out;
+ ret = -ETIMEDOUT;
+ if (to && !to->task)
+ goto out;
+
+ /*
+ * We expect signal_pending(current), but we might be the
+ * victim of a spurious wakeup as well.
+ */
+ if (!signal_pending(current))
+ goto retry;
+
+ ret = -ERESTARTSYS;
+ if (!abs_time)
+ goto out;
+
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ ret = set_restart_fn(restart, futex_wait_restart);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = restart->futex.uaddr;
+ ktime_t t, *tp = NULL;
+
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t = restart->futex.time;
+ tp = &t;
+ }
+ restart->fn = do_no_restart_syscall;
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
+}
+
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 053447183ac5..04f4ebdc3cf5 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
select CONSTRUCTORS
default n
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index cbb0bed958ab..7670a811a565 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
for (i = 0; i < sfn_ptr->num_counters; i++)
dfn_ptr->counters[i] += sfn_ptr->counters[i];
+
+ sfn_ptr = list_next_entry(sfn_ptr, head);
}
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 460c12b7dfea..74a4ef1da9ad 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -30,6 +30,13 @@
#define GCOV_TAG_FUNCTION_LENGTH 3
+/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */
+#if (__GNUC__ >= 12)
+#define GCOV_UNIT_SIZE 4
+#else
+#define GCOV_UNIT_SIZE 1
+#endif
+
static struct gcov_info *gcov_info_head;
/**
@@ -75,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -87,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
@@ -383,12 +395,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
+#if (__GNUC__ >= 12)
+ /* Use zero as checksum of the compilation unit. */
+ pos += store_gcov_u32(buffer, pos, 0);
+#endif
+
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
@@ -402,7 +420,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+ pos += store_gcov_u32(buffer, pos,
+ ci_ptr->num * 2 * GCOV_UNIT_SIZE);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 1966a749e0d9..1ef9a87511f5 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -7,13 +7,15 @@ set -e
sfile="$(readlink -f "$0")"
outdir="$(pwd)"
tarfile=$1
-cpio_dir=$outdir/$tarfile.tmp
+cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir
dir_list="
include/
arch/$SRCARCH/include/
"
+type cpio > /dev/null
+
# Support incremental builds by skipping archive generation
# if timestamps of files being archived are not changed.
@@ -31,8 +33,8 @@ if [ "$building_out_of_srctree" ]; then
fi
all_dirs="$all_dirs $dir_list"
-# include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed.
+# include/generated/utsversion.h is ignored because it is generated after this
+# script is executed. (utsversion.h is unneeded for kheaders)
#
# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
# updated, but the contents might be still the same. When any CONFIG option is
@@ -42,7 +44,7 @@ all_dirs="$all_dirs $dir_list"
#
# Ignore them for md5 calculation to avoid pointless regeneration.
headers_md5="$(find $all_dirs -name "*.h" |
- grep -v "include/generated/compile.h" |
+ grep -v "include/generated/utsversion.h" |
grep -v "include/generated/autoconf.h" |
xargs ls -l | md5sum | cut -d ' ' -f1)"
@@ -74,7 +76,7 @@ fi
# of tree builds having stale headers in srctree. Just silence CPIO for now.
for f in $dir_list;
do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1
# Remove comments except SDPX lines
find $cpio_dir -type f -print0 |
diff --git a/kernel/groups.c b/kernel/groups.c
index 787b381c7c00..9aaed2a31073 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -134,13 +134,26 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
+ const struct cred *old;
+ int retval;
new = prepare_creds();
if (!new)
return -ENOMEM;
+ old = current_cred();
+
set_groups(new, group_info);
+
+ retval = security_task_fix_setgroups(new, old);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
EXPORT_SYMBOL(set_current_groups);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9888e2bc8c76..9a24574988d2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -28,7 +28,7 @@
/*
* The number of tasks checked:
*/
-int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
@@ -47,9 +47,9 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
*/
-unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
-int __read_mostly sysctl_hung_task_warnings = 10;
+static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
@@ -63,15 +63,17 @@ static struct task_struct *watchdog_task;
* Should we dump all CPUs backtraces in a hung task event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+#else
+#define sysctl_hung_task_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
/*
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
-unsigned int __read_mostly sysctl_hung_task_panic =
- CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+static unsigned int __read_mostly sysctl_hung_task_panic =
+ IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -93,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* Ensure the task is not frozen.
* Also, skip vfork and any other user process that freezer should skip.
*/
- if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
- return;
+ if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
+ return;
/*
* When a freshly created task is scheduled once, changes its state to
@@ -140,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
+ if (!sysctl_hung_task_warnings)
+ pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
touch_nmi_watchdog();
@@ -189,6 +193,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
+ unsigned int state;
+
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
@@ -196,8 +202,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
+ /*
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ */
+ state = READ_ONCE(t->__state);
+ if ((state & TASK_UNINTERRUPTIBLE) &&
+ !(state & TASK_WAKEKILL) &&
+ !(state & TASK_NOLOAD))
check_hung_task(t, timeout);
}
unlock:
@@ -222,11 +234,13 @@ static long hung_timeout_jiffies(unsigned long last_checked,
MAX_SCHEDULE_TIMEOUT;
}
+#ifdef CONFIG_SYSCTL
/*
* Process updating of timeout sysctl
*/
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -241,6 +255,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
+static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
+static struct ctl_table hung_task_sysctls[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "hung_task_all_cpu_backtrace",
+ .data = &sysctl_hung_task_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+ {
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_NEG_ONE,
+ },
+ {}
+};
+
+static void __init hung_task_sysctl_init(void)
+{
+ register_sysctl_init("kernel", hung_task_sysctls);
+}
+#else
+#define hung_task_sysctl_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@@ -310,6 +394,7 @@ static int __init hung_task_init(void)
pm_notifier(hungtask_pm_notify, 0);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+ hung_task_sysctl_init();
return 0;
}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index fbc54c2a7f23..2531f3496ab6 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -24,6 +24,7 @@ config GENERIC_IRQ_SHOW_LEVEL
# Supports effective affinity mask
config GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ depends on SMP
bool
# Support for delayed migration from interrupt context
@@ -82,24 +83,22 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
+ depends on SMP
select IRQ_DOMAIN_HIERARCHY
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
bool
+ depends on SMP
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
config IRQ_MSI_IOMMU
bool
-config HANDLE_DOMAIN_IRQ
- bool
-
config IRQ_TIMINGS
bool
@@ -144,3 +143,10 @@ config GENERIC_IRQ_MULTI_HANDLER
bool
help
Allow to specify the low level IRQ handler at run time.
+
+# Cavium Octeon is the last system to use this deprecated option
+# Do not even think of enabling this on any new platform
+config DEPRECATED_IRQ_CPU_ONOFFLINE
+ bool
+ depends on CAVIUM_OCTEON_SOC
+ default CAVIUM_OCTEON_SOC
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4d89ad4fae3b..44a4eba80315 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,397 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
-
-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
-{
- const struct cpumask *siblmsk;
- int cpu, sibl;
-
- for ( ; cpus_per_vec > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
-
- /* If the cpu has siblings, use them first */
- siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
- }
- }
-}
-
-static cpumask_var_t *alloc_node_to_cpumask(void)
-{
- cpumask_var_t *masks;
- int node;
-
- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
- if (!masks)
- return NULL;
-
- for (node = 0; node < nr_node_ids; node++) {
- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
- goto out_unwind;
- }
-
- return masks;
-
-out_unwind:
- while (--node >= 0)
- free_cpumask_var(masks[node]);
- kfree(masks);
- return NULL;
-}
-
-static void free_node_to_cpumask(cpumask_var_t *masks)
-{
- int node;
-
- for (node = 0; node < nr_node_ids; node++)
- free_cpumask_var(masks[node]);
- kfree(masks);
-}
-
-static void build_node_to_cpumask(cpumask_var_t *masks)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
-}
-
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
- const struct cpumask *mask, nodemask_t *nodemsk)
-{
- int n, nodes = 0;
-
- /* Calculate the number of nodes in the supplied affinity mask */
- for_each_node(n) {
- if (cpumask_intersects(mask, node_to_cpumask[n])) {
- node_set(n, *nodemsk);
- nodes++;
- }
- }
- return nodes;
-}
-
-struct node_vectors {
- unsigned id;
-
- union {
- unsigned nvectors;
- unsigned ncpus;
- };
-};
-
-static int ncpus_cmp_func(const void *l, const void *r)
-{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
-
- return ln->ncpus - rn->ncpus;
-}
-
-/*
- * Allocate vector number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated numbver is <= active CPU number of this node
- *
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
-{
- unsigned n, remaining_ncpus = 0;
-
- for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
- }
-
- for_each_node_mask(n, nodemsk) {
- unsigned ncpus;
-
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- ncpus = cpumask_weight(nmsk);
-
- if (!ncpus)
- continue;
- remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
- }
-
- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
-
- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
- ncpus_cmp_func, NULL);
-
- /*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
- * bigger than number of active numa nodes. Always start the
- * allocation from the node with minimized nr_cpus.
- *
- * This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
- * bigger than number of numa nodes.
- *
- * One perfect invariant is that number of allocated vectors for
- * each node is <= CPU count of this node:
- *
- * 1) suppose there are two nodes: A and B
- * ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
- * algorithm
- *
- * ncpu(A) <= ncpu(B)
- * ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
- *
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
- *
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
- *
- * 2) obviously vecs(A) <= ncpu(A) because:
- *
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
- * ncpu(A) >= 1
- *
- * otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
- *
- * 3) prove how vecs(B) <= ncpu(B):
- *
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
- *
- * otherwise:
- *
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
- * round_down((N - delta) * ncpu(A) / N) =
- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
- * round_down((N * ncpu(A) - delta * N) / N) =
- * cpu(A) - delta
- *
- * then:
- *
- * vecs(A) - V >= ncpu(A) - delta - V
- * =>
- * V - vecs(A) <= V + delta - ncpu(A)
- * =>
- * vecs(B) <= N - ncpu(A)
- * =>
- * vecs(B) <= cpu(B)
- *
- * For nodes >= 3, it can be thought as one node and another big
- * node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
- *
- */
- for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
-
- if (node_vectors[n].ncpus == UINT_MAX)
- continue;
-
- WARN_ON_ONCE(numvecs == 0);
-
- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
-
- node_vectors[n].nvectors = nvectors;
-
- remaining_ncpus -= ncpus;
- numvecs -= nvectors;
- }
-}
-
-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- unsigned int firstvec,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
-{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
- unsigned int curvec = startvec;
- nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
-
- if (!cpumask_weight(cpu_mask))
- return 0;
-
- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-
- /*
- * If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
- */
- if (numvecs <= nodes) {
- for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
- node_to_cpumask[n]);
- if (++curvec == last_affv)
- curvec = firstvec;
- }
- return numvecs;
- }
-
- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
- GFP_KERNEL);
- if (!node_vectors)
- return -ENOMEM;
-
- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
- for (i = 0; i < nr_node_ids; i++) {
- unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
-
- if (nv->nvectors == UINT_MAX)
- continue;
-
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
- ncpus = cpumask_weight(nmsk);
- if (!ncpus)
- continue;
-
- WARN_ON_ONCE(nv->nvectors > ncpus);
-
- /* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
-
- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
-
- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
- }
-
- /*
- * wrapping has to be considered given 'startvec'
- * may start anywhere
- */
- if (curvec >= last_affv)
- curvec = firstvec;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
- cpus_per_vec);
- }
- done += nv->nvectors;
- }
- kfree(node_vectors);
- return done;
-}
-
-/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
- */
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
- struct irq_affinity_desc *masks)
-{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- cpumask_var_t *node_to_cpumask;
- cpumask_var_t nmsk, npresmsk;
- int ret = -ENOMEM;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
-
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail_nmsk;
-
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
- goto fail_npresmsk;
-
- /* Stabilize the cpumasks */
- get_online_cpus();
- build_node_to_cpumask(node_to_cpumask);
-
- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
- if (ret < 0)
- goto fail_build_affinity;
- nr_present = ret;
-
- /*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
- */
- if (nr_present >= numvecs)
- curvec = firstvec;
- else
- curvec = firstvec + nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
- if (ret >= 0)
- nr_others = ret;
-
- fail_build_affinity:
- put_online_cpus();
-
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
-
- free_node_to_cpumask(node_to_cpumask);
-
- fail_npresmsk:
- free_cpumask_var(npresmsk);
-
- fail_nmsk:
- free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
-}
+#include <linux/group_cpus.h>
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
@@ -460,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = group_cpus_evenly(this_vecs);
- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}
@@ -505,9 +119,9 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
if (affd->calc_sets) {
set_vecs = maxvec - resv;
} else {
- get_online_cpus();
+ cpus_read_lock();
set_vecs = cpumask_weight(cpu_possible_mask);
- put_online_cpus();
+ cpus_read_unlock();
}
return resv + min(set_vecs, maxvec - resv);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7f04c7d8296e..49e7bc871fec 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -38,7 +38,7 @@ struct irqaction chained_action = {
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
if (!desc)
return -EINVAL;
- if (!chip)
- chip = &no_irq_chip;
-
- desc->irq_data.chip = chip;
+ desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
@@ -191,7 +188,8 @@ enum {
#ifdef CONFIG_SMP
static int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -227,7 +225,8 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
}
#else
static __always_inline int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
return IRQ_STARTUP_NORMAL;
}
@@ -255,7 +254,7 @@ static int __irq_startup(struct irq_desc *desc)
int irq_startup(struct irq_desc *desc, bool resend, bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
- struct cpumask *aff = irq_data_get_affinity_mask(d);
+ const struct cpumask *aff = irq_data_get_affinity_mask(d);
int ret = 0;
desc->depth = 0;
@@ -265,8 +264,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
@@ -572,8 +574,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- unsigned int flags = 0;
-
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@@ -590,7 +590,7 @@ void handle_untracked_irq(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- __handle_irq_event_percpu(desc, &flags);
+ __handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1008,8 +1008,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
- if (is_chained)
+ if (is_chained) {
desc->action = NULL;
+ WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+ }
desc->depth = 1;
}
desc->handle_irq = handle;
@@ -1035,6 +1037,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
+ WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
irq_activate_and_startup(desc, IRQ_RESEND);
}
}
@@ -1072,7 +1075,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -1119,6 +1122,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
}
EXPORT_SYMBOL_GPL(irq_modify_status);
+#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
/**
* irq_cpu_online - Invoke all irq_cpu_online functions.
*
@@ -1178,6 +1182,7 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+#endif
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -1513,7 +1518,8 @@ int irq_chip_request_resources_parent(struct irq_data *data)
if (data->chip->irq_request_resources)
return data->chip->irq_request_resources(data);
- return -ENOSYS;
+ /* no error on missing optional irq_chip::irq_request_resources */
+ return 0;
}
EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
@@ -1555,6 +1561,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+static struct device *irq_get_pm_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->pm_dev;
+
+ return NULL;
+}
+
/**
* irq_chip_pm_get - Enable power for an IRQ chip
* @data: Pointer to interrupt specific data
@@ -1564,17 +1578,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
int irq_chip_pm_get(struct irq_data *data)
{
- int retval;
+ struct device *dev = irq_get_pm_device(data);
+ int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
- retval = pm_runtime_get_sync(data->chip->parent_device);
- if (retval < 0) {
- pm_runtime_put_noidle(data->chip->parent_device);
- return retval;
- }
- }
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_resume_and_get(dev);
- return 0;
+ return retval;
}
/**
@@ -1587,10 +1597,11 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
- retval = pm_runtime_put(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
return (retval < 0) ? retval : 0;
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 02236b13b359..1ed2b1739363 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -166,7 +166,7 @@ void irq_migrate_all_off_this_cpu(void)
raw_spin_unlock(&desc->lock);
if (affinity_broken) {
- pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n",
+ pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
irq, smp_processor_id());
}
}
@@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
- if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
return false;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..bbcaac64038e 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -30,7 +30,7 @@ static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
- struct cpumask *msk;
+ const struct cpumask *msk;
msk = irq_data_get_affinity_mask(data);
seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
@@ -58,6 +58,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
+ BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
};
static void
@@ -69,8 +70,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
seq_printf(m, "chip: None\n");
return;
}
- seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
- seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
ARRAY_SIZE(irqchip_flags));
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f8f23af6ab0d..c653cd31548d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -25,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock);
void irq_gc_noop(struct irq_data *d)
{
}
+EXPORT_SYMBOL_GPL(irq_gc_noop);
/**
* irq_gc_mask_disable_reg - Mask chip via disable register
@@ -44,6 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
/**
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
@@ -103,6 +105,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
/**
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
@@ -240,9 +243,8 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
- unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
- gc = kzalloc(sz, GFP_KERNEL);
+ gc = kzalloc(struct_size(gc, chip_types, num_ct), GFP_KERNEL);
if (gc) {
irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
handler);
@@ -288,8 +290,11 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- int numchips, sz, i;
unsigned long flags;
+ int numchips, i;
+ size_t dgc_sz;
+ size_t gc_sz;
+ size_t sz;
void *tmp;
if (d->gc)
@@ -300,8 +305,9 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
- sz = sizeof(*dgc) + numchips * sizeof(gc);
- sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+ gc_sz = struct_size(gc, chip_types, num_ct);
+ dgc_sz = struct_size(dgc, gc, numchips);
+ sz = dgc_sz + numchips * gc_sz;
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
@@ -314,7 +320,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
d->gc = dgc;
/* Calc pointer to the first generic chip */
- tmp += sizeof(*dgc) + numchips * sizeof(gc);
+ tmp += dgc_sz;
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
@@ -331,7 +337,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
/* Calc pointer to the next generic chip */
- tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ tmp += gc_sz;
}
return 0;
}
@@ -425,7 +431,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
return 0;
}
-static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
{
struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
@@ -445,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
}
-struct irq_domain_ops irq_generic_chip_ops = {
+const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 221d80c31e94..9489f93b3db3 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -14,6 +14,8 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <asm/irq_regs.h>
+
#include <trace/events/irq.h>
#include "internals.h"
@@ -134,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
@@ -172,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
}
__irq_wake_thread(desc, action);
-
- fallthrough; /* to add to randomness */
- case IRQ_HANDLED:
- *flags |= action->flags;
break;
default:
@@ -191,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
- unsigned int flags = 0;
- retval = __handle_irq_event_percpu(desc, &flags);
+ retval = __handle_irq_event_percpu(desc);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
@@ -226,4 +223,20 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
handle_arch_irq = handle_irq;
return 0;
}
+
+/**
+ * generic_handle_arch_irq - root irq handler for architectures which do no
+ * entry accounting themselves
+ * @regs: Register file coming from the low-level handling code
+ */
+asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+
+ irq_enter();
+ old_regs = set_irq_regs(regs);
+ handle_arch_irq(regs);
+ set_irq_regs(old_regs);
+ irq_exit();
+}
#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 54363527feea..5fdc0b557579 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -29,12 +29,14 @@ extern struct irqaction chained_action;
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
+ * IRQTF_READY - signals that irq thread is ready
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
+ IRQTF_READY,
};
/*
@@ -50,6 +52,7 @@ enum {
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -62,6 +65,7 @@ enum {
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
@@ -103,7 +107,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 52f11c791bf8..961d4af76af3 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -14,11 +14,11 @@
/**
* irq_reserve_ipi() - Setup an IPI to destination cpumask
* @domain: IPI domain
- * @dest: cpumask of cpus which can receive the IPI
+ * @dest: cpumask of CPUs which can receive the IPI
*
* Allocate a virq that can be used to send IPI to any CPU in dest mask.
*
- * On success it'll return linux irq number and error code on failure
+ * Return: Linux IRQ number on success or error code on failure
*/
int irq_reserve_ipi(struct irq_domain *domain,
const struct cpumask *dest)
@@ -104,22 +104,22 @@ free_descs:
/**
* irq_destroy_ipi() - unreserve an IPI that was previously allocated
- * @irq: linux irq number to be destroyed
- * @dest: cpumask of cpus which should have the IPI removed
+ * @irq: Linux IRQ number to be destroyed
+ * @dest: cpumask of CPUs which should have the IPI removed
*
* The IPIs allocated with irq_reserve_ipi() are returned to the system
* destroying all virqs associated with them.
*
- * Return 0 on success or error code on failure.
+ * Return: %0 on success or error code on failure.
*/
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
struct irq_domain *domain;
unsigned int nr_irqs;
- if (!irq || !data || !ipimask)
+ if (!irq || !data)
return -EINVAL;
domain = data->domain;
@@ -131,7 +131,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
return -EINVAL;
}
- if (WARN_ON(!cpumask_subset(dest, ipimask)))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
/*
* Must be destroying a subset of CPUs to which this IPI
* was set up to target
@@ -150,24 +151,25 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
}
/**
- * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
- * @irq: linux irq number
- * @cpu: the target cpu
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU
+ * @irq: Linux IRQ number
+ * @cpu: the target CPU
*
* When dealing with coprocessors IPI, we need to inform the coprocessor of
* the hwirq it needs to use to receive and send IPIs.
*
- * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ * Return: hwirq value on success or INVALID_HWIRQ on failure.
*/
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
- if (!data || !ipimask || cpu >= nr_cpu_ids)
+ if (!data || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
- if (!cpumask_test_cpu(cpu, ipimask))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
return INVALID_HWIRQ;
/*
@@ -186,9 +188,9 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);
static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
const struct cpumask *dest, unsigned int cpu)
{
- struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+ const struct cpumask *ipimask;
- if (!chip || !ipimask)
+ if (!chip || !data)
return -EINVAL;
if (!chip->ipi_send_single && !chip->ipi_send_mask)
@@ -197,6 +199,10 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (cpu >= nr_cpu_ids)
return -EINVAL;
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask)
+ return -EINVAL;
+
if (dest) {
if (!cpumask_subset(dest, ipimask))
return -EINVAL;
@@ -216,7 +222,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
{
@@ -250,7 +256,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
}
/**
- * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * __ipi_send_mask - send an IPI to target Linux SMP CPU(s)
* @desc: pointer to irq_desc of the IRQ
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
@@ -258,7 +264,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
{
@@ -298,11 +304,11 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
/**
* ipi_send_single - Send an IPI to a single CPU
- * @virq: linux irq number from irq_reserve_ipi()
+ * @virq: Linux IRQ number from irq_reserve_ipi()
* @cpu: destination CPU, must in the destination mask passed to
* irq_reserve_ipi()
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int ipi_send_single(unsigned int virq, unsigned int cpu)
{
@@ -319,11 +325,11 @@ EXPORT_SYMBOL_GPL(ipi_send_single);
/**
* ipi_send_mask - Send an IPI to target CPU(s)
- * @virq: linux irq number from irq_reserve_ipi()
+ * @virq: Linux IRQ number from irq_reserve_ipi()
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
{
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 0cd02efa3a74..dd76323ea3fd 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
goto err_free_bitmap;
work_ctx->irq_count = num_irqs;
- init_irq_work(&work_ctx->work, irq_sim_handle_irq);
+ work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
return work_ctx->domain;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index fadb93766020..240e145e969f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj,
raw_spin_lock_irq(&desc->lock);
if (desc->irq_data.domain)
- ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+ ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
raw_spin_unlock_irq(&desc->lock);
return ret;
@@ -251,7 +251,7 @@ static ssize_t actions_show(struct kobject *kobj,
char *p = "";
raw_spin_lock_irq(&desc->lock);
- for (action = desc->action; action != NULL; action = action->next) {
+ for_each_action_of_desc(desc, action) {
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
p, action->name);
p = ",";
@@ -277,7 +277,7 @@ static struct attribute *irq_attrs[] = {
};
ATTRIBUTE_GROUPS(irq);
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = irq_groups,
@@ -288,22 +288,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
if (irq_kobj_base) {
/*
* Continue even in case of failure as this is nothing
- * crucial.
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
*/
if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
}
}
static void irq_sysfs_del(struct irq_desc *desc)
{
/*
- * If irq_sysfs_init() has not yet been invoked (early boot), then
- * irq_kobj_base is NULL and the descriptor was never added.
- * kobject_del() complains about a object with no parent, so make
- * it conditional.
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
*/
- if (irq_kobj_base)
+ if (desc->istate & IRQS_SYSFS)
kobject_del(&desc->kobj);
}
@@ -332,7 +335,7 @@ postcore_initcall(irq_sysfs_init);
#else /* !CONFIG_SYSFS */
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
};
@@ -407,6 +410,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
+ init_waitqueue_head(&desc->wait_for_threads);
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
@@ -575,6 +579,7 @@ int __init early_irq_init(void)
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
mutex_init(&desc[i].request_mutex);
+ init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
@@ -640,35 +645,62 @@ int handle_irq_desc(struct irq_desc *desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
return 0;
}
-EXPORT_SYMBOL_GPL(handle_irq_desc);
/**
* generic_handle_irq - Invoke the handler for a particular irq
* @irq: The irq number to handle
*
- */
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
+ */
int generic_handle_irq(unsigned int irq)
{
return handle_irq_desc(irq_to_desc(irq));
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
+
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
- * to a domain, usually for a non-root interrupt
- * controller
+ * to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
@@ -676,75 +708,47 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
-/**
- * handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
- * usually for a root interrupt controller
+ /**
+ * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
+ * to a domain from any context.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
- * @regs: Register file coming from the low-level handling code
*
- * Returns: 0 on success, or -EINVAL if conversion has failed
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process
+ * context). If the interrupt is marked as 'enforce IRQ-context only' then
+ * the function must be invoked from hard interrupt context.
*/
-int handle_domain_irq(struct irq_domain *domain,
- unsigned int hwirq, struct pt_regs *regs)
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc *desc;
- int ret = 0;
-
- irq_enter();
-
- /* The irqdomain code provides boundary checks */
- desc = irq_resolve_mapping(domain, hwirq);
- if (likely(desc))
- handle_irq_desc(desc);
- else
- ret = -EINVAL;
+ unsigned long flags;
+ int ret;
- irq_exit();
- set_irq_regs(old_regs);
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+ local_irq_restore(flags);
return ret;
}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
/**
- * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
+ * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
+ * to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
- * @regs: Register file coming from the low-level handling code
- *
- * This function must be called from an NMI context.
*
* Returns: 0 on success, or -EINVAL if conversion has failed
- */
-int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
- struct pt_regs *regs)
+ *
+ * This function must be called from an NMI context with irq regs
+ * initialized.
+ **/
+int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc *desc;
- int ret = 0;
-
- /*
- * NMI context needs to be setup earlier in order to deal with tracing.
- */
- WARN_ON(!in_nmi());
-
- desc = irq_resolve_mapping(domain, hwirq);
-
- /*
- * ack_bad_irq is not NMI-safe, just report
- * an invalid interrupt.
- */
- if (likely(desc))
- handle_irq_desc(desc);
- else
- ret = -EINVAL;
-
- set_irq_regs(old_regs);
- return ret;
+ WARN_ON_ONCE(!in_nmi());
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
#endif
-#endif
/* Dynamic interrupt handling */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 51c483ce2447..f34760a1e222 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -114,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -123,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
+ unsigned int size,
+ irq_hw_number_t hwirq_max,
+ int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irqchip_fwid *fwid;
struct irq_domain *domain;
@@ -147,7 +139,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
static atomic_t unknown_domains;
if (WARN_ON((size && direct_max) ||
- (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max)))
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max) ||
+ (direct_max && (direct_max != hwirq_max))))
return NULL;
domain = kzalloc_node(struct_size(domain, revmap, size),
@@ -213,26 +206,66 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
- if (direct_max) {
- size = direct_max;
+ if (direct_max)
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
- }
domain->revmap_size = size;
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+/**
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @fwnode: firmware node for the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates and initializes an irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max,
+ ops, host_data);
+ if (domain)
+ __irq_domain_publish(domain);
+
return domain;
}
EXPORT_SYMBOL_GPL(__irq_domain_add);
@@ -437,31 +470,6 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
- * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
- * IRQ remapping
- *
- * Return: false if any MSI irq domain does not support IRQ remapping,
- * true otherwise (including if there is no MSI irq domain)
- */
-bool irq_domain_check_msi_remap(void)
-{
- struct irq_domain *h;
- bool ret = true;
-
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(h, &irq_domain_list, link) {
- if (irq_domain_is_msi(h) &&
- !irq_domain_hierarchical_is_msi_remap(h)) {
- ret = false;
- break;
- }
- }
- mutex_unlock(&irq_domain_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
-
-/**
* irq_set_default_host() - Set a "default" irq domain
* @domain: default domain pointer
*
@@ -491,6 +499,7 @@ struct irq_domain *irq_get_default_host(void)
{
return irq_default_domain;
}
+EXPORT_SYMBOL_GPL(irq_get_default_host);
static bool irq_domain_is_nomap(struct irq_domain *domain)
{
@@ -501,30 +510,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], NULL);
else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], irq_data);
else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -537,6 +550,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -556,10 +572,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -572,7 +590,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -589,23 +606,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -618,9 +641,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
@@ -649,9 +671,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
pr_debug("create_direct virq allocation failed\n");
return 0;
}
- if (virq >= domain->revmap_size) {
- pr_err("ERROR: no free irqs available below %i maximum\n",
- domain->revmap_size);
+ if (virq >= domain->hwirq_max) {
+ pr_err("ERROR: no free irqs available below %lu maximum\n",
+ domain->hwirq_max);
irq_free_desc(virq);
return 0;
}
@@ -667,6 +689,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
#endif
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
+{
+ struct device_node *of_node = irq_domain_get_of_node(domain);
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(of_node), virq);
+
+ return virq;
+}
+
/**
* irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
@@ -679,14 +729,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
* on the number returned from that call.
*/
unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
- irq_hw_number_t hwirq,
- const struct irq_affinity_desc *affinity)
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
int virq;
- pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-
/* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
@@ -694,32 +741,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
- pr_debug("-> using domain @%p\n", domain);
- of_node = irq_domain_get_of_node(domain);
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
- }
-
- /* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
- affinity);
- if (virq <= 0) {
- pr_debug("-> virq allocation failed\n");
- return 0;
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
}
- if (irq_domain_associate(domain, virq, hwirq)) {
- irq_free_desc(virq);
- return 0;
- }
-
- pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(of_node), virq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -743,9 +777,8 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
- unsigned int count,
- struct irq_fwspec *fwspec)
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count, struct irq_fwspec *fwspec)
{
int i;
@@ -755,6 +788,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
for (i = 0; i < count; i++)
fwspec->param[i] = args[i];
}
+EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
@@ -788,6 +822,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&domain->root->mutex);
+
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
@@ -800,7 +836,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
- return virq;
+ goto out;
/*
* If the trigger type has not been set yet, then set
@@ -808,40 +844,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
- if (!irq_data)
- return 0;
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
irqd_set_trigger_type(irq_data, type);
- return virq;
+ goto out;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
- return 0;
+ virq = 0;
+ goto out;
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
- if (virq <= 0)
- return 0;
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
- return 0;
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -905,10 +946,12 @@ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
return desc;
if (irq_domain_is_nomap(domain)) {
- if (hwirq < domain->revmap_size) {
+ if (hwirq < domain->hwirq_max) {
data = irq_domain_get_irq_data(domain, hwirq);
if (data && data->hwirq == hwirq)
desc = irq_data_to_desc(data);
+ if (irq && desc)
+ *irq = hwirq;
}
return desc;
@@ -1099,12 +1142,17 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
struct irq_domain *domain;
if (size)
- domain = irq_domain_create_linear(fwnode, size, ops, host_data);
+ domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);
else
- domain = irq_domain_create_tree(fwnode, ops, host_data);
+ domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
+
if (domain) {
+ if (parent)
+ domain->root = parent->root;
domain->parent = parent;
domain->flags |= flags;
+
+ __irq_domain_publish(domain);
}
return domain;
@@ -1120,10 +1168,6 @@ static void irq_domain_insert_irq(int virq)
domain->mapcount++;
irq_domain_set_mapping(domain, data->hwirq, data);
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1215,6 +1259,7 @@ int irq_domain_disconnect_hierarchy(struct irq_domain *domain,
irqd->chip = ERR_PTR(-ENOTCONN);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy);
static int irq_domain_trim_hierarchy(unsigned int virq)
{
@@ -1317,7 +1362,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1326,7 +1372,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
@@ -1345,7 +1391,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1421,40 +1467,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
-/**
- * __irq_domain_alloc_irqs - Allocate IRQs from domain
- * @domain: domain to allocate from
- * @irq_base: allocate specified IRQ number if irq_base >= 0
- * @nr_irqs: number of IRQs to allocate
- * @node: NUMA node id for memory allocation
- * @arg: domain specific argument
- * @realloc: IRQ descriptors have already been allocated if true
- * @affinity: Optional irq affinity mask for multiqueue devices
- *
- * Allocate IRQ numbers and initialized all data structures to support
- * hierarchy IRQ domains.
- * Parameter @realloc is mainly to support legacy IRQs.
- * Returns error code or allocated IRQ number
- *
- * The whole process to setup an IRQ has been split into two steps.
- * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
- * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program the hardware with preallocated
- * resources. In this way, it's easier to rollback when failing to
- * allocate resources.
- */
-int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
- unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct irq_affinity_desc *affinity)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
- if (domain == NULL) {
- domain = irq_default_domain;
- if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
- return -EINVAL;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1473,24 +1491,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
goto out_free_desc;
}
- mutex_lock(&irq_domain_mutex);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret < 0)
goto out_free_irq_data;
- }
for (i = 0; i < nr_irqs; i++) {
ret = irq_domain_trim_hierarchy(virq + i);
- if (ret) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret)
goto out_free_irq_data;
- }
}
-
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
return virq;
@@ -1501,16 +1513,60 @@ out_free_desc:
return ret;
}
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int ret;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
+
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
+ lockdep_assert_held(&d->domain->root->mutex);
+
if (irq_domain_is_nomap(d->domain))
return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_mutex);
if (d->hwirq < d->domain->revmap_size) {
/* Not using radix tree */
rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
@@ -1519,7 +1575,6 @@ static void irq_domain_fix_revmap(struct irq_data *d)
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
}
- mutex_unlock(&d->domain->revmap_mutex);
}
/**
@@ -1535,8 +1590,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)
*/
int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
{
- struct irq_data *child_irq_data;
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_desc *desc;
int rv = 0;
@@ -1561,47 +1616,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (WARN_ON(!irq_domain_is_hierarchy(domain)))
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
- if (domain->parent != root_irq_data->domain)
+ if (domain->parent != irq_data->domain)
return -EINVAL;
- child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
- irq_data_get_node(root_irq_data));
- if (!child_irq_data)
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
- *child_irq_data = *root_irq_data;
+ *parent_irq_data = *irq_data;
/*
- * Overwrite the root_irq_data, which is embedded in struct
- * irq_desc, with values for this domain.
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
*/
- root_irq_data->parent_data = child_irq_data;
- root_irq_data->domain = domain;
- root_irq_data->mask = 0;
- root_irq_data->hwirq = 0;
- root_irq_data->chip = NULL;
- root_irq_data->chip_data = NULL;
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
/* May (probably does) set hwirq, chip, etc. */
rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (rv) {
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
- kfree(child_irq_data);
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
goto error;
}
- irq_domain_fix_revmap(child_irq_data);
- irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
-
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1617,8 +1671,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);
*/
int irq_domain_pop_irq(struct irq_domain *domain, int virq)
{
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
- struct irq_data *child_irq_data;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_data *tmp_irq_data;
struct irq_desc *desc;
@@ -1640,37 +1694,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (domain == NULL)
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
tmp_irq_data = irq_domain_get_irq_data(domain, virq);
/* We can only "pop" if this domain is at the top of the list */
- if (WARN_ON(root_irq_data != tmp_irq_data))
+ if (WARN_ON(irq_data != tmp_irq_data))
return -EINVAL;
- if (WARN_ON(root_irq_data->domain != domain))
+ if (WARN_ON(irq_data->domain != domain))
return -EINVAL;
- child_irq_data = root_irq_data->parent_data;
- if (WARN_ON(!child_irq_data))
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
- root_irq_data->parent_data = NULL;
+ irq_data->parent_data = NULL;
- irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
irq_domain_free_irqs_hierarchy(domain, virq, 1);
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
+ *irq_data = *parent_irq_data;
- irq_domain_fix_revmap(root_irq_data);
+ irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
- kfree(child_irq_data);
+ kfree(parent_irq_data);
return 0;
}
@@ -1684,17 +1738,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
@@ -1809,20 +1866,6 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
if (domain->ops->alloc)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
-
-/**
- * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
- * parent has MSI remapping support
- * @domain: domain pointer
- */
-bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
-{
- for (; domain; domain = domain->parent) {
- if (irq_domain_is_msi_remap(domain))
- return true;
- }
- return false;
-}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
/**
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
@@ -1850,7 +1893,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1859,6 +1902,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
irq_set_handler_data(virq, handler_data);
}
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ return -EINVAL;
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
@@ -1909,7 +1959,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(debugfs_lookup(d->name, domain_dir));
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef30b4762947..d2742af0f0fd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -25,12 +25,11 @@
#include "internals.h"
#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
-__read_mostly bool force_irqthreads;
-EXPORT_SYMBOL_GPL(force_irqthreads);
+DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);
static int __init setup_forced_irqthreads(char *arg)
{
- force_irqthreads = true;
+ static_branch_enable(&force_irqthreads_key);
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
@@ -190,9 +189,12 @@ void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
- for_each_action_of_desc(desc, action)
+ for_each_action_of_desc(desc, action) {
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ if (action->secondary && action->secondary->thread)
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ }
}
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
@@ -206,16 +208,8 @@ static void irq_validate_effective_affinity(struct irq_data *data)
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
}
-
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask)
-{
- cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
-}
#else
static inline void irq_validate_effective_affinity(struct irq_data *data) { }
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask) { }
#endif
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
@@ -223,11 +217,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
{
struct irq_desc *desc = irq_data_to_desc(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
+ const struct cpumask *prog_mask;
int ret;
+ static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+ static struct cpumask tmp_mask;
+
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
+ raw_spin_lock(&tmp_mask_lock);
/*
* If this is a managed interrupt and housekeeping is enabled on
* it check whether the requested affinity mask intersects with
@@ -248,25 +247,35 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* online.
*/
if (irqd_affinity_is_managed(data) &&
- housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
- const struct cpumask *hk_mask, *prog_mask;
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask;
- static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
- static struct cpumask tmp_mask;
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
-
- raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
prog_mask = mask;
else
prog_mask = &tmp_mask;
- ret = chip->irq_set_affinity(data, prog_mask, force);
- raw_spin_unlock(&tmp_mask_lock);
} else {
- ret = chip->irq_set_affinity(data, mask, force);
+ prog_mask = mask;
}
+
+ /*
+ * Make sure we only provide online CPUs to the irqchip,
+ * unless we are being asked to force the affinity (in which
+ * case we do as we are told).
+ */
+ cpumask_and(&tmp_mask, prog_mask, cpu_online_mask);
+ if (!force && !cpumask_empty(&tmp_mask))
+ ret = chip->irq_set_affinity(data, &tmp_mask, force);
+ else if (force)
+ ret = chip->irq_set_affinity(data, mask, force);
+ else
+ ret = -EINVAL;
+
+ raw_spin_unlock(&tmp_mask_lock);
+
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
@@ -315,7 +324,7 @@ static int irq_try_set_affinity(struct irq_data *data,
}
static bool irq_set_affinity_deactivated(struct irq_data *data,
- const struct cpumask *mask, bool force)
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_data_to_desc(data);
@@ -333,7 +342,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
return false;
cpumask_copy(desc->irq_common_data.affinity, mask);
- irq_init_effective_affinity(data, mask);
+ irq_data_update_effective_affinity(data, mask);
irqd_set(data, IRQD_AFFINITY_SET);
return true;
}
@@ -348,7 +357,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_set_affinity_deactivated(data, mask, force))
+ if (irq_set_affinity_deactivated(data, mask))
return 0;
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
@@ -487,7 +496,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
+ bool setaffinity)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
@@ -496,12 +506,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
- /* set the initial affinity to prevent every interrupt being on CPU0 */
- if (m)
+ if (m && setaffinity)
__irq_set_affinity(irq, m, false);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
@@ -717,10 +726,13 @@ EXPORT_SYMBOL(disable_irq_nosync);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
@@ -1250,6 +1262,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
}
/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ set_bit(IRQTF_READY, &action->thread_flags);
+ wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!action || !action->thread)
+ return;
+
+ wake_up_process(action->thread);
+ wait_event(desc->wait_for_threads,
+ test_bit(IRQTF_READY, &action->thread_flags));
+}
+
+/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
@@ -1260,8 +1297,12 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
- if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
- &action->thread_flags))
+ irq_thread_set_ready(desc, action);
+
+ sched_set_fifo(current);
+
+ if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
handler_fn = irq_forced_thread_fn;
else
handler_fn = irq_thread_fn;
@@ -1322,7 +1363,7 @@ EXPORT_SYMBOL_GPL(irq_wake_thread);
static int irq_setup_forced_threading(struct irqaction *new)
{
- if (!force_irqthreads)
+ if (!force_irqthreads())
return 0;
if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
return 0;
@@ -1425,8 +1466,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
if (IS_ERR(t))
return PTR_ERR(t);
- sched_set_fifo(t);
-
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
@@ -1684,8 +1723,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- init_waitqueue_head(&desc->wait_for_threads);
-
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
@@ -1781,14 +1818,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irq_setup_timings(desc, new);
- /*
- * Strictly no need to wake it up, but hung_task complains
- * when no hard interrupt wakes the thread up.
- */
- if (new->thread)
- wake_up_process(new->thread);
- if (new->secondary)
- wake_up_process(new->secondary->thread);
+ wake_up_and_wait_for_irq_thread_ready(desc, new);
+ wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
@@ -2072,9 +2103,9 @@ const void *free_nmi(unsigned int irq, void *dev_id)
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts
- * If NULL and thread_fn != NULL the default
- * primary handler is installed
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
* @thread_fn: Function called from the irq handler thread
* If NULL, no irq thread is created
* @irqflags: Interrupt type flags
@@ -2108,7 +2139,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
- *
+ * IRQF_ONESHOT Run thread_fn with interrupt line masked
*/
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
@@ -2828,7 +2859,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
* This call sets the internal irqchip state of an interrupt,
* depending on the value of @which.
*
- * This function should be called with preemption disabled if the
+ * This function should be called with migration disabled if the
* interrupt controller has per-cpu registers.
*/
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 578596e41cb6..1698e77645ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -280,12 +280,13 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
/**
* irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
* @m: Matrix pointer
- * @cpu: On which CPU the interrupt should be allocated
+ * @msk: Which CPUs to search in
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
*/
int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
unsigned int *mapped_cpu)
{
- unsigned int bit, cpu, end = m->alloc_end;
+ unsigned int bit, cpu, end;
struct cpumap *cm;
if (cpumask_empty(msk))
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c41965e348b5..7a97bcb086bf 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,46 +14,243 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
#include "internals.h"
/**
- * alloc_msi_entry - Allocate an initialize msi_entry
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
+static inline int msi_sysfs_create_group(struct device *dev);
+
+
+/**
+ * msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
*
- * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * If @affinity is not %NULL then an affinity array[@nvec] is allocated
* and the affinity masks and flags from @affinity are copied.
+ *
+ * Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
-struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
{
- struct msi_desc *desc;
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
- desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
- INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity,
- nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
}
}
-
return desc;
}
-void free_msi_entry(struct msi_desc *entry)
+static void msi_free_desc(struct msi_desc *desc)
+{
+ kfree(desc->affinity);
+ kfree(desc);
+}
+
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
+{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
+ int ret;
+
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
+ return ret;
+}
+
+/**
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
+{
+ struct msi_desc *desc;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
+ if (!desc)
+ return -ENOMEM;
+
+ /* Copy type specific data to the new descriptor. */
+ desc->pci = init_desc->pci;
+
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
+}
+
+static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
{
- kfree(entry->affinity);
- kfree(entry);
+ switch (filter) {
+ case MSI_DESC_ALL:
+ return true;
+ case MSI_DESC_NOTASSOCIATED:
+ return !desc->irq;
+ case MSI_DESC_ASSOCIATED:
+ return !!desc->irq;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
+/**
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
+ */
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
+/**
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ unsigned int idx;
+ int ret;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -69,7 +266,354 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static void msi_device_data_release(struct device *dev, void *res)
+{
+ struct msi_device_data *md = res;
+ int i;
+
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
+ dev->msi.data = NULL;
+}
+
+/**
+ * msi_setup_device_data - Setup MSI device data
+ * @dev: Device for which MSI device data should be set up
+ *
+ * Return: 0 on success, appropriate error code otherwise
+ *
+ * This can be called more than once for @dev. If the MSI device data is
+ * already allocated the call succeeds. The allocated memory is
+ * automatically released when the device is destroyed.
+ */
+int msi_setup_device_data(struct device *dev)
+{
+ struct msi_device_data *md;
+ int ret, i;
+
+ if (dev->msi.data)
+ return 0;
+
+ md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ ret = msi_sysfs_create_group(dev);
+ if (ret) {
+ devres_free(md);
+ return ret;
+ }
+
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
+ mutex_init(&md->mutex);
+ dev->msi.data = md;
+ devres_add(dev, md);
+ return 0;
+}
+
+/**
+ * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_lock_descs(struct device *dev)
+{
+ mutex_lock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_lock_descs);
+
+/**
+ * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_unlock_descs(struct device *dev)
+{
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
+ mutex_unlock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_unlock_descs);
+
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct xarray *xa = &md->__domains[domid].store;
+ struct msi_desc *desc;
+
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
+ if (msi_desc_match(desc, filter))
+ return desc;
+ }
+ md->__iter_idx = MSI_XA_MAX_INDEX;
+ return NULL;
+}
+
+/**
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
+ * @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
+ *
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ md->__iter_idx = 0;
+ return msi_find_desc(md, domid, filter);
+}
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
+
+/**
+ * msi_next_desc - Get the next MSI descriptor of a device
+ * @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
+ *
+ * The first invocation of msi_next_desc() has to be preceeded by a
+ * successful invocation of __msi_first_desc(). Consecutive invocations are
+ * only valid if the previous one was successful. All these operations have
+ * to be done within the same MSI mutex held region.
+ *
+ * Return: Pointer to the next MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
+ return NULL;
+
+ md->__iter_idx++;
+ return msi_find_desc(md, domid, filter);
+}
+EXPORT_SYMBOL_GPL(msi_next_desc);
+
+/**
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
+ * @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
+ * @index: MSI interrupt index to look for (0-based)
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
+{
+ struct msi_desc *desc;
+ unsigned int ret = 0;
+ bool pcimsi = false;
+ struct xarray *xa;
+
+ if (!dev->msi.data)
+ return 0;
+
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
+
+ msi_lock_descs(dev);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
+ if (desc && desc->irq) {
+ /*
+ * PCI-MSI has only one descriptor for multiple interrupts.
+ * PCI-MSIX and platform MSI use a descriptor per
+ * interrupt.
+ */
+ if (pcimsi) {
+ if (index < desc->nvec_used)
+ ret = desc->irq + index;
+ } else {
+ ret = desc->irq;
+ }
+ }
+
+ msi_unlock_descs(dev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
+
+#ifdef CONFIG_SYSFS
+static struct attribute *msi_dev_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group msi_irqs_group = {
+ .name = "msi_irqs",
+ .attrs = msi_dev_attrs,
+};
+
+static inline int msi_sysfs_create_group(struct device *dev)
+{
+ return devm_device_add_group(dev, &msi_irqs_group);
+}
+
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ /* MSI vs. MSIX is per device not per interrupt */
+ bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs = desc->sysfs_attrs;
+ int i;
+
+ if (!attrs)
+ return;
+
+ desc->sysfs_attrs = NULL;
+ for (i = 0; i < desc->nvec_used; i++) {
+ if (attrs[i].show)
+ sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ kfree(attrs[i].attr.name);
+ }
+ kfree(attrs);
+}
+
+static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs;
+ int ret, i;
+
+ attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ desc->sysfs_attrs = attrs;
+ for (i = 0; i < desc->nvec_used; i++) {
+ sysfs_attr_init(&attrs[i].attr);
+ attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
+ if (!attrs[i].attr.name) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ attrs[i].attr.mode = 0444;
+ attrs[i].show = msi_mode_show;
+
+ ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ if (ret) {
+ attrs[i].show = NULL;
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ msi_sysfs_remove_desc(dev, desc);
+ return ret;
+}
+
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
+/**
+ * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) which will get sysfs entries
+ */
+int msi_device_populate_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ if (desc->sysfs_attrs)
+ continue;
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) for which to remove
+ * sysfs entries
+ */
+void msi_device_destroy_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ALL)
+ msi_sysfs_remove_desc(dev, desc);
+}
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#else /* CONFIG_SYSFS */
+static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
+static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
+static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
+#endif /* !CONFIG_SYSFS */
+
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -97,6 +641,8 @@ static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
*
* Intended to be used by MSI interrupt controllers which are
* implemented with hierarchical domains.
+ *
+ * Return: IRQ_SET_MASK_* result code
*/
int msi_domain_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
@@ -221,21 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
.msi_prepare = msi_domain_ops_prepare,
.set_desc = msi_domain_ops_set_desc,
- .domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -247,11 +783,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
- if (ops->domain_alloc_irqs == NULL)
- ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
-
if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -259,8 +790,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
if (ops->set_desc == NULL)
@@ -276,29 +805,257 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain)
+ irq_domain_update_bus_token(domain, info->bus_token);
+
+ return domain;
+}
+
/**
- * msi_create_irq_domain - Create a MSI interrupt domain
+ * msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
* @info: MSI domain info
* @parent: Parent irq domain
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
*/
struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
+
+/**
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
+{
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
+
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
+
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct msi_domain_template *bundle;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ fwnode = irq_domain_alloc_named_fwnode(bundle->name);
+ if (!fwnode)
+ goto free_bundle;
+
+ if (msi_setup_device_data(dev))
+ goto free_fwnode;
+
+ msi_lock_descs(dev);
+
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ goto fail;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ goto fail;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ goto fail;
+
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+ msi_unlock_descs(dev);
+ return true;
+
+fail:
+ msi_unlock_descs(dev);
+free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+free_bundle:
+ kfree(bundle);
+ return false;
+}
+
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
+{
+ struct fwnode_handle *fwnode = NULL;
+ struct msi_domain_info *info;
struct irq_domain *domain;
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+ msi_lock_descs(dev);
- domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = msi_get_device_domain(dev, domid);
- if (domain && !domain->name && info->chip)
- domain->name = info->chip->name;
+ if (!domain || !irq_domain_is_msi_device(domain))
+ goto unlock;
- return domain;
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
+ irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
+ kfree(container_of(info, struct msi_domain_template, info));
+
+unlock:
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
+ *
+ * Return: True if device domain exists and bus tokens match.
+ */
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+ bool ret = false;
+
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ ret = info->bus_token == bus_token;
+ }
+ msi_unlock_descs(dev);
+ return ret;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -306,56 +1063,81 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- int ret;
-
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, arg);
- return ret;
+ return ops->msi_prepare(domain, dev, nvec, arg);
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq, int nvec, msi_alloc_info_t *arg)
+ int virq_base, int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
struct msi_desc *desc;
- int ret = 0;
+ struct xarray *xa;
+ int ret, virq;
- for_each_msi_entry(desc, dev) {
- /* Don't even try the multi-MSI brain damage. */
- if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
- ret = -EINVAL;
- break;
- }
+ msi_lock_descs(dev);
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
+ if (!msi_ctrl_valid(dev, &ctrl)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
+ if (ret)
+ goto unlock;
+
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+
+ for (virq = virq_base; virq < virq_base + nvec; virq++) {
+ desc = xa_load(xa, virq);
+ desc->irq = virq;
ops->set_desc(arg, desc);
- /* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
- arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (ret)
- break;
+ goto fail;
- irq_set_msi_desc_off(desc->irq, 0, desc);
+ irq_set_msi_desc(virq, desc);
}
+ msi_unlock_descs(dev);
+ return 0;
- if (ret) {
- /* Mop up the damage */
- for_each_msi_entry(desc, dev) {
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
-
- irq_domain_free_irqs_common(domain, desc->irq, 1);
- }
+fail:
+ for (--virq; virq >= virq_base; virq--) {
+ msi_domain_depopulate_descs(dev, virq, 1);
+ irq_domain_free_irqs_common(domain, virq, 1);
}
-
+ msi_domain_free_descs(dev, &ctrl);
+unlock:
+ msi_unlock_descs(dev);
return ret;
}
+void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec)
+{
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ if (!msi_ctrl_valid(dev, &ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last)
+ desc->irq = 0;
+}
+
/*
* Carefully check whether the device can use reservation mode. If
* reservation mode is enabled then the early activation will assign a
@@ -375,6 +1157,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
break;
default:
@@ -389,161 +1173,505 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
/*
* Checking the first MSI descriptor is sufficient. MSIX supports
- * masking and MSI does so when the maskbit is set.
+ * masking and MSI does so when the can_mask attribute is set.
*/
- desc = first_msi_entry(dev);
- return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+ desc = msi_first_desc(dev, MSI_DESC_ALL);
+ return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
}
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
+ int allocated)
{
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_VMD_MSI:
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ break;
+ fallthrough;
+ default:
+ return -ENOSPC;
+ }
+
+ /* Let a failed PCI multi MSI allocation retry */
+ if (desc->nvec_used > 1)
+ return 1;
+
+ /* If there was a successful allocation let the caller know */
+ return allocated ? allocated : -ENOSPC;
+}
+
+#define VIRQ_CAN_RESERVE 0x01
+#define VIRQ_ACTIVATE 0x02
+#define VIRQ_NOMASK_QUIRK 0x04
+
+static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
+{
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (!(vflags & VIRQ_CAN_RESERVE)) {
+ irqd_clr_can_reserve(irqd);
+ if (vflags & VIRQ_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irqd);
+
+ /*
+ * If the interrupt is managed but no CPU is available to
+ * service it, shut it down until better times. Note that
+ * we only do this on the !RESERVE path as x86 (the only
+ * architecture using this flag) deals with this in a
+ * different way by using a catch-all vector.
+ */
+ if ((vflags & VIRQ_ACTIVATE) &&
+ irqd_affinity_is_managed(irqd) &&
+ !cpumask_intersects(irq_data_get_affinity_mask(irqd),
+ cpu_online_mask)) {
+ irqd_set_managed_shutdown(irqd);
+ return 0;
+ }
+ }
+
+ if (!(vflags & VIRQ_ACTIVATE))
+ return 0;
+
+ ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
+ if (ret)
+ return ret;
+ /*
+ * If the interrupt uses reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (vflags & VIRQ_CAN_RESERVE)
+ irqd_clr_activated(irqd);
+ return 0;
+}
+
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- struct irq_data *irq_data;
- struct msi_desc *desc;
+ unsigned int vflags = 0, allocated = 0;
msi_alloc_info_t arg = { };
+ struct msi_desc *desc;
+ unsigned long idx;
int i, ret, virq;
- bool can_reserve;
- ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
- for_each_msi_entry(desc, dev) {
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
+ vflags |= VIRQ_ACTIVATE;
+
+ /*
+ * Interrupt can use a reserved vector and will not occupy
+ * a real device vector until the interrupt is requested.
+ */
+ if (msi_check_reservation_mode(domain, info, dev)) {
+ vflags |= VIRQ_CAN_RESERVE;
+ /*
+ * MSI affinity setting requires a special quirk (X86) when
+ * reservation mode is active.
+ */
+ if (info->flags & MSI_FLAG_NOMASK_QUIRK)
+ vflags |= VIRQ_NOMASK_QUIRK;
+ }
+
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
- if (virq < 0) {
- ret = -ENOSPC;
- if (ops->handle_error)
- ret = ops->handle_error(domain, desc, ret);
- if (ops->msi_finish)
- ops->msi_finish(&arg, ret);
- return ret;
- }
+ if (virq < 0)
+ return msi_handle_pci_fail(domain, desc, allocated);
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
+ ret = msi_init_virq(domain, virq + i, vflags);
+ if (ret)
+ return ret;
}
+ if (info->flags & MSI_FLAG_DEV_SYSFS) {
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
+ }
+ allocated++;
}
+ return 0;
+}
- if (ops->msi_finish)
- ops->msi_finish(&arg, 0);
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
+{
+ if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
+ return 0;
- can_reserve = msi_check_reservation_mode(domain, info, dev);
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
- /*
- * This flag is set by the PCI layer as we need to activate
- * the MSI entries before the PCI layer enables MSI in the
- * card. Otherwise the card latches a random msi message.
- */
- if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
- goto skip_activate;
-
- for_each_msi_vector(desc, i, dev) {
- if (desc->irq == i) {
- virq = desc->irq;
- dev_dbg(dev, "irq [%d-%d] for MSI\n",
- virq, virq + desc->nvec_used - 1);
- }
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
- irq_data = irq_domain_get_irq_data(domain, i);
- if (!can_reserve) {
- irqd_clr_can_reserve(irq_data);
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irq_data);
- }
- ret = irq_domain_activate_irq(irq_data, can_reserve);
- if (ret)
- goto cleanup;
- }
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
-skip_activate:
- /*
- * If these interrupts use reservation mode, clear the activated bit
- * so request_irq() will assign the final vector.
- */
- if (can_reserve) {
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- irqd_clr_activated(irq_data);
- }
- }
- return 0;
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
-cleanup:
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
- msi_domain_free_irqs(domain, dev);
+ info = domain->host_data;
+
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
+
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
return ret;
}
/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own descriptor
+ * allocation/free.
*
- * Returns 0 on success or an error code.
+ * Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
- return ops->domain_alloc_irqs(domain, dev, nvec);
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ int ret;
+
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+ return ret;
}
-void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+/**
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
struct msi_desc *desc;
+ int ret;
- for_each_msi_entry(desc, dev) {
- /*
- * We might have failed to allocate an MSI early
- * enough that there is no IRQ associated to this
- * entry. If that's the case, don't do anything.
- */
- if (desc->irq) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ goto unlock;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ goto unlock;
+ }
+
+ if (icookie)
+ desc->data.icookie = *icookie;
+
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ goto unlock;
+ }
+
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+unlock:
+ msi_unlock_descs(dev);
+ return map;
+}
+
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
+ struct msi_domain_info *info = domain->host_data;
+ struct irq_data *irqd;
+ struct msi_desc *desc;
+ unsigned long idx;
+ int i;
+
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
+ /* Make sure all interrupts are deactivated */
+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd && irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
}
+
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ if (info->flags & MSI_FLAG_DEV_SYSFS)
+ msi_sysfs_remove_desc(dev, desc);
+ desc->irq = 0;
}
}
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
+ if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
+ msi_domain_free_descs(dev, ctrl);
+}
+
/**
- * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
+ */
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
+{
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
+}
- return ops->domain_free_irqs(domain, dev);
+/**
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ */
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
+ msi_unlock_descs(dev);
}
/**
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
- * Returns the pointer to the msi_domain_info stored in
- * @domain->host_data.
+ * Return: the pointer to the msi_domain_info stored in @domain->host_data.
*/
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+/**
+ * msi_device_has_isolated_msi - True if the device has isolated MSI
+ * @dev: The device to check
+ *
+ * Isolated MSI means that HW modeled by an irq_domain on the path from the
+ * initiating device to the CPU will validate that the MSI message specifies an
+ * interrupt number that the device is authorized to trigger. This must block
+ * devices from triggering interrupts they are not authorized to trigger.
+ * Currently authorization means the MSI vector is one assigned to the device.
+ *
+ * This is interesting for securing VFIO use cases where a rouge MSI (eg created
+ * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to
+ * impact outside its security domain, eg userspace triggering interrupts on
+ * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM
+ * triggering interrupts on another VM.
+ */
+bool msi_device_has_isolated_msi(struct device *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(dev);
+
+ for (; domain; domain = domain->parent)
+ if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI)
+ return true;
+ return arch_is_isolated_msi();
+}
+EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index ce0adb22ee96..c556bc49d213 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -147,7 +147,6 @@ void suspend_device_irqs(void)
synchronize_irq(irq);
}
}
-EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc)
{
@@ -227,7 +226,7 @@ unlock:
}
/**
- * irq_pm_syscore_ops - enable interrupt lines early
+ * irq_pm_syscore_resume - enable interrupt lines early
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
@@ -259,4 +258,3 @@ void resume_device_irqs(void)
{
resume_irqs(false);
}
-EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7c5cd42df3b9..623b8136e9af 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
+ unsigned int irq = (int)(long)pde_data(file_inode(file));
cpumask_var_t new_value;
int err;
@@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_proc_show, pde_data(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_list_proc_show, pde_data(inode));
}
static const struct proc_ops irq_affinity_proc_ops = {
@@ -265,7 +265,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE_DATA(inode));
+ return single_open(file, default_affinity_show, pde_data(inode));
}
static const struct proc_ops default_affinity_proc_ops = {
@@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
- seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+ seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
else
seq_printf(p, " %*s", prec, "");
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c481d8458325..02b2daf07441 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index d309d6fbf5bd..c43e2ac2f8de 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
*/
index = irq_timings_interval_index(interval);
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
/*
* Store the index as an element of the pattern in another
* circular array.
@@ -794,12 +799,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti)
__irq_timings_store(irq, irqs, ti->intervals[i]);
if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ ret = -EBADSLT;
pr_err("Failed to store in the circular buffer\n");
goto out;
}
}
if (irqs->count != ti->count) {
+ ret = -ERANGE;
pr_err("Count differs\n");
goto out;
}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index db8c248ebc8c..2f4fb336dda1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -18,11 +18,38 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>
+#include <trace/events/ipi.h>
+
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
+static DEFINE_PER_CPU(struct task_struct *, irq_workd);
+
+static void wake_irq_workd(void)
+{
+ struct task_struct *tsk = __this_cpu_read(irq_workd);
+
+ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
+ wake_up_process(tsk);
+}
+
+#ifdef CONFIG_SMP
+static void irq_work_wake(struct irq_work *entry)
+{
+ wake_irq_workd();
+}
+
+static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
+ IRQ_WORK_INIT_HARD(irq_work_wake);
+#endif
+
+static int irq_workd_should_run(unsigned int cpu)
+{
+ return !llist_empty(this_cpu_ptr(&lazy_list));
+}
/*
* Claim the entry so that no one else will poke at it.
@@ -49,18 +76,40 @@ void __weak arch_irq_work_raise(void)
*/
}
+static __always_inline void irq_work_raise(struct irq_work *work)
+{
+ if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+ trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+
+ arch_irq_work_raise();
+}
+
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
+ struct llist_head *list;
+ bool rt_lazy_work = false;
+ bool lazy_work = false;
+ int work_flags;
+
+ work_flags = atomic_read(&work->node.a_flags);
+ if (work_flags & IRQ_WORK_LAZY)
+ lazy_work = true;
+ else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(work_flags & IRQ_WORK_HARD_IRQ))
+ rt_lazy_work = true;
+
+ if (lazy_work || rt_lazy_work)
+ list = this_cpu_ptr(&lazy_list);
+ else
+ list = this_cpu_ptr(&raised_list);
+
+ if (!llist_add(&work->node.llist, list))
+ return;
+
/* If the work is "lazy", handle it from next tick if any */
- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
- } else {
- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
- arch_irq_work_raise();
- }
+ if (!lazy_work || tick_nohz_tick_stopped())
+ irq_work_raise(work);
}
/* Enqueue the irq work @work on the current CPU */
@@ -98,23 +147,40 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (!irq_work_claim(work))
return false;
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack_noalloc(work);
preempt_disable();
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
+
+ /*
+ * On PREEMPT_RT the items which are not marked as
+ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
+ * item is used on the remote CPU to wake the thread.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
+
+ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
+ goto out;
+
+ work = &per_cpu(irq_work_wakeup, cpu);
+ if (!irq_work_claim(work))
+ goto out;
+ }
+
__smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
+out:
preempt_enable();
return true;
#endif /* CONFIG_SMP */
}
-
bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
@@ -160,6 +226,10 @@ void irq_work_single(void *arg)
* else claimed it meanwhile.
*/
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt())
+ rcuwait_wake_up(&work->irqwait);
}
static void irq_work_run_list(struct llist_head *list)
@@ -167,7 +237,12 @@ static void irq_work_run_list(struct llist_head *list)
struct irq_work *work, *tmp;
struct llist_node *llnode;
- BUG_ON(!irqs_disabled());
+ /*
+ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
+ * in a per-CPU thread in preemptible context. Only the items which are
+ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
+ */
+ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));
if (llist_empty(list))
return;
@@ -184,7 +259,10 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -194,7 +272,11 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
/*
@@ -204,8 +286,42 @@ void irq_work_tick(void)
void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
+ might_sleep();
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt()) {
+ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
+ TASK_UNINTERRUPTIBLE);
+ return;
+ }
while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+static void run_irq_workd(unsigned int cpu)
+{
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+static void irq_workd_setup(unsigned int cpu)
+{
+ sched_set_fifo_low(current);
+}
+
+static struct smp_hotplug_thread irqwork_threads = {
+ .store = &irq_workd,
+ .setup = irq_workd_setup,
+ .thread_should_run = irq_workd_should_run,
+ .thread_fn = run_irq_workd,
+ .thread_comm = "irq_work/%u",
+};
+
+static __init int irq_work_init_threads(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
+ return 0;
+}
+early_initcall(irq_work_init_threads);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b156e152d6b4..d9c822bbffb8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -113,11 +113,40 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
{
- int v, v1;
+ int v;
STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || (v + 1) < 0)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
lockdep_assert_cpus_held();
/*
@@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
* static_key_enabled(&key) for jumps to be updated properly.
- *
- * So give a special meaning to negative key->enabled: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
*/
- for (v = atomic_read(&key->enabled); v > 0; v = v1) {
- v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
- if (likely(v1 == v))
- return;
- }
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
@@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
*/
atomic_set_release(&key->enabled, 1);
} else {
- atomic_inc(&key->enabled);
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
+ jump_label_unlock();
+ return false;
+ }
}
jump_label_unlock();
+ return true;
}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
+ bool ret;
+
cpus_read_lock();
- static_key_slow_inc_cpuslocked(key);
+ ret = static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -332,17 +360,13 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ /* nothing to do on most architectures */
}
+#endif
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
@@ -508,7 +532,7 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
-static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
bool type = static_key_type(key);
@@ -596,31 +620,6 @@ static void __jump_label_mod_update(struct static_key *key)
}
}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
-
- /* if the module doesn't have jump label entries, just return */
- if (iter_start == iter_stop)
- return;
-
- for (iter = iter_start; iter < iter_stop; iter++) {
- /* Only write NOPs for arch_branch_static(). */
- if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
- arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- }
-}
-
static int jump_label_add_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 0ba87982d017..77747391f49b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -29,29 +29,10 @@
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
-/*
- * These will be re-linked against their real values
- * during the second link stage.
- */
-extern const unsigned long kallsyms_addresses[] __weak;
-extern const int kallsyms_offsets[] __weak;
-extern const u8 kallsyms_names[] __weak;
-
-/*
- * Tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV).
- */
-extern const unsigned int kallsyms_num_syms
-__section(".rodata") __attribute__((weak));
-
-extern const unsigned long kallsyms_relative_base
-__section(".rodata") __attribute__((weak));
-
-extern const char kallsyms_token_table[] __weak;
-extern const u16 kallsyms_token_index[] __weak;
-
-extern const unsigned int kallsyms_markers[] __weak;
+#include "kallsyms_internal.h"
/*
* Expand a compressed symbol data into the resulting uncompressed string,
@@ -69,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
data = &kallsyms_names[off];
len = *data;
data++;
+ off++;
+
+ /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7F) | (*data << 7);
+ data++;
+ off++;
+ }
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
- off += len + 1;
+ off += len;
/*
* For every byte on the compressed symbol data, copy the table
@@ -127,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
- int i;
+ int i, len;
/*
* Use the closest marker we have. We have markers every 256 positions,
@@ -141,13 +130,23 @@ static unsigned int get_symbol_offset(unsigned long pos)
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
- for (i = 0; i < (pos & 0xFF); i++)
- name = name + (*name) + 1;
+ for (i = 0; i < (pos & 0xFF); i++) {
+ len = *name;
+
+ /*
+ * If MSB is 1, it is a "big" symbol, so we need to look into
+ * the next byte (and skip it, too).
+ */
+ if ((len & 0x80) != 0)
+ len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+ name = name + len + 1;
+ }
return name - kallsyms_names;
}
-static unsigned long kallsyms_sym_address(int idx)
+unsigned long kallsyms_sym_address(int idx)
{
if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
return kallsyms_addresses[idx];
@@ -164,53 +163,132 @@ static unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
-#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN)
-/*
- * LLVM appends a hash to static function names when ThinLTO and CFI are
- * both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b.
- * This causes confusion and potentially breaks user space tools, so we
- * strip the suffix from expanded symbol names.
- */
-static inline bool cleanup_symbol_name(char *s)
+static bool cleanup_symbol_name(char *s)
{
char *res;
- res = strrchr(s, '$');
- if (res)
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ /*
+ * LLVM appends various suffixes for local functions and variables that
+ * must be promoted to global scope as part of LTO. This can break
+ * hooking of static functions with kprobes. '.' is not a valid
+ * character in an identifier in C. Suffixes observed:
+ * - foo.llvm.[0-9a-f]+
+ * - foo.[0-9a-f]+
+ */
+ res = strchr(s, '.');
+ if (res) {
*res = '\0';
+ return true;
+ }
- return res != NULL;
+ return false;
}
-#else
-static inline bool cleanup_symbol_name(char *s) { return false; }
-#endif
-/* Lookup the address for this symbol. Returns 0 if not found. */
-unsigned long kallsyms_lookup_name(const char *name)
+static int compare_symbol_name(const char *name, char *namebuf)
+{
+ int ret;
+
+ ret = strcmp(name, namebuf);
+ if (!ret)
+ return ret;
+
+ if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf))
+ return 0;
+
+ return ret;
+}
+
+static unsigned int get_symbol_seq(int index)
{
+ unsigned int i, seq = 0;
+
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
+
+ return seq;
+}
+
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
+{
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ low = 0;
+ high = kallsyms_num_syms - 1;
+
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = compare_symbol_name(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
+ }
- if (strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ if (low > high)
+ return -ESRCH;
- if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
}
+
+ return 0;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+ int ret;
+ unsigned int i;
+
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
+
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
+
return module_kallsyms_lookup_name(name);
}
-#ifdef CONFIG_LIVEPATCH
/*
* Iterate over all symbols in vmlinux. For symbols from modules use
* module_kallsyms_on_each_symbol instead.
*/
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
- unsigned long),
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
@@ -220,13 +298,31 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
+ ret = fn(data, namebuf, kallsyms_sym_address(i));
if (ret != 0)
return ret;
+ cond_resched();
}
return 0;
}
-#endif /* CONFIG_LIVEPATCH */
+
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
+}
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
@@ -775,6 +871,96 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+#ifdef CONFIG_BPF_SYSCALL
+
+struct bpf_iter__ksym {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kallsym_iter *, ksym);
+};
+
+static int ksym_prog_seq_show(struct seq_file *m, bool in_stop)
+{
+ struct bpf_iter__ksym ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = m;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.ksym = m ? m->private : NULL;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p)
+{
+ return ksym_prog_seq_show(m, false);
+}
+
+static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p)
+{
+ if (!p)
+ (void) ksym_prog_seq_show(m, true);
+ else
+ s_stop(m, p);
+}
+
+static const struct seq_operations bpf_iter_ksym_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = bpf_iter_ksym_seq_stop,
+ .show = bpf_iter_ksym_seq_show,
+};
+
+static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct kallsym_iter *iter = priv_data;
+
+ reset_iter(iter, 0);
+
+ /* cache here as in kallsyms_open() case; use current process
+ * credentials to tell BPF iterators if values should be shown.
+ */
+ iter->show_value = kallsyms_show_value(current_cred());
+
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym)
+
+static const struct bpf_iter_seq_info ksym_iter_seq_info = {
+ .seq_ops = &bpf_iter_ksym_ops,
+ .init_seq_private = bpf_iter_ksym_init,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct kallsym_iter),
+};
+
+static struct bpf_iter_reg ksym_iter_reg_info = {
+ .target = "ksym",
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ksym, ksym),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ksym_iter_seq_info,
+};
+
+BTF_ID_LIST(btf_ksym_iter_id)
+BTF_ID(struct, kallsym_iter)
+
+static int __init bpf_ksym_iter_register(void)
+{
+ ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id;
+ return bpf_iter_reg_target(&ksym_iter_reg_info);
+}
+
+late_initcall(bpf_ksym_iter_register);
+
+#endif /* CONFIG_BPF_SYSCALL */
+
static inline int kallsyms_for_perf(void)
{
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
new file mode 100644
index 000000000000..27fabdcc40f5
--- /dev/null
+++ b/kernel/kallsyms_internal.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LINUX_KALLSYMS_INTERNAL_H_
+#define LINUX_KALLSYMS_INTERNAL_H_
+
+#include <linux/types.h>
+
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const int kallsyms_offsets[] __weak;
+extern const u8 kallsyms_names[] __weak;
+
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
+ */
+extern const unsigned int kallsyms_num_syms
+__section(".rodata") __attribute__((weak));
+
+extern const unsigned long kallsyms_relative_base
+__section(".rodata") __attribute__((weak));
+
+extern const char kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
+
+extern const unsigned int kallsyms_markers[] __weak;
+extern const u8 kallsyms_seqs_of_names[] __weak;
+
+#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..a2e3745d15c4
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+ ITEM_DATA(vmap_area_list),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, unsigned long addr)
+{
+ u64 t0, t1, t;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ t0 = ktime_get_ns();
+ (void)kallsyms_lookup_name(name);
+ t1 = ktime_get_ns();
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static bool match_cleanup_name(const char *s, const char *name)
+{
+ char *p;
+ int len;
+
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ p = strchr(s, '.');
+ if (!p)
+ return false;
+
+ len = strlen(name);
+ if (p - s != len)
+ return false;
+
+ return !strncmp(s, name, len);
+}
+
+static int find_symbol(void *data, const char *name, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (strcmp(name, stat->name) == 0 ||
+ (!stat->perf && match_cleanup_name(name, stat->name))) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ goto failed;
+ }
+
+ /*
+ * The first '.' may be the initial letter, in which case the
+ * entire symbol name will be truncated to an empty string in
+ * cleanup_symbol_name(). Do not test these symbols.
+ *
+ * For example:
+ * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head
+ * .E_read_words
+ * .E_leading_bytes
+ * .E_trailing_bytes
+ * .E_write_words
+ * .E_copy
+ * .str.292.llvm.12122243386960820698
+ * .str.24.llvm.12122243386960820698
+ * .str.29.llvm.12122243386960820698
+ * .str.75.llvm.12122243386960820698
+ * .str.99.llvm.12122243386960820698
+ */
+ if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0])
+ continue;
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0])))
+ goto failed;
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt)
+ goto failed;
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr || lookup_addr != stat->addrs[0])
+ goto failed;
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt)
+ goto failed;
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_create(test_entry, NULL, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+ kthread_bind(t, 0);
+ wake_up_process(t);
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 80bfe71bbe13..84c717337df0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
@@ -88,6 +89,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
struct kcov_percpu_data {
void *irq_area;
+ local_lock_t lock;
unsigned int saved_mode;
unsigned int saved_size;
@@ -96,7 +98,9 @@ struct kcov_percpu_data {
int saved_sequence;
};
-static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -149,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
+ /*
+ * KMSAN doesn't instrument this file, so it may not know area->list
+ * is initialized. Unpoison it explicitly to avoid reports in
+ * kcov_remote_area_get().
+ */
+ kmsan_unpoison_memory(&area->list, sizeof(area->list));
}
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
@@ -201,8 +211,16 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first 64-bit word is the number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
+ /* Previously we write pc before updating pos. However, some
+ * early interrupt code could bypass check_kcov_mode() check
+ * and invoke __sanitizer_cov_trace_pc(). If such interrupt is
+ * raised between writing pc and updating pos, the pc could be
+ * overitten by the recursive __sanitizer_cov_trace_pc().
+ * Update pos before writing pc to avoid such interleaving.
+ */
WRITE_ONCE(area[0], pos);
+ barrier();
+ area[pos] = ip;
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
@@ -233,11 +251,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
start_index = 1 + count * KCOV_WORDS_PER_CMP;
end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
if (likely(end_pos <= max_pos)) {
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ WRITE_ONCE(area[0], count + 1);
+ barrier();
area[start_index] = type;
area[start_index + 1] = arg1;
area[start_index + 2] = arg2;
area[start_index + 3] = ip;
- WRITE_ONCE(area[0], count + 1);
}
}
@@ -456,37 +476,31 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vm_flags_set(vma, VM_DONTEXPAND);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ res = vm_insert_page(vma, vma->vm_start + off, page);
+ if (res) {
+ pr_warn_once("kcov: vm_insert_page() failed\n");
+ return res;
}
- return 0;
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -561,31 +575,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -689,9 +684,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -707,16 +730,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
@@ -824,7 +849,7 @@ void kcov_remote_start(u64 handle)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
/*
* Check that kcov_remote_start() is not called twice in background
@@ -832,7 +857,7 @@ void kcov_remote_start(u64 handle)
*/
mode = READ_ONCE(t->kcov_mode);
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -841,14 +866,15 @@ void kcov_remote_start(u64 handle)
* happened while collecting coverage from a background thread.
*/
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
kcov_debug("handle = %llx, context: %s\n", handle,
@@ -869,19 +895,19 @@ void kcov_remote_start(u64 handle)
size = CONFIG_KCOV_IRQ_AREA_SIZE;
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
}
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
/* Can only happen when in_task(). */
if (!area) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
kcov_put(kcov);
return;
}
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
}
- local_irq_save(flags);
-
/* Reset coverage size. */
*(u64 *)area = 0;
@@ -891,7 +917,7 @@ void kcov_remote_start(u64 handle)
}
kcov_start(t, kcov, size, area, mode, sequence);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -965,12 +991,12 @@ void kcov_remote_stop(void)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
mode = READ_ONCE(t->kcov_mode);
barrier();
if (!kcov_mode_enabled(mode)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -978,12 +1004,12 @@ void kcov_remote_stop(void)
* actually found the remote handle and started collecting coverage.
*/
if (in_serving_softirq() && !t->kcov_softirq) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
@@ -1013,7 +1039,7 @@ void kcov_remote_stop(void)
spin_unlock(&kcov_remote_lock);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
/* Get in kcov_remote_start(). */
kcov_put(kcov);
@@ -1034,8 +1060,8 @@ static int __init kcov_init(void)
int cpu;
for_each_possible_cpu(cpu) {
- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
- sizeof(unsigned long));
+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long), cpu_to_node(cpu));
if (!area)
return -ENOMEM;
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig
new file mode 100644
index 000000000000..e82f0f52ab0a
--- /dev/null
+++ b/kernel/kcsan/.kunitconfig
@@ -0,0 +1,24 @@
+# Note that the KCSAN tests need to run on an SMP setup.
+# Under kunit_tool, this can be done by using the --qemu_args
+# option to configure a machine with several cores. For example:
+# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \
+# --arch=x86_64 --qemu_args="-smp 8"
+
+CONFIG_KUNIT=y
+
+CONFIG_DEBUG_KERNEL=y
+
+# Need some level of concurrency to test a concurrency sanitizer.
+CONFIG_SMP=y
+
+CONFIG_KCSAN=y
+CONFIG_KCSAN_KUNIT_TEST=y
+
+# Set these if you want to run test_barrier_nothreads
+#CONFIG_KCSAN_STRICT=y
+#CONFIG_KCSAN_WEAK_MEMORY=y
+
+# This prevents the test from timing out on many setups. Feel free to remove
+# (or alter) this, in conjunction with setting a different test timeout with,
+# for example, the --timeout kunit_tool option.
+CONFIG_KCSAN_REPORT_ONCE_IN_MS=100
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index c2bb07f5bcc7..a45f3dfc8d14 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,10 +8,14 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ $(call cc-option,-mno-outline-atomics) \
-fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
+
+KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
-CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
deleted file mode 100644
index 530ae1bda8e7..000000000000
--- a/kernel/kcsan/atomic.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Rules for implicitly atomic memory accesses.
- *
- * Copyright (C) 2019, Google LLC.
- */
-
-#ifndef _KERNEL_KCSAN_ATOMIC_H
-#define _KERNEL_KCSAN_ATOMIC_H
-
-#include <linux/types.h>
-
-/*
- * Special rules for certain memory where concurrent conflicting accesses are
- * common, however, the current convention is to not mark them; returns true if
- * access to @ptr should be considered atomic. Called from slow-path.
- */
-static bool kcsan_is_atomic_special(const volatile void *ptr)
-{
- return false;
-}
-
-#endif /* _KERNEL_KCSAN_ATOMIC_H */
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 26709ea65c71..5a60cc52adc0 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -14,15 +14,17 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/sched.h>
+#include <linux/string.h>
#include <linux/uaccess.h>
-#include "atomic.h"
#include "encoding.h"
#include "kcsan.h"
+#include "permissive.h"
static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE);
unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK;
@@ -40,15 +42,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+static bool kcsan_weak_memory = true;
+module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
+#else
+#define kcsan_weak_memory false
+#endif
+
bool kcsan_enabled;
/* Per-CPU kcsan_ctx for interrupts */
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
- .disable_count = 0,
- .atomic_next = 0,
- .atomic_nest_count = 0,
- .in_flat_atomic = false,
- .access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
};
@@ -202,22 +206,29 @@ static __always_inline struct kcsan_ctx *get_ctx(void)
return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
}
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
+
/* Check scoped accesses; never inline because this is a slow-path! */
static noinline void kcsan_check_scoped_accesses(void)
{
struct kcsan_ctx *ctx = get_ctx();
- struct list_head *prev_save = ctx->scoped_accesses.prev;
struct kcsan_scoped_access *scoped_access;
- ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
- list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
- __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
- ctx->scoped_accesses.prev = prev_save;
+ if (ctx->disable_scoped)
+ return;
+
+ ctx->disable_scoped++;
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
+ check_access(scoped_access->ptr, scoped_access->size,
+ scoped_access->type, scoped_access->ip);
+ }
+ ctx->disable_scoped--;
}
/* Rules for generic atomic accesses. Called from fast-path. */
static __always_inline bool
-is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
if (type & KCSAN_ACCESS_ATOMIC)
return true;
@@ -254,7 +265,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
}
static __always_inline bool
-should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
/*
* Never set up watchpoints when memory operations are atomic.
@@ -263,7 +274,7 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
* should not count towards skipped instructions, and (2) to actually
* decrement kcsan_atomic_next for consecutive instruction stream.
*/
- if (is_atomic(ptr, size, type, ctx))
+ if (is_atomic(ctx, ptr, size, type))
return false;
if (this_cpu_dec_return(kcsan_skip) >= 0)
@@ -301,9 +312,9 @@ static inline void reset_kcsan_skip(void)
this_cpu_write(kcsan_skip, skip_count);
}
-static __always_inline bool kcsan_is_enabled(void)
+static __always_inline bool kcsan_is_enabled(struct kcsan_ctx *ctx)
{
- return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
+ return READ_ONCE(kcsan_enabled) && !ctx->disable_count;
}
/* Introduce delay depending on context and configuration. */
@@ -320,6 +331,30 @@ static void delay_access(int type)
udelay(delay);
}
+/*
+ * Reads the instrumented memory for value change detection; value change
+ * detection is currently done for accesses up to a size of 8 bytes.
+ */
+static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
+{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
+ switch (size) {
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
+ default: return 0; /* Ignore; we do not diff the values. */
+ }
+}
+
void kcsan_save_irqtrace(struct task_struct *task)
{
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -334,6 +369,76 @@ void kcsan_restore_irqtrace(struct task_struct *task)
#endif
}
+static __always_inline int get_kcsan_stack_depth(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return current->kcsan_stack_depth;
+#else
+ BUILD_BUG();
+ return 0;
+#endif
+}
+
+static __always_inline void add_kcsan_stack_depth(int val)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ current->kcsan_stack_depth += val;
+#else
+ BUILD_BUG();
+#endif
+}
+
+static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return ctx->disable_scoped ? NULL : &ctx->reorder_access;
+#else
+ return NULL;
+#endif
+}
+
+static __always_inline bool
+find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access)
+ return false;
+
+ /*
+ * Note: If accesses are repeated while reorder_access is identical,
+ * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
+ */
+ return reorder_access->ptr == ptr && reorder_access->size == size &&
+ reorder_access->type == type && reorder_access->ip == ip;
+}
+
+static inline void
+set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access || !kcsan_weak_memory)
+ return;
+
+ /*
+ * To avoid nested interrupts or scheduler (which share kcsan_ctx)
+ * reading an inconsistent reorder_access, ensure that the below has
+ * exclusive access to reorder_access by disallowing concurrent use.
+ */
+ ctx->disable_scoped++;
+ barrier();
+ reorder_access->ptr = ptr;
+ reorder_access->size = size;
+ reorder_access->type = type | KCSAN_ACCESS_SCOPED;
+ reorder_access->ip = ip;
+ reorder_access->stack_depth = get_kcsan_stack_depth();
+ barrier();
+ ctx->disable_scoped--;
+}
+
/*
* Pull everything together: check_access() below contains the performance
* critical operations; the fast-path (including check_access) functions should
@@ -350,28 +455,47 @@ void kcsan_restore_irqtrace(struct task_struct *task)
static noinline void kcsan_found_watchpoint(const volatile void *ptr,
size_t size,
int type,
+ unsigned long ip,
atomic_long_t *watchpoint,
long encoded_watchpoint)
{
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ struct kcsan_ctx *ctx = get_ctx();
unsigned long flags;
bool consumed;
- if (!kcsan_is_enabled())
+ /*
+ * We know a watchpoint exists. Let's try to keep the race-window
+ * between here and finally consuming the watchpoint below as small as
+ * possible -- avoid unneccessarily complex code until consumed.
+ */
+
+ if (!kcsan_is_enabled(ctx))
return;
/*
* The access_mask check relies on value-change comparison. To avoid
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
+ *
+ * reorder_access is never created from an access with access_mask set.
+ */
+ if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
+ return;
+
+ /*
+ * If the other thread does not want to ignore the access, and there was
+ * a value change as a result of this thread's operation, we will still
+ * generate a report of unknown origin.
+ *
+ * Use CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=n to filter.
*/
- if (get_ctx()->access_mask != 0)
+ if (!is_assert && kcsan_ignore_address(ptr))
return;
/*
- * Consume the watchpoint as soon as possible, to minimize the chances
- * of !consumed. Consuming the watchpoint must always be guarded by
- * kcsan_is_enabled() check, as otherwise we might erroneously
- * triggering reports when disabled.
+ * Consuming the watchpoint must be guarded by kcsan_is_enabled() to
+ * avoid erroneously triggering reports if the context is disabled.
*/
consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint);
@@ -380,7 +504,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
if (consumed) {
kcsan_save_irqtrace(current);
- kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
+ kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@@ -391,7 +515,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]);
}
- if ((type & KCSAN_ACCESS_ASSERT) != 0)
+ if (is_assert)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
else
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]);
@@ -400,16 +524,19 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
}
static noinline void
-kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
u64 old, new, diff;
- unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ bool interrupt_watcher = kcsan_interrupt_watcher;
unsigned long ua_flags = user_access_save();
+ struct kcsan_ctx *ctx = get_ctx();
+ unsigned long access_mask = ctx->access_mask;
unsigned long irq_flags = 0;
+ bool is_reorder_access;
/*
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
@@ -417,16 +544,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
*/
reset_kcsan_skip();
- if (!kcsan_is_enabled())
+ if (!kcsan_is_enabled(ctx))
goto out;
/*
- * Special atomic rules: unlikely to be true, so we check them here in
- * the slow-path, and not in the fast-path in is_atomic(). Call after
- * kcsan_is_enabled(), as we may access memory that is not yet
- * initialized during early boot.
+ * Check to-ignore addresses after kcsan_is_enabled(), as we may access
+ * memory that is not yet initialized during early boot.
*/
- if (!is_assert && kcsan_is_atomic_special(ptr))
+ if (!is_assert && kcsan_ignore_address(ptr))
goto out;
if (!check_encodable((unsigned long)ptr, size)) {
@@ -435,12 +560,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
}
/*
+ * The local CPU cannot observe reordering of its own accesses, and
+ * therefore we need to take care of 2 cases to avoid false positives:
+ *
+ * 1. Races of the reordered access with interrupts. To avoid, if
+ * the current access is reorder_access, disable interrupts.
+ * 2. Avoid races of scoped accesses from nested interrupts (below).
+ */
+ is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
+ if (is_reorder_access)
+ interrupt_watcher = false;
+ /*
+ * Avoid races of scoped accesses from nested interrupts (or scheduler).
+ * Assume setting up a watchpoint for a non-scoped (normal) access that
+ * also conflicts with a current scoped access. In a nested interrupt,
+ * which shares the context, it would check a conflicting scoped access.
+ * To avoid, disable scoped access checking.
+ */
+ ctx->disable_scoped++;
+
+ /*
* Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
* runtime is entered for every memory access, and potentially useful
* information is lost if dirtied by KCSAN.
*/
kcsan_save_irqtrace(current);
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
@@ -461,32 +606,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- old = 0;
- switch (size) {
- case 1:
- old = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- old = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- old = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- old = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
- }
-
- if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
- kcsan_disable_current();
- pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
- is_write ? "write" : "read", size, ptr,
- watchpoint_slot((unsigned long)ptr),
- encode_watchpoint((unsigned long)ptr, size, is_write));
- kcsan_enable_current();
- }
+ old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
/*
* Delay this thread, to increase probability of observing a racy
@@ -498,31 +618,30 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
- access_mask = get_ctx()->access_mask;
- new = 0;
- switch (size) {
- case 1:
- new = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- new = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- new = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- new = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
+ if (!is_reorder_access) {
+ new = read_instrumented_memory(ptr, size);
+ } else {
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * because the memory location may no longer be accessible and
+ * could result in a fault.
+ */
+ new = 0;
+ access_mask = 0;
}
diff = old ^ new;
if (access_mask)
diff &= access_mask;
- /* Were we able to observe a value-change? */
- if (diff != 0)
+ /*
+ * Check if we observed a value change.
+ *
+ * Also check if the data race should be ignored (the rules depend on
+ * non-zero diff); if it is to be ignored, the below rules for
+ * KCSAN_VALUE_CHANGE_MAYBE apply.
+ */
+ if (diff && !kcsan_ignore_data_race(size, type, old, new, diff))
value_change = KCSAN_VALUE_CHANGE_TRUE;
/* Check if this access raced with another. */
@@ -556,8 +675,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- kcsan_report_known_origin(ptr, size, type, value_change,
- watchpoint - watchpoints,
+ kcsan_report_known_origin(ptr, size, type, ip,
+ value_change, watchpoint - watchpoints,
old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
@@ -566,8 +685,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
- kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
+ kcsan_report_unknown_origin(ptr, size, type, ip,
+ old, new, access_mask);
+ }
}
/*
@@ -576,18 +697,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
*/
remove_watchpoint(watchpoint);
atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
out_unlock:
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_restore(irq_flags);
kcsan_restore_irqtrace(current);
+ ctx->disable_scoped--;
+
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * therefore never consider for reordering if access_mask is set.
+ * ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
+ */
+ if (!access_mask && !is_assert)
+ set_reorder_access(ctx, ptr, size, type, ip);
out:
user_access_restore(ua_flags);
}
-static __always_inline void check_access(const volatile void *ptr, size_t size,
- int type)
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
- const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
long encoded_watchpoint;
@@ -598,12 +728,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
if (unlikely(size == 0))
return;
+again:
/*
* Avoid user_access_save in fast-path: find_watchpoint is safe without
* user_access_save, as the address that ptr points to is only used to
* check if a watchpoint exists; ptr is never dereferenced.
*/
- watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ watchpoint = find_watchpoint((unsigned long)ptr, size,
+ !(type & KCSAN_ACCESS_WRITE),
&encoded_watchpoint);
/*
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
@@ -613,14 +745,46 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
*/
if (unlikely(watchpoint != NULL))
- kcsan_found_watchpoint(ptr, size, type, watchpoint,
- encoded_watchpoint);
+ kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
- if (unlikely(should_watch(ptr, size, type, ctx)))
- kcsan_setup_watchpoint(ptr, size, type);
- else if (unlikely(ctx->scoped_accesses.prev))
+ if (unlikely(should_watch(ctx, ptr, size, type))) {
+ kcsan_setup_watchpoint(ptr, size, type, ip);
+ return;
+ }
+
+ if (!(type & KCSAN_ACCESS_SCOPED)) {
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (reorder_access) {
+ /*
+ * reorder_access check: simulates reordering of
+ * the access after subsequent operations.
+ */
+ ptr = reorder_access->ptr;
+ type = reorder_access->type;
+ ip = reorder_access->ip;
+ /*
+ * Upon a nested interrupt, this context's
+ * reorder_access can be modified (shared ctx).
+ * We know that upon return, reorder_access is
+ * always invalidated by setting size to 0 via
+ * __tsan_func_exit(). Therefore we must read
+ * and check size after the other fields.
+ */
+ barrier();
+ size = READ_ONCE(reorder_access->size);
+ if (size)
+ goto again;
+ }
+ }
+
+ /*
+ * Always checked last, right before returning from runtime;
+ * if reorder_access is valid, checked after it was checked.
+ */
+ if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
}
@@ -644,6 +808,15 @@ void __init kcsan_init(void)
pr_info("enabled early\n");
WRITE_ONCE(kcsan_enabled, true);
}
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) ||
+ IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) ||
+ IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) ||
+ IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ pr_warn("non-strict mode configured - use CONFIG_KCSAN_STRICT=y to see all data races\n");
+ } else {
+ pr_info("strict mode configured\n");
+ }
}
/* === Exported interface =================================================== */
@@ -736,7 +909,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
{
struct kcsan_ctx *ctx = get_ctx();
- __kcsan_check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
@@ -744,6 +917,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
sa->ptr = ptr;
sa->size = size;
sa->type = type;
+ sa->ip = _RET_IP_;
if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
INIT_LIST_HEAD(&ctx->scoped_accesses);
@@ -775,16 +949,32 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
ctx->disable_count--;
- __kcsan_check_access(sa->ptr, sa->size, sa->type);
+ check_access(sa->ptr, sa->size, sa->type, sa->ip);
}
EXPORT_SYMBOL(kcsan_end_scoped_access);
void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
{
- check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
}
EXPORT_SYMBOL(__kcsan_check_access);
+#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
+ void __kcsan_##name(void) \
+ { \
+ struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
+ if (!sa) \
+ return; \
+ if (order_before_cond) \
+ sa->size = 0; \
+ } \
+ EXPORT_SYMBOL(__kcsan_##name)
+
+DEFINE_MEMORY_BARRIER(mb, true);
+DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(release, true);
+
/*
* KCSAN uses the same instrumentation that is emitted by supported compilers
* for ThreadSanitizer (TSAN).
@@ -802,7 +992,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read##size(void *ptr); \
void __tsan_read##size(void *ptr) \
{ \
- check_access(ptr, size, 0); \
+ check_access(ptr, size, 0, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read##size); \
void __tsan_unaligned_read##size(void *ptr) \
@@ -811,7 +1001,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
{ \
- check_access(ptr, size, KCSAN_ACCESS_WRITE); \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
@@ -821,7 +1011,8 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read_write##size(void *ptr) \
{ \
check_access(ptr, size, \
- KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read_write##size); \
void __tsan_unaligned_read_write##size(void *ptr) \
@@ -837,14 +1028,14 @@ DEFINE_TSAN_READ_WRITE(16);
void __tsan_read_range(void *ptr, size_t size);
void __tsan_read_range(void *ptr, size_t size)
{
- check_access(ptr, size, 0);
+ check_access(ptr, size, 0, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_read_range);
void __tsan_write_range(void *ptr, size_t size);
void __tsan_write_range(void *ptr, size_t size)
{
- check_access(ptr, size, KCSAN_ACCESS_WRITE);
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_write_range);
@@ -865,7 +1056,8 @@ EXPORT_SYMBOL(__tsan_write_range);
IS_ALIGNED((unsigned long)ptr, size); \
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
return; \
- check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_read##size); \
void __tsan_unaligned_volatile_read##size(void *ptr) \
@@ -880,7 +1072,8 @@ EXPORT_SYMBOL(__tsan_write_range);
return; \
check_access(ptr, size, \
KCSAN_ACCESS_WRITE | \
- (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_write##size); \
void __tsan_unaligned_volatile_write##size(void *ptr) \
@@ -894,19 +1087,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
/*
- * The below are not required by KCSAN, but can still be emitted by the
- * compiler.
+ * Function entry and exit are used to determine the validty of reorder_access.
+ * Reordering of the access ends at the end of the function scope where the
+ * access happened. This is done for two reasons:
+ *
+ * 1. Artificially limits the scope where missing barriers are detected.
+ * This minimizes false positives due to uninstrumented functions that
+ * contain the required barriers but were missed.
+ *
+ * 2. Simplifies generating the stack trace of the access.
*/
void __tsan_func_entry(void *call_pc);
-void __tsan_func_entry(void *call_pc)
+noinline void __tsan_func_entry(void *call_pc)
{
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ add_kcsan_stack_depth(1);
}
EXPORT_SYMBOL(__tsan_func_entry);
+
void __tsan_func_exit(void);
-void __tsan_func_exit(void)
+noinline void __tsan_func_exit(void)
{
+ struct kcsan_scoped_access *reorder_access;
+
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ reorder_access = get_reorder_access(get_ctx());
+ if (!reorder_access)
+ goto out;
+
+ if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
+ /*
+ * Access check to catch cases where write without a barrier
+ * (supposed release) was last access in function: because
+ * instrumentation is inserted before the real access, a data
+ * race due to the write giving up a c-s would only be caught if
+ * we do the conflicting access after.
+ */
+ check_access(reorder_access->ptr, reorder_access->size,
+ reorder_access->type, reorder_access->ip);
+ reorder_access->size = 0;
+ reorder_access->stack_depth = INT_MIN;
+ }
+out:
+ add_kcsan_stack_depth(-1);
}
EXPORT_SYMBOL(__tsan_func_exit);
+
void __tsan_init(void);
void __tsan_init(void)
{
@@ -929,12 +1159,21 @@ EXPORT_SYMBOL(__tsan_init);
* functions, whose job is to also execute the operation itself.
*/
+static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
+{
+ if (memorder == __ATOMIC_RELEASE ||
+ memorder == __ATOMIC_SEQ_CST ||
+ memorder == __ATOMIC_ACQ_REL)
+ __kcsan_release();
+}
+
#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
- check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_load_n(ptr, memorder); \
} \
@@ -942,9 +1181,10 @@ EXPORT_SYMBOL(__tsan_init);
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
- KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_store_n(ptr, v, memorder); \
} \
@@ -954,10 +1194,11 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_##op##suffix(ptr, v, memorder); \
} \
@@ -986,10 +1227,11 @@ EXPORT_SYMBOL(__tsan_init);
int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
u##bits val, int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
} \
@@ -1001,10 +1243,11 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
return exp; \
@@ -1032,10 +1275,95 @@ DEFINE_TSAN_ATOMIC_OPS(64);
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
{
+ kcsan_atomic_builtin_memorder(memorder);
__atomic_thread_fence(memorder);
}
EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+/*
+ * In instrumented files, we emit instrumentation for barriers by mapping the
+ * kernel barriers to an __atomic_signal_fence(), which is interpreted specially
+ * and otherwise has no relation to a real __atomic_signal_fence(). No known
+ * kernel code uses __atomic_signal_fence().
+ *
+ * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
+ * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
+ * can be disabled via the __no_kcsan function attribute (vs. an explicit call
+ * which could not). When __no_kcsan is requested, __atomic_signal_fence()
+ * generates no code.
+ *
+ * Note: The result of using __atomic_signal_fence() with KCSAN enabled is
+ * potentially limiting the compiler's ability to reorder operations; however,
+ * if barriers were instrumented with explicit calls (without LTO), the compiler
+ * couldn't optimize much anyway. The result of a hypothetical architecture
+ * using __atomic_signal_fence() in normal code would be KCSAN false negatives.
+ */
void __tsan_atomic_signal_fence(int memorder);
-void __tsan_atomic_signal_fence(int memorder) { }
+noinline void __tsan_atomic_signal_fence(int memorder)
+{
+ switch (memorder) {
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
+ __kcsan_mb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
+ __kcsan_wmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
+ __kcsan_rmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
+ __kcsan_release();
+ break;
+ default:
+ break;
+ }
+}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
+
+#ifdef __HAVE_ARCH_MEMSET
+void *__tsan_memset(void *s, int c, size_t count);
+noinline void *__tsan_memset(void *s, int c, size_t count)
+{
+ /*
+ * Instead of not setting up watchpoints where accessed size is greater
+ * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE.
+ */
+ size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE);
+
+ check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ return memset(s, c, count);
+}
+#else
+void *__tsan_memset(void *s, int c, size_t count) __alias(memset);
+#endif
+EXPORT_SYMBOL(__tsan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__tsan_memmove(void *dst, const void *src, size_t len);
+noinline void *__tsan_memmove(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memmove(dst, src, len);
+}
+#else
+void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove);
+#endif
+EXPORT_SYMBOL(__tsan_memmove);
+
+#ifdef __HAVE_ARCH_MEMCPY
+void *__tsan_memcpy(void *dst, const void *src, size_t len);
+noinline void *__tsan_memcpy(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memcpy(dst, src, len);
+}
+#else
+void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy);
+#endif
+EXPORT_SYMBOL(__tsan_memcpy);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index e65de172ccf7..1d1d1b0e4248 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -64,7 +64,7 @@ static noinline void microbenchmark(unsigned long iters)
{
const struct kcsan_ctx ctx_save = current->kcsan_ctx;
const bool was_enabled = READ_ONCE(kcsan_enabled);
- cycles_t cycles;
+ u64 cycles;
/* We may have been called from an atomic region; reset context. */
memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index f36e25c497ed..ae33c2a7f07e 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -121,7 +121,7 @@ enum kcsan_value_change {
* to be consumed by the reporting thread. No report is printed yet.
*/
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
- int watchpoint_idx);
+ unsigned long ip, int watchpoint_idx);
/*
* The calling thread observed that the watchpoint it set up was hit and
@@ -129,14 +129,14 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
* thread.
*/
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change, int watchpoint_idx,
- u64 old, u64 new, u64 mask);
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask);
/*
* No other thread was observed to race with the access, but the data value
* before and after the stall differs. Reports a race of "unknown origin".
*/
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
- u64 old, u64 new, u64 mask);
+ unsigned long ip, u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 8bcffbdef3d3..0ddbdab5903d 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -16,9 +16,12 @@
#define pr_fmt(fmt) "kcsan_test: " fmt
#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/jiffies.h>
#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
+#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/seqlock.h>
#include <linux/spinlock.h>
@@ -29,6 +32,11 @@
#include <linux/types.h>
#include <trace/events/printk.h>
+#define KCSAN_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
#else
@@ -146,12 +154,12 @@ struct expect_report {
/* Check observed report matches information in @r. */
__no_kcsan
-static bool report_matches(const struct expect_report *r)
+static bool __report_matches(const struct expect_report *r)
{
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
unsigned long flags;
- typeof(observed.lines) expect;
+ typeof(*observed.lines) *expect;
const char *end;
char *cur;
int i;
@@ -160,6 +168,10 @@ static bool report_matches(const struct expect_report *r)
if (!report_available())
return false;
+ expect = kmalloc(sizeof(observed.lines), GFP_KERNEL);
+ if (WARN_ON(!expect))
+ return false;
+
/* Generate expected report contents. */
/* Title */
@@ -205,10 +217,12 @@ static bool report_matches(const struct expect_report *r)
"read-write" :
"write") :
"read");
+ const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
+ const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
const char *const access_type_aux =
- (ty & KCSAN_ACCESS_ATOMIC) ?
- " (marked)" :
- ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
+ (is_atomic && is_scoped) ? " (marked, reordered)"
+ : (is_atomic ? " (marked)"
+ : (is_scoped ? " (reordered)" : ""));
if (i == 1) {
/* Access 2 */
@@ -243,9 +257,44 @@ static bool report_matches(const struct expect_report *r)
strstr(observed.lines[2], expect[1])));
out:
spin_unlock_irqrestore(&observed.lock, flags);
+ kfree(expect);
return ret;
}
+static __always_inline const struct expect_report *
+__report_set_scoped(struct expect_report *r, int accesses)
+{
+ BUILD_BUG_ON(accesses > 3);
+
+ if (accesses & 1)
+ r->access[0].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
+
+ if (accesses & 2)
+ r->access[1].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
+
+ return r;
+}
+
+__no_kcsan
+static bool report_matches_any_reordered(struct expect_report *r)
+{
+ return __report_matches(__report_set_scoped(r, 0)) ||
+ __report_matches(__report_set_scoped(r, 1)) ||
+ __report_matches(__report_set_scoped(r, 2)) ||
+ __report_matches(__report_set_scoped(r, 3));
+}
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+/* Due to reordering accesses, any access may appear as "(reordered)". */
+#define report_matches report_matches_any_reordered
+#else
+#define report_matches __report_matches
+#endif
+
/* ===== Test kernels ===== */
static long test_sink;
@@ -256,6 +305,8 @@ static struct {
long val[8];
} test_struct;
static DEFINE_SEQLOCK(test_seqlock);
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_MUTEX(test_mutex);
/*
* Helper to avoid compiler optimizing out reads, and to generate source values
@@ -264,6 +315,16 @@ static DEFINE_SEQLOCK(test_seqlock);
__no_kcsan
static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+/*
+ * Generates a delay and some accesses that enter the runtime but do not produce
+ * data races.
+ */
+static noinline void test_delay(int iter)
+{
+ while (iter--)
+ sink_value(READ_ONCE(test_sink));
+}
+
static noinline void test_kernel_read(void) { sink_value(test_var); }
static noinline void test_kernel_write(void)
@@ -333,7 +394,10 @@ static noinline void test_kernel_assert_bits_nochange(void)
ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
}
-/* To check that scoped assertions do trigger anywhere in scope. */
+/*
+ * Scoped assertions do trigger anywhere in scope. However, the report should
+ * still only point at the start of the scope.
+ */
static noinline void test_enter_scope(void)
{
int x = 0;
@@ -414,19 +478,247 @@ static noinline void test_kernel_atomic_builtins(void)
__atomic_load_n(&test_var, __ATOMIC_RELAXED);
}
+static noinline void test_kernel_xor_1bit(void)
+{
+ /* Do not report data races between the read-writes. */
+ kcsan_nestable_atomic_begin();
+ test_var ^= 0x10000;
+ kcsan_nestable_atomic_end();
+}
+
+#define TEST_KERNEL_LOCKED(name, acquire, release) \
+ static noinline void test_kernel_##name(void) \
+ { \
+ long *flag = &test_struct.val[0]; \
+ long v = 0; \
+ if (!(acquire)) \
+ return; \
+ while (v++ < 100) { \
+ test_var++; \
+ barrier(); \
+ } \
+ release; \
+ test_delay(10); \
+ }
+
+TEST_KERNEL_LOCKED(with_memorder,
+ cmpxchg_acquire(flag, 0, 1) == 0,
+ smp_store_release(flag, 0));
+TEST_KERNEL_LOCKED(wrong_memorder,
+ cmpxchg_relaxed(flag, 0, 1) == 0,
+ WRITE_ONCE(*flag, 0));
+TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELEASE));
+TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELAXED));
+
/* ===== Test cases ===== */
+/*
+ * Tests that various barriers have the expected effect on internal state. Not
+ * exhaustive on atomic_t operations. Unlike the selftest, also checks for
+ * too-strict barrier instrumentation; these can be tolerated, because it does
+ * not cause false positives, but at least we should be aware of such cases.
+ */
+static void test_barrier_nothreads(struct kunit *test)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+
+ KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
+ KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
+
+#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = sizeof(test_var); \
+ barrier; \
+ KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
+ order_before ? 0 : sizeof(test_var), \
+ "improperly instrumented type=(" #access_type "): " name); \
+ } while (0)
+#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
+#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
+#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
+
+ /*
+ * Lockdep initialization can strengthen certain locking operations due
+ * to calling into instrumented files; "warm up" our locks.
+ */
+ spin_lock(&test_spinlock);
+ spin_unlock(&test_spinlock);
+ mutex_lock(&test_mutex);
+ mutex_unlock(&test_mutex);
+
+ /* Force creating a valid entry in reorder_access first. */
+ test_var = 0;
+ while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
+ __kcsan_check_read(&test_var, sizeof(test_var));
+ KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_EXPECT_READ_BARRIER(mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_RW_BARRIER(mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
+
+#ifdef clear_bit_unlock_is_negative_byte
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+#endif
+ kcsan_nestable_atomic_end();
+}
+
/* Simple test with normal data race. */
__no_kcsan
static void test_basic(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- static const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -451,14 +743,14 @@ static void test_basic(struct kunit *test)
__no_kcsan
static void test_concurrent_races(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
/* NULL will match any address. */
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
},
};
- static const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_rmw_array, NULL, 0, 0 },
{ test_kernel_rmw_array, NULL, 0, 0 },
@@ -480,17 +772,24 @@ static void test_concurrent_races(struct kunit *test)
__no_kcsan
static void test_novalue_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
bool match_expect = false;
+ test_kernel_write_nochange(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange, test_kernel_read);
do {
- match_expect = report_matches(&expect);
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
KUNIT_EXPECT_FALSE(test, match_expect);
@@ -505,17 +804,24 @@ static void test_novalue_change(struct kunit *test)
__no_kcsan
static void test_novalue_change_exception(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
bool match_expect = false;
+ test_kernel_write_nochange_rcu(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
do {
- match_expect = report_matches(&expect);
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
KUNIT_EXPECT_TRUE(test, match_expect);
}
@@ -524,7 +830,7 @@ static void test_novalue_change_exception(struct kunit *test)
__no_kcsan
static void test_unknown_origin(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ NULL },
@@ -546,7 +852,7 @@ static void test_unknown_origin(struct kunit *test)
__no_kcsan
static void test_write_write_assume_atomic(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -572,7 +878,7 @@ static void test_write_write_assume_atomic(struct kunit *test)
__no_kcsan
static void test_write_write_struct(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
@@ -594,7 +900,7 @@ static void test_write_write_struct(struct kunit *test)
__no_kcsan
static void test_write_write_struct_part(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
@@ -626,7 +932,7 @@ static void test_read_atomic_write_atomic(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_write(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
@@ -634,8 +940,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
};
bool match_expect = false;
- if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
- return;
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_write_atomic);
do {
@@ -648,7 +953,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_rmw(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_atomic_rmw, &test_var, sizeof(test_var),
@@ -657,8 +962,7 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
};
bool match_expect = false;
- if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
- return;
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
do {
@@ -671,13 +975,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
__no_kcsan
static void test_zero_size_access(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
@@ -711,7 +1015,7 @@ static void test_data_race(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -729,7 +1033,7 @@ static void test_assert_exclusive_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -747,19 +1051,19 @@ static void test_assert_exclusive_access(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access_writer(struct kunit *test)
{
- const struct expect_report expect_access_writer = {
+ struct expect_report expect_access_writer = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
},
};
- const struct expect_report expect_access_access = {
+ struct expect_report expect_access_access = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
@@ -783,7 +1087,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_bits_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_change_bits, &test_var, sizeof(test_var),
@@ -814,43 +1118,43 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer_scoped(struct kunit *test)
{
- const struct expect_report expect_start = {
+ struct expect_report expect_start = {
.access = {
{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report expect_anywhere = {
+ struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
bool match_expect_start = false;
- bool match_expect_anywhere = false;
+ bool match_expect_inscope = false;
begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
do {
match_expect_start |= report_matches(&expect_start);
- match_expect_anywhere |= report_matches(&expect_anywhere);
- } while (!end_test_checks(match_expect_start && match_expect_anywhere));
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
- KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
__no_kcsan
static void test_assert_exclusive_access_scoped(struct kunit *test)
{
- const struct expect_report expect_start1 = {
+ struct expect_report expect_start1 = {
.access = {
{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- const struct expect_report expect_start2 = {
+ struct expect_report expect_start2 = {
.access = { expect_start1.access[0], expect_start1.access[0] },
};
- const struct expect_report expect_inscope = {
+ struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -864,9 +1168,9 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
do {
match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
match_expect_inscope |= report_matches(&expect_inscope);
- } while (!end_test_checks(match_expect_start && match_expect_inscope));
+ } while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
- KUNIT_EXPECT_TRUE(test, match_expect_inscope);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
/*
@@ -952,6 +1256,113 @@ static void test_atomic_builtins(struct kunit *test)
KUNIT_EXPECT_FALSE(test, match_never);
}
+__no_kcsan
+static void test_1bit_value_change(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ },
+ };
+ bool match = false;
+
+ begin_test_checks(test_kernel_read, test_kernel_xor_1bit);
+ do {
+ match = IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)
+ ? report_available()
+ : report_matches(&expect);
+ } while (!end_test_checks(match));
+ if (IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ KUNIT_EXPECT_FALSE(test, match);
+ else
+ KUNIT_EXPECT_TRUE(test, match);
+}
+
+__no_kcsan
+static void test_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_with_memorder,
+ test_kernel_atomic_builtin_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
+ test_kernel_atomic_builtin_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
/*
* Generate thread counts for all test cases. Values generated are in interval
* [2, 5] followed by exponentially increasing thread counts from 8 to 32.
@@ -974,13 +1385,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
else
nthreads *= 2;
- if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ if (!preempt_model_preemptible() ||
+ !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
/*
* Without any preemption, keep 2 CPUs free for other tasks, one
* of which is the main test case function checking for
* completion or failure.
*/
- const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+ const long min_unused_cpus = preempt_model_none() ? 2 : 0;
const long min_required_cpus = 2 + min_unused_cpus;
if (num_online_cpus() < min_required_cpus) {
@@ -1001,6 +1413,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
static struct kunit_case kcsan_test_cases[] = {
+ KUNIT_CASE(test_barrier_nothreads),
KCSAN_KUNIT_CASE(test_basic),
KCSAN_KUNIT_CASE(test_concurrent_races),
KCSAN_KUNIT_CASE(test_novalue_change),
@@ -1024,6 +1437,11 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_jiffies_noreport),
KCSAN_KUNIT_CASE(test_seqlock_noreport),
KCSAN_KUNIT_CASE(test_atomic_builtins),
+ KCSAN_KUNIT_CASE(test_1bit_value_change),
+ KCSAN_KUNIT_CASE(test_correct_barrier),
+ KCSAN_KUNIT_CASE(test_missing_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
{},
};
@@ -1088,6 +1506,9 @@ static int test_init(struct kunit *test)
observed.nlines = 0;
spin_unlock_irqrestore(&observed.lock, flags);
+ if (strstr(test->name, "nothreads"))
+ return 0;
+
if (!torture_init_begin((char *)test->name, 1))
return -EBUSY;
@@ -1130,6 +1551,9 @@ static void test_exit(struct kunit *test)
struct task_struct **stop_thread;
int i;
+ if (strstr(test->name, "nothreads"))
+ return;
+
if (torture_cleanup_begin())
return;
@@ -1147,53 +1571,40 @@ static void test_exit(struct kunit *test)
torture_cleanup_end();
}
-static struct kunit_suite kcsan_test_suite = {
- .name = "kcsan",
- .test_cases = kcsan_test_cases,
- .init = test_init,
- .exit = test_exit,
-};
-static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
-
__no_kcsan
-static void register_tracepoints(struct tracepoint *tp, void *ignore)
+static void register_tracepoints(void)
{
- check_trace_callback_type_console(probe_console);
- if (!strcmp(tp->name, "console"))
- WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+ register_trace_console(probe_console, NULL);
}
__no_kcsan
-static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+static void unregister_tracepoints(void)
{
- if (!strcmp(tp->name, "console"))
- tracepoint_probe_unregister(tp, probe_console, NULL);
+ unregister_trace_console(probe_console, NULL);
}
-/*
- * We only want to do tracepoints setup and teardown once, therefore we have to
- * customize the init and exit functions and cannot rely on kunit_test_suite().
- */
-static int __init kcsan_test_init(void)
+static int kcsan_suite_init(struct kunit_suite *suite)
{
- /*
- * Because we want to be able to build the test as a module, we need to
- * iterate through all known tracepoints, since the static registration
- * won't work here.
- */
- for_each_kernel_tracepoint(register_tracepoints, NULL);
- return __kunit_test_suites_init(kcsan_test_suites);
+ register_tracepoints();
+ return 0;
}
-static void kcsan_test_exit(void)
+static void kcsan_suite_exit(struct kunit_suite *suite)
{
- __kunit_test_suites_exit(kcsan_test_suites);
- for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ unregister_tracepoints();
tracepoint_synchronize_unregister();
}
-late_initcall(kcsan_test_init);
-module_exit(kcsan_test_exit);
+static struct kunit_suite kcsan_test_suite = {
+ .name = "kcsan",
+ .test_cases = kcsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kcsan_suite_init,
+ .suite_exit = kcsan_suite_exit,
+};
+
+kunit_test_suites(&kcsan_test_suite);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h
new file mode 100644
index 000000000000..2c01fe4a59ee
--- /dev/null
+++ b/kernel/kcsan/permissive.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Special rules for ignoring entire classes of data-racy memory accesses. None
+ * of the rules here imply that such data races are generally safe!
+ *
+ * All rules in this file can be configured via CONFIG_KCSAN_PERMISSIVE. Keep
+ * them separate from core code to make it easier to audit.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#ifndef _KERNEL_KCSAN_PERMISSIVE_H
+#define _KERNEL_KCSAN_PERMISSIVE_H
+
+#include <linux/bitops.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+/*
+ * Access ignore rules based on address.
+ */
+static __always_inline bool kcsan_ignore_address(const volatile void *ptr)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Data-racy bitops on current->flags are too common, ignore completely
+ * for now.
+ */
+ return ptr == &current->flags;
+}
+
+/*
+ * Data race ignore rules based on access type and value change patterns.
+ */
+static bool
+kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Rules here are only for plain read accesses, so that we still report
+ * data races between plain read-write accesses.
+ */
+ if (type || size > sizeof(long))
+ return false;
+
+ /*
+ * A common pattern is checking/setting just 1 bit in a variable; for
+ * example:
+ *
+ * if (flags & SOME_FLAG) { ... }
+ *
+ * and elsewhere flags is updated concurrently:
+ *
+ * flags |= SOME_OTHER_FLAG; // just 1 bit
+ *
+ * While it is still recommended that such accesses be marked
+ * appropriately, in many cases these types of data races are so common
+ * that marking them all is often unrealistic and left to maintainer
+ * preference.
+ *
+ * The assumption in all cases is that with all known compiler
+ * optimizations (including those that tear accesses), because no more
+ * than 1 bit changed, the plain accesses are safe despite the presence
+ * of data races.
+ *
+ * The rules here will ignore the data races if we observe no more than
+ * 1 bit changed.
+ *
+ * Of course many operations can effecively change just 1 bit, but the
+ * general assuption that data races involving 1-bit changes can be
+ * tolerated still applies.
+ *
+ * And in case a true bug is missed, the bug likely manifests as a
+ * reportable data race elsewhere.
+ */
+ if (hweight64(diff) == 1) {
+ /*
+ * Exception: Report data races where the values look like
+ * ordinary booleans (one of them was 0 and the 0th bit was
+ * changed) More often than not, they come with interesting
+ * memory ordering requirements, so let's report them.
+ */
+ if (!((!old || !new) && diff == 1))
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* _KERNEL_KCSAN_PERMISSIVE_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 21137929d428..e95ce7d7a76e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -8,6 +8,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
@@ -31,6 +32,7 @@ struct access_info {
int access_type;
int task_pid;
int cpu_id;
+ unsigned long ip;
};
/*
@@ -213,9 +215,9 @@ static const char *get_access_type(int type)
if (type & KCSAN_ACCESS_ASSERT) {
if (type & KCSAN_ACCESS_SCOPED) {
if (type & KCSAN_ACCESS_WRITE)
- return "assert no accesses (scoped)";
+ return "assert no accesses (reordered)";
else
- return "assert no writes (scoped)";
+ return "assert no writes (reordered)";
} else {
if (type & KCSAN_ACCESS_WRITE)
return "assert no accesses";
@@ -238,13 +240,17 @@ static const char *get_access_type(int type)
case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
- return "read (scoped)";
+ return "read (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
- return "read (marked, scoped)";
+ return "read (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
- return "write (scoped)";
+ return "write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
- return "write (marked, scoped)";
+ return "write (marked, reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write (reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked, reordered)";
default:
BUG();
}
@@ -300,6 +306,52 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
return skip;
}
+/*
+ * Skips to the first entry that matches the function of @ip, and then replaces
+ * that entry with @ip, returning the entries to skip with @replaced containing
+ * the replaced entry.
+ */
+static int
+replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ unsigned long symbolsize, offset;
+ unsigned long target_func;
+ int skip;
+
+ if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
+ target_func = ip - offset;
+ else
+ goto fallback;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ unsigned long func = stack_entries[skip];
+
+ if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
+ goto fallback;
+ func -= offset;
+
+ if (func == target_func) {
+ *replaced = stack_entries[skip];
+ stack_entries[skip] = ip;
+ return skip;
+ }
+ }
+
+fallback:
+ /* Should not happen; the resulting stack trace is likely misleading. */
+ WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
+ return get_stack_skipnr(stack_entries, num_entries);
+}
+
+static int
+sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
+ get_stack_skipnr(stack_entries, num_entries);
+}
+
/* Compares symbolized strings of addr1 and addr2. */
static int sym_strcmp(void *addr1, void *addr2)
{
@@ -312,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2)
return strncmp(buf1, buf2, sizeof(buf1));
}
+static void
+print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
+{
+ stack_trace_print(stack_entries, num_entries, 0);
+ if (reordered_to)
+ pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
+}
+
static void print_verbose_info(struct task_struct *task)
{
if (!task)
@@ -327,13 +387,15 @@ static void print_verbose_info(struct task_struct *task)
static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
- const struct other_info *other_info,
+ struct other_info *other_info,
u64 old, u64 new, u64 mask)
{
+ unsigned long reordered_to = 0;
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
- int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+ int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_reordered_to = 0;
unsigned long other_frame = 0;
int other_skipnr = 0; /* silence uninit warnings */
@@ -344,8 +406,9 @@ static void print_report(enum kcsan_value_change value_change,
return;
if (other_info) {
- other_skipnr = get_stack_skipnr(other_info->stack_entries,
- other_info->num_stack_entries);
+ other_skipnr = sanitize_stack_entries(other_info->stack_entries,
+ other_info->num_stack_entries,
+ other_info->ai.ip, &other_reordered_to);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
@@ -385,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change,
other_info->ai.cpu_id);
/* Print the other thread's stack trace. */
- stack_trace_print(other_info->stack_entries + other_skipnr,
+ print_stack_trace(other_info->stack_entries + other_skipnr,
other_info->num_stack_entries - other_skipnr,
- 0);
-
+ other_reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(other_info->task);
@@ -402,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change,
get_thread_desc(ai->task_pid), ai->cpu_id);
}
/* Print stack trace of this thread. */
- stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
- 0);
-
+ print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
@@ -432,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change,
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("KCSAN");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
@@ -576,21 +635,23 @@ discard:
}
static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
- int access_type)
+ int access_type, unsigned long ip)
{
return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
- .cpu_id = raw_smp_processor_id()
+ .cpu_id = raw_smp_processor_id(),
+ /* Only replace stack entry with @ip if scoped access. */
+ .ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
};
}
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
- int watchpoint_idx)
+ unsigned long ip, int watchpoint_idx)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
@@ -603,10 +664,10 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
}
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change, int watchpoint_idx,
- u64 old, u64 new, u64 mask)
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
struct other_info *other_info = &other_infos[watchpoint_idx];
unsigned long flags = 0;
@@ -637,9 +698,9 @@ out:
}
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
- u64 old, u64 new, u64 mask)
+ unsigned long ip, u64 old, u64 new, u64 mask)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 7f29cb0f5e63..8679322450f2 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -7,90 +7,76 @@
#define pr_fmt(fmt) "kcsan: " fmt
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/init.h>
+#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
#include <linux/types.h>
#include "encoding.h"
#define ITERS_PER_TEST 2000
-/* Test requirements. */
-static bool test_requires(void)
-{
- /* random should be initialized for the below tests */
- return prandom_u32() + prandom_u32() != 0;
-}
-
/*
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
*/
-static bool test_encode_decode(void)
+static bool __init test_encode_decode(void)
{
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
- size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
- bool is_write = !!prandom_u32_max(2);
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
+ unsigned long verif_masked_addr;
+ long encoded_watchpoint;
+ bool verif_is_write;
unsigned long addr;
+ size_t verif_size;
- prandom_bytes(&addr, sizeof(addr));
+ get_random_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
addr = PAGE_SIZE;
if (WARN_ON(!check_encodable(addr, size)))
return false;
- /* Encode and decode */
- {
- const long encoded_watchpoint =
- encode_watchpoint(addr, size, is_write);
- unsigned long verif_masked_addr;
- size_t verif_size;
- bool verif_is_write;
-
- /* Check special watchpoints */
- if (WARN_ON(decode_watchpoint(
- INVALID_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(decode_watchpoint(
- CONSUMED_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
-
- /* Check decoding watchpoint returns same data */
- if (WARN_ON(!decode_watchpoint(
- encoded_watchpoint, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(verif_masked_addr !=
- (addr & WATCHPOINT_ADDR_MASK)))
- goto fail;
- if (WARN_ON(verif_size != size))
- goto fail;
- if (WARN_ON(is_write != verif_is_write))
- goto fail;
-
- continue;
-fail:
- pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
- __func__, is_write ? "write" : "read", size,
- addr, encoded_watchpoint,
- verif_is_write ? "write" : "read", verif_size,
- verif_masked_addr);
+ encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
return false;
- }
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
+ return false;
}
return true;
}
/* Test access matching function. */
-static bool test_matching_access(void)
+static bool __init test_matching_access(void)
{
if (WARN_ON(!matching_access(10, 1, 10, 1)))
return false;
@@ -115,6 +101,143 @@ static bool test_matching_access(void)
return true;
}
+/*
+ * Correct memory barrier instrumentation is critical to avoiding false
+ * positives: simple test to check at boot certain barriers are always properly
+ * instrumented. See kcsan_test for a more complete test.
+ */
+static DEFINE_SPINLOCK(test_spinlock);
+static bool __init test_barrier(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ bool ret = true;
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+ long test_var;
+
+ if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
+ return true;
+
+#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = 1; \
+ barrier; \
+ if (reorder_access->size != 0) { \
+ pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
+ ret = false; \
+ } \
+ } while (0)
+#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
+#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
+#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_CHECK_READ_BARRIER(mb());
+ KCSAN_CHECK_READ_BARRIER(rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb());
+ KCSAN_CHECK_READ_BARRIER(smp_rmb());
+ KCSAN_CHECK_READ_BARRIER(dma_rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_WRITE_BARRIER(mb());
+ KCSAN_CHECK_WRITE_BARRIER(wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_RW_BARRIER(mb());
+ KCSAN_CHECK_RW_BARRIER(wmb());
+ KCSAN_CHECK_RW_BARRIER(rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb());
+ KCSAN_CHECK_RW_BARRIER(smp_wmb());
+ KCSAN_CHECK_RW_BARRIER(smp_rmb());
+ KCSAN_CHECK_RW_BARRIER(dma_wmb());
+ KCSAN_CHECK_RW_BARRIER(dma_rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
+
+#ifdef clear_bit_unlock_is_negative_byte
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+#endif
+ kcsan_nestable_atomic_end();
+
+ return ret;
+}
+
static int __init kcsan_selftest(void)
{
int passed = 0;
@@ -129,9 +252,9 @@ static int __init kcsan_selftest(void)
pr_err("selftest: " #do_test " failed"); \
} while (0)
- RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
+ RUN_TEST(test_barrier);
pr_info("selftest: %d/%d tests passed\n", passed, total);
if (passed != total)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c06f051..92d301f98776 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -19,26 +19,9 @@
#include "kexec_internal.h"
-static int copy_user_segment_list(struct kimage *image,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
-{
- int ret;
- size_t segment_bytes;
-
- /* Read in the segments */
- image->nr_segments = nr_segments;
- segment_bytes = nr_segments * sizeof(*segments);
- ret = copy_from_user(image->segment, segments, segment_bytes);
- if (ret)
- ret = -EFAULT;
-
- return ret;
-}
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
- struct kexec_segment __user *segments,
+ struct kexec_segment *segments,
unsigned long flags)
{
int ret;
@@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
return -ENOMEM;
image->start = entry;
-
- ret = copy_user_segment_list(image, nr_segments, segments);
- if (ret)
- goto out_free_image;
+ image->nr_segments = nr_segments;
+ memcpy(image->segment, segments, nr_segments * sizeof(*segments));
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
@@ -104,12 +85,20 @@ out_free_image:
}
static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
- struct kexec_segment __user *segments, unsigned long flags)
+ struct kexec_segment *segments, unsigned long flags)
{
struct kimage **dest_image, *image;
unsigned long i;
int ret;
+ /*
+ * Because we write directly to the reserved memory region when loading
+ * crash kernels we need a serialization here to prevent multiple crash
+ * kernels from attempting to load simultaneously.
+ */
+ if (!kexec_trylock())
+ return -EBUSY;
+
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
@@ -121,7 +110,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (nr_segments == 0) {
/* Uninstall image */
kimage_free(xchg(dest_image, NULL));
- return 0;
+ ret = 0;
+ goto out_unlock;
}
if (flags & KEXEC_ON_CRASH) {
/*
@@ -134,7 +124,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
if (ret)
- return ret;
+ goto out_unlock;
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
@@ -171,6 +161,8 @@ out:
arch_kexec_protect_crashkres();
kimage_free(image);
+out_unlock:
+ kexec_unlock();
return ret;
}
@@ -198,10 +190,12 @@ out:
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
+ int image_type = (flags & KEXEC_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Permit LSMs and IMA to fail the kexec */
@@ -236,7 +230,8 @@ static inline int kexec_load_check(unsigned long nr_segments,
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
{
- int result;
+ struct kexec_segment *ksegments;
+ unsigned long result;
result = kexec_load_check(nr_segments, flags);
if (result)
@@ -247,20 +242,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
return -EINVAL;
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- result = do_kexec_load(entry, nr_segments, segments, flags);
+ ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0]));
+ if (IS_ERR(ksegments))
+ return PTR_ERR(ksegments);
- mutex_unlock(&kexec_mutex);
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
+ kfree(ksegments);
return result;
}
@@ -272,7 +259,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, flags)
{
struct compat_kexec_segment in;
- struct kexec_segment out, __user *ksegments;
+ struct kexec_segment *ksegments;
unsigned long i, result;
result = kexec_load_check(nr_segments, flags);
@@ -285,37 +272,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
+ GFP_KERNEL);
+ if (!ksegments)
+ return -ENOMEM;
+
for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
- return -EFAULT;
-
- out.buf = compat_ptr(in.buf);
- out.bufsz = in.bufsz;
- out.mem = in.mem;
- out.memsz = in.memsz;
+ goto fail;
- result = copy_to_user(&ksegments[i], &out, sizeof(out));
- if (result)
- return -EFAULT;
+ ksegments[i].buf = compat_ptr(in.buf);
+ ksegments[i].bufsz = in.bufsz;
+ ksegments[i].mem = in.mem;
+ ksegments[i].memsz = in.memsz;
}
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
result = do_kexec_load(entry, nr_segments, ksegments, flags);
- mutex_unlock(&kexec_mutex);
-
+fail:
+ kfree(ksegments);
return result;
}
#endif
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 4b34a9aa32bc..3d578c6fefee 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/btf.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -46,7 +47,7 @@
#include <crypto/hash.h>
#include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex);
+atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -81,7 +82,7 @@ int kexec_should_crash(struct task_struct *p)
if (crash_kexec_post_notifiers)
return 0;
/*
- * There are 4 panic() calls in do_exit() path, each of which
+ * There are 4 panic() calls in make_task_dead() path, each of which
* corresponds to each of these 4 conditions.
*/
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
@@ -561,23 +562,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
@@ -591,11 +586,6 @@ static void kimage_free_extra_pages(struct kimage *image)
}
-int __weak machine_kexec_post_load(struct kimage *image)
-{
- return 0;
-}
-
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -768,7 +758,6 @@ static struct page *kimage_alloc_page(struct kimage *image,
kimage_free_pages(old_page);
continue;
}
- addr = old_addr;
page = old_page;
break;
}
@@ -788,7 +777,6 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned char __user *buf = NULL;
unsigned char *kbuf = NULL;
- result = 0;
if (image->file_mode)
kbuf = segment->kbuf;
else
@@ -816,7 +804,7 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
@@ -829,7 +817,7 @@ static int kimage_load_normal_segment(struct kimage *image,
memcpy(ptr, kbuf, uchunk);
else
result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
+ kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
@@ -880,7 +868,7 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
PAGE_SIZE - (maddr & ~PAGE_MASK));
@@ -896,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
else
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
- kunmap(page);
+ kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
@@ -933,9 +921,123 @@ int kimage_load_segment(struct kimage *image,
return result;
}
+struct kexec_load_limit {
+ /* Mutex protects the limit count. */
+ struct mutex mutex;
+ int limit;
+};
+
+static struct kexec_load_limit load_limit_reboot = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
+ .limit = -1,
+};
+
+static struct kexec_load_limit load_limit_panic = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
+ .limit = -1,
+};
+
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
-int kexec_load_disabled;
+static int kexec_load_disabled;
+
+#ifdef CONFIG_SYSCTL
+static int kexec_limit_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct kexec_load_limit *limit = table->data;
+ int val;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ };
+ int ret;
+
+ if (write) {
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (val < 0)
+ return -EINVAL;
+
+ mutex_lock(&limit->mutex);
+ if (limit->limit != -1 && val >= limit->limit)
+ ret = -EINVAL;
+ else
+ limit->limit = val;
+ mutex_unlock(&limit->mutex);
+
+ return ret;
+ }
+
+ mutex_lock(&limit->mutex);
+ val = limit->limit;
+ mutex_unlock(&limit->mutex);
+
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table kexec_core_sysctls[] = {
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kexec_load_limit_panic",
+ .data = &load_limit_panic,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ {
+ .procname = "kexec_load_limit_reboot",
+ .data = &load_limit_reboot,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ { }
+};
+
+static int __init kexec_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", kexec_core_sysctls);
+ return 0;
+}
+late_initcall(kexec_core_sysctl_init);
+#endif
+
+bool kexec_load_permitted(int kexec_image_type)
+{
+ struct kexec_load_limit *limit;
+
+ /*
+ * Only the superuser can use the kexec syscall and if it has not
+ * been disabled.
+ */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return false;
+
+ /* Check limit counter and decrease it.*/
+ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
+ &load_limit_panic : &load_limit_reboot;
+ mutex_lock(&limit->mutex);
+ if (!limit->limit) {
+ mutex_unlock(&limit->mutex);
+ return false;
+ }
+ if (limit->limit != -1)
+ limit->limit--;
+ mutex_unlock(&limit->mutex);
+
+ return true;
+}
/*
* No panic_cpu check version of crash_kexec(). This function is called
@@ -944,7 +1046,7 @@ int kexec_load_disabled;
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
- /* Take the kexec_mutex here to prevent sys_kexec_load
+ /* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -952,7 +1054,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_trylock()) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
@@ -961,12 +1063,12 @@ void __noclone __crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
-void crash_kexec(struct pt_regs *regs)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
int old_cpu, this_cpu;
@@ -979,7 +1081,6 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
@@ -990,24 +1091,18 @@ void crash_kexec(struct pt_regs *regs)
}
}
-size_t crash_get_memory_size(void)
+ssize_t crash_get_memory_size(void)
{
- size_t size = 0;
+ ssize_t size = 0;
+
+ if (!kexec_trylock())
+ return -EBUSY;
- mutex_lock(&kexec_mutex);
if (crashk_res.end != crashk_res.start)
size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
+ kexec_unlock();
+ return size;
}
int crash_shrink_memory(unsigned long new_size)
@@ -1017,7 +1112,8 @@ int crash_shrink_memory(unsigned long new_size)
unsigned long old_size;
struct resource *ram_res;
- mutex_lock(&kexec_mutex);
+ if (!kexec_trylock())
+ return -EBUSY;
if (kexec_crash_image) {
ret = -ENOENT;
@@ -1055,7 +1151,7 @@ int crash_shrink_memory(unsigned long new_size)
insert_resource(&iomem_resource, ram_res);
unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
@@ -1079,7 +1175,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
return;
memset(&prstatus, 0, sizeof(prstatus));
prstatus.common.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
final_note(buf);
@@ -1127,7 +1223,7 @@ int kernel_kexec(void)
{
int error = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
@@ -1203,19 +1299,6 @@ int kernel_kexec(void)
#endif
Unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return error;
}
-
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_kexec_protect_crashkres(void)
-{}
-
-void __weak arch_kexec_unprotect_crashkres(void)
-{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..f989f5f1933b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -29,8 +29,20 @@
#include <linux/vmalloc.h>
#include "kexec_internal.h"
+#ifdef CONFIG_KEXEC_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
+
+void set_kexec_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+#endif
+
static int kexec_calculate_store_digests(struct kimage *image);
+/* Maximum size in bytes for kernel/initrd files. */
+#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX)
+
/*
* Currently this is the only default function that is exported as some
* architectures need it to do additional handlings.
@@ -53,13 +65,6 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
return ret;
}
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_probe_default(image, buf, buf_len);
-}
-
static void *kexec_image_load_default(struct kimage *image)
{
if (!image->fops || !image->fops->load)
@@ -71,11 +76,6 @@ static void *kexec_image_load_default(struct kimage *image)
image->cmdline_buf_len);
}
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return kexec_image_load_default(image);
-}
-
int kexec_image_post_load_cleanup_default(struct kimage *image)
{
if (!image->fops || !image->fops->cleanup)
@@ -84,64 +84,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
return image->fops->cleanup(image->image_loader_data);
}
-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
- return kexec_image_post_load_cleanup_default(image);
-}
-
-#ifdef CONFIG_KEXEC_SIG
-static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- if (!image->fops || !image->fops->verify_sig) {
- pr_debug("kernel loader does not support signature verification.\n");
- return -EKEYREJECTED;
- }
-
- return image->fops->verify_sig(buf, buf_len);
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_verify_sig_default(image, buf, buf_len);
-}
-#endif
-
-/*
- * arch_kexec_apply_relocations_add - apply relocations of type RELA
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELAs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * arch_kexec_apply_relocations - apply relocations of type REL
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
/*
* Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
@@ -184,16 +126,44 @@ void kimage_file_post_load_cleanup(struct kimage *image)
}
#ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len)
+{
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
+}
+#endif
+
+static int kexec_image_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
static int
kimage_validate_signature(struct kimage *image)
{
int ret;
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
+ ret = kexec_image_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
if (ret) {
- if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+ if (sig_enforce) {
pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
@@ -223,11 +193,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
const char __user *cmdline_ptr,
unsigned long cmdline_len, unsigned flags)
{
- int ret;
+ ssize_t ret;
void *ldata;
ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf,
- INT_MAX, NULL, READING_KEXEC_IMAGE);
+ KEXEC_FILE_SIZE_MAX, NULL,
+ READING_KEXEC_IMAGE);
if (ret < 0)
return ret;
image->kernel_buf_len = ret;
@@ -247,7 +218,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf,
- INT_MAX, NULL,
+ KEXEC_FILE_SIZE_MAX, NULL,
READING_KEXEC_INITRAMFS);
if (ret < 0)
goto out;
@@ -278,8 +249,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* IMA needs to pass the measurement list to the next kernel. */
ima_add_kexec_buffer(image);
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
+ /* Call image load handler */
+ ldata = kexec_image_load_default(image);
if (IS_ERR(ldata)) {
ret = PTR_ERR(ldata);
@@ -355,11 +326,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
unsigned long, cmdline_len, const char __user *, cmdline_ptr,
unsigned long, flags)
{
- int ret = 0, i;
+ int image_type = (flags & KEXEC_FILE_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
struct kimage **dest_image, *image;
+ int ret = 0, i;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Make sure we have a legal set of flags */
@@ -368,14 +341,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH) {
+ if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
+ } else {
+ dest_image = &kexec_image;
}
if (flags & KEXEC_FILE_UNLOAD)
@@ -440,7 +414,7 @@ out:
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
kimage_free(image);
return ret;
}
@@ -556,6 +530,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+ /*
+ * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
+ * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
+ * locate_mem_hole_callback().
+ */
if (kbuf->top_down) {
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
&mstart, &mend, NULL) {
@@ -642,19 +621,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
- * arch_kexec_locate_mem_hole - Find free memory to place the segments.
- * @kbuf: Parameters for the memory search.
- *
- * On success, kbuf->mem will have the start address of the memory region found.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
-{
- return kexec_locate_mem_hole(kbuf);
-}
-
-/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
@@ -1178,7 +1144,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
{
int i, j;
unsigned long long start, end, p_start, p_end;
- struct crash_mem_range temp_range = {0, 0};
+ struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
@@ -1255,7 +1221,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
-int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
{
Elf64_Ehdr *ehdr;
@@ -1319,7 +1285,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr++;
/* Prepare PT_LOAD type program header for kernel text region */
- if (kernel_map) {
+ if (need_kernel_map) {
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
phdr->p_vaddr = (unsigned long) _text;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..74da1409cd14 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-extern struct mutex kexec_mutex;
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 8f69772af77b..42163c9e94e5 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -26,15 +26,15 @@ asm (
" .popsection \n"
);
-extern char kernel_headers_data;
-extern char kernel_headers_data_end;
+extern char kernel_headers_data[];
+extern char kernel_headers_data_end[];
static ssize_t
ikheaders_read(struct file *file, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
- memcpy(buf, &kernel_headers_data + off, len);
+ memcpy(buf, &kernel_headers_data[off], len);
return len;
}
@@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = {
static int __init ikheaders_init(void)
{
- kheaders_attr.size = (&kernel_headers_data_end -
- &kernel_headers_data);
+ kheaders_attr.size = (kernel_headers_data_end -
+ kernel_headers_data);
return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 790a573bbe00..00e177de91cc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
- * kernel/kprobes.c
*
* Copyright (C) IBM Corporation, 2002, 2004
*
@@ -18,6 +17,9 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
+
+#define pr_fmt(fmt) "kprobes: " fmt
+
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
@@ -46,21 +48,24 @@
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL)
+#define kprobe_sysctls_init() do { } while (0)
+#endif
static int kprobes_initialized;
/* kprobe_table can be accessed by
- * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
+ * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held.
* Or
* - RCU hlist traversal under disabling preempt (breakpoint handlers)
*/
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: change this value only with 'kprobe_mutex' held */
static bool kprobes_all_disarmed;
-/* This protects kprobe_table and optimizing_list */
+/* This protects 'kprobe_table' and 'optimizing_list' */
static DEFINE_MUTEX(kprobe_mutex);
-static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance);
kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
unsigned int __unused)
@@ -68,12 +73,15 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}
-/* Blacklist -- list of struct kprobe_blacklist_entry */
+/*
+ * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where
+ * kprobes can not probe.
+ */
static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
- * kprobe->ainsn.insn points to the copy of the instruction to be
+ * 'kprobe::ainsn.insn' points to the copy of the instruction to be
* single-stepped. x86_64, POWER4 and above have no-exec support and
* stepping on the instruction on a vmalloced/kmalloced/data page
* is a recipe for disaster
@@ -104,6 +112,12 @@ enum kprobe_slot_state {
void __weak *alloc_insn_page(void)
{
+ /*
+ * Use module_alloc() so this page is within +/- 2GB of where the
+ * kernel image and loaded module images reside. This is required
+ * for most of the architectures.
+ * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
+ */
return module_alloc(PAGE_SIZE);
}
@@ -139,6 +153,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
list_for_each_entry_rcu(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
int i;
+
for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_CLEAN) {
kip->slot_used[i] = SLOT_USED;
@@ -164,11 +179,6 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
if (!kip)
goto out;
- /*
- * Use module_alloc so this page is within +/- 2GB of where the
- * kernel image and loaded module images reside. This is required
- * so x86_64 can correctly handle the %rip-relative fixups.
- */
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
@@ -191,8 +201,8 @@ out:
return slot;
}
-/* Return 1 if all garbages are collected, otherwise 0. */
-static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
+/* Return true if all garbages are collected, otherwise false. */
+static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -216,9 +226,9 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
kip->cache->free(kip->insns);
kfree(kip);
}
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
@@ -230,6 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
+
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
@@ -310,7 +321,7 @@ int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
list_for_each_entry_rcu(kip, &c->pages, list) {
if ((*symnum)--)
continue;
- strlcpy(sym, c->sym, KSYM_NAME_LEN);
+ strscpy(sym, c->sym, KSYM_NAME_LEN);
*type = 't';
*value = (unsigned long)kip->insns;
ret = 0;
@@ -358,9 +369,9 @@ static inline void reset_kprobe_instance(void)
/*
* This routine is called either:
- * - under the kprobe_mutex - during kprobe_[un]register()
- * OR
- * - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ * - under the 'kprobe_mutex' - during kprobe_[un]register().
+ * OR
+ * - with preemption disabled - from architecture specific code.
*/
struct kprobe *get_kprobe(void *addr)
{
@@ -380,22 +391,20 @@ NOKPROBE_SYMBOL(get_kprobe);
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
-/* Return true if the kprobe is an aggregator */
-static inline int kprobe_aggrprobe(struct kprobe *p)
+/* Return true if 'p' is an aggregator */
+static inline bool kprobe_aggrprobe(struct kprobe *p)
{
return p->pre_handler == aggr_pre_handler;
}
-/* Return true(!0) if the kprobe is unused */
-static inline int kprobe_unused(struct kprobe *p)
+/* Return true if 'p' is unused */
+static inline bool kprobe_unused(struct kprobe *p)
{
return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
list_empty(&p->list);
}
-/*
- * Keep all fields in the kprobe consistent
- */
+/* Keep all fields in the kprobe consistent. */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
@@ -403,11 +412,11 @@ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
}
#ifdef CONFIG_OPTPROBES
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: This is protected by 'kprobe_mutex'. */
static bool kprobes_allow_optimization;
/*
- * Call all pre_handler on the list, but ignores its return value.
+ * Call all 'kprobe::pre_handler' on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
@@ -435,7 +444,7 @@ static void free_aggr_kprobe(struct kprobe *p)
kfree(op);
}
-/* Return true(!0) if the kprobe is ready for optimization. */
+/* Return true if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -448,8 +457,8 @@ static inline int kprobe_optready(struct kprobe *p)
return 0;
}
-/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
-static inline int kprobe_disarmed(struct kprobe *p)
+/* Return true if the kprobe is disarmed. Note: p must be on hash list */
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -462,32 +471,32 @@ static inline int kprobe_disarmed(struct kprobe *p)
return kprobe_disabled(p) && list_empty(&op->list);
}
-/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int kprobe_queued(struct kprobe *p)
+/* Return true if the probe is queued on (un)optimizing lists */
+static bool kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
if (kprobe_aggrprobe(p)) {
op = container_of(p, struct optimized_kprobe, kp);
if (!list_empty(&op->list))
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
* Return an optimized kprobe whose optimizing code replaces
- * instructions including addr (exclude breakpoint).
+ * instructions including 'addr' (exclude breakpoint).
*/
-static struct kprobe *get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr)
{
int i;
struct kprobe *p = NULL;
struct optimized_kprobe *op;
/* Don't check i == 0, since that is a breakpoint case. */
- for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
- p = get_kprobe((void *)(addr - i));
+ for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++)
+ p = get_kprobe(addr - i);
if (p && kprobe_optready(p)) {
op = container_of(p, struct optimized_kprobe, kp);
@@ -498,7 +507,7 @@ static struct kprobe *get_optimized_kprobe(unsigned long addr)
return NULL;
}
-/* Optimization staging list, protected by kprobe_mutex */
+/* Optimization staging list, protected by 'kprobe_mutex' */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);
@@ -509,20 +518,20 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
/*
* Optimize (replace a breakpoint with a jump) kprobes listed on
- * optimizing_list.
+ * 'optimizing_list'.
*/
static void do_optimize_kprobes(void)
{
lockdep_assert_held(&text_mutex);
/*
- * The optimization/unoptimization refers online_cpus via
- * stop_machine() and cpu-hotplug modifies online_cpus.
- * And same time, text_mutex will be held in cpu-hotplug and here.
- * This combination can cause a deadlock (cpu-hotplug try to lock
- * text_mutex but stop_machine can not be done because online_cpus
- * has been changed)
- * To avoid this deadlock, caller must have locked cpu hotplug
- * for preventing cpu-hotplug outside of text_mutex locking.
+ * The optimization/unoptimization refers 'online_cpus' via
+ * stop_machine() and cpu-hotplug modifies the 'online_cpus'.
+ * And same time, 'text_mutex' will be held in cpu-hotplug and here.
+ * This combination can cause a deadlock (cpu-hotplug tries to lock
+ * 'text_mutex' but stop_machine() can not be done because
+ * the 'online_cpus' has been changed)
+ * To avoid this deadlock, caller must have locked cpu-hotplug
+ * for preventing cpu-hotplug outside of 'text_mutex' locking.
*/
lockdep_assert_cpus_held();
@@ -536,7 +545,7 @@ static void do_optimize_kprobes(void)
/*
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
- * if need) kprobes listed on unoptimizing_list.
+ * if need) kprobes listed on 'unoptimizing_list'.
*/
static void do_unoptimize_kprobes(void)
{
@@ -546,23 +555,21 @@ static void do_unoptimize_kprobes(void)
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop free_list for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
* Remove unused probes from hash list. After waiting
* for synchronization, these probes are reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes.)
+ * (reclaiming is done by do_free_cleaned_kprobes().)
*/
hlist_del_rcu(&op->kp.hlist);
} else
@@ -570,7 +577,7 @@ static void do_unoptimize_kprobes(void)
}
}
-/* Reclaim all kprobes on the free_list */
+/* Reclaim all kprobes on the 'freeing_list' */
static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -642,9 +649,9 @@ void wait_for_kprobe_optimizer(void)
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
- /* this will also make optimizing_work execute immmediately */
+ /* This will also make 'optimizing_work' execute immmediately */
flush_delayed_work(&optimizing_work);
- /* @optimizing_work might not have been queued yet, relax */
+ /* 'optimizing_work' might not have been queued yet, relax */
cpu_relax();
mutex_lock(&kprobe_mutex);
@@ -653,7 +660,7 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
@@ -675,7 +682,7 @@ static void optimize_kprobe(struct kprobe *p)
(kprobe_disabled(p) || kprobes_all_disarmed))
return;
- /* kprobes with post_handler can not be optimized */
+ /* kprobes with 'post_handler' can not be optimized */
if (p->post_handler)
return;
@@ -695,7 +702,10 @@ static void optimize_kprobe(struct kprobe *p)
}
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ /*
+ * On the 'unoptimizing_list' and 'optimizing_list',
+ * 'op' must have OPTIMIZED flag
+ */
if (WARN_ON_ONCE(!list_empty(&op->list)))
return;
@@ -765,7 +775,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
WARN_ON_ONCE(list_empty(&op->list));
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
- /* Optimize it again (remove from op->list) */
+ /* Optimize it again. (remove from 'op->list') */
if (!kprobe_optready(ap))
return -EINVAL;
@@ -785,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
@@ -815,7 +824,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
__prepare_optimized_kprobe(op, p);
}
-/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+/* Allocate new optimized_kprobe and try to prepare optimized instructions. */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -834,19 +843,19 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
- * Prepare an optimized_kprobe and optimize it
- * NOTE: p must be a normal registered kprobe
+ * Prepare an optimized_kprobe and optimize it.
+ * NOTE: 'p' must be a normal registered kprobe.
*/
static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
- /* Impossible to optimize ftrace-based kprobe */
+ /* Impossible to optimize ftrace-based kprobe. */
if (kprobe_ftrace(p))
return;
- /* For preparing optimization, jump_label_text_reserved() is called */
+ /* For preparing optimization, jump_label_text_reserved() is called. */
cpus_read_lock();
jump_label_lock();
mutex_lock(&text_mutex);
@@ -857,14 +866,14 @@ static void try_to_optimize_kprobe(struct kprobe *p)
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
- /* If failed to setup optimizing, fallback to kprobe */
+ /* If failed to setup optimizing, fallback to kprobe. */
arch_remove_optimized_kprobe(op);
kfree(op);
goto out;
}
init_aggr_kprobe(ap, p);
- optimize_kprobe(ap); /* This just kicks optimizer thread */
+ optimize_kprobe(ap); /* This just kicks optimizer thread. */
out:
mutex_unlock(&text_mutex);
@@ -879,7 +888,7 @@ static void optimize_all_kprobes(void)
unsigned int i;
mutex_lock(&kprobe_mutex);
- /* If optimization is already allowed, just return */
+ /* If optimization is already allowed, just return. */
if (kprobes_allow_optimization)
goto out;
@@ -892,7 +901,7 @@ static void optimize_all_kprobes(void)
optimize_kprobe(p);
}
cpus_read_unlock();
- printk(KERN_INFO "Kprobes globally optimized\n");
+ pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
out:
mutex_unlock(&kprobe_mutex);
}
@@ -905,7 +914,7 @@ static void unoptimize_all_kprobes(void)
unsigned int i;
mutex_lock(&kprobe_mutex);
- /* If optimization is already prohibited, just return */
+ /* If optimization is already prohibited, just return. */
if (!kprobes_allow_optimization) {
mutex_unlock(&kprobe_mutex);
return;
@@ -923,16 +932,16 @@ static void unoptimize_all_kprobes(void)
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
- /* Wait for unoptimizing completion */
+ /* Wait for unoptimizing completion. */
wait_for_kprobe_optimizer();
- printk(KERN_INFO "Kprobes globally unoptimized\n");
+ pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
-int sysctl_kprobes_optimization;
-int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length,
- loff_t *ppos)
+static int sysctl_kprobes_optimization;
+static int proc_kprobes_optimization_handler(struct ctl_table *table,
+ int write, void *buffer,
+ size_t *length, loff_t *ppos)
{
int ret;
@@ -948,15 +957,35 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
return ret;
}
+
+static struct ctl_table kprobe_sysctls[] = {
+ {
+ .procname = "kprobes-optimization",
+ .data = &sysctl_kprobes_optimization,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_kprobes_optimization_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init kprobe_sysctls_init(void)
+{
+ register_sysctl_init("debug", kprobe_sysctls);
+}
#endif /* CONFIG_SYSCTL */
-/* Put a breakpoint for a probe. Must be called with text_mutex locked */
+/* Put a breakpoint for a probe. */
static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
- /* Check collision with other optimized kprobes */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ lockdep_assert_held(&text_mutex);
+
+ /* Find the overlapping optimized kprobes. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p))
/* Fallback to unoptimized kprobe */
unoptimize_kprobe(_p, true);
@@ -965,22 +994,29 @@ static void __arm_kprobe(struct kprobe *p)
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+/* Remove the breakpoint of a probe. */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
+ lockdep_assert_held(&text_mutex);
+
/* Try to unoptimize */
unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
- /* If another kprobe was blocked, optimize it. */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ /* If another kprobe was blocked, re-optimize it. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p) && reopt)
optimize_kprobe(_p);
}
- /* TODO: reoptimize others after unoptimized this probe */
+ /*
+ * TODO: Since unoptimization and real disarming will be done by
+ * the worker thread, we can not check whether another probe are
+ * unoptimized because of this probe here. It should be re-optimized
+ * by the worker thread.
+ */
}
#else /* !CONFIG_OPTPROBES */
@@ -1003,7 +1039,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
* unregistered.
* Thus there should be no chance to reuse unused kprobe.
*/
- printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -1033,34 +1069,21 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
-/* Must ensure p->addr is really on ftrace */
-static int prepare_kprobe(struct kprobe *p)
-{
- if (!kprobe_ftrace(p))
- return arch_prepare_kprobe(p);
-
- return arch_prepare_kprobe_ftrace(p);
-}
-
-/* Caller must lock kprobe_mutex */
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
int ret = 0;
+ lockdep_assert_held(&kprobe_mutex);
+
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
- if (ret) {
- pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
- p->addr, ret);
+ if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
return ret;
- }
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (ret) {
- pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
+ if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret))
goto err_ftrace;
- }
}
(*cnt)++;
@@ -1084,22 +1107,23 @@ static int arm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
-/* Caller must lock kprobe_mutex */
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
int ret = 0;
+ lockdep_assert_held(&kprobe_mutex);
+
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
+ if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret))
return ret;
}
(*cnt)--;
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
- WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
+ WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n",
p->addr, ret);
return ret;
}
@@ -1113,11 +1137,6 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-static inline int prepare_kprobe(struct kprobe *p)
-{
- return arch_prepare_kprobe(p);
-}
-
static inline int arm_kprobe_ftrace(struct kprobe *p)
{
return -ENODEV;
@@ -1129,7 +1148,15 @@ static inline int disarm_kprobe_ftrace(struct kprobe *p)
}
#endif
-/* Arm a kprobe with text_mutex */
+static int prepare_kprobe(struct kprobe *p)
+{
+ /* Must ensure p->addr is really on ftrace */
+ if (kprobe_ftrace(p))
+ return arch_prepare_kprobe_ftrace(p);
+
+ return arch_prepare_kprobe(p);
+}
+
static int arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp)))
@@ -1144,7 +1171,6 @@ static int arm_kprobe(struct kprobe *kp)
return 0;
}
-/* Disarm a kprobe with text_mutex */
static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp)))
@@ -1194,41 +1220,20 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(aggr_post_handler);
-/* Walks the list and increments nmissed count for multiprobe case */
+/* Walks the list and increments 'nmissed' if 'p' has child probes. */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
+
if (!kprobe_aggrprobe(p)) {
p->nmissed++;
} else {
list_for_each_entry_rcu(kp, &p->list, list)
kp->nmissed++;
}
- return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-static void free_rp_inst_rcu(struct rcu_head *head)
-{
- struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
-
- if (refcount_dec_and_test(&ri->rph->ref))
- kfree(ri->rph);
- kfree(ri);
-}
-NOKPROBE_SYMBOL(free_rp_inst_rcu);
-
-static void recycle_rp_inst(struct kretprobe_instance *ri)
-{
- struct kretprobe *rp = get_kretprobe(ri);
-
- if (likely(rp)) {
- freelist_add(&ri->freelist, &rp->freelist);
- } else
- call_rcu(&ri->rcu, free_rp_inst_rcu);
-}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
static struct kprobe kprobe_busy = {
.addr = (void *) get_kprobe,
};
@@ -1249,57 +1254,7 @@ void kprobe_busy_end(void)
preempt_enable();
}
-/*
- * This function is called from finish_task_switch when task tk becomes dead,
- * so that we can recycle any function-return probe instances associated
- * with this task. These left over instances represent probed functions
- * that have been called but will never return.
- */
-void kprobe_flush_task(struct task_struct *tk)
-{
- struct kretprobe_instance *ri;
- struct llist_node *node;
-
- /* Early boot, not yet initialized. */
- if (unlikely(!kprobes_initialized))
- return;
-
- kprobe_busy_begin();
-
- node = __llist_del_all(&tk->kretprobe_instances);
- while (node) {
- ri = container_of(node, struct kretprobe_instance, llist);
- node = node->next;
-
- recycle_rp_inst(ri);
- }
-
- kprobe_busy_end();
-}
-NOKPROBE_SYMBOL(kprobe_flush_task);
-
-static inline void free_rp_inst(struct kretprobe *rp)
-{
- struct kretprobe_instance *ri;
- struct freelist_node *node;
- int count = 0;
-
- node = rp->freelist.head;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, freelist);
- node = node->next;
-
- kfree(ri);
- count++;
- }
-
- if (refcount_sub_and_test(count, &rp->rph->ref)) {
- kfree(rp->rph);
- rp->rph = NULL;
- }
-}
-
-/* Add the new probe to ap->list */
+/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
if (p->post_handler)
@@ -1313,12 +1268,12 @@ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * Fill in the required fields of the "manager kprobe". Replace the
- * earlier kprobe in the hlist with the manager kprobe
+ * Fill in the required fields of the aggregator kprobe. Replace the
+ * earlier kprobe in the hlist with the aggregator kprobe.
*/
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
- /* Copy p's insn slot to ap */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
@@ -1336,8 +1291,7 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * This is the second or subsequent kprobe at the address - handle
- * the intricacies
+ * This registers the second or subsequent kprobe at the same address.
*/
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
@@ -1351,7 +1305,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
mutex_lock(&text_mutex);
if (!kprobe_aggrprobe(orig_p)) {
- /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
ap = alloc_aggr_kprobe(orig_p);
if (!ap) {
ret = -ENOMEM;
@@ -1376,8 +1330,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
if (ret)
/*
* Even if fail to allocate new slot, don't need to
- * free aggr_probe. It will be used next time, or
- * freed by unregister_kprobe.
+ * free the 'ap'. It will be used next time, or
+ * freed by unregister_kprobe().
*/
goto out;
@@ -1392,7 +1346,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
| KPROBE_FLAG_DISABLED;
}
- /* Copy ap's insn slot to p */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(ap, p);
ret = add_new_kprobe(ap, p);
@@ -1418,7 +1372,7 @@ out:
bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- /* The __kprobes marked functions and entry code must not be probed */
+ /* The '__kprobes' functions and entry code must not be probed. */
return addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end;
}
@@ -1430,8 +1384,8 @@ static bool __within_kprobe_blacklist(unsigned long addr)
if (arch_within_kprobe_blacklist(addr))
return true;
/*
- * If there exists a kprobe_blacklist, verify and
- * fail any probe registration in the prohibited area
+ * If 'kprobe_blacklist' is defined, check the address and
+ * reject any probe registration in the prohibited area.
*/
list_for_each_entry(ent, &kprobe_blacklist, list) {
if (addr >= ent->start_addr && addr < ent->end_addr)
@@ -1461,24 +1415,68 @@ bool within_kprobe_blacklist(unsigned long addr)
}
/*
- * If we have a symbol_name argument, look it up and add the offset field
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+ unsigned long offset,
+ bool *on_func_entry)
+{
+ *on_func_entry = !offset;
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
+ * If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
- const char *symbol_name, unsigned int offset)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+ unsigned long offset, bool *on_func_entry)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
if (symbol_name) {
+ /*
+ * Input: @sym + @offset
+ * Output: @addr + @offset
+ *
+ * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+ * argument into it's output!
+ */
addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + offset);
+ /*
+ * So here we have @addr + @offset, displace it into a new
+ * @addr' + @offset' where @addr' is the symbol start address.
+ */
+ addr = (void *)addr + offset;
+ if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+ return ERR_PTR(-ENOENT);
+ addr = (void *)addr - offset;
+
+ /*
+ * Then ask the architecture to re-combine them, taking care of
+ * magical function entry details while telling us if this was indeed
+ * at the start of the function.
+ */
+ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
if (addr)
return addr;
@@ -1488,10 +1486,14 @@ invalid:
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
- return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+ bool on_func_entry;
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+/*
+ * Check the 'p' is valid and return the aggregator kprobe
+ * at the same address.
+ */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1529,16 +1531,12 @@ static inline int warn_kprobe_rereg(struct kprobe *p)
return ret;
}
-int __weak arch_check_ftrace_location(struct kprobe *p)
+static int check_ftrace_location(struct kprobe *p)
{
- unsigned long ftrace_addr;
+ unsigned long addr = (unsigned long)p->addr;
- ftrace_addr = ftrace_location((unsigned long)p->addr);
- if (ftrace_addr) {
+ if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
- /* Given address is not on the instruction boundary */
- if ((unsigned long)p->addr != ftrace_addr)
- return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
@@ -1552,14 +1550,16 @@ static int check_kprobe_address_safe(struct kprobe *p,
{
int ret;
- ret = arch_check_ftrace_location(p);
+ ret = check_ftrace_location(p);
if (ret)
return ret;
jump_label_lock();
preempt_disable();
/* Ensure it is not in reserved area nor out of text */
- if (!kernel_text_address((unsigned long) p->addr) ||
+ if (!(core_kernel_text((unsigned long) p->addr) ||
+ is_module_text_address((unsigned long) p->addr)) ||
+ in_gate_area_no_mm((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
@@ -1568,7 +1568,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
- /* Check if are we probing a module */
+ /* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
/*
@@ -1581,7 +1581,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
}
/*
- * If the module freed .init.text, we couldn't insert
+ * If the module freed '.init.text', we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
@@ -1604,9 +1604,10 @@ int register_kprobe(struct kprobe *p)
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
+ bool on_func_entry;
/* Adjust probe address from symbol */
- addr = kprobe_addr(p);
+ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
if (IS_ERR(addr))
return PTR_ERR(addr);
p->addr = addr;
@@ -1626,9 +1627,12 @@ int register_kprobe(struct kprobe *p)
mutex_lock(&kprobe_mutex);
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
+
old_p = get_kprobe(p->addr);
if (old_p) {
- /* Since this may unoptimize old_p, locking text_mutex. */
+ /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
ret = register_aggr_kprobe(old_p, p);
goto out;
}
@@ -1667,8 +1671,8 @@ out:
}
EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check if all probes on the aggrprobe are disabled */
-static int aggr_kprobe_disabled(struct kprobe *ap)
+/* Check if all probes on the 'ap' are disabled. */
+static bool aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1677,20 +1681,21 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
- * There is an active probe on the list.
- * We can't disable this ap.
+ * Since there is an active probe on the list,
+ * we can't disable this 'ap'.
*/
- return 0;
+ return false;
- return 1;
+ return true;
}
-/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
int ret;
+ lockdep_assert_held(&kprobe_mutex);
+
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe(p);
if (unlikely(orig_p == NULL))
@@ -1704,11 +1709,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
/*
- * If kprobes_all_disarmed is set, orig_p
- * should have already been disarmed, so
- * skip unneed disarming process.
+ * Don't be lazy here. Even if 'kprobes_all_disarmed'
+ * is false, 'orig_p' might not have been armed yet.
+ * Note arm_all_kprobes() __tries__ to arm all kprobes
+ * on the best effort basis.
*/
- if (!kprobes_all_disarmed) {
+ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
ret = disarm_kprobe(orig_p, true);
if (ret) {
p->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1757,7 +1763,13 @@ static int __unregister_kprobe_top(struct kprobe *p)
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- ap->post_handler = NULL;
+ /*
+ * For the kprobe-on-ftrace case, we keep the
+ * post_handler setting to identify this aggrprobe
+ * armed with kprobe_ipmodify_ops.
+ */
+ if (!kprobe_ftrace(ap))
+ ap->post_handler = NULL;
}
noclean:
/*
@@ -1850,53 +1862,177 @@ static struct notifier_block kprobe_exceptions_nb = {
.priority = 0x7fffffff /* we need to be notified first */
};
-unsigned long __weak arch_deref_entry_point(void *entry)
+#ifdef CONFIG_KRETPROBES
+
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+static void free_rp_inst_rcu(struct rcu_head *head)
{
- return (unsigned long)entry;
+ struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+
+ if (refcount_dec_and_test(&ri->rph->ref))
+ kfree(ri->rph);
+ kfree(ri);
}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
-#ifdef CONFIG_KRETPROBES
+static void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
-unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
- void *trampoline_address,
- void *frame_pointer)
+ if (likely(rp))
+ freelist_add(&ri->freelist, &rp->freelist);
+ else
+ call_rcu(&ri->rcu, free_rp_inst_rcu);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
{
- kprobe_opcode_t *correct_ret_addr = NULL;
- struct kretprobe_instance *ri = NULL;
- struct llist_node *first, *node;
- struct kretprobe *rp;
+ struct kretprobe_instance *ri;
+ struct llist_node *node;
- /* Find all nodes for this frame. */
- first = node = current->kretprobe_instances.first;
+ /* Early boot, not yet initialized. */
+ if (unlikely(!kprobes_initialized))
+ return;
+
+ kprobe_busy_begin();
+
+ node = __llist_del_all(&tk->kretprobe_instances);
while (node) {
ri = container_of(node, struct kretprobe_instance, llist);
+ node = node->next;
- BUG_ON(ri->fp != frame_pointer);
+ recycle_rp_inst(ri);
+ }
- if (ri->ret_addr != trampoline_address) {
- correct_ret_addr = ri->ret_addr;
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- goto found;
- }
+ kprobe_busy_end();
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ struct freelist_node *node;
+ int count = 0;
+ node = rp->freelist.head;
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, freelist);
node = node->next;
+
+ kfree(ri);
+ count++;
}
- pr_err("Oops! Kretprobe fails to find correct return address.\n");
- BUG_ON(1);
-found:
- /* Unlink all nodes for this frame. */
- current->kretprobe_instances.first = node->next;
- node->next = NULL;
+ if (refcount_sub_and_test(count, &rp->rph->ref)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ }
+}
+
+/* This assumes the 'tsk' is the current task or the is not running. */
+static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->kretprobe_instances.first;
+ else
+ node = node->next;
+
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ if (ri->ret_addr != kretprobe_trampoline_addr()) {
+ *cur = node;
+ return ri->ret_addr;
+ }
+ node = node->next;
+ }
+ return NULL;
+}
+NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
+
+/**
+ * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe
+ * @tsk: Target task
+ * @fp: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a kretprobe on @tsk in unsigned
+ * long type. If it finds the return address, this returns that address value,
+ * or this returns 0.
+ * The @tsk must be 'current' or a task which is not running. @fp is a hint
+ * to get the currect return address - which is compared with the
+ * kretprobe_instance::fp field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ */
+unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
+ struct llist_node **cur)
+{
+ struct kretprobe_instance *ri = NULL;
+ kprobe_opcode_t *ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ do {
+ ret = __kretprobe_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ ri = container_of(*cur, struct kretprobe_instance, llist);
+ } while (ri->fp != fp);
+
+ return (unsigned long)ret;
+}
+NOKPROBE_SYMBOL(kretprobe_find_ret_addr);
+
+void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
+{
+ /*
+ * Do nothing by default. Please fill this to update the fake return
+ * address on the stack with the correct one on each arch if possible.
+ */
+}
+
+unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
+ void *frame_pointer)
+{
+ kprobe_opcode_t *correct_ret_addr = NULL;
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *first, *node = NULL;
+ struct kretprobe *rp;
+
+ /* Find correct address and all nodes for this frame. */
+ correct_ret_addr = __kretprobe_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n");
+ BUG_ON(1);
+ }
+
+ /*
+ * Set the return address as the instruction pointer, because if the
+ * user handler calls stack_trace_save_regs() with this 'regs',
+ * the stack trace will start from the instruction pointer.
+ */
+ instruction_pointer_set(regs, (unsigned long)correct_ret_addr);
- /* Run them.. */
+ /* Run the user handler of the nodes. */
+ first = current->kretprobe_instances.first;
while (first) {
ri = container_of(first, struct kretprobe_instance, llist);
- first = first->next;
+
+ if (WARN_ON_ONCE(ri->fp != frame_pointer))
+ break;
rp = get_kretprobe(ri);
if (rp && rp->handler) {
@@ -1907,6 +2043,23 @@ found:
rp->handler(ri, regs);
__this_cpu_write(current_kprobe, prev);
}
+ if (first == node)
+ break;
+
+ first = first->next;
+ }
+
+ arch_kretprobe_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink all nodes for this frame. */
+ first = current->kretprobe_instances.first;
+ current->kretprobe_instances.first = node->next;
+ node->next = NULL;
+
+ /* Recycle free instances. */
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+ first = first->next;
recycle_rp_inst(ri);
}
@@ -1945,11 +2098,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_instance *ri;
+ struct rethook_node *rhn;
+
+ rhn = rethook_try_get(rp->rh);
+ if (!rhn) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ ri = container_of(rhn, struct kretprobe_instance, node);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
+ rethook_recycle(rhn);
+ else
+ rethook_hook(rhn, regs, kprobe_ftrace(p));
+
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_kprobe_on_func_entry(unsigned long offset)
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
{
- return !offset;
+ struct kretprobe *rp = (struct kretprobe *)data;
+ struct kretprobe_instance *ri;
+ struct kprobe_ctlblk *kcb;
+
+ /* The data must NOT be null. This means rethook data structure is broken. */
+ if (WARN_ON_ONCE(!data) || !rp->handler)
+ return;
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ ri = container_of(rh, struct kretprobe_instance, node);
+ rp->handler(ri, regs);
+
+ __this_cpu_write(current_kprobe, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
/**
* kprobe_on_func_entry() -- check whether given address is function entry
@@ -1966,15 +2165,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
*/
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
- kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+ bool on_func_entry;
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
if (IS_ERR(kp_addr))
return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
- return -ENOENT;
-
- if (!arch_kprobe_on_func_entry(offset))
+ if (!on_func_entry)
return -EINVAL;
return 0;
@@ -1991,7 +2188,7 @@ int register_kretprobe(struct kretprobe *rp)
if (ret)
return ret;
- /* If only rp->kp.addr is specified, check reregistering kprobes */
+ /* If only 'rp->kp.addr' is specified, check reregistering kprobes */
if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
return -EINVAL;
@@ -2006,17 +2203,39 @@ int register_kretprobe(struct kretprobe *rp)
}
}
+ if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
+ return -E2BIG;
+
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPTION
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
+
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
+ if (!rp->rh)
+ return -ENOMEM;
+
+ for (i = 0; i < rp->maxactive; i++) {
+ inst = kzalloc(sizeof(struct kretprobe_instance) +
+ rp->data_size, GFP_KERNEL);
+ if (inst == NULL) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(rp->rh, &inst->node);
}
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
+ }
+#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
rp->freelist.head = NULL;
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
@@ -2041,6 +2260,7 @@ int register_kretprobe(struct kretprobe *rp)
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
+#endif
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -2079,7 +2299,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
for (i = 0; i < num; i++) {
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rethook_free(rps[i]->rh);
+#else
rps[i]->rph->rp = NULL;
+#endif
}
mutex_unlock(&kprobe_mutex);
@@ -2087,7 +2311,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
free_rp_inst(rps[i]);
+#endif
}
}
}
@@ -2096,13 +2322,13 @@ EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int register_kretprobes(struct kretprobe **rps, int num)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
@@ -2131,6 +2357,14 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2147,14 +2381,6 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
-
- /*
- * The module is going away. We should disarm the kprobe which
- * is using ftrace, because ftrace framework is still available at
- * MODULE_STATE_GOING notification.
- */
- if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
- disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
@@ -2202,8 +2428,11 @@ int enable_kprobe(struct kprobe *kp)
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
ret = arm_kprobe(p);
- if (ret)
+ if (ret) {
p->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ }
}
out:
mutex_unlock(&kprobe_mutex);
@@ -2214,8 +2443,7 @@ EXPORT_SYMBOL_GPL(enable_kprobe);
/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
- pr_err("Dumping kprobe:\n");
- pr_err("Name: %s\nOffset: %x\nAddress: %pS\n",
+ pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n",
kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);
@@ -2317,7 +2545,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
int ret;
for (iter = start; iter < end; iter++) {
- entry = arch_deref_entry_point((void *)*iter);
+ entry = (unsigned long)dereference_symbol_descriptor((void *)*iter);
ret = kprobe_add_ksym_blacklist(entry);
if (ret == -EINVAL)
continue;
@@ -2325,13 +2553,13 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret;
}
- /* Symbols in __kprobes_text are blacklisted */
+ /* Symbols in '__kprobes_text' are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
(unsigned long)__kprobes_text_end);
if (ret)
return ret;
- /* Symbols in noinstr section are blacklisted */
+ /* Symbols in 'noinstr' section are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
(unsigned long)__noinstr_text_end);
@@ -2403,9 +2631,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/*
- * When MODULE_STATE_GOING was notified, both of module .text and
- * .init.text sections would be freed. When MODULE_STATE_LIVE was
- * notified, only .init.text section would be freed. We need to
+ * When 'MODULE_STATE_GOING' was notified, both of module '.text' and
+ * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was
+ * notified, only '.init.text' section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
mutex_lock(&kprobe_mutex);
@@ -2422,9 +2650,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
*
* Note, this will also move any optimized probes
* that are pending to be removed from their
- * corresponding lists to the freeing_list and
+ * corresponding lists to the 'freeing_list' and
* will not be touched by the delayed
- * kprobe_optimizer work handler.
+ * kprobe_optimizer() work handler.
*/
kill_kprobe(p);
}
@@ -2440,10 +2668,6 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
-/* Markers of _kprobe_blacklist section */
-extern unsigned long __start_kprobe_blacklist[];
-extern unsigned long __stop_kprobe_blacklist[];
-
void kprobe_free_init_mem(void)
{
void *start = (void *)(&__init_begin);
@@ -2454,7 +2678,7 @@ void kprobe_free_init_mem(void)
mutex_lock(&kprobe_mutex);
- /* Kill all kprobes on initmem */
+ /* Kill all kprobes on initmem because the target code has been freed. */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist) {
@@ -2477,10 +2701,8 @@ static int __init init_kprobes(void)
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
- if (err) {
- pr_err("kprobes: failed to populate blacklist: %d\n", err);
- pr_err("Please take care of using kprobes.\n");
- }
+ if (err)
+ pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err);
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
@@ -2488,7 +2710,7 @@ static int __init init_kprobes(void)
kretprobe_blacklist[i].addr =
kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
- printk("kretprobe: lookup failed: %s\n",
+ pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n",
kretprobe_blacklist[i].name);
}
}
@@ -2497,7 +2719,7 @@ static int __init init_kprobes(void)
kprobes_all_disarmed = false;
#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
- /* Init kprobe_optinsn_slots for allocation */
+ /* Init 'kprobe_optinsn_slots' for allocation */
kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif
@@ -2508,9 +2730,7 @@ static int __init init_kprobes(void)
err = register_module_notifier(&kprobe_module_nb);
kprobes_initialized = (err == 0);
-
- if (!err)
- init_test_probes();
+ kprobe_sysctls_init();
return err;
}
early_initcall(init_kprobes);
@@ -2631,7 +2851,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
list_entry(v, struct kprobe_blacklist_entry, list);
/*
- * If /proc/kallsyms is not showing kernel address, we won't
+ * If '/proc/kallsyms' is not showing kernel address, we won't
* show them here either.
*/
if (!kallsyms_show_value(m->file->f_cred))
@@ -2692,7 +2912,7 @@ static int arm_all_kprobes(void)
}
if (errors)
- pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
+ pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n",
errors, total);
else
pr_info("Kprobes globally enabled\n");
@@ -2735,7 +2955,7 @@ static int disarm_all_kprobes(void)
}
if (errors)
- pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
+ pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n",
errors, total);
else
pr_info("Kprobes globally disabled\n");
@@ -2770,30 +2990,14 @@ static ssize_t read_enabled_file_bool(struct file *file,
static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
- char buf[32];
- size_t buf_size;
- int ret = 0;
-
- buf_size = min(count, (sizeof(buf)-1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
+ bool enable;
+ int ret;
- buf[buf_size] = '\0';
- switch (buf[0]) {
- case 'y':
- case 'Y':
- case '1':
- ret = arm_all_kprobes();
- break;
- case 'n':
- case 'N':
- case '0':
- ret = disarm_all_kprobes();
- break;
- default:
- return -EINVAL;
- }
+ ret = kstrtobool_from_user(user_buf, count, &enable);
+ if (ret)
+ return ret;
+ ret = enable ? arm_all_kprobes() : disarm_all_kprobes();
if (ret)
return ret;
@@ -2809,13 +3013,12 @@ static const struct file_operations fops_kp = {
static int __init debugfs_kprobe_init(void)
{
struct dentry *dir;
- unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
- debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
+ debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);
debugfs_create_file("blacklist", 0400, dir, NULL,
&kprobe_blacklist_fops);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..aad7a3bfd846 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
@@ -20,27 +21,50 @@
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
+/* address bits */
+static ssize_t address_bits_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
+}
+KERNEL_ATTR_RO(address_bits);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", uevent_helper);
+ return sysfs_emit(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -61,7 +85,7 @@ KERNEL_ATTR_RW(uevent_helper);
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", prof_on);
+ return sysfs_emit(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -92,21 +116,26 @@ KERNEL_ATTR_RW(profiling);
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", !!kexec_image);
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", kexec_crash_loaded());
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%zu\n", crash_get_memory_size());
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sysfs_emit(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -131,8 +160,8 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
- return sprintf(buf, "%pa %x\n", &vmcore_base,
- (unsigned int)VMCOREINFO_NOTE_SIZE);
+ return sysfs_emit(buf, "%pa %x\n", &vmcore_base,
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
@@ -142,7 +171,7 @@ KERNEL_ATTR_RO(vmcoreinfo);
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", file_caps_enabled);
+ return sysfs_emit(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
@@ -151,7 +180,7 @@ int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -168,7 +197,7 @@ int rcu_normal;
static ssize_t rcu_normal_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal));
}
static ssize_t rcu_normal_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -211,6 +240,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
+ &address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5b37a8567168..490792b1066e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -52,14 +53,16 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ int result;
int (*threadfn)(void *);
void *data;
- mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
+ /* To store the full name if task comm is truncated. */
+ char *full_name;
};
enum KTHREAD_BITS {
@@ -71,7 +74,7 @@ enum KTHREAD_BITS {
static inline struct kthread *to_kthread(struct task_struct *k)
{
WARN_ON(!(k->flags & PF_KTHREAD));
- return (__force void *)k->set_child_tid;
+ return k->worker_private;
}
/*
@@ -79,7 +82,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
*
* Per construction; when:
*
- * (p->flags & PF_KTHREAD) && p->set_child_tid
+ * (p->flags & PF_KTHREAD) && p->worker_private
*
* the task is both a kthread and struct kthread is persistent. However
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
@@ -87,26 +90,41 @@ static inline struct kthread *to_kthread(struct task_struct *k)
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
- void *kthread = (__force void *)p->set_child_tid;
+ void *kthread = p->worker_private;
if (kthread && !(p->flags & PF_KTHREAD))
kthread = NULL;
return kthread;
}
-void set_kthread_struct(struct task_struct *p)
+void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
- struct kthread *kthread;
+ struct kthread *kthread = to_kthread(tsk);
- if (__to_kthread(p))
+ if (!kthread || !kthread->full_name) {
+ __get_task_comm(buf, buf_size, tsk);
return;
+ }
+
+ strscpy_pad(buf, kthread->full_name, buf_size);
+}
+
+bool set_kthread_struct(struct task_struct *p)
+{
+ struct kthread *kthread;
+
+ if (WARN_ON_ONCE(to_kthread(p)))
+ return false;
kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
- /*
- * We abuse ->set_child_tid to avoid the new member and because it
- * can't be wrongly copied by copy_process(). We also rely on fact
- * that the caller can't exec, so PF_KTHREAD can't be cleared.
- */
- p->set_child_tid = (__force void __user *)kthread;
+ if (!kthread)
+ return false;
+
+ init_completion(&kthread->exited);
+ init_completion(&kthread->parked);
+ p->vfork_done = &kthread->exited;
+
+ p->worker_private = kthread;
+ return true;
}
void free_kthread_struct(struct task_struct *k)
@@ -114,13 +132,17 @@ void free_kthread_struct(struct task_struct *k)
struct kthread *kthread;
/*
- * Can be NULL if this kthread was created by kernel_thread()
- * or if kmalloc() in kthread() failed.
+ * Can be NULL if kmalloc() in set_kthread_struct() failed.
*/
kthread = to_kthread(k);
+ if (!kthread)
+ return;
+
#ifdef CONFIG_BLK_CGROUP
- WARN_ON_ONCE(kthread && kthread->blkcg_css);
+ WARN_ON_ONCE(kthread->blkcg_css);
#endif
+ k->worker_private = NULL;
+ kfree(kthread->full_name);
kfree(kthread);
}
@@ -268,8 +290,47 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
+/**
+ * kthread_exit - Cause the current kthread return @result to kthread_stop().
+ * @result: The integer value to return to kthread_stop().
+ *
+ * While kthread_exit can be called directly, it exists so that
+ * functions which do some additional work in non-modular code such as
+ * module_put_and_kthread_exit can be implemented.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_exit(long result)
+{
+ struct kthread *kthread = to_kthread(current);
+ kthread->result = result;
+ do_exit(0);
+}
+
+/**
+ * kthread_complete_and_exit - Exit the current kthread.
+ * @comp: Completion to complete
+ * @code: The integer value to return to kthread_stop().
+ *
+ * If present complete @comp and the reuturn code to kthread_stop().
+ *
+ * A kernel thread whose module may be removed after the completion of
+ * @comp can use this function exit safely.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(kthread_complete_and_exit);
+
static int kthread(void *_create)
{
+ static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
@@ -278,27 +339,26 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- set_kthread_struct(current);
self = to_kthread(current);
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
- do_exit(-EINTR);
- }
-
- if (!self) {
- create->result = ERR_PTR(-ENOMEM);
- complete(done);
- do_exit(-ENOMEM);
+ kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
- init_completion(&self->exited);
- init_completion(&self->parked);
- current->vfork_done = &self->exited;
+
+ /*
+ * The new thread inherited kthreadd's priority and CPU mask. Reset
+ * back to default in case they have been changed.
+ */
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -318,7 +378,7 @@ static int kthread(void *_create)
__kthread_parkme(self);
ret = threadfn(data);
}
- do_exit(ret);
+ kthread_exit(ret);
}
/* called from kernel_clone() to get node information for about to be created task */
@@ -339,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -370,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -383,9 +450,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
- * If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
+ * If I was killed by a fatal signal before kthreadd (or new
+ * kernel thread) calls complete(), leave the cleanup of this
+ * structure to that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
@@ -396,24 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- static const struct sched_param param = { .sched_priority = 0 };
- char name[TASK_COMM_LEN];
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- vsnprintf(name, sizeof(name), namefmt, args);
- set_task_comm(task, name);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task,
- housekeeping_cpumask(HK_FLAG_KTHREAD));
- }
+free_create:
kfree(create);
return task;
}
@@ -433,7 +483,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
+ * argument. @threadfn() can either return directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
@@ -523,6 +573,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
to_kthread(p)->cpu = cpu;
return p;
}
+EXPORT_SYMBOL(kthread_create_on_cpu);
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
@@ -627,7 +678,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
- * If threadfn() may call do_exit() itself, the caller must ensure
+ * If threadfn() may call kthread_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
@@ -644,9 +695,10 @@ int kthread_stop(struct task_struct *k)
kthread = to_kthread(k);
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
kthread_unpark(k);
+ set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
wake_up_process(k);
wait_for_completion(&kthread->exited);
- ret = k->exit_code;
+ ret = kthread->result;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
@@ -661,7 +713,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
@@ -816,7 +868,7 @@ fail_task:
*
* Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
@@ -865,7 +917,7 @@ EXPORT_SYMBOL(kthread_create_worker);
* Return:
* The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
@@ -990,8 +1042,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_FUNCTION_MISMATCH(timer->function,
- kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
@@ -1322,6 +1373,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
* Flush and destroy @worker. The simple flush is enough because the kthread
* worker API is used only in trivial scenarios. There are no multi-step state
* machines needed.
+ *
+ * Note that this function is not responsible for handling delayed work, so
+ * caller should be responsible for queuing or canceling all delayed work items
+ * before invoke this function.
*/
void kthread_destroy_worker(struct kthread_worker *worker)
{
@@ -1333,6 +1388,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kthread_flush_worker(worker);
kthread_stop(task);
+ WARN_ON(!list_empty(&worker->delayed_work_list));
WARN_ON(!list_empty(&worker->work_list));
kfree(worker);
}
@@ -1350,14 +1406,18 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It is possible for mm to be the same as tsk->active_mm, but
+ * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+ * because these references are not equivalent.
+ */
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
- tsk->active_mm = mm;
- }
+ tsk->active_mm = mm;
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1374,14 +1434,9 @@ void kthread_use_mm(struct mm_struct *mm)
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
- * mmdrop(), or explicitly with smp_mb().
+ * mmdrop_lazy_tlb().
*/
- if (active_mm != mm)
- mmdrop(active_mm);
- else
- smp_mb();
-
- to_kthread(tsk)->oldfs = force_uaccess_begin();
+ mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1396,8 +1451,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- force_uaccess_end(to_kthread(tsk)->oldfs);
-
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
@@ -1411,10 +1464,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
@@ -1466,5 +1522,4 @@ struct cgroup_subsys_state *kthread_blkcg(void)
}
return NULL;
}
-EXPORT_SYMBOL(kthread_blkcg);
#endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 166d7bf49666..781249098cb6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
+#include <linux/sysctl.h>
static DEFINE_RAW_SPINLOCK(latency_lock);
@@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
+#ifdef CONFIG_SYSCTL
+static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
+
+static struct ctl_table latencytop_sysctl[] = {
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_latencytop,
+ },
+ {}
+};
+#endif
+
void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
@@ -86,7 +112,7 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
- int firstnonnull = MAXLR + 1;
+ int firstnonnull = MAXLR;
int i;
/* skip kernel threads for now */
@@ -124,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
- if (i >= MAXLR - 1)
+ if (i >= MAXLR)
return;
/* Allocted a new one: */
@@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = {
static int __init init_lstats_procfs(void)
{
proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("kernel", latencytop_sysctl);
+#endif
return 0;
}
-
-int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int err;
-
- err = proc_dointvec(table, write, buffer, lenp, ppos);
- if (latencytop_enabled)
- force_schedstat_enabled();
-
- return err;
-}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 335d988bd811..61328328c474 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
@@ -118,27 +119,16 @@ static struct klp_object *klp_find_object(struct klp_patch *patch,
}
struct klp_find_arg {
- const char *objname;
const char *name;
unsigned long addr;
unsigned long count;
unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
args->addr = addr;
args->count++;
@@ -153,11 +143,20 @@ static int klp_find_callback(void *data, const char *name,
return 0;
}
+static int klp_find_callback(void *data, const char *name, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
- .objname = objname,
.name = name,
.addr = 0,
.count = 0,
@@ -165,9 +164,9 @@ static int klp_find_object_symbol(const char *objname, const char *name,
};
if (objname)
- module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
else
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
@@ -190,7 +189,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symndx, Elf_Shdr *relasec,
const char *sec_objname)
{
@@ -213,12 +212,12 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* we use the smallest/strictest upper bound possible (56, based on
* the current definition of MODULE_NAME_LEN) to prevent overflows.
*/
- BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
- sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+ sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
pr_err("symbol %s is not marked as a livepatch symbol\n",
strtab + sym->st_name);
@@ -227,7 +226,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
/* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
- ".klp.sym.%55[^.].%127[^,],%lu",
+ ".klp.sym.%55[^.].%511[^,],%lu",
sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
@@ -261,6 +260,14 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
return 0;
}
+void __weak clear_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+}
+
/*
* At a high-level, there are two types of klp relocation sections: those which
* reference symbols which live in vmlinux; and those which reference symbols
@@ -284,10 +291,10 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* the to-be-patched module to be loaded and patched sometime *after* the
* klp module is loaded.
*/
-int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
- const char *shstrtab, const char *strtab,
- unsigned int symndx, unsigned int secndx,
- const char *objname)
+static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname, bool apply)
{
int cnt, ret;
char sec_objname[MODULE_NAME_LEN];
@@ -309,11 +316,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
if (strcmp(objname ? objname : "vmlinux", sec_objname))
return 0;
- ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
- if (ret)
- return ret;
+ if (apply) {
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx,
+ sec, sec_objname);
+ if (ret)
+ return ret;
- return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ }
+
+ clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return 0;
+}
+
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
+{
+ return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx,
+ secndx, objname, true);
}
/*
@@ -325,6 +347,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static int __klp_disable_patch(struct klp_patch *patch);
@@ -431,6 +454,22 @@ static struct attribute *klp_patch_attrs[] = {
};
ATTRIBUTE_GROUPS(klp_patch);
+static ssize_t patched_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+ return sysfs_emit(buf, "%d\n", obj->patched);
+}
+
+static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
+static struct attribute *klp_object_attrs[] = {
+ &patched_kobj_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(klp_object);
+
static void klp_free_object_dynamic(struct klp_object *obj)
{
kfree(obj->name);
@@ -557,7 +596,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
complete(&patch->finish);
}
-static struct kobj_type klp_ktype_patch = {
+static const struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_patch_groups,
@@ -573,9 +612,10 @@ static void klp_kobj_release_object(struct kobject *kobj)
klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_object = {
+static const struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = klp_object_groups,
};
static void klp_kobj_release_func(struct kobject *kobj)
@@ -588,7 +628,7 @@ static void klp_kobj_release_func(struct kobject *kobj)
klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
@@ -744,8 +784,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
-static int klp_apply_object_relocs(struct klp_patch *patch,
- struct klp_object *obj)
+static int klp_write_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj,
+ bool apply)
{
int i, ret;
struct klp_modinfo *info = patch->mod->klp_info;
@@ -756,10 +797,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
continue;
- ret = klp_apply_section_relocs(patch->mod, info->sechdrs,
+ ret = klp_write_section_relocs(patch->mod, info->sechdrs,
info->secstrings,
patch->mod->core_kallsyms.strtab,
- info->symndx, i, obj->name);
+ info->symndx, i, obj->name, apply);
if (ret)
return ret;
}
@@ -767,6 +808,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
return 0;
}
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ return klp_write_object_relocs(patch, obj, true);
+}
+
+static void klp_clear_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ klp_write_object_relocs(patch, obj, false);
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -862,14 +915,11 @@ static void klp_init_object_early(struct klp_patch *patch,
list_add_tail(&obj->node, &patch->obj_list);
}
-static int klp_init_patch_early(struct klp_patch *patch)
+static void klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
struct klp_func *func;
- if (!patch->objs)
- return -EINVAL;
-
INIT_LIST_HEAD(&patch->list);
INIT_LIST_HEAD(&patch->obj_list);
kobject_init(&patch->kobj, &klp_ktype_patch);
@@ -879,20 +929,12 @@ static int klp_init_patch_early(struct klp_patch *patch)
init_completion(&patch->finish);
klp_for_each_object_static(patch, obj) {
- if (!obj->funcs)
- return -EINVAL;
-
klp_init_object_early(patch, obj);
klp_for_each_func_static(obj, func) {
klp_init_func_early(obj, func);
}
}
-
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- return 0;
}
static int klp_init_patch(struct klp_patch *patch)
@@ -1024,10 +1066,17 @@ err:
int klp_enable_patch(struct klp_patch *patch)
{
int ret;
+ struct klp_object *obj;
- if (!patch || !patch->mod)
+ if (!patch || !patch->mod || !patch->objs)
return -EINVAL;
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+ }
+
+
if (!is_livepatch_module(patch->mod)) {
pr_err("module %s is not marked as a livepatch module\n",
patch->mod->name);
@@ -1051,12 +1100,13 @@ int klp_enable_patch(struct klp_patch *patch)
return -EINVAL;
}
- ret = klp_init_patch_early(patch);
- if (ret) {
+ if (!try_module_get(patch->mod)) {
mutex_unlock(&klp_mutex);
- return ret;
+ return -ENODEV;
}
+ klp_init_patch_early(patch);
+
ret = klp_init_patch(patch);
if (ret)
goto err;
@@ -1157,7 +1207,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
klp_unpatch_object(obj);
klp_post_unpatch_callback(obj);
-
+ klp_clear_object_relocs(patch, obj);
klp_free_object_loaded(obj);
break;
}
@@ -1174,7 +1224,7 @@ int klp_module_coming(struct module *mod)
return -EINVAL;
if (!strcmp(mod->name, "vmlinux")) {
- pr_err("vmlinux.ko: invalid module name");
+ pr_err("vmlinux.ko: invalid module name\n");
return -EINVAL;
}
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index e8029aea67f1..4152c71507e2 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
ops = container_of(fops, struct klp_ops, fops);
+ /*
+ * The ftrace_test_recursion_trylock() will disable preemption,
+ * which is required for the variant of synchronize_rcu() that is
+ * used to allow patching functions where RCU is not watching.
+ * See klp_synchronize_transition() for more details.
+ */
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (WARN_ON_ONCE(bit < 0))
return;
- /*
- * A variant of synchronize_rcu() is used to allow patching functions
- * where RCU is not watching, see klp_synchronize_transition().
- */
- preempt_disable_notrace();
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
stack_node);
@@ -117,26 +118,12 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- klp_arch_set_pc(fregs, (unsigned long)func->new_func);
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
unlock:
- preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
- return faddr;
-}
-#endif
-
static void klp_unpatch_func(struct klp_func *func)
{
struct klp_ops *ops;
@@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func)
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func)
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index e5c9fb295ba9..c2e724d97ddf 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -272,12 +272,12 @@ void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
EXPORT_SYMBOL_GPL(klp_shadow_free);
/**
- * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * klp_shadow_free_all() - detach and free all <_, id> shadow variables
* @id: data identifier
* @dtor: custom callback that can be used to unregister the variable
* and/or free data that the shadow variable points to (optional)
*
- * This function releases the memory for all <*, id> shadow variable
+ * This function releases the memory for all <_, id> shadow variable
* instances, callers should stop referencing them accordingly.
*/
void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
@@ -288,7 +288,7 @@ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
spin_lock_irqsave(&klp_shadow_lock, flags);
- /* Delete all <*, id> from hash */
+ /* Delete all <_, id> from hash */
hash_for_each(klp_shadow_hash, i, shadow, node) {
if (klp_shadow_match(shadow, shadow->obj, id))
klp_shadow_free_struct(shadow, dtor);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 3a4beb9395c4..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,13 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
-#include <linux/tracehook.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
-#include "../sched/sched.h"
#define MAX_STACK_ENTRIES 100
+DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
@@ -27,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched(). This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -174,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -198,36 +218,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
struct klp_ops *ops;
int i;
- for (i = 0; i < nr_entries; i++) {
- address = entries[i];
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
- if (klp_target_state == KLP_UNPATCHED) {
- /*
- * Check for the to-be-unpatched function
- * (the func itself).
- */
- func_addr = (unsigned long)func->new_func;
- func_size = func->new_size;
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
} else {
- /*
- * Check for the to-be-patched function
- * (the previous func).
- */
- ops = klp_find_ops(func->old_func);
-
- if (list_is_singular(&ops->func_stack)) {
- /* original function */
- func_addr = (unsigned long)func->old_func;
- func_size = func->old_size;
- } else {
- /* previously patched function */
- struct klp_func *prev;
-
- prev = list_next_entry(func, stack_node);
- func_addr = (unsigned long)prev->new_func;
- func_size = prev->new_size;
- }
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
}
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (address >= func_addr && address < func_addr + func_size)
return -EAGAIN;
@@ -240,20 +260,19 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
* Determine whether it's safe to transition the task to the target patch state
* by looking for any to-be-patched or to-be-unpatched functions on its stack.
*/
-static int klp_check_stack(struct task_struct *task, char *err_buf)
+static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
- if (ret < 0) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d has an unreliable stack\n",
- __func__, task->comm, task->pid);
- return ret;
- }
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
+ if (ret < 0)
+ return -EINVAL;
nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
@@ -262,11 +281,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
klp_for_each_func(obj, func) {
ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d is sleeping on function %s\n",
- __func__, task->comm, task->pid,
- func->old_name);
- return ret;
+ *oldname = func->old_name;
+ return -EADDRINUSE;
}
}
}
@@ -274,6 +290,22 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
return 0;
}
+static int klp_check_and_switch_task(struct task_struct *task, void *arg)
+{
+ int ret;
+
+ if (task_curr(task) && task != current)
+ return -EBUSY;
+
+ ret = klp_check_stack(task, arg);
+ if (ret)
+ return ret;
+
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+ return 0;
+}
+
/*
* Try to safely switch a task to the target patch state. If it's currently
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
@@ -281,13 +313,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
*/
static bool klp_try_switch_task(struct task_struct *task)
{
- static char err_buf[STACK_ERR_BUF_SIZE];
- struct rq *rq;
- struct rq_flags flags;
+ const char *old_name;
int ret;
- bool success = false;
-
- err_buf[0] = '\0';
/* check if this task has already switched over */
if (task->patch_state == klp_target_state)
@@ -305,37 +332,74 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- rq = task_rq_lock(task, &flags);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
+ switch (ret) {
+ case 0: /* success */
+ break;
- if (task_running(rq, task) && task != current) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d is running\n", __func__, task->comm,
- task->pid);
- goto done;
+ case -EBUSY: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is running\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EINVAL: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d has an unreliable stack\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EADDRINUSE: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is sleeping on function %s\n",
+ __func__, task->comm, task->pid, old_name);
+ break;
+
+ default:
+ pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n",
+ __func__, ret, task->comm, task->pid);
+ break;
}
- ret = klp_check_stack(task, err_buf);
- if (ret)
- goto done;
+ return !ret;
+}
- success = true;
+void __klp_sched_try_switch(void)
+{
+ if (likely(!klp_patch_pending(current)))
+ return;
- clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
- task->patch_state = klp_target_state;
+ /*
+ * This function is called from cond_resched() which is called in many
+ * places throughout the kernel. Using the klp_mutex here might
+ * deadlock.
+ *
+ * Instead, disable preemption to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch this task while it's running.
+ */
+ preempt_disable();
-done:
- task_rq_unlock(rq, task, &flags);
+ /*
+ * Make sure current didn't get patched between the above check and
+ * preempt_disable().
+ */
+ if (unlikely(!klp_patch_pending(current)))
+ goto out;
/*
- * Due to console deadlock issues, pr_debug() can't be used while
- * holding the task rq lock. Instead we have to use a temporary buffer
- * and print the debug message after releasing the lock.
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
*/
- if (err_buf[0] != '\0')
- pr_debug("%s", err_buf);
+ smp_rmb();
- return success;
+ klp_try_switch_task(current);
+
+out:
+ preempt_enable();
}
+EXPORT_SYMBOL(__klp_sched_try_switch);
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
@@ -411,19 +475,22 @@ void klp_try_complete_transition(void)
/*
* Ditto for the idle "swapper" tasks.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
if (cpu_online(cpu)) {
- if (!klp_try_switch_task(task))
+ if (!klp_try_switch_task(task)) {
complete = false;
+ /* Make idle task go through the main loop. */
+ wake_up_if_idle(cpu);
+ }
} else if (task->patch_state != klp_target_state) {
/* offline idle tasks can be switched immediately */
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
task->patch_state = klp_target_state;
}
}
- put_online_cpus();
+ cpus_read_unlock();
if (!complete) {
if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -492,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_cond_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_UNDEFINED.
*/
smp_wmb();
@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -601,18 +668,51 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
/* Called from copy_process() during fork */
void klp_copy_process(struct task_struct *child)
{
- child->patch_state = current->patch_state;
- /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+ /*
+ * The parent process may have gone through a KLP transition since
+ * the thread flag was copied in setup_thread_stack earlier. Bring
+ * the task flag up to date with the parent here.
+ *
+ * The operation is serialized against all klp_*_transition()
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
+ */
+ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
+ set_tsk_thread_flag(child, TIF_PATCH_PENDING);
+ else
+ clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
+
+ child->patch_state = current->patch_state;
}
/*
@@ -640,6 +740,13 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_for_each_patch(patch)
- patch->forced = true;
+ /* Set forced flag for patches being removed. */
+ if (klp_target_state == KLP_UNPATCHED)
+ klp_transition_patch->forced = true;
+ else if (klp_transition_patch->replace) {
+ klp_for_each_patch(patch) {
+ if (patch != klp_transition_patch)
+ patch->forced = true;
+ }
+ }
}
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 3572808223e4..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,7 +5,7 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
@@ -24,7 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bf1c00c881e4..dcd1d5bfc1e0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,28 +55,59 @@
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+ { }
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
DEFINE_PER_CPU(unsigned int, lockdep_recursion);
EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
@@ -183,11 +214,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -338,7 +367,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
* elements. These elements are linked together by the lock_entry member in
* struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
static LIST_HEAD(free_lock_classes);
/**
@@ -788,6 +817,21 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
+/*
+ * Check if an address is part of freed initmem. After initmem is freed,
+ * memory can be allocated from it, and such allocations would then have
+ * addresses within the range [_stext, _end].
+ */
+#ifndef arch_is_kernel_initmem_freed
+static int arch_is_kernel_initmem_freed(unsigned long addr)
+{
+ if (system_state < SYSTEM_FREEING_INITMEM)
+ return 0;
+
+ return init_section_contains((void *)addr, 1);
+}
+#endif
+
static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
@@ -803,9 +847,6 @@ static int static_obj(const void *obj)
if ((addr >= start) && (addr < end))
return 1;
- if (arch_is_kernel_data(addr))
- return 1;
-
/*
* in-kernel percpu var?
*/
@@ -888,14 +929,16 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name &&
- lock->key != &__lockdep_no_validate__);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
@@ -1240,6 +1283,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1305,6 +1349,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* of classes.
*/
list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
@@ -1366,7 +1413,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, u16 distance, u8 dep,
+ u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -1834,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1861,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
printk(" lock(");
@@ -1870,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
- printk(" lock(");
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
__print_lock_name(source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
@@ -3119,19 +3176,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* to the previous lock's dependency list:
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance,
- calc_dep(prev, next),
- *trace);
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance,
- calc_depb(prev, next),
- *trace);
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
@@ -3450,7 +3503,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
u16 chain_hlock = chain_hlocks[chain->base + i];
unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
- return lock_classes + class_idx - 1;
+ return lock_classes + class_idx;
}
/*
@@ -3518,7 +3571,7 @@ static void print_chain_keys_chain(struct lock_chain *chain)
hlock_id = chain_hlocks[chain->base + i];
chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1);
+ print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -4222,14 +4275,13 @@ static void __trace_hardirqs_on_caller(void)
/**
* lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
- * @ip: Caller address
*
* Invoked before a possible transition to RCU idle from exit to user or
* guest mode. This ensures that all RCU operations are done before RCU
* stops watching. After the RCU transition lockdep_hardirqs_on() has to be
* invoked to set the final state.
*/
-void lockdep_hardirqs_on_prepare(unsigned long ip)
+void lockdep_hardirqs_on_prepare(void)
{
if (unlikely(!debug_locks))
return;
@@ -4489,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -4671,7 +4729,7 @@ print_lock_invalid_wait_context(struct task_struct *curr,
/*
* Verify the wait_type context.
*
- * This check validates we takes locks in the right wait-type order; that is it
+ * This check validates we take locks in the right wait-type order; that is it
* ensures that we do not take mutexes inside spinlocks and do not attempt to
* acquire spinlocks inside raw_spinlocks and the sort.
*
@@ -4826,8 +4884,7 @@ EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
return;
@@ -4869,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4920,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4954,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -5003,7 +5062,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
- print_lock_nested_lock_not_held(curr, hlock, ip);
+ print_lock_nested_lock_not_held(curr, hlock);
return 0;
}
@@ -5015,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -5156,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -5200,9 +5263,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map_waits(lock, name, key, 0,
- lock->wait_type_inner,
- lock->wait_type_outer);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
@@ -5366,7 +5430,7 @@ int __lock_is_held(const struct lockdep_map *lock, int read)
struct held_lock *hlock = curr->held_locks + i;
if (match_held_lock(hlock, lock)) {
- if (read == -1 || hlock->read == read)
+ if (read == -1 || !!hlock->read == read)
return LOCK_STATE_HELD;
return LOCK_STATE_NOT_HELD;
@@ -5394,7 +5458,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
* be guessable and still allows some pin nesting in
* our u32 pin_count.
*/
- cookie.val = 1 + (prandom_u32() >> 16);
+ cookie.val = 1 + (sched_clock() & 0xffff);
hlock->pin_count += cookie.val;
return cookie;
}
@@ -5473,6 +5537,7 @@ static noinstr void check_flags(unsigned long flags)
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -5487,6 +5552,7 @@ static noinstr void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
@@ -5623,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -5649,6 +5715,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
@@ -5986,6 +6080,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -5997,13 +6093,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
static void reinit_class(struct lock_class *class)
{
- void *const p = class;
- const unsigned int offset = offsetof(struct lock_class, key);
-
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
- memset(p + offset, 0, sizeof(*class) - offset);
+ memset_startat(class, 0, key);
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
@@ -6276,7 +6369,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
lockdep_reset_lock_reg(lock);
}
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
void lockdep_unregister_key(struct lock_class_key *key)
{
struct hlist_head *hash_head = keyhashentry(key);
@@ -6291,10 +6390,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ lockdep_lock();
- pf = get_pending_free();
hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
if (k == key) {
hlist_del_rcu(&k->hash_entry);
@@ -6302,11 +6399,13 @@ void lockdep_unregister_key(struct lock_class_key *key)
break;
}
}
- WARN_ON_ONCE(!found);
- __lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
- graph_unlock();
-out_irq:
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ }
+ lockdep_unlock();
raw_local_irq_restore(flags);
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
@@ -6507,6 +6606,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6525,7 +6625,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
@@ -6547,5 +6647,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index ecb8662e7a4e..bbe9000260d0 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
@@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks;
extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -205,7 +208,6 @@ struct lockdep_stats {
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#define __debug_atomic_inc(ptr) \
this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b8d9a050c337..15fdc7fa5c68 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps += lockdep_count_forward_deps(class);
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
@@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
@@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
iter->stats = lock_stats(class);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index b3adb40549bf..153ddc4c47ef 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -46,10 +46,16 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, verbose, 1,
"Enable verbose debugging printk()s");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
@@ -59,7 +65,7 @@ static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
static unsigned long last_lock_release;
struct lock_stress_stats {
@@ -76,10 +82,12 @@ static void lock_torture_cleanup(void);
struct lock_torture_ops {
void (*init)(void);
void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(int tid);
@@ -127,15 +135,50 @@ static void torture_lock_busted_write_unlock(int tid __maybe_unused)
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -179,7 +222,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -206,7 +249,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -214,6 +257,59 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
static DEFINE_RWLOCK(torture_rwlock);
static int torture_rwlock_write_lock(int tid __maybe_unused)
@@ -275,7 +371,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -318,7 +414,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -327,6 +423,28 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
static int torture_mutex_lock(int tid __maybe_unused)
__acquires(torture_mutex)
@@ -355,11 +473,24 @@ __releases(torture_mutex)
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -456,7 +587,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
.exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -466,43 +597,34 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
-static int torture_rtmutex_lock(int tid __maybe_unused)
-__acquires(torture_rtmutex)
+static void torture_rtmutex_init(void)
{
- rt_mutex_lock(&torture_rtmutex);
- return 0;
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
{
- const unsigned int factor = 50000; /* yes, quite arbitrary */
+ int i;
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- sched_set_fifo(current);
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- sched_set_normal(current, 0);
- } else /* common case, do nothing */
- return;
- }
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
+
+static int torture_rtmutex_lock(int tid __maybe_unused)
+__acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
@@ -530,11 +652,32 @@ __releases(torture_rtmutex)
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -600,7 +743,7 @@ __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -652,7 +795,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
.exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -669,6 +812,8 @@ static int lock_torture_writer(void *arg)
struct lock_stress_stats *lwsp = arg;
int tid = lwsp - cxt.lwsa;
DEFINE_TORTURE_RANDOM(rand);
+ u32 lockset_mask;
+ bool skip_main_lock;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
set_user_nice(current, MAX_NICE);
@@ -677,19 +822,40 @@ static int lock_torture_writer(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock(tid);
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = true;
- if (WARN_ON_ONCE(lock_is_read_held))
- lwsp->n_lock_fail++; /* rare, but... */
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
- lwsp->n_lock_acquired++;
+ cxt.cur_ops->task_boost(&rand);
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ }
cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = false;
- WRITE_ONCE(last_lock_release, jiffies);
- cxt.cur_ops->writeunlock(tid);
+ if (!skip_main_lock) {
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -717,13 +883,13 @@ static int lock_torture_reader(void *arg)
schedule_timeout_uninterruptible(1);
cxt.cur_ops->readlock(tid);
- lock_is_read_held = true;
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = false;
+ atomic_dec(&lock_is_read_held);
cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
@@ -738,20 +904,22 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
+ long cur;
bool fail = false;
int i, n_stress;
- long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_acquired)
- max = statp[i].n_lock_acquired;
- if (min > statp[i].n_lock_acquired)
- min = statp[i].n_lock_acquired;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
@@ -828,11 +996,11 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress,
+ nested_locks, stat_interval, verbose, shuffle_interval,
+ stutter, shutdown_secs, onoff_interval, onoff_holdoff);
}
static void lock_torture_cleanup(void)
@@ -902,6 +1070,7 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -996,7 +1165,6 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
@@ -1021,23 +1189,23 @@ static int __init lock_torture_init(void)
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ, NULL);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -1046,18 +1214,22 @@ static int __init lock_torture_init(void)
sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
@@ -1081,7 +1253,7 @@ static int __init lock_torture_init(void)
/* Create writer. */
firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
writer_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
create_reader:
@@ -1090,13 +1262,13 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index db9301591e3f..bc8abb8549d2 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -22,7 +20,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
@@ -32,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
+ waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 53e631e1d76d..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d2df5e68b503..d973fe6041bf 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,17 +30,23 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "mutex.h"
+
#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
+# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond)
#else
-# include "mutex.h"
+# define MUTEX_WARN_ON(cond)
#endif
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_long_set(&lock->owner, 0);
- spin_lock_init(&lock->wait_lock);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
@@ -92,54 +98,58 @@ static inline unsigned long __owner_flags(unsigned long owner)
}
/*
- * Trylock variant that returns the owning task on failure.
+ * Returns: __mutex_owner(lock) on failure or NULL on success.
*/
-static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
owner = atomic_long_read(&lock->owner);
for (;;) { /* must loop, can race against a flag */
- unsigned long old, flags = __owner_flags(owner);
+ unsigned long flags = __owner_flags(owner);
unsigned long task = owner & ~MUTEX_FLAGS;
if (task) {
- if (likely(task != curr))
+ if (flags & MUTEX_FLAG_PICKUP) {
+ if (task != curr)
+ break;
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else if (handoff) {
+ if (flags & MUTEX_FLAG_HANDOFF)
+ break;
+ flags |= MUTEX_FLAG_HANDOFF;
+ } else {
break;
-
- if (likely(!(flags & MUTEX_FLAG_PICKUP)))
- break;
-
- flags &= ~MUTEX_FLAG_PICKUP;
+ }
} else {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP));
+ task = curr;
}
- /*
- * We set the HANDOFF bit, we must make sure it doesn't live
- * past the point where we acquire it. This would be possible
- * if we (accidentally) set the bit on an unlocked mutex.
- */
- flags &= ~MUTEX_FLAG_HANDOFF;
-
- old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
- if (old == owner)
- return NULL;
-
- owner = old;
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) {
+ if (task == curr)
+ return NULL;
+ break;
+ }
}
return __owner_task(owner);
}
/*
+ * Trylock or set HANDOFF
+ */
+static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff)
+{
+ return !__mutex_trylock_common(lock, handoff);
+}
+
+/*
* Actual trylock that will work on any unlocked state.
*/
static inline bool __mutex_trylock(struct mutex *lock)
{
- return !__mutex_trylock_or_owner(lock);
+ return !__mutex_trylock_common(lock, false);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -168,10 +178,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
- if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
- return true;
-
- return false;
+ return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
#endif
@@ -226,23 +233,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
unsigned long owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old, new;
+ unsigned long new;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
new = (owner & MUTEX_FLAG_WAITERS);
new |= (unsigned long)task;
if (task)
new |= MUTEX_FLAG_PICKUP;
- old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
- if (old == owner)
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new))
break;
-
- owner = old;
}
}
@@ -286,218 +288,18 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-/*
- * Wait-Die:
- * The newer transactions are killed when:
- * It (the new transaction) makes a request for a lock being held
- * by an older transaction.
- *
- * Wound-Wait:
- * The newer transactions are wounded when:
- * An older transaction makes a request for a lock being held by
- * the newer transaction.
- */
-
-/*
- * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
- * it.
- */
-static __always_inline void
-ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
- ww->ctx = ww_ctx;
-}
-
-/*
- * Determine if context @a is 'after' context @b. IOW, @a is a younger
- * transaction than @b and depending on algorithm either needs to wait for
- * @b or die.
- */
-static inline bool __sched
-__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
-{
-
- return (signed long)(a->stamp - b->stamp) > 0;
-}
+#include "ww_mutex.h"
-/*
- * Wait-Die; wake a younger waiter context (when locks held) such that it can
- * die.
- *
- * Among waiters with context, only the first one can have other locks acquired
- * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
- * __ww_mutex_check_kill() wake any but the earliest context.
- */
-static bool __sched
-__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ww_ctx)
-{
- if (!ww_ctx->is_wait_die)
- return false;
-
- if (waiter->ww_ctx->acquired > 0 &&
- __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
- debug_mutex_wake_waiter(lock, waiter);
- wake_up_process(waiter->task);
- }
-
- return true;
-}
-
-/*
- * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
- *
- * Wound the lock holder if there are waiters with older transactions than
- * the lock holders. Even if multiple waiters may wound the lock holder,
- * it's sufficient that only one does.
- */
-static bool __ww_mutex_wound(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx,
- struct ww_acquire_ctx *hold_ctx)
-{
- struct task_struct *owner = __mutex_owner(lock);
-
- lockdep_assert_held(&lock->wait_lock);
-
- /*
- * Possible through __ww_mutex_add_waiter() when we race with
- * ww_mutex_set_context_fastpath(). In that case we'll get here again
- * through __ww_mutex_check_waiters().
- */
- if (!hold_ctx)
- return false;
-
- /*
- * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
- * it cannot go away because we'll have FLAG_WAITERS set and hold
- * wait_lock.
- */
- if (!owner)
- return false;
-
- if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
- hold_ctx->wounded = 1;
-
- /*
- * wake_up_process() paired with set_current_state()
- * inserts sufficient barriers to make sure @owner either sees
- * it's wounded in __ww_mutex_check_kill() or has a
- * wakeup pending to re-read the wounded state.
- */
- if (owner != current)
- wake_up_process(owner);
-
- return true;
- }
-
- return false;
-}
-
-/*
- * We just acquired @lock under @ww_ctx, if there are later contexts waiting
- * behind us on the wait-list, check if they need to die, or wound us.
- *
- * See __ww_mutex_add_waiter() for the list-order construction; basically the
- * list is ordered by stamp, smallest (oldest) first.
- *
- * This relies on never mixing wait-die/wound-wait on the same wait-list;
- * which is currently ensured by that being a ww_class property.
- *
- * The current task must not be on the wait list.
- */
-static void __sched
-__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
-
- lockdep_assert_held(&lock->wait_lock);
-
- list_for_each_entry(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_mutex_die(lock, cur, ww_ctx) ||
- __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
- break;
- }
-}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
- * and wake up any waiters so they can recheck.
+ * Trylock variant that returns the owning task on failure.
*/
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
{
- ww_mutex_lock_acquired(lock, ctx);
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the WAITERS check is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* See comments above and below. */
-
- /*
- * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
- * MB MB
- * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
- *
- * The memory barrier above pairs with the memory barrier in
- * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
- * and/or !empty list.
- */
- if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
- return;
-
- /*
- * Uh oh, we raced in fastpath, check if any of the waiters need to
- * die or wound us.
- */
- spin_lock(&lock->base.wait_lock);
- __ww_mutex_check_waiters(&lock->base, ctx);
- spin_unlock(&lock->base.wait_lock);
+ return __mutex_trylock_common(lock, false);
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
static inline
bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
@@ -552,21 +354,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
{
bool ret = true;
- rcu_read_lock();
+ lockdep_assert_preemption_disabled();
+
while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * checking lock->owner still matches owner. And we already
+ * disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the
+ * task_strcut structure won't go away during the spinning
+ * period
*/
barrier();
/*
* Use vcpu_is_preempted to detect lock holder preemption issue.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (!owner_on_cpu(owner) || need_resched()) {
ret = false;
break;
}
@@ -578,7 +382,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
cpu_relax();
}
- rcu_read_unlock();
return ret;
}
@@ -591,19 +394,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ lockdep_assert_preemption_disabled();
+
if (need_resched())
return 0;
- rcu_read_lock();
- owner = __mutex_owner(lock);
-
/*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
+ * We already disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the task_strcut
+ * structure won't go away during the spinning period.
*/
+ owner = __mutex_owner(lock);
if (owner)
- retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
- rcu_read_unlock();
+ retval = owner_on_cpu(owner);
/*
* If lock->owner is not set, the mutex has been released. Return true
@@ -754,171 +557,11 @@ EXPORT_SYMBOL(mutex_unlock);
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
- if (lock->ctx) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
-#endif
- if (lock->ctx->acquired > 0)
- lock->ctx->acquired--;
- lock->ctx = NULL;
- }
-
+ __ww_mutex_unlock(lock);
mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
-
-static __always_inline int __sched
-__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
- ww_ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
- }
-
- return 0;
-}
-
-
-/*
- * Check the wound condition for the current lock acquire.
- *
- * Wound-Wait: If we're wounded, kill ourself.
- *
- * Wait-Die: If we're trying to acquire a lock already held by an older
- * context, kill ourselves.
- *
- * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
- * look at waiters before us in the wait-list.
- */
-static inline int __sched
-__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ctx)
-{
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
- struct mutex_waiter *cur;
-
- if (ctx->acquired == 0)
- return 0;
-
- if (!ctx->is_wait_die) {
- if (ctx->wounded)
- return __ww_mutex_kill(lock, ctx);
-
- return 0;
- }
-
- if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
- return __ww_mutex_kill(lock, ctx);
-
- /*
- * If there is a waiter in front of us that has a context, then its
- * stamp is earlier than ours and we must kill ourself.
- */
- cur = waiter;
- list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- return __ww_mutex_kill(lock, ctx);
- }
-
- return 0;
-}
-
-/*
- * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
- * first. Such that older contexts are preferred to acquire the lock over
- * younger contexts.
- *
- * Waiters without context are interspersed in FIFO order.
- *
- * Furthermore, for Wait-Die kill ourself immediately when possible (there are
- * older contexts already waiting) to avoid unnecessary waiting and for
- * Wound-Wait ensure we wound the owning context when it is younger.
- */
-static inline int __sched
-__ww_mutex_add_waiter(struct mutex_waiter *waiter,
- struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
- struct list_head *pos;
- bool is_wait_die;
-
- if (!ww_ctx) {
- __mutex_add_waiter(lock, waiter, &lock->wait_list);
- return 0;
- }
-
- is_wait_die = ww_ctx->is_wait_die;
-
- /*
- * Add the waiter before the first waiter with a higher stamp.
- * Waiters without a context are skipped to avoid starving
- * them. Wait-Die waiters may die here. Wound-Wait waiters
- * never die here, but they are sorted in stamp order and
- * may wound the lock holder.
- */
- pos = &lock->wait_list;
- list_for_each_entry_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
- /*
- * Wait-Die: if we find an older context waiting, there
- * is no point in queueing behind it, as we'd have to
- * die the moment it would acquire the lock.
- */
- if (is_wait_die) {
- int ret = __ww_mutex_kill(lock, ww_ctx);
-
- if (ret)
- return ret;
- }
-
- break;
- }
-
- pos = &cur->list;
-
- /* Wait-Die: ensure younger waiters die. */
- __ww_mutex_die(lock, cur, ww_ctx);
- }
-
- __mutex_add_waiter(lock, waiter, pos);
-
- /*
- * Wound-Wait: if we're blocking on a mutex owned by a younger context,
- * wound that such that we might proceed.
- */
- if (!is_wait_die) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-
- /*
- * See ww_mutex_set_context_fastpath(). Orders setting
- * MUTEX_FLAG_WAITERS vs the ww->ctx load,
- * such that either we or the fastpath will wound @ww->ctx.
- */
- smp_mb();
- __ww_mutex_wound(lock, ww_ctx, ww->ctx);
- }
-
- return 0;
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
@@ -928,7 +571,6 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
struct mutex_waiter waiter;
- bool first = false;
struct ww_mutex *ww;
int ret;
@@ -937,9 +579,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
might_sleep();
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
ww = container_of(lock, struct ww_mutex, base);
if (ww_ctx) {
@@ -953,22 +593,28 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
*/
if (ww_ctx->acquired == 0)
ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
}
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
/*
* After waiting to acquire the wait_lock, try again.
*/
@@ -980,17 +626,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}
debug_mutex_lock_common(lock, &waiter);
+ waiter.task = current;
+ if (use_ww_ctx)
+ waiter.ww_ctx = ww_ctx;
lock_contended(&lock->dep_map, ip);
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
__mutex_add_waiter(lock, &waiter, &lock->wait_list);
-
-
-#ifdef CONFIG_DEBUG_MUTEXES
- waiter.ww_ctx = MUTEX_POISON_WW_CTX;
-#endif
} else {
/*
* Add in stamp order, waking up waiters that must kill
@@ -999,14 +643,13 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
if (ret)
goto err_early_kill;
-
- waiter.ww_ctx = ww_ctx;
}
- waiter.task = current;
-
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
+ bool first;
+
/*
* Once we hold wait_lock, we're serialized against
* mutex_unlock() handing the lock off to us, do a trylock
@@ -1032,18 +675,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err;
}
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
- /*
- * ww_mutex needs to always recheck its position since its waiter
- * list is not FIFO ordered.
- */
- if (ww_ctx || !first) {
- first = __mutex_waiter_is_first(lock, &waiter);
- if (first)
- __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
- }
+ first = __mutex_waiter_is_first(lock, &waiter);
set_current_state(state);
/*
@@ -1051,13 +686,19 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;
- spin_lock(&lock->wait_lock);
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
+ raw_spin_lock(&lock->wait_lock);
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
acquired:
__set_current_state(TASK_RUNNING);
@@ -1078,11 +719,12 @@ acquired:
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
preempt_enable();
return 0;
@@ -1090,7 +732,8 @@ err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
- spin_unlock(&lock->wait_lock);
+ trace_contention_end(lock, ret);
+ raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
preempt_enable();
@@ -1106,11 +749,48 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
static int __sched
__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
- struct lockdep_map *nest_lock, unsigned long ip,
- struct ww_acquire_ctx *ww_ctx)
+ unsigned long ip, struct ww_acquire_ctx *ww_ctx)
+{
+ return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+}
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
+ * @ww: mutex to lock
+ * @ww_ctx: optional w/w acquire context
+ *
+ * Trylocks a mutex with the optional acquire context; no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ *
+ * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
+ * specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
{
- return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+ if (!ww_ctx)
+ return mutex_trylock(&ww->base);
+
+ MUTEX_WARN_ON(ww->base.magic != &ww->base);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__mutex_trylock(&ww->base)) {
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL(ww_mutex_trylock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
@@ -1189,8 +869,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1205,8 +884,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1237,29 +915,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
if (owner & MUTEX_FLAG_HANDOFF)
break;
- old = atomic_long_cmpxchg_release(&lock->owner, owner,
- __owner_flags(owner));
- if (old == owner) {
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) {
if (owner & MUTEX_FLAG_WAITERS)
break;
return;
}
-
- owner = old;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -1276,7 +946,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
wake_up_q(&wake_q);
}
@@ -1380,7 +1050,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock)
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1388,7 +1058,7 @@ static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1412,9 +1082,7 @@ int __sched mutex_trylock(struct mutex *lock)
{
bool locked;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
locked = __mutex_trylock(lock);
if (locked)
@@ -1455,7 +1123,8 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
EXPORT_SYMBOL(ww_mutex_lock_interruptible);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* !CONFIG_PREEMPT_RT */
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index f0c710b1d192..0b2a79c4013b 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -5,19 +5,41 @@
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
-#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name, key) do { } while (0)
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
-static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
-{
-}
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+#else /* CONFIG_DEBUG_MUTEXES */
+# define debug_mutex_lock_common(lock, waiter) do { } while (0)
+# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+# define debug_mutex_free_waiter(waiter) do { } while (0)
+# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_unlock(lock) do { } while (0)
+# define debug_mutex_init(lock, name, key) do { } while (0)
+#endif /* !CONFIG_DEBUG_MUTEXES */
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 70a32a576f3f..185bd1c906b0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,7 +7,9 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -162,7 +164,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -170,9 +172,11 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
if (try)
return false;
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+ trace_contention_end(sem, 0);
return true;
}
@@ -188,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
@@ -211,10 +221,11 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -236,6 +247,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index ec36b73f4733..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,12 +12,13 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
+#include <trace/events/lock.h>
/**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
@@ -51,17 +54,21 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
-void queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
int cnts;
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
/*
* Include queued spinlock statistics code
@@ -312,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
u32 old, tail;
@@ -370,7 +371,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -379,7 +380,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
@@ -401,6 +402,8 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ locked:
pv_kick_node(lock, next);
release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..6afc249ce697 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -489,7 +489,7 @@ gotlock:
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
-__visible void
+__visible __lockfunc void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct pv_node *node;
@@ -544,7 +544,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
#include <asm/qspinlock_paravirt.h>
#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
u8 locked;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b5d9bb5202c6..728f434de2bb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,20 +8,60 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
* See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/timer.h>
+#include <linux/ww_mutex.h>
+
+#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#ifndef WW_RT
+# define build_ww_mutex() (false)
+# define ww_container_of(rtm) NULL
+
+static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+#else
+# define build_ww_mutex() (true)
+# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base)
+# include "ww_mutex.h"
+#endif
+
/*
* lock->owner state tracking:
*
@@ -49,24 +89,41 @@
* set this bit before looking at the lock.
*/
-static __always_inline void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- WRITE_ONCE(lock->owner, (struct task_struct *)val);
+ return (struct task_struct *)val;
}
-static __always_inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
+}
+
+static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -132,8 +189,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -141,15 +211,26 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_acquire(&lock->owner, &old, new);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_release(&lock->owner, &old, new);
+}
/*
* Callers must hold the ->wait_lock -- which is the whole purpose as we force
* all future threads that attempt to [Rmw] the lock to the slowpath. As such
* relaxed semantics suffice.
*/
-static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -157,6 +238,13 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
owner = *p;
} while (cmpxchg_relaxed(p, owner,
owner | RT_MUTEX_HAS_WAITERS) != owner);
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -165,7 +253,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
unsigned long flags)
__releases(lock->wait_lock)
{
@@ -201,10 +289,22 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#else
-# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
-# define rt_mutex_cmpxchg_release(l,c,n) (0)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+
+}
-static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
+
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -213,7 +313,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
unsigned long flags)
__releases(lock->wait_lock)
{
@@ -223,11 +323,28 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ waiter->prio = __waiter_prio(task);
+ waiter->deadline = task->dl.deadline;
+}
+
/*
* Only use with rt_mutex_waiter_{less,equal}()
*/
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -265,22 +382,63 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
+{
+ if (rt_mutex_waiter_less(waiter, top_waiter))
+ return true;
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+ return false;
+
+ return rt_mutex_waiter_equal(waiter, top_waiter);
+#else
+ return false;
+#endif
+}
+
#define __node_2_waiter(node) \
rb_entry((node), struct rt_mutex_waiter, tree_entry)
static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
{
- return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+ if (rt_mutex_waiter_less(aw, bw))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+ if (rt_mutex_waiter_less(bw, aw))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+ if (aw->ww_ctx) {
+ if (!bw->ww_ctx)
+ return 1;
+
+ return (signed long)(aw->ww_ctx->stamp -
+ bw->ww_ctx->stamp) < 0;
+ }
+
+ return 0;
}
static __always_inline void
-rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
}
static __always_inline void
-rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
@@ -326,6 +484,42 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
rt_mutex_setprio(p, pi_task);
}
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
+ struct task_struct *task,
+ unsigned int wake_state)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(task);
+ wqh->rtlock_task = task;
+ } else {
+ wake_q_add(&wqh->head, task);
+ }
+}
+
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
+}
+
+static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
+
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
+}
+
/*
* Deadlock detection is conditional:
*
@@ -343,17 +537,12 @@ static __always_inline bool
rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
enum rtmutex_chainwalk chwalk)
{
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
return waiter != NULL;
return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
-static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
@@ -423,15 +612,15 @@ static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct
*/
static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
enum rtmutex_chainwalk chwalk,
- struct rt_mutex *orig_lock,
- struct rt_mutex *next_lock,
+ struct rt_mutex_base *orig_lock,
+ struct rt_mutex_base *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
struct rt_mutex_waiter *prerequeue_top_waiter;
int ret = 0, depth = 0;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
bool detect_deadlock;
bool requeue = true;
@@ -514,6 +703,31 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * There could be 'spurious' loops in the lock graph due to ww_mutex,
+ * consider:
+ *
+ * P1: A, ww_A, ww_B
+ * P2: ww_B, ww_A
+ * P3: A
+ *
+ * P3 should not return -EDEADLK because it gets trapped in the cycle
+ * created by P1 and P2 (which will resolve -- and runs into
+ * max_lock_depth above). Therefore disable detect_deadlock such that
+ * the below termination condition can trigger once all relevant tasks
+ * are boosted.
+ *
+ * Even when we start with ww_mutex we can disable deadlock detection,
+ * since we would supress a ww_mutex induced deadlock at [6] anyway.
+ * Supressing it here however is not sufficient since we might still
+ * hit [6] due to adjustment driven iteration.
+ *
+ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
+ * utterly fail to report it; lockdep should.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
+ detect_deadlock = false;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -574,8 +788,21 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
+
+ /*
+ * When the deadlock is due to ww_mutex; also see above. Don't
+ * report the deadlock and instead let the ww_mutex wound/die
+ * logic pick which of the contending threads gets -EDEADLK.
+ *
+ * NOTE: assumes the cycle only contains a single ww_class; any
+ * other configuration and we fail to report; also, see
+ * lockdep.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
+ ret = 0;
+
+ raw_spin_unlock(&lock->wait_lock);
goto out_unlock_pi;
}
@@ -653,8 +880,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* serializes all pi_waiters access and rb_erase() does not care about
* the values of the node being removed.
*/
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
@@ -675,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
@@ -779,7 +1006,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* callsite called task_blocked_on_lock(), otherwise NULL
*/
static int __sched
-try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
@@ -815,19 +1042,21 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters tree.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -838,13 +1067,9 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -897,14 +1122,15 @@ takeit:
*
* This must be called with lock->wait_lock held and interrupts disabled
*/
-static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
+static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
+ struct ww_acquire_ctx *ww_ctx,
enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
lockdep_assert_held(&lock->wait_lock);
@@ -917,15 +1143,17 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
+ *
+ * Except for ww_mutex, in that case the chain walk must already deal
+ * with spurious cycles, see the comments at [3] and [6].
*/
- if (owner == task)
+ if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -936,6 +1164,21 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&task->pi_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ struct rt_mutex *rtm;
+
+ /* Check whether the waiter should back out immediately */
+ rtm = container_of(lock, struct rt_mutex, rtmutex);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+ if (res) {
+ raw_spin_lock(&task->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ task->pi_blocked_on = NULL;
+ raw_spin_unlock(&task->pi_lock);
+ return res;
+ }
+ }
+
if (!owner)
return 0;
@@ -986,8 +1229,8 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
*
* Called with lock->wait_lock held and interrupts disabled.
*/
-static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
- struct rt_mutex *lock)
+static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ struct rt_mutex_base *lock)
{
struct rt_mutex_waiter *waiter;
@@ -1023,25 +1266,200 @@ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* deboost but before waking our donor task, hence the preempt_disable()
* before unlock.
*
- * Pairs with preempt_enable() in rt_mutex_postunlock();
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
+ rt_mutex_wake_q_add(wqh, waiter);
raw_spin_unlock(&current->pi_lock);
}
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
+{
+ unsigned long flags;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ ret = __rt_mutex_slowtrylock(lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ return ret;
+}
+
+static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
+/*
+ * Slow path to release a rt-mutex.
+ */
+static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ debug_rt_mutex_unlock(lock);
+
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock, flags) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ }
+
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ *
+ * Queue the next waiter for wakeup once we release the wait_lock.
+ */
+ mark_wakeup_next_waiter(&wqh, lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(lock);
+}
+
+#ifdef CONFIG_SMP
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
+{
+ bool res = true;
+
+ rcu_read_lock();
+ for (;;) {
+ /* If owner changed, trylock again. */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner_on_cpu(owner) || need_resched() ||
+ !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
+}
+#else
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
+{
+ return false;
+}
+#endif
+
+#ifdef RT_MUTEX_BUILD_MUTEX
+/*
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
+ */
+
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and interrupts disabled. I must
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
* have just failed to try_to_take_rt_mutex().
*/
-static void __sched remove_waiter(struct rt_mutex *lock,
+static void __sched remove_waiter(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter)
{
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
lockdep_assert_held(&lock->wait_lock);
@@ -1089,45 +1507,10 @@ static void __sched remove_waiter(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
}
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void __sched rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
-}
-
-void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
- waiter->task = NULL;
-}
-
/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
+ * @ww_ctx: WW mutex context pointer
* @state: the state the task should block in (TASK_INTERRUPTIBLE
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
@@ -1135,10 +1518,14 @@ void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
*
* Must be called with lock->wait_lock held and interrupts disabled
*/
-static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
{
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct task_struct *owner;
int ret = 0;
for (;;) {
@@ -1155,9 +1542,20 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state
break;
}
+ if (build_ww_mutex() && ww_ctx) {
+ ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
+ if (ret)
+ break;
+ }
+
+ if (waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
raw_spin_unlock_irq(&lock->wait_lock);
- schedule();
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
+ schedule();
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1177,6 +1575,9 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (res != -EDEADLOCK || detect_deadlock)
return;
+ if (build_ww_mutex() && w->ww_ctx)
+ return;
+
/*
* Yell loudly and stop the task right here.
*/
@@ -1187,599 +1588,188 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
}
}
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
*/
-static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
+static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter)
{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- rt_mutex_init_waiter(&waiter);
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct ww_mutex *ww = ww_container_of(rtm);
+ int ret;
- /*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
- */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ lockdep_assert_held(&lock->wait_lock);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ if (build_ww_mutex() && ww_ctx) {
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
return 0;
}
set_current_state(state);
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
+ trace_contention_begin(lock, LCB_F_RT);
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-
- if (unlikely(ret)) {
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+
+ if (likely(!ret)) {
+ /* acquired the lock */
+ if (build_ww_mutex() && ww_ctx) {
+ if (!ww_ctx->is_wait_die)
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ } else {
__set_current_state(TASK_RUNNING);
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
}
/*
* try_to_take_rt_mutex() sets the waiter bit
* unconditionally. We might have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ fixup_rt_mutex_waiters(lock, true);
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
+ trace_contention_end(lock, ret);
return ret;
}
-static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- int ret = try_to_take_rt_mutex(lock, current, NULL);
+ struct rt_mutex_waiter waiter;
+ int ret;
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ rt_mutex_init_waiter(&waiter);
+ waiter.ww_ctx = ww_ctx;
+ ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
+ &waiter);
+
+ debug_rt_mutex_free_waiter(&waiter);
return ret;
}
/*
- * Slow path try-lock function:
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
*/
-static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
unsigned long flags;
int ret;
/*
- * If the lock already has an owner we fail to get the lock.
- * This can be done without taking the @lock->wait_lock as
- * it is only being read, and this is a trylock anyway.
- */
- if (rt_mutex_owner(lock))
- return 0;
-
- /*
- * The mutex has currently no owner. Lock the wait lock and try to
- * acquire the lock. We use irqsave here to support early boot calls.
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- ret = __rt_mutex_slowtrylock(lock);
-
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
-/*
- * Performs the wakeup of the top-waiter and re-enables preemption.
- */
-void __sched rt_mutex_postunlock(struct wake_q_head *wake_q)
-{
- wake_up_q(wake_q);
-
- /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
- preempt_enable();
-}
-
-/*
- * Slow path to release a rt-mutex.
- *
- * Return whether the current task needs to call rt_mutex_postunlock().
- */
-static void __sched rt_mutex_slowunlock(struct rt_mutex *lock)
+static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
+ unsigned int state)
{
- DEFINE_WAKE_Q(wake_q);
- unsigned long flags;
-
- /* irqsave required to support early boot calls */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- debug_rt_mutex_unlock(lock);
-
- /*
- * We must be careful here if the fast path is enabled. If we
- * have no waiters queued we cannot set owner to NULL here
- * because of:
- *
- * foo->lock->owner = NULL;
- * rtmutex_lock(foo->lock); <- fast path
- * free = atomic_dec_and_test(foo->refcnt);
- * rtmutex_unlock(foo->lock); <- fast path
- * if (free)
- * kfree(foo);
- * raw_spin_unlock(foo->lock->wait_lock);
- *
- * So for the fastpath enabled kernel:
- *
- * Nothing can set the waiters bit as long as we hold
- * lock->wait_lock. So we do the following sequence:
- *
- * owner = rt_mutex_owner(lock);
- * clear_rt_mutex_waiters(lock);
- * raw_spin_unlock(&lock->wait_lock);
- * if (cmpxchg(&lock->owner, owner, 0) == owner)
- * return;
- * goto retry;
- *
- * The fastpath disabled variant is simple as all access to
- * lock->owner is serialized by lock->wait_lock:
- *
- * lock->owner = NULL;
- * raw_spin_unlock(&lock->wait_lock);
- */
- while (!rt_mutex_has_waiters(lock)) {
- /* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock, flags) == true)
- return;
- /* Relock the rtmutex and try again */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
- }
-
- /*
- * The wakeup next waiter path does not suffer from the above
- * race. See the comments there.
- *
- * Queue the next waiter for wakeup once we release the wait_lock.
- */
- mark_wakeup_next_waiter(&wake_q, lock);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- rt_mutex_postunlock(&wake_q);
-}
-
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state,
- unsigned int subclass)
-{
- int ret;
-
- might_sleep();
- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-
if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
- return ret;
+ return rt_mutex_slowlock(lock, NULL, state);
}
+#endif /* RT_MUTEX_BUILD_MUTEX */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/**
- * rt_mutex_lock_nested - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- * @subclass: the lockdep subclass
- */
-void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
-{
- __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
-
-#else /* !CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+/*
+ * Functions required for spin/rw_lock substitution on RT kernels
*/
-void __sched rt_mutex_lock(struct rt_mutex *lock)
-{
- __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
-#endif
/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying RT mutex
*/
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
{
- return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
-
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * This function can only be called in thread context. It's safe to call it
- * from atomic regions, but not from hard or soft interrupt context.
- *
- * Returns:
- * 1 on success
- * 0 on contention
- */
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
-{
- int ret;
-
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
- return 0;
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
- /*
- * No lockdep annotation required because lockdep disables the fast
- * path.
- */
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 1;
+ lockdep_assert_held(&lock->wait_lock);
- ret = rt_mutex_slowtrylock(lock);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ if (try_to_take_rt_mutex(lock, current, NULL))
+ return;
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+ rt_mutex_init_rtlock_waiter(&waiter);
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
-{
- mutex_release(&lock->dep_map, _RET_IP_);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
- return;
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
- rt_mutex_slowunlock(lock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+ trace_contention_begin(lock, LCB_F_RT);
-/*
- * Futex variants, must not use fastpath.
- */
-int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_slowtrylock(lock);
-}
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
-int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return __rt_mutex_slowtrylock(lock);
-}
+ for (;;) {
+ /* Try to acquire the lock again */
+ if (try_to_take_rt_mutex(lock, current, &waiter))
+ break;
-/**
- * __rt_mutex_futex_unlock - Futex variant, that since futex variants
- * do not use the fast-path, can be simple and will not need to retry.
- *
- * @lock: The rt_mutex to be unlocked
- * @wake_q: The wake queue head from which to get the next lock waiter
- */
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
-{
- lockdep_assert_held(&lock->wait_lock);
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
- debug_rt_mutex_unlock(lock);
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ schedule_rtlock();
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- return false; /* done */
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
}
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
+
/*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally.
+ * We might have to fix that up:
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ fixup_rt_mutex_waiters(lock, true);
+ debug_rt_mutex_free_waiter(&waiter);
- return true; /* call postunlock() */
+ trace_contention_end(lock, 0);
}
-void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
- DEFINE_WAKE_Q(wake_q);
unsigned long flags;
- bool postunlock;
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+ rtlock_slowlock_locked(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
-}
-
-/**
- * __rt_mutex_init - initialize the rt_mutex
- *
- * @lock: The rt_mutex to be initialized
- * @name: The lock name used for debugging
- * @key: The lock class key used for debugging
- *
- * Initialize the rt_mutex to unlocked state.
- *
- * Initializing of a locked rt_mutex is not allowed
- */
-void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
- struct lock_class_key *key)
-{
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-
- __rt_mutex_basic_init(lock);
-}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
-
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This initializes the rtmutex and
- * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
- * possible at this point because the pi_state which contains the rtmutex
- * is not yet visible to other tasks.
- */
-void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- __rt_mutex_basic_init(lock);
- rt_mutex_set_owner(lock, proxy_owner);
-}
-
-/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This merrily cleans up the rtmutex
- * (debugging) state. Concurrent operations on this rt_mutex are not
- * possible because it belongs to the pi_state which is about to be freed
- * and it is not longer visible to other tasks.
- */
-void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
-}
-
-/**
- * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: does _NOT_ remove the @waiter on failure; must either call
- * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
-
- lockdep_assert_held(&lock->wait_lock);
-
- if (try_to_take_rt_mutex(lock, task, NULL))
- return 1;
-
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
-
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
- }
-
- return ret;
-}
-
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
- * on failure.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
-
- raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
- return ret;
}
-/**
- * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Wait for the lock acquisition started on our behalf by
- * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
- * rt_mutex_cleanup_proxy_lock().
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex support
- */
-int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
-{
- int ret;
-
- raw_spin_lock_irq(&lock->wait_lock);
- /* sleep on the mutex */
- set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
- raw_spin_unlock_irq(&lock->wait_lock);
-
- return ret;
-}
-
-/**
- * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
- * @lock: the rt_mutex we were woken on
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
- * rt_mutex_wait_proxy_lock().
- *
- * Unless we acquired the lock; we're still enqueued on the wait-list and can
- * in fact still be granted ownership until we're removed. Therefore we can
- * find we are in fact the owner and must disregard the
- * rt_mutex_wait_proxy_lock() failure.
- *
- * Returns:
- * true - did the cleanup, we done.
- * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
- * caller should disregards its return value.
- *
- * Special API call for PI-futex support
- */
-bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
-{
- bool cleanup = false;
-
- raw_spin_lock_irq(&lock->wait_lock);
- /*
- * Do an unconditional try-lock, this deals with the lock stealing
- * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
- * sets a NULL owner.
- *
- * We're not interested in the return value, because the subsequent
- * test on rt_mutex_owner() will infer that. If the trylock succeeded,
- * we will own the lock and it will have removed the waiter. If we
- * failed the trylock, we're still not owner and we need to remove
- * ourselves.
- */
- try_to_take_rt_mutex(lock, current, waiter);
- /*
- * Unless we're the owner; we're still enqueued on the wait_list.
- * So check if we became owner, if not, take us off the wait_list.
- */
- if (rt_mutex_owner(lock) != current) {
- remove_waiter(lock, waiter);
- cleanup = true;
- }
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- return cleanup;
-}
-
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-void rt_mutex_debug_task_free(struct task_struct *task)
-{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
-}
-#endif
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..cb9fdff76a8a
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ struct lockdep_map *nest_lock,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+void rt_mutex_base_init(struct rt_mutex_base *rtb)
+{
+ __rt_mutex_base_init(rtb);
+}
+EXPORT_SYMBOL(rt_mutex_base_init);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+}
+EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ __rt_mutex_base_init(&lock->rtmutex);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_base_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive, because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based, which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a separate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_clear_owner(lock);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, false);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex_base *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/* Mutexes */
+void __mutex_rt_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(__mutex_rt_init);
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched mutex_lock_killable_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index a90c22abdbca..c47e8361bfb5 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -25,29 +25,90 @@
* @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
* @prio: Priority of the waiter
* @deadline: Deadline of the waiter if applicable
+ * @ww_ctx: WW context pointer
*/
struct rt_mutex_waiter {
struct rb_node tree_entry;
struct rb_node pi_tree_entry;
struct task_struct *task;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
int prio;
u64 deadline;
+ struct ww_acquire_ctx *ww_ctx;
};
+/**
+ * rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
+};
+
+#define DEFINE_RT_WAKE_Q(name) \
+ struct rt_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
+
/*
* Must be guarded because this header is included from rcu/tree_plugin.h
* unconditionally.
*/
#ifdef CONFIG_RT_MUTEXES
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
{
return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
-static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex *lock)
+/*
+ * Lockless speculative check whether @waiter is still the top waiter on
+ * @lock. This is solely comparing pointers and not derefencing the
+ * leftmost entry which might be about to vanish.
+ */
+static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+}
+
+static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
@@ -72,19 +133,12 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
#define RT_MUTEX_HAS_WAITERS 1UL
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
{
unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
}
-#else /* CONFIG_RT_MUTEXES */
-/* Used in rcu/tree_plugin.h */
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
-{
- return NULL;
-}
-#endif /* !CONFIG_RT_MUTEXES */
/*
* Constants for rt mutex functions which have a selectable deadlock
@@ -101,49 +155,21 @@ enum rtmutex_chainwalk {
RT_MUTEX_FULL_CHAINWALK,
};
-static inline void __rt_mutex_basic_init(struct rt_mutex *lock)
+static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
{
- lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
}
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter);
-
-extern int rt_mutex_futex_trylock(struct rt_mutex *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
-
-extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
-extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
-
/* Debug functions */
-static inline void debug_rt_mutex_unlock(struct rt_mutex *lock)
+static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
{
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
}
-static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
{
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
@@ -161,4 +187,27 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
memset(waiter, 0x22, sizeof(*waiter));
}
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
+
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..25ec0239477c
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical section
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS, so readers can use the fast path again
+ * 3) Unlock rtmutex, to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock, which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1)))
+ return 1;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ int ret;
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read(), which might be unbound.
+ */
+
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+ DEFINE_RT_WAKE_Q(wqh);
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ rt_mutex_wake_q_add_task(&wqh, owner, state);
+
+ /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock().
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+}
+
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
+
+ rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
+ for (;;) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ rwbase_restore_current_state();
+ __rwbase_write_unlock(rwb, 0, flags);
+ trace_contention_end(rwb, -EINTR);
+ return -EINTR;
+ }
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+
+ set_current_state(state);
+ }
+ rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb)) {
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 16bfbb10c74d..9eabd585ce7a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,7 +27,9 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <trace/events/lock.h>
+#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
/*
@@ -55,7 +57,6 @@
*
* A fast path reader optimistic lock stealing is supported when the rwsem
* is previously owned by a writer and the following conditions are met:
- * - OSQ is empty
* - rwsem is not currently writer owned
* - the handoff isn't set.
*/
@@ -105,9 +106,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers -- set, clear
+ * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@@ -132,14 +133,19 @@
* the owner value concurrently without lock. Read from owner, however,
* may not need READ_ONCE() as long as the pointer value is only used
* for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
*/
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, (long)current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, 0);
}
@@ -334,6 +340,7 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
+ bool handoff_set;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -344,12 +351,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
-enum writer_wait_state {
- WRITER_NOT_FIRST, /* Writer is not first in wait list */
- WRITER_FIRST, /* Writer is first in wait list */
- WRITER_HANDOFF /* Writer is first & handoff needed */
-};
-
/*
* The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -365,6 +366,34 @@ enum writer_wait_state {
*/
#define MAX_READERS_WAKEUP 0x100
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_add_tail(&waiter->list, &sem->wait_list);
+ /* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
+ */
+static inline bool
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_del(&waiter->list);
+ if (likely(!list_empty(&sem->wait_list)))
+ return true;
+
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -376,6 +405,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
*/
static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
@@ -431,10 +462,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* to give up the lock), request a HANDOFF to
* force the issue.
*/
- if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
- time_after(jiffies, waiter->timeout)) {
- adjustment -= RWSEM_FLAG_HANDOFF;
- lockevent_inc(rwsem_rlock_handoff);
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
}
atomic_long_add(-adjustment, &sem->count);
@@ -484,24 +517,31 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Limit # of readers that can be woken up per wakeup call.
*/
- if (woken >= MAX_READERS_WAKEUP)
+ if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
}
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
+
+ oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
+ /*
+ * Combined with list_move_tail() above, this implies
+ * rwsem_del_waiter().
+ */
adjustment -= RWSEM_FLAG_WAITERS;
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ } else if (woken) {
+ /*
+ * When we've woken a reader, we no longer need to force
+ * writers to give up the lock and we can clear HANDOFF.
+ */
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
}
- /*
- * When we've woken a reader, we no longer need to force writers
- * to give up the lock and we can clear HANDOFF.
- */
- if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
- adjustment -= RWSEM_FLAG_HANDOFF;
-
if (adjustment)
atomic_long_add(adjustment, &sem->count);
@@ -528,16 +568,43 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
}
/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
*
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
- enum writer_wait_state wstate)
+ struct rwsem_waiter *waiter)
{
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -546,13 +613,26 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
- if (has_handoff && wstate == WRITER_NOT_FIRST)
- return false;
+ if (has_handoff) {
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
+ return false;
+ }
new = count;
if (count & RWSEM_LOCK_MASK) {
- if (has_handoff || (wstate != WRITER_HANDOFF))
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
return false;
new |= RWSEM_FLAG_HANDOFF;
@@ -566,16 +646,43 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
- if (new & RWSEM_FLAG_HANDOFF)
+ if (new & RWSEM_FLAG_HANDOFF) {
+ first->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
return false;
+ }
+ /*
+ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
rwsem_set_owner(sem);
return true;
}
+/*
+ * The rwsem_spin_on_owner() function returns the following 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* Try to acquire write lock before the writer has been put on wait queue.
@@ -595,15 +702,6 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
return false;
}
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
- /*
- * As lock holder preemption issue, we both skip spinning if
- * task is not on cpu or its cpu is preempted
- */
- return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
@@ -615,8 +713,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
- rcu_read_lock();
+ /*
+ * Disable preemption is equal to the RCU read-side crital section,
+ * thus the task_strcut structure won't go away.
+ */
owner = rwsem_owner_flags(sem, &flags);
/*
* Don't check the read-owner as the entry may be stale.
@@ -624,30 +724,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- rcu_read_unlock();
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
}
-/*
- * The rwsem_spin_on_owner() function returns the following 4 values
- * depending on the lock owner state.
- * OWNER_NULL : owner is currently NULL
- * OWNER_WRITER: when owner changes and is a writer
- * OWNER_READER: when owner changes and the new owner may be a reader.
- * OWNER_NONSPINNABLE:
- * when optimistic spinning has to stop because either the
- * owner stops running, is unknown, or its timeslice has
- * been used up.
- */
-enum owner_state {
- OWNER_NULL = 1 << 0,
- OWNER_WRITER = 1 << 1,
- OWNER_READER = 1 << 2,
- OWNER_NONSPINNABLE = 1 << 3,
-};
#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
static inline enum owner_state
@@ -669,12 +750,13 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
unsigned long flags, new_flags;
enum owner_state state;
+ lockdep_assert_preemption_disabled();
+
owner = rwsem_owner_flags(sem, &flags);
state = rwsem_owner_state(owner, flags);
if (state != OWNER_WRITER)
return state;
- rcu_read_lock();
for (;;) {
/*
* When a waiting writer set the handoff flag, it may spin
@@ -692,7 +774,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
* owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * our spinning context already disabled preemption which is
+ * equal to RCU read-side crital section ensures the memory
+ * stays valid.
*/
barrier();
@@ -703,7 +787,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
- rcu_read_unlock();
return state;
}
@@ -740,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -849,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -860,7 +940,7 @@ done:
*/
static inline void clear_nonspinnable(struct rw_semaphore *sem)
{
- if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
}
@@ -877,15 +957,39 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
-static inline int
+static inline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- return 0;
+ return OWNER_NONSPINNABLE;
}
-#define OWNER_NULL 1
#endif
/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
+{
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
+/*
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
@@ -895,7 +999,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
long rcnt = (count >> RWSEM_READER_SHIFT);
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
- bool wake = false;
/*
* To prevent a constant stream of readers from starving a sleeping
@@ -932,17 +1035,17 @@ queue:
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list)) {
/*
* In case the wait queue is empty and the lock isn't owned
- * by a writer or has the handoff bit set, this reader can
- * exit the slowpath and return immediately as its
- * RWSEM_READER_BIAS has already been set in the count.
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
*/
- if (!(atomic_long_read(&sem->count) &
- (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
/* Provide lock ACQUIRE */
smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
@@ -952,27 +1055,18 @@ queue:
}
adjustment += RWSEM_FLAG_WAITERS;
}
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (!(count & RWSEM_LOCK_MASK)) {
- clear_nonspinnable(sem);
- wake = true;
- }
- if (wake || (!(count & RWSEM_WRITER_MASK) &&
- (adjustment & RWSEM_FLAG_WAITERS)))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
/* wait to be given the lock */
for (;;) {
@@ -989,36 +1083,30 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list)) {
- atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
- &sem->count);
- }
- raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
/*
* Wait until we successfully acquire the write lock
*/
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- long count;
- enum writer_wait_state wstate;
struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@@ -1034,33 +1122,15 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock */
- if (wstate == WRITER_NOT_FIRST) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and:
- * 1) there are no active locks, wake the front
- * queued process(es) as the handoff bit might be set.
- * 2) there are no active writers and some readers, the lock
- * must be read owned; so we try to wake any read lock
- * waiters that were queued ahead of us.
- */
- if (count & RWSEM_WRITER_MASK)
- goto wait;
-
- rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
- ? RWSEM_WAKE_READERS
- : RWSEM_WAKE_ANY, &wake_q);
-
+ if (rwsem_first_waiter(sem) != &waiter) {
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
if (!wake_q_empty(&wake_q)) {
/*
* We want to minimize wait_lock hold time especially
@@ -1068,24 +1138,27 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
*/
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
- wake_q_init(&wake_q); /* Used again, reinit */
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
-wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
for (;;) {
- if (rwsem_try_write_lock(sem, wstate)) {
+ if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
raw_spin_unlock_irq(&sem->wait_lock);
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
/*
* After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock
@@ -1094,70 +1167,32 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
- if (wstate == WRITER_HANDOFF &&
- rwsem_spin_on_owner(sem) == OWNER_NULL)
- goto trylock_again;
+ if (waiter.handoff_set) {
+ enum owner_state owner_state;
- /* Block until there are no active lockers. */
- for (;;) {
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- schedule();
- lockevent_inc(rwsem_sleep_writer);
- set_current_state(state);
- /*
- * If HANDOFF bit is set, unconditionally do
- * a trylock.
- */
- if (wstate == WRITER_HANDOFF)
- break;
-
- if ((wstate == WRITER_NOT_FIRST) &&
- (rwsem_first_waiter(sem) == &waiter))
- wstate = WRITER_FIRST;
-
- count = atomic_long_read(&sem->count);
- if (!(count & RWSEM_LOCK_MASK))
- break;
-
- /*
- * The setting of the handoff bit is deferred
- * until rwsem_try_write_lock() is called.
- */
- if ((wstate == WRITER_FIRST) && (rt_task(current) ||
- time_after(jiffies, waiter.timeout))) {
- wstate = WRITER_HANDOFF;
- lockevent_inc(rwsem_wlock_handoff);
- break;
- }
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NULL)
+ goto trylock_again;
}
+
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
-
- return ret;
+ trace_contention_end(sem, 0);
+ return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
-
- if (unlikely(wstate == WRITER_HANDOFF))
- atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
-
- if (list_empty(&sem->wait_list))
- atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
- else
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
-
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
@@ -1165,7 +1200,7 @@ out_nolock:
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
*/
-static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
DEFINE_WAKE_Q(wake_q);
@@ -1205,51 +1240,58 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-static inline int __down_read_common(struct rw_semaphore *sem, int state)
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
-static inline void __down_read(struct rw_semaphore *sem)
+static __always_inline void __down_read(struct rw_semaphore *sem)
{
__down_read_common(sem, TASK_UNINTERRUPTIBLE);
}
-static inline int __down_read_interruptible(struct rw_semaphore *sem)
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_INTERRUPTIBLE);
}
-static inline int __down_read_killable(struct rw_semaphore *sem)
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_KILLABLE);
}
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- /*
- * Optimize for the case when the rwsem is not locked at all.
- */
- tmp = RWSEM_UNLOCKED_VALUE;
- do {
+ preempt_disable();
+ tmp = atomic_long_read(&sem->count);
+ while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- tmp + RWSEM_READER_BIAS)) {
+ tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
- } while (!(tmp & RWSEM_READ_FAILED_MASK));
- return 0;
+ }
+ preempt_enable();
+ return ret;
}
/*
@@ -1257,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
static inline void __down_write(struct rw_semaphore *sem)
@@ -1277,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1291,14 +1342,16 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
clear_nonspinnable(sem);
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1316,10 +1369,12 @@ static inline void __up_write(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1337,13 +1392,125 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
+}
+
+#else /* !CONFIG_PREEMPT_RT */
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ __rt_mutex_lock(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ __rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ __rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_schedule() \
+ schedule()
+
+#include "rwbase_rt.c"
+
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
}
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
/*
* lock for reading
*/
@@ -1506,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9aa855a96c4a..34bfae72f295 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -50,10 +51,11 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -72,11 +74,12 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -98,11 +101,12 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -127,7 +131,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -152,11 +156,12 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -175,7 +180,7 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
@@ -201,7 +206,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -232,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index c8d7ad9fb9b2..8475a0794f8c 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
EXPORT_SYMBOL(_raw_write_lock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock)))
+#endif
+
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+ __raw_write_lock_nested(lock, subclass);
+}
+EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
@@ -371,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
- do_raw_spin_lock_flags, &flags);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
return flags;
}
EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index b9d93087ee66..14235671a1a7 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
@@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..48a19ed8486d
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to
+ * resemble the non RT semantics:
+ *
+ * - Contrary to plain rtmutexes, spinlocks and rwlocks are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex, and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rwlocks disable preemption and eventually interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+/*
+ * __might_resched() skips the state check as rtlocks are state
+ * preserving. Take RCU nesting into account as spin/read/write_lock() can
+ * legitimately nest into an RCU read side critical section.
+ */
+#define RTLOCK_RESCHED_OFFSETS \
+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT)
+
+#define rtlock_might_resched() \
+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS)
+
+static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ rtlock_might_resched();
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key, bool percpu)
+{
+ u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL;
+
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG,
+ LD_WAIT_INV, type);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int
+rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
+{
+ rtlock_slowlock_locked(rtm);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock)
+{
+ rtlock_might_resched();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock_nested);
+#endif
+
+void __sched rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 3e82f449b4ff..93cca6e69860 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -16,6 +16,15 @@
static DEFINE_WD_CLASS(ww_class);
struct workqueue_struct *wq;
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+#define ww_acquire_init_noinject(a, b) do { \
+ ww_acquire_init((a), (b)); \
+ (a)->deadlock_inject_countdown = ~0U; \
+ } while (0)
+#else
+#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b))
+#endif
+
struct test_mutex {
struct work_struct work;
struct ww_mutex mutex;
@@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work)
wait_for_completion(&mtx->go);
if (mtx->flags & TEST_MTX_TRY) {
- while (!ww_mutex_trylock(&mtx->mutex))
+ while (!ww_mutex_trylock(&mtx->mutex, NULL))
cond_resched();
} else {
ww_mutex_lock(&mtx->mutex, NULL);
@@ -109,19 +118,39 @@ static int test_mutex(void)
return 0;
}
-static int test_aa(void)
+static int test_aa(bool trylock)
{
struct ww_mutex mutex;
struct ww_acquire_ctx ctx;
int ret;
+ const char *from = trylock ? "trylock" : "lock";
ww_mutex_init(&mutex, &ww_class);
ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&mutex, &ctx);
+ if (!trylock) {
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial lock failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ ret = !ww_mutex_trylock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial trylock failed!\n", __func__);
+ goto out;
+ }
+ }
- if (ww_mutex_trylock(&mutex)) {
- pr_err("%s: trylocked itself!\n", __func__);
+ if (ww_mutex_trylock(&mutex, NULL)) {
+ pr_err("%s: trylocked itself without context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ww_mutex_trylock(&mutex, &ctx)) {
+ pr_err("%s: trylocked itself with context from %s!\n", __func__, from);
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
@@ -129,17 +158,17 @@ static int test_aa(void)
ret = ww_mutex_lock(&mutex, &ctx);
if (ret != -EALREADY) {
- pr_err("%s: missed deadlock for recursing, ret=%d\n",
- __func__, ret);
+ pr_err("%s: missed deadlock for recursing, ret=%d from %s\n",
+ __func__, ret, from);
if (!ret)
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
}
+ ww_mutex_unlock(&mutex);
ret = 0;
out:
- ww_mutex_unlock(&mutex);
ww_acquire_fini(&ctx);
return ret;
}
@@ -150,7 +179,7 @@ struct test_abba {
struct ww_mutex b_mutex;
struct completion a_ready;
struct completion b_ready;
- bool resolve;
+ bool resolve, trylock;
int result;
};
@@ -160,8 +189,13 @@ static void test_abba_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err;
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba->b_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!abba->trylock)
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx);
complete(&abba->b_ready);
wait_for_completion(&abba->a_ready);
@@ -181,7 +215,7 @@ static void test_abba_work(struct work_struct *work)
abba->result = err;
}
-static int test_abba(bool resolve)
+static int test_abba(bool trylock, bool resolve)
{
struct test_abba abba;
struct ww_acquire_ctx ctx;
@@ -192,12 +226,18 @@ static int test_abba(bool resolve)
INIT_WORK_ONSTACK(&abba.work, test_abba_work);
init_completion(&abba.a_ready);
init_completion(&abba.b_ready);
+ abba.trylock = trylock;
abba.resolve = resolve;
schedule_work(&abba.work);
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba.a_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!trylock)
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx);
complete(&abba.a_ready);
wait_for_completion(&abba.b_ready);
@@ -249,7 +289,7 @@ static void test_cycle_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err, erra = 0;
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, &ww_class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
complete(cycle->a_signal);
@@ -359,7 +399,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
+ r = get_random_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -498,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
@@ -581,7 +621,9 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
static int __init test_ww_mutex_init(void)
{
int ncpus = num_online_cpus();
- int ret;
+ int ret, i;
+
+ printk(KERN_INFO "Beginning ww mutex selftests\n");
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
@@ -591,17 +633,19 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = test_aa();
+ ret = test_aa(false);
if (ret)
return ret;
- ret = test_abba(false);
+ ret = test_aa(true);
if (ret)
return ret;
- ret = test_abba(true);
- if (ret)
- return ret;
+ for (i = 0; i < 4; i++) {
+ ret = test_abba(i & 1, i & 2);
+ if (ret)
+ return ret;
+ }
ret = test_cycle(ncpus);
if (ret)
@@ -615,10 +659,11 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
+ printk(KERN_INFO "All ww mutex selftests passed\n");
return 0;
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
new file mode 100644
index 000000000000..56f139201f24
--- /dev/null
+++ b/kernel/locking/ww_mutex.h
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef WW_RT
+
+#define MUTEX mutex
+#define MUTEX_WAITER mutex_waiter
+
+static inline struct mutex_waiter *
+__ww_waiter_first(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_next_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_prev_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_last(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline void
+__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+{
+ struct list_head *p = &lock->wait_list;
+ if (pos)
+ p = &pos->list;
+ __mutex_add_waiter(lock, waiter, p);
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct mutex *lock)
+{
+ return __mutex_owner(lock);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct mutex *lock)
+{
+ return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
+}
+
+static inline void lock_wait_lock(struct mutex *lock)
+{
+ raw_spin_lock(&lock->wait_lock);
+}
+
+static inline void unlock_wait_lock(struct mutex *lock)
+{
+ raw_spin_unlock(&lock->wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+{
+ lockdep_assert_held(&lock->wait_lock);
+}
+
+#else /* WW_RT */
+
+#define MUTEX rt_mutex
+#define MUTEX_WAITER rt_mutex_waiter
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_first(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_next(&w->tree_entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_prev(&w->tree_entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_last(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline void
+__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos)
+{
+ /* RT unconditionally adds the waiter first and then removes it on error */
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(&lock->rtmutex);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return rt_mutex_has_waiters(&lock->rtmutex);
+}
+
+static inline void lock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_lock(&lock->rtmutex.wait_lock);
+}
+
+static inline void unlock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_unlock(&lock->rtmutex.wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+{
+ lockdep_assert_held(&lock->rtmutex.wait_lock);
+}
+
+#endif /* WW_RT */
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef DEBUG_WW_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task
+ * or, when of equal priority, a younger transaction than @b.
+ *
+ * Depending on the algorithm, @a will either need to wait for @b, or die.
+ */
+static inline bool
+__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+/*
+ * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI,
+ * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and
+ * isn't affected by this.
+ */
+#ifdef WW_RT
+ /* kernel prio; less is more */
+ int a_prio = a->task->prio;
+ int b_prio = b->task->prio;
+
+ if (rt_prio(a_prio) || rt_prio(b_prio)) {
+
+ if (a_prio > b_prio)
+ return true;
+
+ if (a_prio < b_prio)
+ return false;
+
+ /* equal static prio */
+
+ if (dl_prio(a_prio)) {
+ if (dl_time_before(b->task->dl.deadline,
+ a->task->dl.deadline))
+ return true;
+
+ if (dl_time_before(a->task->dl.deadline,
+ b->task->dl.deadline))
+ return false;
+ }
+
+ /* equal prio */
+ }
+#endif
+
+ /* FIFO order tie break -- bigger is younger */
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a lesser waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool
+__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
+#ifndef WW_RT
+ debug_mutex_wake_waiter(lock, waiter);
+#endif
+ wake_up_process(waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a lesser @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with more important transactions
+ * than the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx)
+{
+ struct task_struct *owner = __ww_mutex_owner(lock);
+
+ lockdep_assert_wait_lock_held(lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current)
+ wake_up_process(owner);
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are more important contexts
+ * waiting behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur;
+
+ lockdep_assert_wait_lock_held(lock);
+
+ for (cur = __ww_waiter_first(lock); cur;
+ cur = __ww_waiter_next(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ lock_wait_lock(&lock->base);
+ __ww_mutex_check_waiters(&lock->base, ctx);
+ unlock_wait_lock(&lock->base);
+}
+
+static __always_inline int
+__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef DEBUG_WW_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int
+__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct MUTEX_WAITER *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_less(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ for (cur = __ww_waiter_prev(lock, waiter); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int
+__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
+ struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur, *pos = NULL;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __ww_waiter_add(lock, waiter, NULL);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ for (cur = __ww_waiter_last(lock); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = cur;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx);
+ }
+
+ __ww_waiter_add(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx);
+ }
+
+ return 0;
+}
+
+static inline void __ww_mutex_unlock(struct ww_mutex *lock)
+{
+ if (lock->ctx) {
+#ifdef DEBUG_WW_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
new file mode 100644
index 000000000000..d1473c624105
--- /dev/null
+++ b/kernel/locking/ww_rt_mutex.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#define WW_RT
+#include "rtmutex.c"
+
+int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ if (!ww_ctx)
+ return rt_mutex_trylock(rtm);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__rt_mutex_trylock(&rtm->rtmutex)) {
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
+static int __sched
+__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ unsigned int state, unsigned long ip)
+{
+ struct lockdep_map __maybe_unused *nest_lock = NULL;
+ struct rt_mutex *rtm = &lock->base;
+ int ret;
+
+ might_sleep();
+
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(lock->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+ mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
+
+ if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ return 0;
+ }
+
+ ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state);
+
+ if (ret)
+ mutex_release(&rtm->dep_map, ip);
+ return ret;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ __ww_mutex_unlock(lock);
+
+ mutex_release(&rtm->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&rtm->rtmutex);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 33783abc377b..000000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/elf.h>
-#include <asm/module.h>
-
-struct load_info {
- const char *name;
- /* pointer to module in temporary copy, freed at end of load_module() */
- struct module *mod;
- Elf_Ehdr *hdr;
- unsigned long len;
- Elf_Shdr *sechdrs;
- char *secstrings, *strtab;
- unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
- struct _ddebug *debug;
- unsigned int num_debug;
- bool sig_ok;
-#ifdef CONFIG_KALLSYMS
- unsigned long mod_kallsyms_init_off;
-#endif
- struct {
- unsigned int sym, str, mod, vers, info, pcpu;
- } index;
-};
-
-extern int mod_verify_sig(const void *mod, struct load_info *info);
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
new file mode 100644
index 000000000000..33a2e991f608
--- /dev/null
+++ b/kernel/module/Kconfig
@@ -0,0 +1,392 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig MODULES
+ bool "Enable loadable module support"
+ modules
+ help
+ Kernel modules are small pieces of compiled code which can
+ be inserted in the running kernel, rather than being
+ permanently built into the kernel. You use the "modprobe"
+ tool to add (and sometimes remove) them. If you say Y here,
+ many parts of the kernel can be built as modules (by
+ answering M instead of Y where indicated): this is most
+ useful for infrequently used options which are not required
+ for booting. For more information, see the man pages for
+ modprobe, lsmod, modinfo, insmod and rmmod.
+
+ If you say Y here, you will need to run "make
+ modules_install" to put the modules under /lib/modules/
+ where modprobe can find them (you may need to be root to do
+ this).
+
+ If unsure, say Y.
+
+if MODULES
+
+config MODULE_DEBUGFS
+ bool
+
+config MODULE_DEBUG
+ bool "Module debugging"
+ depends on DEBUG_FS
+ help
+ Allows you to enable / disable features which can help you debug
+ modules. You don't need these options on production systems.
+
+if MODULE_DEBUG
+
+config MODULE_STATS
+ bool "Module statistics"
+ depends on DEBUG_FS
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of module statistics.
+ For example, size of all modules, average size, text size, a list
+ of failed modules and the size for each of those. For failed
+ modules we keep track of modules which failed due to either the
+ existing module taking too long to load or that module was already
+ loaded.
+
+ You should enable this if you are debugging production loads
+ and want to see if userspace or the kernel is doing stupid things
+ with loading modules when it shouldn't or if you want to help
+ optimize userspace / kernel space module autoloading schemes.
+ You might want to do this because failed modules tend to use
+ up significant amount of memory, and so you'd be doing everyone a
+ favor in avoiding these failures proactively.
+
+ This functionality is also useful for those experimenting with
+ module .text ELF section optimization.
+
+ If unsure, say N.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS
+ bool "Debug duplicate modules with auto-loading"
+ help
+ Module autoloading allows in-kernel code to request modules through
+ the *request_module*() API calls. This in turn just calls userspace
+ modprobe. Although modprobe checks to see if a module is already
+ loaded before trying to load a module there is a small time window in
+ which multiple duplicate requests can end up in userspace and multiple
+ modprobe calls race calling finit_module() around the same time for
+ duplicate modules. The finit_module() system call can consume in the
+ worst case more than twice the respective module size in virtual
+ memory for each duplicate module requests. Although duplicate module
+ requests are non-fatal virtual memory is a limited resource and each
+ duplicate module request ends up just unnecessarily straining virtual
+ memory.
+
+ This debugging facility will create pr_warn() splats for duplicate
+ module requests to help identify if module auto-loading may be the
+ culprit to your early boot virtual memory pressure. Since virtual
+ memory abuse caused by duplicate module requests could render a
+ system unusable this functionality will also converge races in
+ requests for the same module to a single request. You can boot with
+ the module.enable_dups_trace=1 kernel parameter to use WARN_ON()
+ instead of the pr_warn().
+
+ If the first module request used request_module_nowait() we cannot
+ use that as the anchor to wait for duplicate module requests, since
+ users of request_module() do want a proper return value. If a call
+ for the same module happened earlier with request_module() though,
+ then a duplicate request_module_nowait() would be detected. The
+ non-wait request_module() call is synchronous and waits until modprobe
+ completes. Subsequent auto-loading requests for the same module do
+ not trigger a new finit_module() calls and do not strain virtual
+ memory, and so as soon as modprobe successfully completes we remove
+ tracking for duplicates for that module.
+
+ Enable this functionality to try to debug virtual memory abuse during
+ boot on systems which are failing to boot or if you suspect you may be
+ straining virtual memory during boot, and you want to identify if the
+ abuse was due to module auto-loading. These issues are currently only
+ known to occur on systems with many CPUs (over 400) and is likely the
+ result of udev issuing duplicate module requests for each CPU, and so
+ module auto-loading is not the culprit. There may very well still be
+ many duplicate module auto-loading requests which could be optimized
+ for and this debugging facility can be used to help identify them.
+
+ Only enable this for debugging system functionality, never have it
+ enabled on real systems.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE
+ bool "Force full stack trace when duplicates are found"
+ depends on MODULE_DEBUG_AUTOLOAD_DUPS
+ help
+ Enabling this will force a full stack trace for duplicate module
+ auto-loading requests using WARN_ON() instead of pr_warn(). You
+ should keep this disabled at all times unless you are a developer
+ and are doing a manual inspection and want to debug exactly why
+ these duplicates occur.
+
+endif # MODULE_DEBUG
+
+config MODULE_FORCE_LOAD
+ bool "Forced module loading"
+ default n
+ help
+ Allow loading of modules without version information (ie. modprobe
+ --force). Forced module loading sets the 'F' (forced) taint flag and
+ is usually a really bad idea.
+
+config MODULE_UNLOAD
+ bool "Module unloading"
+ help
+ Without this option you will not be able to unload any
+ modules (note that some modules may not be unloadable
+ anyway), which makes your kernel smaller, faster
+ and simpler. If unsure, say Y.
+
+config MODULE_FORCE_UNLOAD
+ bool "Forced module unloading"
+ depends on MODULE_UNLOAD
+ help
+ This option allows you to force a module to unload, even if the
+ kernel believes it is unsafe: the kernel will remove the module
+ without waiting for anyone to stop using it (using the -f option to
+ rmmod). This is mainly for kernel developers and desperate users.
+ If unsure, say N.
+
+config MODULE_UNLOAD_TAINT_TRACKING
+ bool "Tainted module unload tracking"
+ depends on MODULE_UNLOAD
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of each unloaded
+ module that tainted the kernel. In addition to displaying a
+ list of linked (or loaded) modules e.g. on detection of a bad
+ page (see bad_page()), the aforementioned details are also
+ shown. If unsure, say N.
+
+config MODVERSIONS
+ bool "Module versioning support"
+ help
+ Usually, you have to use modules compiled with your kernel.
+ Saying Y here makes it sometimes possible to use modules
+ compiled for different kernels, by adding enough information
+ to the modules to (hopefully) spot any changes which would
+ make them incompatible with the kernel you are running. If
+ unsure, say N.
+
+config ASM_MODVERSIONS
+ bool
+ default HAVE_ASM_MODVERSIONS && MODVERSIONS
+ help
+ This enables module versioning for exported symbols also from
+ assembly. This can be enabled only when the target architecture
+ supports it.
+
+config MODULE_SRCVERSION_ALL
+ bool "Source checksum for all modules"
+ help
+ Modules which contain a MODULE_VERSION get an extra "srcversion"
+ field inserted into their modinfo section, which contains a
+ sum of the source files which made it. This helps maintainers
+ see exactly which source was used to build a module (since
+ others sometimes change the module source without updating
+ the version). With this option, such a "srcversion" field
+ will be created for all modules. If unsure, say N.
+
+config MODULE_SIG
+ bool "Module signature verification"
+ select MODULE_SIG_FORMAT
+ help
+ Check modules for valid signatures upon load: the signature
+ is simply appended to the module. For more information see
+ <file:Documentation/admin-guide/module-signing.rst>.
+
+ Note that this option adds the OpenSSL development packages as a
+ kernel build dependency so that the signing tool can use its crypto
+ library.
+
+ You should enable this option if you wish to use either
+ CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via
+ another LSM - otherwise unsigned modules will be loadable regardless
+ of the lockdown policy.
+
+ !!!WARNING!!! If you enable this option, you MUST make sure that the
+ module DOES NOT get stripped after being signed. This includes the
+ debuginfo strip done by some packagers (such as rpmbuild) and
+ inclusion into an initramfs that wants the module size reduced.
+
+config MODULE_SIG_FORCE
+ bool "Require modules to be validly signed"
+ depends on MODULE_SIG
+ help
+ Reject unsigned modules or signed modules for which we don't have a
+ key. Without this, such modules will simply taint the kernel.
+
+config MODULE_SIG_ALL
+ bool "Automatically sign all modules"
+ default y
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ help
+ Sign all modules during make modules_install. Without this option,
+ modules must be signed manually, using the scripts/sign-file tool.
+
+comment "Do not forget to sign required modules with scripts/sign-file"
+ depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
+
+choice
+ prompt "Which hash algorithm should modules be signed with?"
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ help
+ This determines which sort of hashing algorithm will be used during
+ signature generation. This algorithm _must_ be built into the kernel
+ directly so that signature verification can take place. It is not
+ possible to load a signed module containing the algorithm to check
+ the signature on that module.
+
+config MODULE_SIG_SHA1
+ bool "Sign modules with SHA-1"
+ select CRYPTO_SHA1
+
+config MODULE_SIG_SHA224
+ bool "Sign modules with SHA-224"
+ select CRYPTO_SHA256
+
+config MODULE_SIG_SHA256
+ bool "Sign modules with SHA-256"
+ select CRYPTO_SHA256
+
+config MODULE_SIG_SHA384
+ bool "Sign modules with SHA-384"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA512
+ bool "Sign modules with SHA-512"
+ select CRYPTO_SHA512
+
+endchoice
+
+config MODULE_SIG_HASH
+ string
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha1" if MODULE_SIG_SHA1
+ default "sha224" if MODULE_SIG_SHA224
+ default "sha256" if MODULE_SIG_SHA256
+ default "sha384" if MODULE_SIG_SHA384
+ default "sha512" if MODULE_SIG_SHA512
+
+choice
+ prompt "Module compression mode"
+ help
+ This option allows you to choose the algorithm which will be used to
+ compress modules when 'make modules_install' is run. (or, you can
+ choose to not compress modules at all.)
+
+ External modules will also be compressed in the same way during the
+ installation.
+
+ For modules inside an initrd or initramfs, it's more efficient to
+ compress the whole initrd or initramfs instead.
+
+ This is fully compatible with signed modules.
+
+ Please note that the tool used to load modules needs to support the
+ corresponding algorithm. module-init-tools MAY support gzip, and kmod
+ MAY support gzip, xz and zstd.
+
+ Your build system needs to provide the appropriate compression tool
+ to compress the modules.
+
+ If in doubt, select 'None'.
+
+config MODULE_COMPRESS_NONE
+ bool "None"
+ help
+ Do not compress modules. The installed modules are suffixed
+ with .ko.
+
+config MODULE_COMPRESS_GZIP
+ bool "GZIP"
+ help
+ Compress modules with GZIP. The installed modules are suffixed
+ with .ko.gz.
+
+config MODULE_COMPRESS_XZ
+ bool "XZ"
+ help
+ Compress modules with XZ. The installed modules are suffixed
+ with .ko.xz.
+
+config MODULE_COMPRESS_ZSTD
+ bool "ZSTD"
+ help
+ Compress modules with ZSTD. The installed modules are suffixed
+ with .ko.zst.
+
+endchoice
+
+config MODULE_DECOMPRESS
+ bool "Support in-kernel module decompression"
+ depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
+ select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
+ select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
+ help
+
+ Support for decompressing kernel modules by the kernel itself
+ instead of relying on userspace to perform this task. Useful when
+ load pinning security policy is enabled.
+
+ If unsure, say N.
+
+config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ bool "Allow loading of modules with missing namespace imports"
+ help
+ Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in
+ a namespace. A module that makes use of a symbol exported with such a
+ namespace is required to import the namespace via MODULE_IMPORT_NS().
+ There is no technical reason to enforce correct namespace imports,
+ but it creates consistency between symbols defining namespaces and
+ users importing namespaces they make use of. This option relaxes this
+ requirement and lifts the enforcement when loading a module.
+
+ If unsure, say N.
+
+config MODPROBE_PATH
+ string "Path to modprobe binary"
+ default "/sbin/modprobe"
+ help
+ When kernel code requests a module, it does so by calling
+ the "modprobe" userspace utility. This option allows you to
+ set the path where that binary is found. This can be changed
+ at runtime via the sysctl file
+ /proc/sys/kernel/modprobe. Setting this to the empty string
+ removes the kernel's ability to request modules (but
+ userspace can still load modules explicitly).
+
+config TRIM_UNUSED_KSYMS
+ bool "Trim unused exported kernel symbols" if EXPERT
+ depends on !COMPILE_TEST
+ help
+ The kernel and some modules make many symbols available for
+ other modules to use via EXPORT_SYMBOL() and variants. Depending
+ on the set of modules being selected in your kernel configuration,
+ many of those exported symbols might never be used.
+
+ This option allows for unused exported symbols to be dropped from
+ the build. In turn, this provides the compiler more opportunities
+ (especially when using LTO) for optimizing the code and reducing
+ binary size. This might have some security advantages as well.
+
+ If unsure, or if you need to build out-of-tree modules, say N.
+
+config UNUSED_KSYMS_WHITELIST
+ string "Whitelist of symbols to keep in ksymtab"
+ depends on TRIM_UNUSED_KSYMS
+ help
+ By default, all unused exported symbols will be un-exported from the
+ build when TRIM_UNUSED_KSYMS is selected.
+
+ UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept
+ exported at all times, even in absence of in-tree users. The value to
+ set here is the path to a text file containing the list of symbols,
+ one per line. The path can be absolute, or relative to the kernel
+ source tree.
+
+config MODULES_TREE_LOOKUP
+ def_bool y
+ depends on PERF_EVENTS || TRACING || CFI_CLANG
+
+endif # MODULES
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
new file mode 100644
index 000000000000..a10b2b9a6fdf
--- /dev/null
+++ b/kernel/module/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for linux kernel module support
+#
+
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+
+obj-y += main.o
+obj-y += strict_rwx.o
+obj-y += kmod.o
+obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
+obj-$(CONFIG_MODULE_SIG) += signing.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PROC_FS) += procfs.o
+obj-$(CONFIG_SYSFS) += sysfs.o
+obj-$(CONFIG_KGDB_KDB) += kdb.o
+obj-$(CONFIG_MODVERSIONS) += version.o
+obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
new file mode 100644
index 000000000000..12a569d361e8
--- /dev/null
+++ b/kernel/module/debug_kmemleak.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kmemleak support
+ *
+ * Copyright (C) 2009 Catalin Marinas
+ */
+
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ unsigned int i;
+
+ /* only scan the sections containing data */
+ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ /* Scan all writable sections that's not executable */
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ info->sechdrs[i].sh_size, GFP_KERNEL);
+ }
+}
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
new file mode 100644
index 000000000000..e97232b125eb
--- /dev/null
+++ b/kernel/module/decompress.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+
+static int module_extend_max_pages(struct load_info *info, unsigned int extent)
+{
+ struct page **new_pages;
+
+ new_pages = kvmalloc_array(info->max_pages + extent,
+ sizeof(info->pages), GFP_KERNEL);
+ if (!new_pages)
+ return -ENOMEM;
+
+ memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages));
+ kvfree(info->pages);
+ info->pages = new_pages;
+ info->max_pages += extent;
+
+ return 0;
+}
+
+static struct page *module_get_next_page(struct load_info *info)
+{
+ struct page *page;
+ int error;
+
+ if (info->max_pages == info->used_pages) {
+ error = module_extend_max_pages(info, info->used_pages);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ info->pages[info->used_pages++] = page;
+ return page;
+}
+
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
+#include <linux/zlib.h>
+#define MODULE_COMPRESSION gzip
+#define MODULE_DECOMPRESS_FN module_gzip_decompress
+
+/*
+ * Calculate length of the header which consists of signature, header
+ * flags, time stamp and operating system ID (10 bytes total), plus
+ * an optional filename.
+ */
+static size_t module_gzip_header_len(const u8 *buf, size_t size)
+{
+ const u8 signature[] = { 0x1f, 0x8b, 0x08 };
+ size_t len = 10;
+
+ if (size < len || memcmp(buf, signature, sizeof(signature)))
+ return 0;
+
+ if (buf[3] & 0x08) {
+ do {
+ /*
+ * If we can't find the end of the file name we must
+ * be dealing with a corrupted file.
+ */
+ if (len == size)
+ return 0;
+ } while (buf[len++] != '\0');
+ }
+
+ return len;
+}
+
+static ssize_t module_gzip_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ struct z_stream_s s = { 0 };
+ size_t new_size = 0;
+ size_t gzip_hdr_len;
+ ssize_t retval;
+ int rc;
+
+ gzip_hdr_len = module_gzip_header_len(buf, size);
+ if (!gzip_hdr_len) {
+ pr_err("not a gzip compressed module\n");
+ return -EINVAL;
+ }
+
+ s.next_in = buf + gzip_hdr_len;
+ s.avail_in = size - gzip_hdr_len;
+
+ s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+ if (!s.workspace)
+ return -ENOMEM;
+
+ rc = zlib_inflateInit2(&s, -MAX_WBITS);
+ if (rc != Z_OK) {
+ pr_err("failed to initialize decompressor: %d\n", rc);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out_inflate_end;
+ }
+
+ s.next_out = kmap_local_page(page);
+ s.avail_out = PAGE_SIZE;
+ rc = zlib_inflate(&s, 0);
+ kunmap_local(s.next_out);
+
+ new_size += PAGE_SIZE - s.avail_out;
+ } while (rc == Z_OK);
+
+ if (rc != Z_STREAM_END) {
+ pr_err("decompression failed with status %d\n", rc);
+ retval = -EINVAL;
+ goto out_inflate_end;
+ }
+
+ retval = new_size;
+
+out_inflate_end:
+ zlib_inflateEnd(&s);
+out:
+ kfree(s.workspace);
+ return retval;
+}
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
+#include <linux/xz.h>
+#define MODULE_COMPRESSION xz
+#define MODULE_DECOMPRESS_FN module_xz_decompress
+
+static ssize_t module_xz_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 };
+ struct xz_dec *xz_dec;
+ struct xz_buf xz_buf;
+ enum xz_ret xz_ret;
+ size_t new_size = 0;
+ ssize_t retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not an xz compressed module\n");
+ return -EINVAL;
+ }
+
+ xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1);
+ if (!xz_dec)
+ return -ENOMEM;
+
+ xz_buf.in_size = size;
+ xz_buf.in = buf;
+ xz_buf.in_pos = 0;
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ xz_buf.out = kmap_local_page(page);
+ xz_buf.out_pos = 0;
+ xz_buf.out_size = PAGE_SIZE;
+ xz_ret = xz_dec_run(xz_dec, &xz_buf);
+ kunmap_local(xz_buf.out);
+
+ new_size += xz_buf.out_pos;
+ } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK);
+
+ if (xz_ret != XZ_STREAM_END) {
+ pr_err("decompression failed with status %d\n", xz_ret);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ xz_dec_end(xz_dec);
+ return retval;
+}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (!IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap_local(zstd_dec.dst);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kfree(wksp);
+ return retval;
+}
+#else
+#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
+#endif
+
+int module_decompress(struct load_info *info, const void *buf, size_t size)
+{
+ unsigned int n_pages;
+ ssize_t data_size;
+ int error;
+
+#if defined(CONFIG_MODULE_STATS)
+ info->compressed_len = size;
+#endif
+
+ /*
+ * Start with number of pages twice as big as needed for
+ * compressed data.
+ */
+ n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2;
+ error = module_extend_max_pages(info, n_pages);
+
+ data_size = MODULE_DECOMPRESS_FN(info, buf, size);
+ if (data_size < 0) {
+ error = data_size;
+ goto err;
+ }
+
+ info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL);
+ if (!info->hdr) {
+ error = -ENOMEM;
+ goto err;
+ }
+
+ info->len = data_size;
+ return 0;
+
+err:
+ module_decompress_cleanup(info);
+ return error;
+}
+
+void module_decompress_cleanup(struct load_info *info)
+{
+ int i;
+
+ if (info->hdr)
+ vunmap(info->hdr);
+
+ for (i = 0; i < info->used_pages; i++)
+ __free_page(info->pages[i]);
+
+ kvfree(info->pages);
+
+ info->pages = NULL;
+ info->max_pages = info->used_pages = 0;
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t compression_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
+}
+
+static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
+
+static int __init module_decompress_sysfs_init(void)
+{
+ int error;
+
+ error = sysfs_create_file(&module_kset->kobj,
+ &module_compression_attr.attr);
+ if (error)
+ pr_warn("Failed to create 'compression' attribute");
+
+ return 0;
+}
+late_initcall(module_decompress_sysfs_init);
+#endif
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
new file mode 100644
index 000000000000..f3d7ea1e96d8
--- /dev/null
+++ b/kernel/module/dups.c
@@ -0,0 +1,248 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * kmod dups - the kernel module autoloader duplicate suppressor
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define pr_fmt(fmt) "module: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE);
+module_param(enable_dups_trace, bool_enable_only, 0644);
+
+/*
+ * Protects dup_kmod_reqs list, adds / removals with RCU.
+ */
+static DEFINE_MUTEX(kmod_dup_mutex);
+static LIST_HEAD(dup_kmod_reqs);
+
+struct kmod_dup_req {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ struct completion first_req_done;
+ struct work_struct complete_work;
+ struct delayed_work delete_work;
+ int dup_ret;
+};
+
+static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name)
+{
+ struct kmod_dup_req *kmod_req;
+
+ list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list,
+ lockdep_is_held(&kmod_dup_mutex)) {
+ if (strlen(kmod_req->name) == strlen(module_name) &&
+ !memcmp(kmod_req->name, module_name, strlen(module_name))) {
+ return kmod_req;
+ }
+ }
+
+ return NULL;
+}
+
+static void kmod_dup_request_delete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+ kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work);
+
+ /*
+ * The typical situation is a module successully loaded. In that
+ * situation the module will be present already in userspace. If
+ * new requests come in after that, userspace will already know the
+ * module is loaded so will just return 0 right away. There is still
+ * a small chance right after we delete this entry new request_module()
+ * calls may happen after that, they can happen. These heuristics
+ * are to protect finit_module() abuse for auto-loading, if modules
+ * are still tryign to auto-load even if a module is already loaded,
+ * that's on them, and those inneficiencies should not be fixed by
+ * kmod. The inneficies there are a call to modprobe and modprobe
+ * just returning 0.
+ */
+ mutex_lock(&kmod_dup_mutex);
+ list_del_rcu(&kmod_req->list);
+ synchronize_rcu();
+ mutex_unlock(&kmod_dup_mutex);
+ kfree(kmod_req);
+}
+
+static void kmod_dup_request_complete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+
+ kmod_req = container_of(work, struct kmod_dup_req, complete_work);
+
+ /*
+ * This will ensure that the kernel will let all the waiters get
+ * informed its time to check the return value. It's time to
+ * go home.
+ */
+ complete_all(&kmod_req->first_req_done);
+
+ /*
+ * Now that we have allowed prior request_module() calls to go on
+ * with life, let's schedule deleting this entry. We don't have
+ * to do it right away, but we *eventually* want to do it so to not
+ * let this linger forever as this is just a boot optimization for
+ * possible abuses of vmalloc() incurred by finit_module() thrashing.
+ */
+ queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ);
+}
+
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ struct kmod_dup_req *kmod_req, *new_kmod_req;
+ int ret;
+
+ /*
+ * Pre-allocate the entry in case we have to use it later
+ * to avoid contention with the mutex.
+ */
+ new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL);
+ if (!new_kmod_req)
+ return false;
+
+ memcpy(new_kmod_req->name, module_name, strlen(module_name));
+ INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete);
+ INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete);
+ init_completion(&new_kmod_req->first_req_done);
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req) {
+ /*
+ * If the first request that came through for a module
+ * was with request_module_nowait() we cannot wait for it
+ * and share its return value with other users which may
+ * have used request_module() and need a proper return value
+ * so just skip using them as an anchor.
+ *
+ * If a prior request to this one came through with
+ * request_module() though, then a request_module_nowait()
+ * would benefit from duplicate detection.
+ */
+ if (!wait) {
+ kfree(new_kmod_req);
+ pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+
+ /*
+ * There was no duplicate, just add the request so we can
+ * keep tab on duplicates later.
+ */
+ pr_debug("New request_module() for %s\n", module_name);
+ list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+ mutex_unlock(&kmod_dup_mutex);
+
+ /* We are dealing with a duplicate request now */
+ kfree(new_kmod_req);
+
+ /*
+ * To fix these try to use try_then_request_module() instead as that
+ * will check if the component you are looking for is present or not.
+ * You could also just queue a single request to load the module once,
+ * instead of having each and everything you need try to request for
+ * the module.
+ *
+ * Duplicate request_module() calls can cause quite a bit of wasted
+ * vmalloc() space when racing with userspace.
+ */
+ if (enable_dups_trace)
+ WARN(1, "module-autoload: duplicate request for module %s\n", module_name);
+ else
+ pr_warn("module-autoload: duplicate request for module %s\n", module_name);
+
+ if (!wait) {
+ /*
+ * If request_module_nowait() was used then the user just
+ * wanted to issue the request and if another module request
+ * was already its way with the same name we don't care for
+ * the return value either. Let duplicate request_module_nowait()
+ * calls bail out right away.
+ */
+ *dup_ret = 0;
+ return true;
+ }
+
+ /*
+ * If a duplicate request_module() was used they *may* care for
+ * the return value, so we have no other option but to wait for
+ * the first caller to complete. If the first caller used
+ * the request_module_nowait() call, subsquent callers will
+ * deal with the comprmise of getting a successful call with this
+ * optimization enabled ...
+ */
+ ret = wait_for_completion_state(&kmod_req->first_req_done,
+ TASK_UNINTERRUPTIBLE | TASK_KILLABLE);
+ if (ret) {
+ *dup_ret = ret;
+ return true;
+ }
+
+ /* Now the duplicate request has the same exact return value as the first request */
+ *dup_ret = kmod_req->dup_ret;
+
+ return true;
+}
+
+void kmod_dup_request_announce(char *module_name, int ret)
+{
+ struct kmod_dup_req *kmod_req;
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req)
+ goto out;
+
+ kmod_req->dup_ret = ret;
+
+ /*
+ * If we complete() here we may allow duplicate threads
+ * to continue before the first one that submitted the
+ * request. We're in no rush also, given that each and
+ * every bounce back to userspace is slow we avoid that
+ * with a slight delay here. So queueue up the completion
+ * and let duplicates suffer, just wait a tad bit longer.
+ * There is no rush. But we also don't want to hold the
+ * caller up forever or introduce any boot delays.
+ */
+ queue_work(system_wq, &kmod_req->complete_work);
+
+out:
+ mutex_unlock(&kmod_dup_mutex);
+}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
new file mode 100644
index 000000000000..dc7b0160c480
--- /dev/null
+++ b/kernel/module/internal.h
@@ -0,0 +1,394 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/elf.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/mm.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/*
+ * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
+ * section. This leaves 28 bits for offset on 32-bit systems, which is
+ * about 256 MiB (WARN_ON_ONCE if we exceed that).
+ */
+
+#define SH_ENTSIZE_TYPE_BITS 4
+#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
+#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
+#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+extern struct mutex module_mutex;
+extern struct list_head modules;
+
+extern struct module_attribute *modinfo_attrs[];
+extern size_t modinfo_attrs_count;
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+
+struct load_info {
+ const char *name;
+ /* pointer to module in temporary copy, freed at end of load_module() */
+ struct module *mod;
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
+ bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
+#ifdef CONFIG_MODULE_DECOMPRESS
+#ifdef CONFIG_MODULE_STATS
+ unsigned long compressed_len;
+#endif
+ struct page **pages;
+ unsigned int max_pages;
+ unsigned int used_pages;
+#endif
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
+enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+};
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const s32 *crc;
+ const struct kernel_symbol *sym;
+ enum mod_license license;
+};
+
+int mod_verify_sig(const void *mod, struct load_info *info);
+int try_to_force_load(struct module *mod, const char *reason);
+bool find_symbol(struct find_symbol_arg *fsa);
+struct module *find_module_all(const char *name, size_t len, bool even_unformed);
+int cmp_name(const void *name, const void *sym);
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section);
+char *module_flags(struct module *mod, char *buf, bool show_state);
+size_t module_flags_taint(unsigned long taints, char *buf);
+
+char *module_next_tag_pair(char *string, unsigned long *secsize);
+
+#define for_each_modinfo_entry(entry, info, name) \
+ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))
+
+static inline void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return;
+
+ WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
+ !lockdep_is_held(&module_mutex));
+#endif
+}
+
+static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+#ifdef CONFIG_LIVEPATCH
+int copy_module_elf(struct module *mod, struct load_info *info);
+void free_module_elf(struct module *mod);
+#else /* !CONFIG_LIVEPATCH */
+static inline int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ return 0;
+}
+
+static inline void free_module_elf(struct module *mod) { }
+#endif /* CONFIG_LIVEPATCH */
+
+static inline bool set_livepatch_module(struct module *mod)
+{
+#ifdef CONFIG_LIVEPATCH
+ mod->klp = true;
+ return true;
+#else
+ return false;
+#endif
+}
+
+/**
+ * enum fail_dup_mod_reason - state at which a duplicate module was detected
+ *
+ * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
+ * we've determined that another module with the same name is already loaded
+ * or being processed on our &modules list. This happens on early_mod_check()
+ * right before layout_and_allocate(). The kernel would have already
+ * vmalloc()'d space for the entire module through finit_module(). If
+ * decompression was used two vmap() spaces were used. These failures can
+ * happen when userspace has not seen the module present on the kernel and
+ * tries to load the module multiple times at same time.
+ * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
+ * checks and the kernel determines that the module was unique and because
+ * of this allocated yet another private kernel copy of the module space in
+ * layout_and_allocate() but after this determined in add_unformed_module()
+ * that another module with the same name is already loaded or being processed.
+ * These failures should be mitigated as much as possible and are indicative
+ * of really fast races in loading modules. Without module decompression
+ * they waste twice as much vmap space. With module decompression three
+ * times the module's size vmap space is wasted.
+ */
+enum fail_dup_mod_reason {
+ FAIL_DUP_MOD_BECOMING = 0,
+ FAIL_DUP_MOD_LOAD,
+};
+
+#ifdef CONFIG_MODULE_DEBUGFS
+extern struct dentry *mod_debugfs_root;
+#endif
+
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add_long(count, var) atomic_long_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic_long_t total_mod_size;
+extern atomic_long_t total_text_size;
+extern atomic_long_t invalid_kread_bytes;
+extern atomic_long_t invalid_decompress_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+struct mod_fail_load {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ atomic_long_t count;
+ unsigned long dup_fail_mask;
+};
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
+void mod_stat_bump_invalid(struct load_info *info, int flags);
+void mod_stat_bump_becoming(struct load_info *info, int flags);
+
+#else
+
+#define mod_stat_add_long(name, var)
+#define mod_stat_inc(name)
+
+static inline int try_add_failed_module(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ return 0;
+}
+
+static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+}
+
+static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+}
+
+#endif /* CONFIG_MODULE_STATS */
+
+#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
+void kmod_dup_request_announce(char *module_name, int ret);
+#else
+static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ return false;
+}
+
+static inline void kmod_dup_request_announce(char *module_name, int ret)
+{
+}
+#endif
+
+#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
+struct mod_unload_taint {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ unsigned long taints;
+ u64 count;
+};
+
+int try_add_tainted_module(struct module *mod);
+void print_unloaded_tainted_modules(void);
+#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+static inline int try_add_tainted_module(struct module *mod)
+{
+ return 0;
+}
+
+static inline void print_unloaded_tainted_modules(void)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+
+#ifdef CONFIG_MODULE_DECOMPRESS
+int module_decompress(struct load_info *info, const void *buf, size_t size);
+void module_decompress_cleanup(struct load_info *info);
+#else
+static inline int module_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void module_decompress_cleanup(struct load_info *info)
+{
+}
+#endif
+
+struct mod_tree_root {
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+ struct latch_tree_root root;
+#endif
+ unsigned long addr_min;
+ unsigned long addr_max;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ unsigned long data_addr_min;
+ unsigned long data_addr_max;
+#endif
+};
+
+extern struct mod_tree_root mod_tree;
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+void mod_tree_insert(struct module *mod);
+void mod_tree_remove_init(struct module *mod);
+void mod_tree_remove(struct module *mod);
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+
+static inline void mod_tree_insert(struct module *mod) { }
+static inline void mod_tree_remove_init(struct module *mod) { }
+static inline void mod_tree_remove(struct module *mod) { }
+static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
+
+void module_enable_ro(const struct module *mod, bool after_init);
+void module_enable_nx(const struct module *mod);
+void module_enable_x(const struct module *mod);
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod);
+
+#ifdef CONFIG_MODULE_SIG
+int module_sig_check(struct load_info *info, int flags);
+#else /* !CONFIG_MODULE_SIG */
+static inline int module_sig_check(struct load_info *info, int flags)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+void kmemleak_load_module(const struct module *mod, const struct load_info *info);
+#else /* !CONFIG_DEBUG_KMEMLEAK */
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info) { }
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#ifdef CONFIG_KALLSYMS
+void init_build_id(struct module *mod, const struct load_info *info);
+void layout_symtab(struct module *mod, struct load_info *info);
+void add_kallsyms(struct module *mod, const struct load_info *info);
+
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#else /* !CONFIG_KALLSYMS */
+static inline void init_build_id(struct module *mod, const struct load_info *info) { }
+static inline void layout_symtab(struct module *mod, struct load_info *info) { }
+static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+#endif /* CONFIG_KALLSYMS */
+
+#ifdef CONFIG_SYSFS
+int mod_sysfs_setup(struct module *mod, const struct load_info *info,
+ struct kernel_param *kparam, unsigned int num_params);
+void mod_sysfs_teardown(struct module *mod);
+void init_param_lock(struct module *mod);
+#else /* !CONFIG_SYSFS */
+static inline int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static inline void mod_sysfs_teardown(struct module *mod) { }
+static inline void init_param_lock(struct module *mod) { }
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODVERSIONS
+int check_version(const struct load_info *info,
+ const char *symname, struct module *mod, const s32 *crc);
+void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
+ struct kernel_symbol *ks, struct tracepoint * const *tp);
+int check_modstruct_version(const struct load_info *info, struct module *mod);
+int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+#else /* !CONFIG_MODVERSIONS */
+static inline int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const s32 *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
new file mode 100644
index 000000000000..c550d7d45f2f
--- /dev/null
+++ b/kernel/module/kallsyms.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kallsyms support
+ *
+ * Copyright (C) 2010 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/module_symbol.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/bsearch.h>
+#include "internal.h"
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+ const struct module *mod)
+{
+ const struct kernel_symbol *ks;
+
+ if (!mod)
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+ else
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+ return ks && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC &&
+ sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
+ return 'n';
+ }
+ return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ unsigned int shnum, unsigned int pcpundx)
+{
+ const Elf_Shdr *sec;
+ enum mod_mem_type type;
+
+ if (src->st_shndx == SHN_UNDEF ||
+ src->st_shndx >= shnum ||
+ !src->st_name)
+ return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
+ sec = sechdrs + src->st_shndx;
+ type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+ || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+ || mod_mem_type_is_init(type))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+void layout_symtab(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ const Elf_Sym *src;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
+ struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
+ struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
+
+ /* Put symbol section at end of init part of module. */
+ symsect->sh_flags |= SHF_ALLOC;
+ symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ symsect, info->index.sym);
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+ src = (void *)info->hdr + symsect->sh_offset;
+ nsrc = symsect->sh_size / sizeof(*src);
+
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
+ ndst++;
+ }
+ }
+
+ /* Append room for core symbols at end of core part. */
+ info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod_mem_data->size += strtab_size;
+ /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
+ info->core_typeoffs = mod_mem_data->size;
+ mod_mem_data->size += ndst * sizeof(char);
+
+ /* Put string table section at end of init part of module. */
+ strsect->sh_flags |= SHF_ALLOC;
+ strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ strsect, info->index.str);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod_mem_init_data->size;
+
+ mod_mem_init_data->size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod_mem_init_data->size;
+ mod_mem_init_data->size += nsrc * sizeof(char);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
+void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+ unsigned int i, ndst;
+ const Elf_Sym *src;
+ Elf_Sym *dst;
+ char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ unsigned long strtab_size;
+ void *data_base = mod->mem[MOD_DATA].base;
+ void *init_data_base = mod->mem[MOD_INIT_DATA].base;
+
+ /* Set up to point into init section. */
+ mod->kallsyms = (void __rcu *)init_data_base +
+ info->mod_kallsyms_init_off;
+
+ rcu_read_lock();
+ /* The following is safe since this pointer cannot change */
+ rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
+ rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ rcu_dereference(mod->kallsyms)->strtab =
+ (void *)info->sechdrs[info->index.str].sh_addr;
+ rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs;
+
+ /*
+ * Now populate the cut down core kallsyms for after init
+ * and set types up while we still have access to sections.
+ */
+ mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
+ mod->core_kallsyms.strtab = s = data_base + info->stroffs;
+ mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
+ strtab_size = info->core_typeoffs - info->stroffs;
+ src = rcu_dereference(mod->kallsyms)->symtab;
+ for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
+ rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ ssize_t ret;
+
+ mod->core_kallsyms.typetab[ndst] =
+ rcu_dereference(mod->kallsyms)->typetab[i];
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ ret = strscpy(s,
+ &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
+ strtab_size);
+ if (ret < 0)
+ break;
+ s += ret + 1;
+ strtab_size -= ret + 1;
+ }
+ }
+ rcu_read_unlock();
+ mod->core_kallsyms.num_symtab = ndst;
+}
+
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+ const Elf_Shdr *sechdr;
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ sechdr = &info->sechdrs[i];
+ if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+ !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+ sechdr->sh_size))
+ break;
+ }
+}
+#else
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval, bestval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ struct module_memory *mod_mem;
+
+ /* At worse, next value is at end of module */
+ if (within_module_init(addr, mod))
+ mod_mem = &mod->mem[MOD_INIT_TEXT];
+ else
+ mod_mem = &mod->mem[MOD_TEXT];
+
+ nextval = (unsigned long)mod_mem->base + mod_mem->size;
+
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+ /*
+ * Scan for closest preceding symbol, and next symbol. (ELF
+ * starts real symbols at 1).
+ */
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ /*
+ * We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim.
+ */
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
+ is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ continue;
+
+ if (thisval <= addr && thisval > bestval) {
+ best = i;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - bestval;
+ if (offset)
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+ void *ptr)
+{
+ return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption.
+ */
+const char *module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ const unsigned char **modbuildid,
+ char *namebuf)
+{
+ const char *ret = NULL;
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ *modbuildid = mod->build_id;
+#else
+ *modbuildid = NULL;
+#endif
+ }
+
+ ret = find_kallsyms_symbol(mod, addr, size, offset);
+ }
+ /* Make a copy in here where it's safe */
+ if (ret) {
+ strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+ ret = namebuf;
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+
+ strscpy(symname, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strscpy(modname, mod->name, MODULE_NAME_LEN);
+ if (name)
+ strscpy(name, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = kallsyms->typetab[symnum];
+ strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+ strscpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, *value, mod);
+ preempt_enable();
+ return 0;
+ }
+ symnum -= kallsyms->num_symtab;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
+ return 0;
+}
+
+static unsigned long __module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+
+ colon = strnchr(name, MODULE_NAME_LEN, ':');
+ if (colon) {
+ mod = find_module_all(name, colon - name, false);
+ if (mod)
+ return __find_kallsyms_symbol_value(mod, colon + 1);
+ return 0;
+ }
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ unsigned long ret;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ ret = __find_kallsyms_symbol_value(mod, name);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ unsigned long ret;
+
+ /* Don't lock: we're in enough trouble already. */
+ preempt_disable();
+ ret = __module_kallsyms_lookup_name(name);
+ preempt_enable();
+ return ret;
+}
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned long ret;
+
+ preempt_disable();
+ ret = __find_kallsyms_symbol_value(mod, name);
+ preempt_enable();
+ return ret;
+}
+
+int module_kallsyms_on_each_symbol(const char *modname,
+ int (*fn)(void *, const char *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ if (modname && strcmp(modname, mod->name))
+ continue;
+
+ /* Use rcu_dereference_sched() to remain compliant with the sparse tool */
+ preempt_disable();
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ preempt_enable();
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ kallsyms_symbol_value(sym));
+ if (ret != 0)
+ goto out;
+ }
+
+ /*
+ * The given module is found, the subsequent modules do not
+ * need to be compared.
+ */
+ if (modname)
+ break;
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
new file mode 100644
index 000000000000..995c32d3698f
--- /dev/null
+++ b/kernel/module/kdb.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kdb support
+ *
+ * Copyright (C) 2010 Jason Wessel
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+#include "internal.h"
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size);
+ kdb_printf("/%8u", mod->mem[MOD_RODATA].size);
+ kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size);
+ kdb_printf("/%8u", mod->mem[MOD_DATA].size);
+
+ kdb_printf(" 0x%px ", (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RODATA].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_DATA].base);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
diff --git a/kernel/kmod.c b/kernel/module/kmod.c
index b717134ebe17..0800d9891692 100644
--- a/kernel/kmod.c
+++ b/kernel/module/kmod.c
@@ -1,6 +1,9 @@
/*
* kmod - the kernel module loader
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -27,6 +30,7 @@
#include <linux/uaccess.h>
#include <trace/events/module.h>
+#include "internal.h"
/*
* Assuming:
@@ -40,8 +44,7 @@
* effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
-static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
-static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
@@ -66,7 +69,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
kfree(info->argv);
}
-static int call_modprobe(char *module_name, int wait)
+static int call_modprobe(char *orig_module_name, int wait)
{
struct subprocess_info *info;
static char *envp[] = {
@@ -75,12 +78,14 @@ static int call_modprobe(char *module_name, int wait)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
+ char *module_name;
+ int ret;
char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
if (!argv)
goto out;
- module_name = kstrdup(module_name, GFP_KERNEL);
+ module_name = kstrdup(orig_module_name, GFP_KERNEL);
if (!module_name)
goto free_argv;
@@ -95,13 +100,16 @@ static int call_modprobe(char *module_name, int wait)
if (!info)
goto free_module_name;
- return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ kmod_dup_request_announce(orig_module_name, ret);
+ return ret;
free_module_name:
kfree(module_name);
free_argv:
kfree(argv);
out:
+ kmod_dup_request_announce(orig_module_name, -ENOMEM);
return -ENOMEM;
}
@@ -125,7 +133,7 @@ int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
- int ret;
+ int ret, dup_ret;
/*
* We don't allow synchronous module loading from async. Module
@@ -148,29 +156,24 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret)
return ret;
- if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
- pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
- atomic_read(&kmod_concurrent_max),
- MAX_KMOD_CONCURRENT, module_name);
- ret = wait_event_killable_timeout(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
- MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
- if (!ret) {
- pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
- module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
- return -ETIME;
- } else if (ret == -ERESTARTSYS) {
- pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
- return ret;
- }
+ ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return ret;
}
trace_module_request(module_name, wait, _RET_IP_);
+ if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
+ ret = dup_ret;
+ goto out;
+ }
+
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
- atomic_inc(&kmod_concurrent_max);
- wake_up(&kmod_wq);
+out:
+ up(&kmod_concurrent_max);
return ret;
}
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
new file mode 100644
index 000000000000..a89f01e1d6b7
--- /dev/null
+++ b/kernel/module/livepatch.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module livepatch support
+ *
+ * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Persist ELF information about a module. Copy the ELF header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ unsigned int size, symndx;
+ int ret;
+
+ size = sizeof(*mod->klp_info);
+ mod->klp_info = kmalloc(size, GFP_KERNEL);
+ if (!mod->klp_info)
+ return -ENOMEM;
+
+ /* ELF header */
+ size = sizeof(mod->klp_info->hdr);
+ memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+ /* ELF section header table */
+ size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+ if (!mod->klp_info->sechdrs) {
+ ret = -ENOMEM;
+ goto free_info;
+ }
+
+ /* ELF section name string table */
+ size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+ if (!mod->klp_info->secstrings) {
+ ret = -ENOMEM;
+ goto free_sechdrs;
+ }
+
+ /* ELF symbol section index */
+ symndx = info->index.sym;
+ mod->klp_info->symndx = symndx;
+
+ /*
+ * For livepatch modules, core_kallsyms.symtab is a complete
+ * copy of the original symbol table. Adjust sh_addr to point
+ * to core_kallsyms.symtab since the copy of the symtab in module
+ * init memory is freed at the end of do_init_module().
+ */
+ mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab;
+
+ return 0;
+
+free_sechdrs:
+ kfree(mod->klp_info->sechdrs);
+free_info:
+ kfree(mod->klp_info);
+ return ret;
+}
+
+void free_module_elf(struct module *mod)
+{
+ kfree(mod->klp_info->sechdrs);
+ kfree(mod->klp_info->secstrings);
+ kfree(mod->klp_info);
+}
diff --git a/kernel/module.c b/kernel/module/main.c
index ed13917ea5f3..044aa2c9e3cb 100644
--- a/kernel/module.c
+++ b/kernel/module/main.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2002 Richard Henderson
* Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
#define INCLUDE_VERMAGIC
@@ -14,16 +15,13 @@
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/buildid.h>
-#include <linux/file.h>
#include <linux/fs.h>
-#include <linux/sysfs.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
+#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -57,240 +55,76 @@
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
+#include <linux/cfi.h>
+#include <linux/debugfs.h>
#include <uapi/linux/module.h>
-#include "module-internal.h"
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
/*
* Mutex protects:
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
- * 3) module_addr_min/module_addr_max.
+ * 3) mod_tree.addr_min/mod_tree.addr_max.
* (delete and add uses RCU list operations).
*/
-static DEFINE_MUTEX(module_mutex);
-static LIST_HEAD(modules);
+DEFINE_MUTEX(module_mutex);
+LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
static void do_free_init(struct work_struct *w);
static DECLARE_WORK(init_free_wq, do_free_init);
static LLIST_HEAD(init_free_list);
-#ifdef CONFIG_MODULES_TREE_LOOKUP
-
-/*
- * Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
- *
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
- */
-
-static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->base;
-}
-
-static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->size;
-}
-
-static __always_inline bool
-mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
-{
- return __mod_tree_val(a) < __mod_tree_val(b);
-}
-
-static __always_inline int
-mod_tree_comp(void *key, struct latch_tree_node *n)
-{
- unsigned long val = (unsigned long)key;
- unsigned long start, end;
-
- start = __mod_tree_val(n);
- if (val < start)
- return -1;
-
- end = start + __mod_tree_size(n);
- if (val >= end)
- return 1;
-
- return 0;
-}
-
-static const struct latch_tree_ops mod_tree_ops = {
- .less = mod_tree_less,
- .comp = mod_tree_comp,
-};
-
-static struct mod_tree_root {
- struct latch_tree_root root;
- unsigned long addr_min;
- unsigned long addr_max;
-} mod_tree __cacheline_aligned = {
+struct mod_tree_root mod_tree __cacheline_aligned = {
.addr_min = -1UL,
};
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
-{
- latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-static void __mod_tree_remove(struct mod_tree_node *node)
-{
- latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-/*
- * These modifications: insert, remove_init and remove; are serialized by the
- * module_mutex.
- */
-static void mod_tree_insert(struct module *mod)
-{
- mod->core_layout.mtn.mod = mod;
- mod->init_layout.mtn.mod = mod;
-
- __mod_tree_insert(&mod->core_layout.mtn);
- if (mod->init_layout.size)
- __mod_tree_insert(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove_init(struct module *mod)
-{
- if (mod->init_layout.size)
- __mod_tree_remove(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove(struct module *mod)
-{
- __mod_tree_remove(&mod->core_layout.mtn);
- mod_tree_remove_init(mod);
-}
-
-static struct module *mod_find(unsigned long addr)
-{
- struct latch_tree_node *ltn;
-
- ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
- if (!ltn)
- return NULL;
-
- return container_of(ltn, struct mod_tree_node, node)->mod;
-}
-
-#else /* MODULES_TREE_LOOKUP */
-
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-static void mod_tree_insert(struct module *mod) { }
-static void mod_tree_remove_init(struct module *mod) { }
-static void mod_tree_remove(struct module *mod) { }
-
-static struct module *mod_find(unsigned long addr)
-{
- struct module *mod;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- if (within_module(addr, mod))
- return mod;
- }
-
- return NULL;
-}
-
-#endif /* MODULES_TREE_LOOKUP */
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const s32 *crcs;
+ enum mod_license license;
+};
/*
- * Bounds of module text, for speeding up __module_address.
+ * Bounds of module memory, for speeding up __module_address.
* Protected by module_mutex.
*/
-static void __mod_update_bounds(void *base, unsigned int size)
+static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
+ unsigned int size, struct mod_tree_root *tree)
{
unsigned long min = (unsigned long)base;
unsigned long max = min + size;
- if (min < module_addr_min)
- module_addr_min = min;
- if (max > module_addr_max)
- module_addr_max = max;
-}
-
-static void mod_update_bounds(struct module *mod)
-{
- __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
- if (mod->init_layout.size)
- __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
-}
-
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
-static void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
- if (unlikely(!debug_locks))
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (mod_mem_type_is_core_data(type)) {
+ if (min < tree->data_addr_min)
+ tree->data_addr_min = min;
+ if (max > tree->data_addr_max)
+ tree->data_addr_max = max;
return;
-
- WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
- !lockdep_is_held(&module_mutex));
+ }
#endif
+ if (min < tree->addr_min)
+ tree->addr_min = min;
+ if (max > tree->addr_max)
+ tree->addr_max = max;
}
-#ifdef CONFIG_MODULE_SIG
-static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-module_param(sig_enforce, bool_enable_only, 0644);
-
-void set_module_sig_enforced(void)
+static void mod_update_bounds(struct module *mod)
{
- sig_enforce = true;
-}
-#else
-#define sig_enforce false
-#endif
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
-/*
- * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
- * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
- */
-bool is_module_sig_enforced(void)
-{
- return sig_enforce;
+ if (mod_mem->size)
+ __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
+ }
}
-EXPORT_SYMBOL(is_module_sig_enforced);
/* Block module loading/unloading? */
-int modules_disabled = 0;
+int modules_disabled;
core_param(nomodule, modules_disabled, bint, 0);
/* Waiting for a module to finish initializing? */
@@ -335,14 +169,14 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
/*
* A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit. nfsd and lockd use this.
+ * is running can call this to safely exit.
*/
-void __noreturn __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
{
module_put(mod);
- do_exit(code);
+ kthread_exit(code);
}
-EXPORT_SYMBOL(__module_put_and_exit);
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
/* Find a module section: 0 means not found. */
static unsigned int find_sec(const struct load_info *info, const char *name)
@@ -408,66 +242,12 @@ static __maybe_unused void *any_section_objs(const struct load_info *info,
return (void *)info->sechdrs[sec].sh_addr;
}
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
-
#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
#else
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
-struct symsearch {
- const struct kernel_symbol *start, *stop;
- const s32 *crcs;
- enum mod_license {
- NOT_GPL_ONLY,
- GPL_ONLY,
- } license;
-};
-
-struct find_symbol_arg {
- /* Input */
- const char *name;
- bool gplok;
- bool warn;
-
- /* Output */
- struct module *owner;
- const s32 *crc;
- const struct kernel_symbol *sym;
- enum mod_license license;
-};
-
-static bool check_exported_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
-{
- struct find_symbol_arg *fsa = data;
-
- if (!fsa->gplok && syms->license == GPL_ONLY)
- return false;
- fsa->owner = owner;
- fsa->crc = symversion(syms->crcs, symnum);
- fsa->sym = &syms->start[symnum];
- fsa->license = syms->license;
- return true;
-}
-
-static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- return (unsigned long)offset_to_ptr(&sym->value_offset);
-#else
- return sym->value;
-#endif
-}
-
static const char *kernel_symbol_name(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
@@ -488,33 +268,38 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
#endif
}
-static int cmp_name(const void *name, const void *sym)
+int cmp_name(const void *name, const void *sym)
{
return strcmp(name, kernel_symbol_name(sym));
}
static bool find_exported_symbol_in_section(const struct symsearch *syms,
struct module *owner,
- void *data)
+ struct find_symbol_arg *fsa)
{
- struct find_symbol_arg *fsa = data;
struct kernel_symbol *sym;
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
+
sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
sizeof(struct kernel_symbol), cmp_name);
+ if (!sym)
+ return false;
- if (sym != NULL && check_exported_symbol(syms, owner,
- sym - syms->start, data))
- return true;
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, sym - syms->start);
+ fsa->sym = sym;
+ fsa->license = syms->license;
- return false;
+ return true;
}
/*
* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex.
*/
-static bool find_symbol(struct find_symbol_arg *fsa)
+bool find_symbol(struct find_symbol_arg *fsa)
{
static const struct symsearch arr[] = {
{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
@@ -558,8 +343,8 @@ static bool find_symbol(struct find_symbol_arg *fsa)
* Search for module by name: must hold module_mutex (or preempt disabled
* for read-only access).
*/
-static struct module *find_module_all(const char *name, size_t len,
- bool even_unformed)
+struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
{
struct module *mod;
@@ -744,7 +529,10 @@ static struct module_attribute modinfo_##field = { \
MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);
-static char last_unloaded_module[MODULE_NAME_LEN+1];
+static struct {
+ char name[MODULE_NAME_LEN + 1];
+ char taints[MODULE_FLAGS_BUF_SIZE];
+} last_unloaded_module;
#ifdef CONFIG_MODULE_UNLOAD
@@ -777,10 +565,8 @@ static int already_uses(struct module *a, struct module *b)
struct module_use *use;
list_for_each_entry(use, &b->source_list, source_list) {
- if (use->source == a) {
- pr_debug("%s uses %s!\n", a->name, b->name);
+ if (use->source == a)
return 1;
- }
}
pr_debug("%s does not use %s!\n", a->name, b->name);
return 0;
@@ -914,6 +700,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
{
struct module *mod;
char name[MODULE_NAME_LEN];
+ char buf[MODULE_FLAGS_BUF_SIZE];
int ret, forced = 0;
if (!capable(CAP_SYS_MODULE) || modules_disabled)
@@ -958,7 +745,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
}
}
- /* Stop the machine so refcounts can't move and disable module. */
ret = try_stop_module(mod, flags, &forced);
if (ret != 0)
goto out;
@@ -974,8 +760,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
async_synchronize_full();
- /* Store the name of the last unloaded module for diagnostic purposes */
- strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
+ /* Store the name and taints of the last unloaded module for diagnostic purposes */
+ strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name));
+ strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints));
free_module(mod);
/* someone could wait for the module in add_unformed_module() */
@@ -986,31 +773,6 @@ out:
return ret;
}
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- struct module_use *use;
- int printed_something = 0;
-
- seq_printf(m, " %i ", module_refcount(mod));
-
- /*
- * Always include a trailing , so userspace can differentiate
- * between this and the old multi-field proc format.
- */
- list_for_each_entry(use, &mod->source_list, source_list) {
- printed_something = 1;
- seq_printf(m, "%s,", use->source->name);
- }
-
- if (mod->init != NULL && mod->exit == NULL) {
- printed_something = 1;
- seq_puts(m, "[permanent],");
- }
-
- if (!printed_something)
- seq_puts(m, "-");
-}
-
void __symbol_put(const char *symbol)
{
struct find_symbol_arg fsa = {
@@ -1100,12 +862,6 @@ void module_put(struct module *module)
EXPORT_SYMBOL(module_put);
#else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- /* We don't know the usage count, or what modules are using. */
- seq_puts(m, " - -");
-}
-
static inline void module_unload_free(struct module *mod)
{
}
@@ -1121,13 +877,13 @@ static inline int module_unload_init(struct module *mod)
}
#endif /* CONFIG_MODULE_UNLOAD */
-static size_t module_flags_taint(struct module *mod, char *buf)
+size_t module_flags_taint(unsigned long taints, char *buf)
{
size_t l = 0;
int i;
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- if (taint_flags[i].module && test_bit(i, &mod->taints))
+ if (taint_flags[i].module && test_bit(i, &taints))
buf[l++] = taint_flags[i].c_true;
}
@@ -1174,16 +930,41 @@ struct module_attribute module_uevent =
static ssize_t show_coresize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
+ unsigned int size = mk->mod->mem[MOD_TEXT].size;
+
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ }
+ return sprintf(buffer, "%u\n", size);
}
static struct module_attribute modinfo_coresize =
__ATTR(coresize, 0444, show_coresize, NULL);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+static ssize_t show_datasize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static struct module_attribute modinfo_datasize =
+ __ATTR(datasize, 0444, show_datasize, NULL);
+#endif
+
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, init)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
}
static struct module_attribute modinfo_initsize =
@@ -1194,7 +975,7 @@ static ssize_t show_taint(struct module_attribute *mattr,
{
size_t l;
- l = module_flags_taint(mk->mod, buffer);
+ l = module_flags_taint(mk->mod->taints, buffer);
buffer[l++] = '\n';
return l;
}
@@ -1202,12 +983,15 @@ static ssize_t show_taint(struct module_attribute *mattr,
static struct module_attribute modinfo_taint =
__ATTR(taint, 0444, show_taint, NULL);
-static struct module_attribute *modinfo_attrs[] = {
+struct module_attribute *modinfo_attrs[] = {
&module_uevent,
&modinfo_version,
&modinfo_srcversion,
&modinfo_initstate,
&modinfo_coresize,
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ &modinfo_datasize,
+#endif
&modinfo_initsize,
&modinfo_taint,
#ifdef CONFIG_MODULE_UNLOAD
@@ -1216,9 +1000,11 @@ static struct module_attribute *modinfo_attrs[] = {
NULL,
};
+size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+
static const char vermagic[] = VERMAGIC_STRING;
-static int try_to_force_load(struct module *mod, const char *reason)
+int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
@@ -1230,118 +1016,55 @@ static int try_to_force_load(struct module *mod, const char *reason)
#endif
}
-#ifdef CONFIG_MODVERSIONS
-
-static u32 resolve_rel_crc(const s32 *crc)
-{
- return *(u32 *)((void *)crc + *crc);
-}
-
-static int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
+/* Parse tag=value strings from .modinfo section */
+char *module_next_tag_pair(char *string, unsigned long *secsize)
{
- Elf_Shdr *sechdrs = info->sechdrs;
- unsigned int versindex = info->index.vers;
- unsigned int i, num_versions;
- struct modversion_info *versions;
-
- /* Exporting module didn't supply crcs? OK, we're already tainted. */
- if (!crc)
- return 1;
-
- /* No versions at all? modprobe --force does this. */
- if (versindex == 0)
- return try_to_force_load(mod, symname) == 0;
-
- versions = (void *) sechdrs[versindex].sh_addr;
- num_versions = sechdrs[versindex].sh_size
- / sizeof(struct modversion_info);
-
- for (i = 0; i < num_versions; i++) {
- u32 crcval;
-
- if (strcmp(versions[i].name, symname) != 0)
- continue;
-
- if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
- crcval = resolve_rel_crc(crc);
- else
- crcval = *crc;
- if (versions[i].crc == crcval)
- return 1;
- pr_debug("Found checksum %X vs module %lX\n",
- crcval, versions[i].crc);
- goto bad_version;
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
}
- /* Broken toolchain. Warn once, then let it go.. */
- pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
- return 1;
-
-bad_version:
- pr_warn("%s: disagrees about version of symbol %s\n",
- info->name, symname);
- return 0;
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
}
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
{
- struct find_symbol_arg fsa = {
- .name = "module_layout",
- .gplok = true,
- };
+ char *p;
+ unsigned int taglen = strlen(tag);
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
/*
- * Since this should be found in kernel (which can't be removed), no
- * locking is necessary -- use preempt_disable() to placate lockdep.
+ * get_modinfo() calls made before rewrite_section_headers()
+ * must use sh_offset, as sh_addr isn't set!
*/
- preempt_disable();
- if (!find_symbol(&fsa)) {
- preempt_enable();
- BUG();
- }
- preempt_enable();
- return check_version(info, "module_layout", mod, fsa.crc);
-}
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- if (has_crcs) {
- amagic += strcspn(amagic, " ");
- bmagic += strcspn(bmagic, " ");
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = module_next_tag_pair(prev, &size);
}
- return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- return 1;
-}
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- return 1;
+ for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
}
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
+static char *get_modinfo(const struct load_info *info, const char *tag)
{
- return strcmp(amagic, bmagic) == 0;
+ return get_next_modinfo(info, tag, NULL);
}
-#endif /* CONFIG_MODVERSIONS */
-
-static char *get_modinfo(const struct load_info *info, const char *tag);
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev);
static int verify_namespace_is_imported(const struct load_info *info,
const struct kernel_symbol *sym,
@@ -1352,12 +1075,9 @@ static int verify_namespace_is_imported(const struct load_info *info,
namespace = kernel_symbol_namespace(sym);
if (namespace && namespace[0]) {
- imported_namespace = get_modinfo(info, "import_ns");
- while (imported_namespace) {
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
if (strcmp(namespace, imported_namespace) == 0)
return 0;
- imported_namespace = get_next_modinfo(
- info, "import_ns", imported_namespace);
}
#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
pr_warn(
@@ -1373,20 +1093,20 @@ static int verify_namespace_is_imported(const struct load_info *info,
return 0;
}
-static bool inherit_taint(struct module *mod, struct module *owner)
+static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
{
if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
return true;
if (mod->using_gplonly_symbols) {
- pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
- mod->name, owner->name);
+ pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
+ mod->name, name, owner->name);
return false;
}
if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
- pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
- mod->name, owner->name);
+ pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
+ mod->name, name, owner->name);
set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
}
return true;
@@ -1418,7 +1138,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (fsa.license == GPL_ONLY)
mod->using_gplonly_symbols = true;
- if (!inherit_taint(mod, fsa.owner)) {
+ if (!inherit_taint(mod, fsa.owner, name)) {
fsa.sym = NULL;
goto getname;
}
@@ -1466,693 +1186,63 @@ resolve_symbol_wait(struct module *mod,
return ksym;
}
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
- return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-#endif
-
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-struct module_sect_attr {
- struct bin_attribute battr;
- unsigned long address;
-};
-
-struct module_sect_attrs {
- struct attribute_group grp;
- unsigned int nsections;
- struct module_sect_attr attrs[];
-};
-
-#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
-static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *battr,
- char *buf, loff_t pos, size_t count)
-{
- struct module_sect_attr *sattr =
- container_of(battr, struct module_sect_attr, battr);
- char bounce[MODULE_SECT_READ_SIZE + 1];
- size_t wrote;
-
- if (pos != 0)
- return -EINVAL;
-
- /*
- * Since we're a binary read handler, we must account for the
- * trailing NUL byte that sprintf will write: if "buf" is
- * too small to hold the NUL, or the NUL is exactly the last
- * byte, the read will look like it got truncated by one byte.
- * Since there is no way to ask sprintf nicely to not write
- * the NUL, we have to use a bounce buffer.
- */
- wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
- kallsyms_show_value(file->f_cred)
- ? (void *)sattr->address : NULL);
- count = min(count, wrote);
- memcpy(buf, bounce, count);
-
- return count;
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
- unsigned int section;
-
- for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].battr.attr.name);
- kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int nloaded = 0, i, size[2];
- struct module_sect_attrs *sect_attrs;
- struct module_sect_attr *sattr;
- struct bin_attribute **gattr;
-
- /* Count loaded sections and allocate structures */
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]))
- nloaded++;
- size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
- sizeof(sect_attrs->grp.bin_attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
- sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (sect_attrs == NULL)
- return;
-
- /* Setup section attributes. */
- sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
-
- sect_attrs->nsections = 0;
- sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.bin_attrs[0];
- for (i = 0; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *sec = &info->sechdrs[i];
- if (sect_empty(sec))
- continue;
- sysfs_bin_attr_init(&sattr->battr);
- sattr->address = sec->sh_addr;
- sattr->battr.attr.name =
- kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (sattr->battr.attr.name == NULL)
- goto out;
- sect_attrs->nsections++;
- sattr->battr.read = module_sect_read;
- sattr->battr.size = MODULE_SECT_READ_SIZE;
- sattr->battr.attr.mode = 0400;
- *(gattr++) = &(sattr++)->battr;
- }
- *gattr = NULL;
-
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
- goto out;
-
- mod->sect_attrs = sect_attrs;
- return;
- out:
- free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
- if (mod->sect_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->sect_attrs->grp);
- /*
- * We are positive that no one is using any sect attrs
- * at this point. Deallocate immediately.
- */
- free_sect_attrs(mod->sect_attrs);
- mod->sect_attrs = NULL;
- }
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
- struct kobject *dir;
- unsigned int notes;
- struct bin_attribute attrs[];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t pos, size_t count)
+void __weak module_memfree(void *module_region)
{
/*
- * The caller checked the pos and count against our size.
+ * This memory may be RO, and freeing RO memory in an interrupt is not
+ * supported by vmalloc.
*/
- memcpy(buf, bin_attr->private + pos, count);
- return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
- unsigned int i)
-{
- if (notes_attrs->dir) {
- while (i-- > 0)
- sysfs_remove_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]);
- kobject_put(notes_attrs->dir);
- }
- kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int notes, loaded, i;
- struct module_notes_attrs *notes_attrs;
- struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
-
- /* Count notes sections and allocate structures. */
- notes = 0;
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]) &&
- (info->sechdrs[i].sh_type == SHT_NOTE))
- ++notes;
-
- if (notes == 0)
- return;
-
- notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
- GFP_KERNEL);
- if (notes_attrs == NULL)
- return;
-
- notes_attrs->notes = notes;
- nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
- if (sect_empty(&info->sechdrs[i]))
- continue;
- if (info->sechdrs[i].sh_type == SHT_NOTE) {
- sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
- nattr->attr.mode = S_IRUGO;
- nattr->size = info->sechdrs[i].sh_size;
- nattr->private = (void *) info->sechdrs[i].sh_addr;
- nattr->read = module_notes_read;
- ++nattr;
- }
- ++loaded;
- }
-
- notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
- goto out;
-
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
- goto out;
-
- mod->notes_attrs = notes_attrs;
- return;
-
- out:
- free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
- if (mod->notes_attrs)
- free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
- const struct load_info *info)
-{
+ WARN_ON(in_interrupt());
+ vfree(module_region);
}
-static inline void remove_sect_attrs(struct module *mod)
+void __weak module_arch_cleanup(struct module *mod)
{
}
-static inline void add_notes_attrs(struct module *mod,
- const struct load_info *info)
+void __weak module_arch_freeing_init(struct module *mod)
{
}
-static inline void remove_notes_attrs(struct module *mod)
+static bool mod_mem_use_vmalloc(enum mod_mem_type type)
{
+ return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) &&
+ mod_mem_type_is_core_data(type);
}
-#endif /* CONFIG_KALLSYMS */
-static void del_usage_links(struct module *mod)
+static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
{
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
- mutex_unlock(&module_mutex);
-#endif
+ if (mod_mem_use_vmalloc(type))
+ return vzalloc(size);
+ return module_alloc(size);
}
-static int add_usage_links(struct module *mod)
+static void module_memory_free(void *ptr, enum mod_mem_type type)
{
- int ret = 0;
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- ret = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- if (ret)
- break;
- }
- mutex_unlock(&module_mutex);
- if (ret)
- del_usage_links(mod);
-#endif
- return ret;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end);
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
- struct module_attribute *attr;
- struct module_attribute *temp_attr;
- int error = 0;
- int i;
-
- mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
- (ARRAY_SIZE(modinfo_attrs) + 1)),
- GFP_KERNEL);
- if (!mod->modinfo_attrs)
- return -ENOMEM;
-
- temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (!attr->test || attr->test(mod)) {
- memcpy(temp_attr, attr, sizeof(*temp_attr));
- sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,
- &temp_attr->attr);
- if (error)
- goto error_out;
- ++temp_attr;
- }
- }
-
- return 0;
-
-error_out:
- if (i > 0)
- module_remove_modinfo_attrs(mod, --i);
+ if (mod_mem_use_vmalloc(type))
+ vfree(ptr);
else
- kfree(mod->modinfo_attrs);
- return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
- if (end >= 0 && i > end)
- break;
- /* pick a field to test for end of list */
- if (!attr->attr.name)
- break;
- sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
- if (attr->free)
- attr->free(mod);
- }
- kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
- DECLARE_COMPLETION_ONSTACK(c);
- mod->mkobj.kobj_completion = &c;
- kobject_put(&mod->mkobj.kobj);
- wait_for_completion(&c);
+ module_memfree(ptr);
}
-static int mod_sysfs_init(struct module *mod)
+static void free_mod_mem(struct module *mod)
{
- int err;
- struct kobject *kobj;
-
- if (!module_sysfs_initialized) {
- pr_err("%s: module sysfs not initialized\n", mod->name);
- err = -EINVAL;
- goto out;
- }
-
- kobj = kset_find_obj(module_kset, mod->name);
- if (kobj) {
- pr_err("%s: module is already loaded\n", mod->name);
- kobject_put(kobj);
- err = -EINVAL;
- goto out;
- }
-
- mod->mkobj.mod = mod;
-
- memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
- mod->mkobj.kobj.kset = module_kset;
- err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
- "%s", mod->name);
- if (err)
- mod_kobject_put(mod);
-
-out:
- return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- int err;
-
- err = mod_sysfs_init(mod);
- if (err)
- goto out;
-
- mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
- if (!mod->holders_dir) {
- err = -ENOMEM;
- goto out_unreg;
- }
-
- err = module_param_sysfs_setup(mod, kparam, num_params);
- if (err)
- goto out_unreg_holders;
-
- err = module_add_modinfo_attrs(mod);
- if (err)
- goto out_unreg_param;
-
- err = add_usage_links(mod);
- if (err)
- goto out_unreg_modinfo_attrs;
-
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
-
- return 0;
-
-out_unreg_modinfo_attrs:
- module_remove_modinfo_attrs(mod, -1);
-out_unreg_param:
- module_param_sysfs_remove(mod);
-out_unreg_holders:
- kobject_put(mod->holders_dir);
-out_unreg:
- mod_kobject_put(mod);
-out:
- return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_put(mod);
-}
-
-static void init_param_lock(struct module *mod)
-{
- mutex_init(&mod->param_lock);
-}
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-}
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
-static void del_usage_links(struct module *mod)
-{
-}
-
-static void init_param_lock(struct module *mod)
-{
-}
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
- del_usage_links(mod);
- module_remove_modinfo_attrs(mod, -1);
- module_param_sysfs_remove(mod);
- kobject_put(mod->mkobj.drivers_dir);
- kobject_put(mod->holders_dir);
- mod_sysfs_fini(mod);
-}
-
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- * [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^ ^ ^ ^
- * ro_size ------------------------| | |
- * ro_after_init_size -----------------------------| |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base)
- */
-
-/*
- * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
- * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
- * whether we are strict.
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-static void frob_text(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base,
- layout->text_size >> PAGE_SHIFT);
-}
-
-static void module_enable_x(const struct module *mod)
-{
- frob_text(&mod->core_layout, set_memory_x);
- frob_text(&mod->init_layout, set_memory_x);
-}
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_STRICT_MODULE_RWX
-static void frob_rodata(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->text_size,
- (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_after_init_size,
- (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_ro(const struct module *mod, bool after_init)
-{
- if (!rodata_enabled)
- return;
-
- set_vm_flush_reset_perms(mod->core_layout.base);
- set_vm_flush_reset_perms(mod->init_layout.base);
- frob_text(&mod->core_layout, set_memory_ro);
-
- frob_rodata(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- frob_rodata(&mod->init_layout, set_memory_ro);
-
- if (after_init)
- frob_ro_after_init(&mod->core_layout, set_memory_ro);
-}
-
-static void module_enable_nx(const struct module *mod)
-{
- frob_rodata(&mod->core_layout, set_memory_nx);
- frob_ro_after_init(&mod->core_layout, set_memory_nx);
- frob_writable_data(&mod->core_layout, set_memory_nx);
- frob_rodata(&mod->init_layout, set_memory_nx);
- frob_writable_data(&mod->init_layout, set_memory_nx);
-}
-
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
- int i;
-
- for (i = 0; i < hdr->e_shnum; i++) {
- if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
- pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
- mod->name, secstrings + sechdrs[i].sh_name, i);
- return -ENOEXEC;
- }
- }
-
- return 0;
-}
-
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_ro(const struct module *mod, bool after_init) {}
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_LIVEPATCH
-/*
- * Persist Elf information about a module. Copy the Elf header,
- * section header table, section string table, and symtab section
- * index from info to mod->klp_info.
- */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- unsigned int size, symndx;
- int ret;
-
- size = sizeof(*mod->klp_info);
- mod->klp_info = kmalloc(size, GFP_KERNEL);
- if (mod->klp_info == NULL)
- return -ENOMEM;
-
- /* Elf header */
- size = sizeof(mod->klp_info->hdr);
- memcpy(&mod->klp_info->hdr, info->hdr, size);
-
- /* Elf section header table */
- size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
- mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
- if (mod->klp_info->sechdrs == NULL) {
- ret = -ENOMEM;
- goto free_info;
- }
+ if (type == MOD_DATA)
+ continue;
- /* Elf section name string table */
- size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
- mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
- if (mod->klp_info->secstrings == NULL) {
- ret = -ENOMEM;
- goto free_sechdrs;
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
+ lockdep_free_key_range(mod_mem->base, mod_mem->size);
+ if (mod_mem->size)
+ module_memory_free(mod_mem->base, type);
}
- /* Elf symbol section index */
- symndx = info->index.sym;
- mod->klp_info->symndx = symndx;
-
- /*
- * For livepatch modules, core_kallsyms.symtab is a complete
- * copy of the original symbol table. Adjust sh_addr to point
- * to core_kallsyms.symtab since the copy of the symtab in module
- * init memory is freed at the end of do_init_module().
- */
- mod->klp_info->sechdrs[symndx].sh_addr = \
- (unsigned long) mod->core_kallsyms.symtab;
-
- return 0;
-
-free_sechdrs:
- kfree(mod->klp_info->sechdrs);
-free_info:
- kfree(mod->klp_info);
- return ret;
-}
-
-static void free_module_elf(struct module *mod)
-{
- kfree(mod->klp_info->sechdrs);
- kfree(mod->klp_info->secstrings);
- kfree(mod->klp_info);
-}
-#else /* !CONFIG_LIVEPATCH */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- return 0;
-}
-
-static void free_module_elf(struct module *mod)
-{
+ /* MOD_DATA hosts mod, so free it at last */
+ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+ module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
}
-#endif /* CONFIG_LIVEPATCH */
-
-void __weak module_memfree(void *module_region)
-{
- /*
- * This memory may be RO, and freeing RO memory in an interrupt is not
- * supported by vmalloc.
- */
- WARN_ON(in_interrupt());
- vfree(module_region);
-}
-
-void __weak module_arch_cleanup(struct module *mod)
-{
-}
-
-void __weak module_arch_freeing_init(struct module *mod)
-{
-}
-
-static void cfi_cleanup(struct module *mod);
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
@@ -2169,9 +1259,6 @@ static void free_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
mutex_unlock(&module_mutex);
- /* Remove dynamic debug info */
- ddebug_remove_module(mod->name);
-
/* Arch-specific cleanup. */
module_arch_cleanup(mod);
@@ -2193,22 +1280,17 @@ static void free_module(struct module *mod)
module_bug_cleanup(mod);
/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
synchronize_rcu();
+ if (try_add_tainted_module(mod))
+ pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
+ mod->name);
mutex_unlock(&module_mutex);
- /* Clean up CFI for the module. */
- cfi_cleanup(mod);
-
/* This may be empty, but that's OK */
module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
kfree(mod->args);
percpu_modfree(mod);
- /* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
- /* Finally, free the core (containing the module structure) */
- module_memfree(mod->core_layout.base);
+ free_mod_mem(mod);
}
void *__symbol_get(const char *symbol)
@@ -2311,8 +1393,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
case SHN_ABS:
/* Don't need to do anything */
- pr_debug("Absolute symbol: 0x%08lx\n",
- (long)sym[i].st_value);
+ pr_debug("Absolute symbol: 0x%08lx %s\n",
+ (long)sym[i].st_value, name);
break;
case SHN_LIVEPATCH:
@@ -2395,16 +1477,18 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod,
return 0;
}
-/* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
- Elf_Shdr *sechdr, unsigned int section)
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section)
{
- long ret;
+ long offset;
+ long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;
- *size += arch_mod_section_prepend(mod, section);
- ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
- *size = ret + sechdr->sh_size;
- return ret;
+ mod->mem[type].size += arch_mod_section_prepend(mod, section);
+ offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
+ mod->mem[type].size = offset + sechdr->sh_size;
+
+ WARN_ON_ONCE(offset & mask);
+ return offset | mask;
}
static bool module_init_layout_section(const char *sname)
@@ -2416,15 +1500,11 @@ static bool module_init_layout_section(const char *sname)
return module_init_section(sname);
}
-/*
- * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- * might -- code, read-only data, read-write data, small data. Tally
- * sizes, and place the offsets into sh_entsize fields: high bit means it
- * belongs in init.
- */
-static void layout_sections(struct module *mod, struct load_info *info)
+static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
{
- static unsigned long const masks[][2] = {
+ unsigned int m, i;
+
+ static const unsigned long masks[][2] = {
/*
* NOTE: all executable code must be the first section
* in this array; otherwise modify the text_size
@@ -2436,13 +1516,24 @@ static void layout_sections(struct module *mod, struct load_info *info)
{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
};
- unsigned int m, i;
-
- for (i = 0; i < info->hdr->e_shnum; i++)
- info->sechdrs[i].sh_entsize = ~0UL;
+ static const int core_m_to_mem_type[] = {
+ MOD_TEXT,
+ MOD_RODATA,
+ MOD_RO_AFTER_INIT,
+ MOD_DATA,
+ MOD_INVALID, /* This is needed to match the masks array */
+ };
+ static const int init_m_to_mem_type[] = {
+ MOD_INIT_TEXT,
+ MOD_INIT_RODATA,
+ MOD_INVALID,
+ MOD_INIT_DATA,
+ MOD_INVALID, /* This is needed to match the masks array */
+ };
- pr_debug("Core section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];
+
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
const char *sname = info->secstrings + s->sh_name;
@@ -2450,69 +1541,39 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || module_init_layout_section(sname))
+ || is_init != module_init_layout_section(sname))
continue;
- s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.text_size = mod->core_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_size = mod->core_layout.size;
- break;
- case 2: /* RO after init */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_after_init_size = mod->core_layout.size;
- break;
- case 4: /* whole core */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- break;
- }
- }
- pr_debug("Init section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || !module_init_layout_section(sname))
+ if (WARN_ON_ONCE(type == MOD_INVALID))
continue;
- s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
- | INIT_OFFSET_MASK);
+
+ s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
pr_debug("\t%s\n", sname);
}
- switch (m) {
- case 0: /* executable */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- mod->init_layout.text_size = mod->init_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- mod->init_layout.ro_size = mod->init_layout.size;
- break;
- case 2:
- /*
- * RO after init doesn't apply to init_layout (only
- * core_layout), so it just takes the value of ro_size.
- */
- mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
- break;
- case 4: /* whole init */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- break;
- }
}
}
-static void set_license(struct module *mod, const char *license)
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data. Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
+
+ pr_debug("Core section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, false);
+
+ pr_debug("Init section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, true);
+}
+
+static void module_license_taint_check(struct module *mod, const char *license)
{
if (!license)
license = "unspecified";
@@ -2526,56 +1587,6 @@ static void set_license(struct module *mod, const char *license)
}
}
-/* Parse tag=value strings from .modinfo section */
-static char *next_string(char *string, unsigned long *secsize)
-{
- /* Skip non-zero chars */
- while (string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
-
- /* Skip any zero padding. */
- while (!string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
- return string;
-}
-
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev)
-{
- char *p;
- unsigned int taglen = strlen(tag);
- Elf_Shdr *infosec = &info->sechdrs[info->index.info];
- unsigned long size = infosec->sh_size;
-
- /*
- * get_modinfo() calls made before rewrite_section_headers()
- * must use sh_offset, as sh_addr isn't set!
- */
- char *modinfo = (char *)info->hdr + infosec->sh_offset;
-
- if (prev) {
- size -= prev - modinfo;
- modinfo = next_string(prev, &size);
- }
-
- for (p = modinfo; p; p = next_string(p, &size)) {
- if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
- return p + taglen + 1;
- }
- return NULL;
-}
-
-static char *get_modinfo(const struct load_info *info, const char *tag)
-{
- return get_next_modinfo(info, tag, NULL);
-}
-
static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
@@ -2598,241 +1609,6 @@ static void free_modinfo(struct module *mod)
}
}
-#ifdef CONFIG_KALLSYMS
-
-/* Lookup exported symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_exported_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
-{
- return bsearch(name, start, stop - start,
- sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
- const struct module *mod)
-{
- const struct kernel_symbol *ks;
- if (!mod)
- ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
- else
- ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
-
- return ks != NULL && kernel_symbol_value(ks) == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
- const Elf_Shdr *sechdrs = info->sechdrs;
-
- if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
- if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
- return 'v';
- else
- return 'w';
- }
- if (sym->st_shndx == SHN_UNDEF)
- return 'U';
- if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
- return 'a';
- if (sym->st_shndx >= SHN_LORESERVE)
- return '?';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
- return 't';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
- && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
- if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
- return 'r';
- else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 'g';
- else
- return 'd';
- }
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 's';
- else
- return 'b';
- }
- if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
- ".debug")) {
- return 'n';
- }
- return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum, unsigned int pcpundx)
-{
- const Elf_Shdr *sec;
-
- if (src->st_shndx == SHN_UNDEF
- || src->st_shndx >= shnum
- || !src->st_name)
- return false;
-
-#ifdef CONFIG_KALLSYMS_ALL
- if (src->st_shndx == pcpundx)
- return true;
-#endif
-
- sec = sechdrs + src->st_shndx;
- if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
- || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
- || (sec->sh_entsize & INIT_OFFSET_MASK))
- return false;
-
- return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep. This is simple, but has the effect of making multiple
- * copies of duplicates. We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *symsect = info->sechdrs + info->index.sym;
- Elf_Shdr *strsect = info->sechdrs + info->index.str;
- const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size = 0;
-
- /* Put symbol section at end of init part of module. */
- symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
- info->index.sym) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
- src = (void *)info->hdr + symsect->sh_offset;
- nsrc = symsect->sh_size / sizeof(*src);
-
- /* Compute total space required for the core symbols' strtab. */
- for (ndst = i = 0; i < nsrc; i++) {
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- strtab_size += strlen(&info->strtab[src[i].st_name])+1;
- ndst++;
- }
- }
-
- /* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_layout.size += strtab_size;
- info->core_typeoffs = mod->core_layout.size;
- mod->core_layout.size += ndst * sizeof(char);
- mod->core_layout.size = debug_align(mod->core_layout.size);
-
- /* Put string table section at end of init part of module. */
- strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
- info->index.str) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-
- /* We'll tack temporary mod_kallsyms on the end. */
- mod->init_layout.size = ALIGN(mod->init_layout.size,
- __alignof__(struct mod_kallsyms));
- info->mod_kallsyms_init_off = mod->init_layout.size;
- mod->init_layout.size += sizeof(struct mod_kallsyms);
- info->init_typeoffs = mod->init_layout.size;
- mod->init_layout.size += nsrc * sizeof(char);
- mod->init_layout.size = debug_align(mod->init_layout.size);
-}
-
-/*
- * We use the full symtab and strtab which layout_symtab arranged to
- * be appended to the init section. Later we switch to the cut-down
- * core-only ones.
- */
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
- unsigned int i, ndst;
- const Elf_Sym *src;
- Elf_Sym *dst;
- char *s;
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
- /* Set up to point into init section. */
- mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
-
- mod->kallsyms->symtab = (void *)symsec->sh_addr;
- mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
- /* Make sure we get permanent strtab: don't use info->strtab. */
- mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
- mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-
- /*
- * Now populate the cut down core kallsyms for after init
- * and set types up while we still have access to sections.
- */
- mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
- mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
- mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
- src = mod->kallsyms->symtab;
- for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
- mod->kallsyms->typetab[i] = elf_type(src + i, info);
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- mod->core_kallsyms.typetab[ndst] =
- mod->kallsyms->typetab[i];
- dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
- s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
- KSYM_NAME_LEN) + 1;
- }
- }
- mod->core_kallsyms.num_symtab = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
- const Elf_Shdr *sechdr;
- unsigned int i;
-
- for (i = 0; i < info->hdr->e_shnum; i++) {
- sechdr = &info->sechdrs[i];
- if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
- !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
- sechdr->sh_size))
- break;
- }
-}
-#else
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-}
-#endif
-
-static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
-{
- if (!debug)
- return;
- ddebug_add_module(debug, num, mod->name);
-}
-
-static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
-{
- if (debug)
- ddebug_remove_module(mod->name);
-}
-
void * __weak module_alloc(unsigned long size)
{
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
@@ -2850,99 +1626,13 @@ bool __weak module_exit_section(const char *name)
return strstarts(name, ".exit");
}
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
- }
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info, int flags)
-{
- int err = -ENODATA;
- const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- const char *reason;
- const void *mod = info->hdr;
-
- /*
- * Require flags == 0, as a module with version information
- * removed is no longer the module that was signed
- */
- if (flags == 0 &&
- info->len > markerlen &&
- memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
- /* We truncate the module to discard the signature */
- info->len -= markerlen;
- err = mod_verify_sig(mod, info);
- if (!err) {
- info->sig_ok = true;
- return 0;
- }
- }
-
- /*
- * We don't permit modules to be loaded into the trusted kernels
- * without a valid signature on them, but if we're not enforcing,
- * certain errors are non-fatal.
- */
- switch (err) {
- case -ENODATA:
- reason = "unsigned module";
- break;
- case -ENOPKG:
- reason = "module with unsupported crypto";
- break;
- case -ENOKEY:
- reason = "module with unavailable key";
- break;
-
- default:
- /*
- * All other errors are fatal, including lack of memory,
- * unparseable signatures, and signature check failures --
- * even if signatures aren't required.
- */
- return err;
- }
-
- if (is_module_sig_enforced()) {
- pr_notice("Loading of %s is rejected\n", reason);
- return -EKEYREJECTED;
- }
-
- return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info, int flags)
-{
- return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
+#if defined(CONFIG_64BIT)
+ unsigned long long secend;
+#else
unsigned long secend;
+#endif
/*
* Check for both overflow and offset/size being
@@ -2956,25 +1646,62 @@ static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
}
/*
- * Sanity checks against invalid binaries, wrong arch, weird elf version.
+ * Check userspace passed ELF module against our expectations, and cache
+ * useful variables for further processing as we go.
*
- * Also do basic validity checks against section offsets and sizes, the
+ * This does basic validity checks against section offsets and sizes, the
* section name string table, and the indices used for it (sh_name).
+ *
+ * As a last step, since we're already checking the ELF sections we cache
+ * useful variables which will be used later for our convenience:
+ *
+ * o pointers to section headers
+ * o cache the modinfo symbol section
+ * o cache the string symbol section
+ * o cache the module section
+ *
+ * As a last step we set info->mod to the temporary copy of the module in
+ * info->hdr. The final one will be allocated in move_module(). Any
+ * modifications we make to our copy of the module will be carried over
+ * to the final minted module.
*/
-static int elf_validity_check(struct load_info *info)
+static int elf_validity_cache_copy(struct load_info *info, int flags)
{
unsigned int i;
Elf_Shdr *shdr, *strhdr;
int err;
+ unsigned int num_mod_secs = 0, mod_idx;
+ unsigned int num_info_secs = 0, info_idx;
+ unsigned int num_sym_secs = 0, sym_idx;
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
+ if (info->len < sizeof(*(info->hdr))) {
+ pr_err("Invalid ELF header len %lu\n", info->len);
+ goto no_exec;
+ }
- if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
- || info->hdr->e_type != ET_REL
- || !elf_check_arch(info->hdr)
- || info->hdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+ goto no_exec;
+ }
+ if (info->hdr->e_type != ET_REL) {
+ pr_err("Invalid ELF header type: %u != %u\n",
+ info->hdr->e_type, ET_REL);
+ goto no_exec;
+ }
+ if (!elf_check_arch(info->hdr)) {
+ pr_err("Invalid architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
+ if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ pr_err("Invalid ELF section header size\n");
+ goto no_exec;
+ }
/*
* e_shnum is 16 bits, and sizeof(Elf_Shdr) is
@@ -2983,8 +1710,10 @@ static int elf_validity_check(struct load_info *info)
*/
if (info->hdr->e_shoff >= info->len
|| (info->hdr->e_shnum * sizeof(Elf_Shdr) >
- info->len - info->hdr->e_shoff))
- return -ENOEXEC;
+ info->len - info->hdr->e_shoff)) {
+ pr_err("Invalid ELF section header overflow\n");
+ goto no_exec;
+ }
info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
@@ -2992,13 +1721,19 @@ static int elf_validity_check(struct load_info *info)
* Verify if the section name table index is valid.
*/
if (info->hdr->e_shstrndx == SHN_UNDEF
- || info->hdr->e_shstrndx >= info->hdr->e_shnum)
- return -ENOEXEC;
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+ info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
strhdr = &info->sechdrs[info->hdr->e_shstrndx];
err = validate_section_offset(info, strhdr);
- if (err < 0)
+ if (err < 0) {
+ pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
return err;
+ }
/*
* The section name table must be NUL-terminated, as required
@@ -3006,8 +1741,14 @@ static int elf_validity_check(struct load_info *info)
* strings in the section safe.
*/
info->secstrings = (void *)info->hdr + strhdr->sh_offset;
- if (info->secstrings[strhdr->sh_size - 1] != '\0')
- return -ENOEXEC;
+ if (strhdr->sh_size == 0) {
+ pr_err("empty section name table\n");
+ goto no_exec;
+ }
+ if (info->secstrings[strhdr->sh_size - 1] != '\0') {
+ pr_err("ELF Spec violation: section name table isn't null terminated\n");
+ goto no_exec;
+ }
/*
* The code assumes that section 0 has a length of zero and
@@ -3015,8 +1756,11 @@ static int elf_validity_check(struct load_info *info)
*/
if (info->sechdrs[0].sh_type != SHT_NULL
|| info->sechdrs[0].sh_size != 0
- || info->sechdrs[0].sh_addr != 0)
- return -ENOEXEC;
+ || info->sechdrs[0].sh_addr != 0) {
+ pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+ info->sechdrs[0].sh_type);
+ goto no_exec;
+ }
for (i = 1; i < info->hdr->e_shnum; i++) {
shdr = &info->sechdrs[i];
@@ -3026,8 +1770,14 @@ static int elf_validity_check(struct load_info *info)
continue;
case SHT_SYMTAB:
if (shdr->sh_link == SHN_UNDEF
- || shdr->sh_link >= info->hdr->e_shnum)
- return -ENOEXEC;
+ || shdr->sh_link >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+ shdr->sh_link, shdr->sh_link,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
+ num_sym_secs++;
+ sym_idx = i;
fallthrough;
default:
err = validate_section_offset(info, shdr);
@@ -3036,6 +1786,15 @@ static int elf_validity_check(struct load_info *info)
i, shdr->sh_type);
return err;
}
+ if (strcmp(info->secstrings + shdr->sh_name,
+ ".gnu.linkonce.this_module") == 0) {
+ num_mod_secs++;
+ mod_idx = i;
+ } else if (strcmp(info->secstrings + shdr->sh_name,
+ ".modinfo") == 0) {
+ num_info_secs++;
+ info_idx = i;
+ }
if (shdr->sh_flags & SHF_ALLOC) {
if (shdr->sh_name >= strhdr->sh_size) {
@@ -3048,7 +1807,95 @@ static int elf_validity_check(struct load_info *info)
}
}
+ if (num_info_secs > 1) {
+ pr_err("Only one .modinfo section must exist.\n");
+ goto no_exec;
+ } else if (num_info_secs == 1) {
+ /* Try to find a name early so we can log errors with a module name */
+ info->index.info = info_idx;
+ info->name = get_modinfo(info, "name");
+ }
+
+ if (num_sym_secs != 1) {
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ /* Sets internal symbols and strings. */
+ info->index.sym = sym_idx;
+ shdr = &info->sechdrs[sym_idx];
+ info->index.str = shdr->sh_link;
+ info->strtab = (char *)info->hdr + info->sechdrs[info->index.str].sh_offset;
+
+ /*
+ * The ".gnu.linkonce.this_module" ELF section is special. It is
+ * what modpost uses to refer to __this_module and let's use rely
+ * on THIS_MODULE to point to &__this_module properly. The kernel's
+ * modpost declares it on each modules's *.mod.c file. If the struct
+ * module of the kernel changes a full kernel rebuild is required.
+ *
+ * We have a few expectaions for this special section, the following
+ * code validates all this for us:
+ *
+ * o Only one section must exist
+ * o We expect the kernel to always have to allocate it: SHF_ALLOC
+ * o The section size must match the kernel's run time's struct module
+ * size
+ */
+ if (num_mod_secs != 1) {
+ pr_err("module %s: Only one .gnu.linkonce.this_module section must exist.\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ shdr = &info->sechdrs[mod_idx];
+
+ /*
+ * This is already implied on the switch above, however let's be
+ * pedantic about it.
+ */
+ if (shdr->sh_type == SHT_NOBITS) {
+ pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ if (!(shdr->sh_flags & SHF_ALLOC)) {
+ pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ if (shdr->sh_size != sizeof(struct module)) {
+ pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ info->index.mod = mod_idx;
+
+ /* This is temporary: point mod into copy of data. */
+ info->mod = (void *)info->hdr + shdr->sh_offset;
+
+ /*
+ * If we didn't load the .modinfo 'name' field earlier, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = info->mod->name;
+
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
+
+ info->index.pcpu = find_pcpusec(info);
+
return 0;
+
+no_exec:
+ return -ENOEXEC;
}
#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
@@ -3068,30 +1915,19 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
return 0;
}
-#ifdef CONFIG_LIVEPATCH
static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
{
- if (get_modinfo(info, "livepatch")) {
- mod->klp = true;
- add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
- pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
- mod->name);
- }
+ if (!get_modinfo(info, "livepatch"))
+ /* Nothing more to do */
+ return 0;
- return 0;
-}
-#else /* !CONFIG_LIVEPATCH */
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
- if (get_modinfo(info, "livepatch")) {
- pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
- mod->name);
- return -ENOEXEC;
- }
+ if (set_livepatch_module(mod))
+ return 0;
- return 0;
+ pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ mod->name);
+ return -ENOEXEC;
}
-#endif /* CONFIG_LIVEPATCH */
static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
{
@@ -3135,9 +1971,12 @@ out:
return err;
}
-static void free_copy(struct load_info *info)
+static void free_copy(struct load_info *info, int flags)
{
- vfree(info->hdr);
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ module_decompress_cleanup(info);
+ else
+ vfree(info->hdr);
}
static int rewrite_section_headers(struct load_info *info, int flags)
@@ -3166,63 +2005,71 @@ static int rewrite_section_headers(struct load_info *info, int flags)
}
/*
- * Set up our basic convenience variables (pointers to section headers,
- * search for module section index etc), and do some basic section
- * verification.
- *
- * Set info->mod to the temporary copy of the module in info->hdr. The final one
- * will be allocated in move_module().
- */
-static int setup_load_info(struct load_info *info, int flags)
+ * These calls taint the kernel depending certain module circumstances */
+static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
{
- unsigned int i;
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
- /* Try to find a name early so we can log errors with a module name */
- info->index.info = find_sec(info, ".modinfo");
- if (info->index.info)
- info->name = get_modinfo(info, "name");
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
- /* Find internal symbols and strings. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
- info->index.sym = i;
- info->index.str = info->sechdrs[i].sh_link;
- info->strtab = (char *)info->hdr
- + info->sechdrs[info->index.str].sh_offset;
- break;
- }
+ check_modinfo_retpoline(mod, info);
+
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
}
- if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
+ if (is_livepatch_module(mod)) {
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
}
- info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
- if (!info->index.mod) {
- pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
+ module_license_taint_check(mod, get_modinfo(info, "license"));
+
+ if (get_modinfo(info, "test")) {
+ if (!test_taint(TAINT_TEST))
+ pr_warn("%s: loading test module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
}
- /* This is temporary: point mod into copy of data. */
- info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
/*
- * If we didn't load the .modinfo 'name' field earlier, fall back to
- * on-disk struct mod 'name' field.
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
*/
- if (!info->name)
- info->name = info->mod->name;
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
+ /* driverloader was caught wrongly pretending to be under GPL */
+ if (strcmp(mod->name, "driverloader") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
- info->index.pcpu = find_pcpusec(info);
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
- return 0;
}
static int check_modinfo(struct module *mod, struct load_info *info, int flags)
@@ -3244,28 +2091,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return -ENOEXEC;
}
- if (!get_modinfo(info, "intree")) {
- if (!test_taint(TAINT_OOT_MODULE))
- pr_warn("%s: loading out-of-tree module taints kernel.\n",
- mod->name);
- add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
- }
-
- check_modinfo_retpoline(mod, info);
-
- if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
- "is unknown, you have been warned.\n", mod->name);
- }
-
err = check_modinfo_livepatch(mod, info);
if (err)
return err;
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(info, "license"));
-
return 0;
}
@@ -3355,19 +2184,36 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(unsigned long),
&mod->num_kprobe_blacklist);
#endif
+#ifdef CONFIG_PRINTK_INDEX
+ mod->printk_index_start = section_objs(info, ".printk_index",
+ sizeof(*mod->printk_index_start),
+ &mod->printk_index_size);
+#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
mod->static_call_sites = section_objs(info, ".static_call_sites",
sizeof(*mod->static_call_sites),
&mod->num_static_call_sites);
#endif
+#if IS_ENABLED(CONFIG_KUNIT)
+ mod->kunit_suites = section_objs(info, ".kunit_test_suites",
+ sizeof(*mod->kunit_suites),
+ &mod->num_kunit_suites);
+#endif
+
mod->extable = section_objs(info, "__ex_table",
sizeof(*mod->extable), &mod->num_exentries);
if (section_addr(info, "__obsparm"))
pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
- info->debug = section_objs(info, "__dyndbg",
- sizeof(*info->debug), &info->num_debug);
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+ mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
+ sizeof(*mod->dyndbg_info.descs),
+ &mod->dyndbg_info.num_descs);
+ mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
+ sizeof(*mod->dyndbg_info.classes),
+ &mod->dyndbg_info.num_classes);
+#endif
return 0;
}
@@ -3376,90 +2222,82 @@ static int move_module(struct module *mod, struct load_info *info)
{
int i;
void *ptr;
+ enum mod_mem_type t = 0;
+ int ret = -ENOMEM;
- /* Do the allocs. */
- ptr = module_alloc(mod->core_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. Just mark it as not being a
- * leak.
- */
- kmemleak_not_leak(ptr);
- if (!ptr)
- return -ENOMEM;
-
- memset(ptr, 0, mod->core_layout.size);
- mod->core_layout.base = ptr;
-
- if (mod->init_layout.size) {
- ptr = module_alloc(mod->init_layout.size);
+ for_each_mod_mem_type(type) {
+ if (!mod->mem[type].size) {
+ mod->mem[type].base = NULL;
+ continue;
+ }
+ mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
+ ptr = module_memory_alloc(mod->mem[type].size, type);
/*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
+ * The pointer to these blocks of memory are stored on the module
+ * structure and we keep that around so long as the module is
+ * around. We only free that memory when we unload the module.
+ * Just mark them as not being a leak then. The .init* ELF
+ * sections *do* get freed after boot so we *could* treat them
+ * slightly differently with kmemleak_ignore() and only grey
+ * them out as they work as typical memory allocations which
+ * *do* eventually get freed, but let's just keep things simple
+ * and avoid *any* false positives.
*/
- kmemleak_ignore(ptr);
+ kmemleak_not_leak(ptr);
if (!ptr) {
- module_memfree(mod->core_layout.base);
- return -ENOMEM;
+ t = type;
+ goto out_enomem;
}
- memset(ptr, 0, mod->init_layout.size);
- mod->init_layout.base = ptr;
- } else
- mod->init_layout.base = NULL;
+ memset(ptr, 0, mod->mem[type].size);
+ mod->mem[type].base = ptr;
+ }
/* Transfer each section which specifies SHF_ALLOC */
- pr_debug("final section addresses:\n");
+ pr_debug("Final section addresses for %s:\n", mod->name);
for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
- if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->init_layout.base
- + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
- else
- dest = mod->core_layout.base + shdr->sh_entsize;
+ dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
- if (shdr->sh_type != SHT_NOBITS)
+ if (shdr->sh_type != SHT_NOBITS) {
+ /*
+ * Our ELF checker already validated this, but let's
+ * be pedantic and make the goal clearer. We actually
+ * end up copying over all modifications made to the
+ * userspace copy of the entire struct module.
+ */
+ if (i == info->index.mod &&
+ (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
+ ret = -ENOEXEC;
+ goto out_enomem;
+ }
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
- /* Update sh_addr to point to copy in image. */
+ }
+ /*
+ * Update the userspace copy's ELF section address to point to
+ * our newly allocated memory as a pure convenience so that
+ * users of info can keep taking advantage and using the newly
+ * minted official memory area.
+ */
shdr->sh_addr = (unsigned long)dest;
- pr_debug("\t0x%lx %s\n",
- (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
+ pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
+ (long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
+out_enomem:
+ for (t--; t >= 0; t--)
+ module_memory_free(mod->mem[t].base, t);
+ return ret;
}
-static int check_module_license_and_versions(struct module *mod)
+static int check_export_symbol_versions(struct module *mod)
{
- int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
-
- /*
- * ndiswrapper is under GPL by itself, but loads proprietary modules.
- * Don't use add_taint_module(), as it would prevent ndiswrapper from
- * using GPL-only symbols it needs.
- */
- if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
-
- /* driverloader was caught wrongly pretending to be under GPL */
- if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- /* lve claims to be GPL but upstream won't provide source */
- if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license taints kernel.\n", mod->name);
-
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs) ||
(mod->num_gpl_syms && !mod->gpl_crcs)) {
@@ -3477,12 +2315,19 @@ static void flush_module_icache(const struct module *mod)
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
- if (mod->init_layout.base)
- flush_icache_range((unsigned long)mod->init_layout.base,
- (unsigned long)mod->init_layout.base
- + mod->init_layout.size);
- flush_icache_range((unsigned long)mod->core_layout.base,
- (unsigned long)mod->core_layout.base + mod->core_layout.size);
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size) {
+ flush_icache_range((unsigned long)mod_mem->base,
+ (unsigned long)mod_mem->base + mod_mem->size);
+ }
+ }
+}
+
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
}
int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -3520,10 +2365,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
unsigned int ndx;
int err;
- err = check_modinfo(info->mod, info, flags);
- if (err)
- return ERR_PTR(err);
-
/* Allow arches to frob section contents and sizes. */
err = module_frob_arch_sections(info->hdr, info->sechdrs,
info->secstrings, info->mod);
@@ -3580,8 +2421,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
- module_memfree(mod->core_layout.base);
+
+ free_mod_mem(mod);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3607,26 +2448,6 @@ static int post_relocation(struct module *mod, const struct load_info *info)
return module_finalize(info->hdr, info->sechdrs, mod);
}
-/* Is this module of this name done loading? No locks held. */
-static bool finished_loading(const char *name)
-{
- struct module *mod;
- bool ret;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
- mutex_unlock(&module_mutex);
-
- return ret;
-}
-
/* Call module constructors. */
static void do_mod_ctors(struct module *mod)
{
@@ -3641,7 +2462,9 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
struct llist_node node;
- void *module_init;
+ void *init_text;
+ void *init_data;
+ void *init_rodata;
};
static void do_free_init(struct work_struct *w)
@@ -3655,11 +2478,19 @@ static void do_free_init(struct work_struct *w)
llist_for_each_safe(pos, n, list) {
initfree = container_of(pos, struct mod_initfree, node);
- module_memfree(initfree->module_init);
+ module_memfree(initfree->init_text);
+ module_memfree(initfree->init_data);
+ module_memfree(initfree->init_rodata);
kfree(initfree);
}
}
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+/* Default value for module->async_probe_requested */
+static bool async_probe;
+module_param(async_probe, bool, 0644);
+
/*
* This is where the real work happens.
*
@@ -3670,19 +2501,27 @@ static noinline int do_init_module(struct module *mod)
{
int ret = 0;
struct mod_initfree *freeinit;
+#if defined(CONFIG_MODULE_STATS)
+ unsigned int text_size = 0, total_size = 0;
+
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+ if (mod_mem->size) {
+ total_size += mod_mem->size;
+ if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+ text_size += mod_mem->size;
+ }
+ }
+#endif
freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
if (!freeinit) {
ret = -ENOMEM;
goto fail;
}
- freeinit->module_init = mod->init_layout.base;
-
- /*
- * We want to find out whether @mod uses async during init. Clear
- * PF_USED_ASYNC. async_schedule*() will set it.
- */
- current->flags &= ~PF_USED_ASYNC;
+ freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
+ freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
+ freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;
do_mod_ctors(mod);
/* Start the module */
@@ -3709,26 +2548,17 @@ static noinline int do_init_module(struct module *mod)
/*
* We need to finish all async code before the module init sequence
- * is done. This has potential to deadlock. For example, a newly
- * detected block device can trigger request_module() of the
- * default iosched from async probing task. Once userland helper
- * reaches here, async_synchronize_full() will wait on the async
- * task waiting on request_module() and deadlock.
- *
- * This deadlock is avoided by perfomring async_synchronize_full()
- * iff module init queued any async jobs. This isn't a full
- * solution as it will deadlock the same if module loading from
- * async jobs nests more than once; however, due to the various
- * constraints, this hack seems to be the best option for now.
- * Please refer to the following thread for details.
+ * is done. This has potential to deadlock if synchronous module
+ * loading is requested from async (which is not allowed!).
*
- * http://thread.gmane.org/gmane.linux.kernel/1420814
+ * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+ * request_module() from async workers") for more details.
*/
- if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
+ if (!mod->async_probe_requested)
async_synchronize_full();
- ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
- mod->init_layout.size);
+ ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
+ mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
@@ -3740,11 +2570,11 @@ static noinline int do_init_module(struct module *mod)
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
- mod->init_layout.base = NULL;
- mod->init_layout.size = 0;
- mod->init_layout.ro_size = 0;
- mod->init_layout.ro_after_init_size = 0;
- mod->init_layout.text_size = 0;
+ for_class_mod_mem_type(type, init) {
+ mod->mem[type].base = NULL;
+ mod->mem[type].size = 0;
+ }
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
mod->btf_data = NULL;
@@ -3768,6 +2598,11 @@ static noinline int do_init_module(struct module *mod)
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
+ mod_stat_add_long(text_size, &total_text_size);
+ mod_stat_add_long(total_size, &total_mod_size);
+
+ mod_stat_inc(&modcount);
+
return 0;
fail_free_freeinit:
@@ -3783,6 +2618,7 @@ fail:
ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);
+
return ret;
}
@@ -3794,6 +2630,67 @@ static int may_init_module(void)
return 0;
}
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Must be called with module_mutex held */
+static int module_patient_check_exists(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ struct module *old;
+ int err = 0;
+
+ old = find_module_all(name, strlen(name), true);
+ if (old == NULL)
+ return 0;
+
+ if (old->state == MODULE_STATE_COMING ||
+ old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(name));
+ mutex_lock(&module_mutex);
+ if (err)
+ return err;
+
+ /* The module might have gone in the meantime. */
+ old = find_module_all(name, strlen(name), true);
+ }
+
+ if (try_add_failed_module(name, reason))
+ pr_warn("Could not add fail-tracking for module: %s\n", name);
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ return -EEXIST;
+ return -EBUSY;
+}
+
/*
* We try to place it in the list now to make sure it's unique before
* we dedicate too many resources. In particular, temporary percpu
@@ -3802,26 +2699,14 @@ static int may_init_module(void)
static int add_unformed_module(struct module *mod)
{
int err;
- struct module *old;
mod->state = MODULE_STATE_UNFORMED;
-again:
mutex_lock(&module_mutex);
- old = find_module_all(mod->name, strlen(mod->name), true);
- if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto out_unlocked;
- goto again;
- }
- err = -EEXIST;
+ err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
+ if (err)
goto out;
- }
+
mod_update_bounds(mod);
list_add_rcu(&mod->list, &modules);
mod_tree_insert(mod);
@@ -3829,7 +2714,6 @@ again:
out:
mutex_unlock(&module_mutex);
-out_unlocked:
return err;
}
@@ -3844,8 +2728,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
if (err < 0)
goto out;
- /* This relies on module_mutex for list integrity. */
+ /* These rely on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
+ module_cfi_finalize(info->hdr, info->sechdrs, mod);
module_enable_ro(mod, false);
module_enable_nx(mod);
@@ -3890,7 +2775,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
int ret;
if (strcmp(param, "async_probe") == 0) {
- mod->async_probe_requested = true;
+ if (kstrtobool(val, &mod->async_probe_requested))
+ mod->async_probe_requested = true;
return 0;
}
@@ -3901,7 +2787,38 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
-static void cfi_init(struct module *mod);
+/* Module within temporary copy, this doesn't do any allocation */
+static int early_mod_check(struct load_info *info, int flags)
+{
+ int err;
+
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
+ if (blacklisted(info->name)) {
+ pr_err("Module %s is blacklisted\n", info->name);
+ return -EPERM;
+ }
+
+ err = rewrite_section_headers(info, flags);
+ if (err)
+ return err;
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info, info->mod))
+ return -ENOEXEC;
+
+ err = check_modinfo(info->mod, info, flags);
+ if (err)
+ return err;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
+ mutex_unlock(&module_mutex);
+
+ return err;
+}
/*
* Allocate and load the module: note that size of section 0 is always
@@ -3911,6 +2828,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
+ bool module_allocated = false;
long err = 0;
char *after_dashes;
@@ -3932,42 +2850,17 @@ static int load_module(struct load_info *info, const char __user *uargs,
/*
* Do basic sanity checks against the ELF header and
- * sections.
+ * sections. Cache useful sections and set the
+ * info->mod to the userspace passed struct module.
*/
- err = elf_validity_check(info);
- if (err) {
- pr_err("Module has invalid ELF structures\n");
- goto free_copy;
- }
-
- /*
- * Everything checks out, so set up the section info
- * in the info structure.
- */
- err = setup_load_info(info, flags);
+ err = elf_validity_cache_copy(info, flags);
if (err)
goto free_copy;
- /*
- * Now that we know we have the correct module name, check
- * if it's blacklisted.
- */
- if (blacklisted(info->name)) {
- err = -EPERM;
- pr_err("Module %s is blacklisted\n", info->name);
- goto free_copy;
- }
-
- err = rewrite_section_headers(info, flags);
+ err = early_mod_check(info, flags);
if (err)
goto free_copy;
- /* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info, info->mod)) {
- err = -ENOEXEC;
- goto free_copy;
- }
-
/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
@@ -3975,6 +2868,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_copy;
}
+ module_allocated = true;
+
audit_log_kern_module(mod->name);
/* Reserve our place in the list. */
@@ -3982,15 +2877,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto free_module;
-#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok) {
- pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
- "kernel\n", mod->name);
- add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
- }
-#endif
+ /*
+ * We are tainting your kernel if your module gets into
+ * the modules linked list somehow.
+ */
+ module_augment_kernel_taints(mod, info);
/* To avoid stressing percpu allocator, do this once we're unique. */
err = percpu_modalloc(mod, info);
@@ -4012,7 +2903,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto free_unload;
- err = check_module_license_and_versions(mod);
+ err = check_export_symbol_versions(mod);
if (err)
goto free_unload;
@@ -4034,9 +2925,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
flush_module_icache(mod);
- /* Setup CFI for the module. */
- cfi_init(mod);
-
/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
@@ -4045,7 +2933,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
}
init_build_id(mod, info);
- dynamic_debug_setup(mod, info->debug, info->num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
@@ -4059,6 +2946,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto bug_cleanup;
+ mod->async_probe_requested = async_probe;
+
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, mod,
@@ -4083,7 +2972,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
}
/* Get rid of temporary copy. */
- free_copy(info);
+ free_copy(info, flags);
/* Done! */
trace_module_load(mod);
@@ -4107,11 +2996,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
ddebug_cleanup:
ftrace_release_mod(mod);
- dynamic_debug_remove(mod, info->debug);
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
- cfi_cleanup(mod);
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
@@ -4127,12 +3014,23 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
+ mod_stat_bump_invalid(info, flags);
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+ for_class_mod_mem_type(type, core_data) {
+ lockdep_free_key_range(mod->mem[type].base,
+ mod->mem[type].size);
+ }
module_deallocate(mod, info);
free_copy:
- free_copy(info);
+ /*
+ * The info->len is always set. We distinguish between
+ * failures once the proper module was allocated and
+ * before that.
+ */
+ if (!module_allocated)
+ mod_stat_bump_becoming(info, flags);
+ free_copy(info, flags);
return err;
}
@@ -4150,8 +3048,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
umod, len, uargs);
err = copy_module_from_user(umod, len, &info);
- if (err)
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
return err;
+ }
return load_module(&info, uargs, 0);
}
@@ -4159,7 +3060,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
struct load_info info = { };
- void *hdr = NULL;
+ void *buf = NULL;
+ int len;
int err;
err = may_init_module();
@@ -4169,459 +3071,61 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
- |MODULE_INIT_IGNORE_VERMAGIC))
+ |MODULE_INIT_IGNORE_VERMAGIC
+ |MODULE_INIT_COMPRESSED_FILE))
return -EINVAL;
- err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL,
+ len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
READING_MODULE);
- if (err < 0)
- return err;
- info.hdr = hdr;
- info.len = err;
-
- return load_module(&info, uargs, flags);
-}
-
-static inline int within(unsigned long addr, void *start, unsigned long size)
-{
- return ((void *)addr >= start && (void *)addr < start + size);
-}
-
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
- if (str[0] == '.' && str[1] == 'L')
- return true;
- return str[0] == '$' && strchr("axtd", str[1])
- && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
-{
- return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
-}
-
-/*
- * Given a module and address, find the corresponding symbol and return its name
- * while providing its size and offset if needed.
- */
-static const char *find_kallsyms_symbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
-{
- unsigned int i, best = 0;
- unsigned long nextval, bestval;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- /* At worse, next value is at end of module */
- if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
- else
- nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
-
- bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
-
- /*
- * Scan for closest preceding symbol, and next symbol. (ELF
- * starts real symbols at 1).
- */
- for (i = 1; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
- unsigned long thisval = kallsyms_symbol_value(sym);
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- /*
- * We ignore unnamed symbols: they're uninformative
- * and inserted at a whim.
- */
- if (*kallsyms_symbol_name(kallsyms, i) == '\0'
- || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
- continue;
-
- if (thisval <= addr && thisval > bestval) {
- best = i;
- bestval = thisval;
- }
- if (thisval > addr && thisval < nextval)
- nextval = thisval;
- }
-
- if (!best)
- return NULL;
-
- if (size)
- *size = nextval - bestval;
- if (offset)
- *offset = addr - bestval;
-
- return kallsyms_symbol_name(kallsyms, best);
-}
-
-void * __weak dereference_module_function_descriptor(struct module *mod,
- void *ptr)
-{
- return ptr;
-}
-
-/*
- * For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption.
- */
-const char *module_address_lookup(unsigned long addr,
- unsigned long *size,
- unsigned long *offset,
- char **modname,
- const unsigned char **modbuildid,
- char *namebuf)
-{
- const char *ret = NULL;
- struct module *mod;
-
- preempt_disable();
- mod = __module_address(addr);
- if (mod) {
- if (modname)
- *modname = mod->name;
- if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- *modbuildid = mod->build_id;
-#else
- *modbuildid = NULL;
-#endif
- }
-
- ret = find_kallsyms_symbol(mod, addr, size, offset);
- }
- /* Make a copy in here where it's safe */
- if (ret) {
- strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
- ret = namebuf;
- }
- preempt_enable();
-
- return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
- if (!sym)
- goto out;
-
- strlcpy(symname, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, size, offset);
- if (!sym)
- goto out;
- if (modname)
- strlcpy(modname, mod->name, MODULE_NAME_LEN);
- if (name)
- strlcpy(name, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
- char *name, char *module_name, int *exported)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- struct mod_kallsyms *kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- kallsyms = rcu_dereference_sched(mod->kallsyms);
- if (symnum < kallsyms->num_symtab) {
- const Elf_Sym *sym = &kallsyms->symtab[symnum];
-
- *value = kallsyms_symbol_value(sym);
- *type = kallsyms->typetab[symnum];
- strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
- strlcpy(module_name, mod->name, MODULE_NAME_LEN);
- *exported = is_exported(name, *value, mod);
- preempt_enable();
- return 0;
+ if (len < 0) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
+ return len;
+ }
+
+ if (flags & MODULE_INIT_COMPRESSED_FILE) {
+ err = module_decompress(&info, buf, len);
+ vfree(buf); /* compressed data is no longer needed */
+ if (err) {
+ mod_stat_inc(&failed_decompress);
+ mod_stat_add_long(len, &invalid_decompress_bytes);
+ return err;
}
- symnum -= kallsyms->num_symtab;
- }
- preempt_enable();
- return -ERANGE;
-}
-
-/* Given a module and name of symbol, find and return the symbol's value */
-static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
-{
- unsigned int i;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
- sym->st_shndx != SHN_UNDEF)
- return kallsyms_symbol_value(sym);
- }
- return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
- struct module *mod;
- char *colon;
- unsigned long ret = 0;
-
- /* Don't lock: we're in enough trouble already. */
- preempt_disable();
- if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
- if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = find_kallsyms_symbol_value(mod, colon+1);
} else {
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
- break;
- }
- }
- preempt_enable();
- return ret;
-}
-
-#ifdef CONFIG_LIVEPATCH
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
- struct module *, unsigned long),
- void *data)
-{
- struct module *mod;
- unsigned int i;
- int ret = 0;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(mod, &modules, list) {
- /* We hold module_mutex: no need for rcu_dereference_sched */
- struct mod_kallsyms *kallsyms = mod->kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- ret = fn(data, kallsyms_symbol_name(kallsyms, i),
- mod, kallsyms_symbol_value(sym));
- if (ret != 0)
- goto out;
- }
+ info.hdr = buf;
+ info.len = len;
}
-out:
- mutex_unlock(&module_mutex);
- return ret;
-}
-#endif /* CONFIG_LIVEPATCH */
-#endif /* CONFIG_KALLSYMS */
-
-static void cfi_init(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- initcall_t *init;
- exitcall_t *exit;
-
- rcu_read_lock_sched();
- mod->cfi_check = (cfi_check_fn)
- find_kallsyms_symbol_value(mod, "__cfi_check");
- init = (initcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
- exit = (exitcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
- rcu_read_unlock_sched();
-
- /* Fix init/exit functions to point to the CFI jump table */
- if (init)
- mod->init = *init;
- if (exit)
- mod->exit = *exit;
-
- cfi_module_add(mod, module_addr_min);
-#endif
-}
-static void cfi_cleanup(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- cfi_module_remove(mod, module_addr_min);
-#endif
+ return load_module(&info, uargs, flags);
}
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
-static char *module_flags(struct module *mod, char *buf)
+char *module_flags(struct module *mod, char *buf, bool show_state)
{
int bx = 0;
BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+ if (!mod->taints && !show_state)
+ goto out;
if (mod->taints ||
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
buf[bx++] = '(';
- bx += module_flags_taint(mod, buf + bx);
+ bx += module_flags_taint(mod->taints, buf + bx);
/* Show a - for module-is-being-unloaded */
- if (mod->state == MODULE_STATE_GOING)
+ if (mod->state == MODULE_STATE_GOING && show_state)
buf[bx++] = '-';
/* Show a + for module-is-being-loaded */
- if (mod->state == MODULE_STATE_COMING)
+ if (mod->state == MODULE_STATE_COMING && show_state)
buf[bx++] = '+';
buf[bx++] = ')';
}
+out:
buf[bx] = '\0';
return buf;
}
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
- struct module *mod = list_entry(p, struct module, list);
- char buf[MODULE_FLAGS_BUF_SIZE];
- void *value;
-
- /* We always ignore unformed modules. */
- if (mod->state == MODULE_STATE_UNFORMED)
- return 0;
-
- seq_printf(m, "%s %u",
- mod->name, mod->init_layout.size + mod->core_layout.size);
- print_unload_info(m, mod);
-
- /* Informative for users. */
- seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading" :
- mod->state == MODULE_STATE_COMING ? "Loading" :
- "Live");
- /* Used by oprofile and other similar tools. */
- value = m->private ? NULL : mod->core_layout.base;
- seq_printf(m, " 0x%px", value);
-
- /* Taints info */
- if (mod->taints)
- seq_printf(m, " %s", module_flags(mod, buf));
-
- seq_puts(m, "\n");
- return 0;
-}
-
-/*
- * Format: modulename size refcount deps address
- *
- * Where refcount is a number or -, and deps is a comma-separated list
- * of depends or -.
- */
-static const struct seq_operations modules_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = m_show
-};
-
-/*
- * This also sets the "private" pointer to non-NULL if the
- * kernel pointers should be hidden (so you can just test
- * "m->private" to see if you should keep the values private).
- *
- * We use the same logic as for /proc/kallsyms.
- */
-static int modules_open(struct inode *inode, struct file *file)
-{
- int err = seq_open(file, &modules_op);
-
- if (!err) {
- struct seq_file *m = file->private_data;
- m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
- }
-
- return err;
-}
-
-static const struct proc_ops modules_proc_ops = {
- .proc_flags = PROC_ENTRY_PERMANENT,
- .proc_open = modules_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
- proc_create("modules", 0, NULL, &modules_proc_ops);
- return 0;
-}
-module_init(proc_modules_init);
-#endif
-
/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
@@ -4678,12 +3182,20 @@ struct module *__module_address(unsigned long addr)
{
struct module *mod;
- if (addr < module_addr_min || addr > module_addr_max)
- return NULL;
+ if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
+ goto lookup;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
+ goto lookup;
+#endif
+
+ return NULL;
+
+lookup:
module_assert_mutex_or_preempt();
- mod = mod_find(addr);
+ mod = mod_find(addr, &mod_tree);
if (mod) {
BUG_ON(!within_module(addr, mod));
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4723,8 +3235,8 @@ struct module *__module_text_address(unsigned long addr)
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
- if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
- && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
+ if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
+ !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
mod = NULL;
}
return mod;
@@ -4742,25 +3254,24 @@ void print_modules(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- pr_cont(" %s%s", mod->name, module_flags(mod, buf));
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf, true));
}
+
+ print_unloaded_tainted_modules();
preempt_enable();
- if (last_unloaded_module[0])
- pr_cont(" [last unloaded: %s]", last_unloaded_module);
+ if (last_unloaded_module.name[0])
+ pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
+ last_unloaded_module.taints);
pr_cont("\n");
}
-#ifdef CONFIG_MODVERSIONS
-/*
- * Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module.
- */
-void module_layout(struct module *mod,
- struct modversion_info *ver,
- struct kernel_param *kp,
- struct kernel_symbol *ks,
- struct tracepoint * const *tp)
+#ifdef CONFIG_MODULE_DEBUGFS
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
{
+ mod_debugfs_root = debugfs_create_dir("modules", NULL);
+ return 0;
}
-EXPORT_SYMBOL(module_layout);
+module_init(module_debugfs_init);
#endif
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
new file mode 100644
index 000000000000..0a4841e88adb
--- /dev/null
+++ b/kernel/module/procfs.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module proc support
+ *
+ * Copyright (C) 2008 Alexey Dobriyan
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "internal.h"
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %i ", module_refcount(mod));
+
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
+ list_for_each_entry(use, &mod->source_list, source_list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->source->name);
+ }
+
+ if (mod->init && !mod->exit) {
+ printed_something = 1;
+ seq_puts(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_puts(m, "-");
+}
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_puts(m, " - -");
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static unsigned int module_total_size(struct module *mod)
+{
+ int size = 0;
+
+ for_each_mod_mem_type(type)
+ size += mod->mem[type].size;
+ return size;
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ void *value;
+ unsigned int size;
+
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
+ size = module_total_size(mod);
+ seq_printf(m, "%s %u", mod->name, size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ value = m->private ? NULL : mod->mem[MOD_TEXT].base;
+ seq_printf(m, " 0x%px", value);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf, true));
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+ }
+
+ return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &modules_proc_ops);
+ return 0;
+}
+module_init(proc_modules_init);
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
new file mode 100644
index 000000000000..a2ff4242e623
--- /dev/null
+++ b/kernel/module/signing.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
+#include <linux/string.h>
+#include <linux/verification.h>
+#include <linux/security.h>
+#include <crypto/public_key.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+void set_module_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, struct load_info *info)
+{
+ struct module_signature ms;
+ size_t sig_len, modlen = info->len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+ ret = mod_check_sig(&ms, modlen, "module");
+ if (ret)
+ return ret;
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ modlen -= sig_len + sizeof(ms);
+ info->len = modlen;
+
+ return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+}
+
+int module_sig_check(struct load_info *info, int flags)
+{
+ int err = -ENODATA;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
+ const void *mod = info->hdr;
+ bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+ MODULE_INIT_IGNORE_VERMAGIC);
+ /*
+ * Do not allow mangled modules as a module with version information
+ * removed is no longer the module that was signed.
+ */
+ if (!mangled_module &&
+ info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, info);
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+ }
+
+ /*
+ * We don't permit modules to be loaded into the trusted kernels
+ * without a valid signature on them, but if we're not enforcing,
+ * certain errors are non-fatal.
+ */
+ switch (err) {
+ case -ENODATA:
+ reason = "unsigned module";
+ break;
+ case -ENOPKG:
+ reason = "module with unsupported crypto";
+ break;
+ case -ENOKEY:
+ reason = "module with unavailable key";
+ break;
+
+ default:
+ /*
+ * All other errors are fatal, including lack of memory,
+ * unparseable signatures, and signature check failures --
+ * even if signatures aren't required.
+ */
+ return err;
+ }
+
+ if (is_module_sig_enforced()) {
+ pr_notice("Loading of %s is rejected\n", reason);
+ return -EKEYREJECTED;
+ }
+
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..ad7b6ada29f2
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Debugging module statistics.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <uapi/linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include <linux/math.h>
+
+#include "internal.h"
+
+/**
+ * DOC: module debugging statistics overview
+ *
+ * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
+ * are useful to monitor and root cause memory pressure issues with module
+ * loading. These statistics are useful to allow us to improve production
+ * workloads.
+ *
+ * The current module debugging statistics supported help keep track of module
+ * loading failures to enable improvements either for kernel module auto-loading
+ * usage (request_module()) or interactions with userspace. Statistics are
+ * provided to track all possible failures in the finit_module() path and memory
+ * wasted in this process space. Each of the failure counters are associated
+ * to a type of module loading failure which is known to incur a certain amount
+ * of memory allocation loss. In the worst case loading a module will fail after
+ * a 3 step memory allocation process:
+ *
+ * a) memory allocated with kernel_read_file_from_fd()
+ * b) module decompression processes the file read from
+ * kernel_read_file_from_fd(), and vmap() is used to map
+ * the decompressed module to a new local buffer which represents
+ * a copy of the decompressed module passed from userspace. The buffer
+ * from kernel_read_file_from_fd() is freed right away.
+ * c) layout_and_allocate() allocates space for the final resting
+ * place where we would keep the module if it were to be processed
+ * successfully.
+ *
+ * If a failure occurs after these three different allocations only one
+ * counter will be incremented with the summation of the allocated bytes freed
+ * incurred during this failure. Likewise, if module loading failed only after
+ * step b) a separate counter is used and incremented for the bytes freed and
+ * not used during both of those allocations.
+ *
+ * Virtual memory space can be limited, for example on x86 virtual memory size
+ * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
+ * memory allocations when possible. These module debugging statistics help
+ * to evaluate how much memory is being wasted on bootup due to module loading
+ * failures.
+ *
+ * All counters are designed to be incremental. Atomic counters are used so to
+ * remain simple and avoid delays and deadlocks.
+ */
+
+/**
+ * DOC: dup_failed_modules - tracks duplicate failed modules
+ *
+ * Linked list of modules which failed to be loaded because an already existing
+ * module with the same name was already being processed or already loaded.
+ * The finit_module() system call incurs heavy virtual memory allocations. In
+ * the worst case an finit_module() system call can end up allocating virtual
+ * memory 3 times:
+ *
+ * 1) kernel_read_file_from_fd() call uses vmalloc()
+ * 2) optional module decompression uses vmap()
+ * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
+ * vmalloc to deal with ELF sections requiring special permissions
+ *
+ * In practice on a typical boot today most finit_module() calls fail due to
+ * the module with the same name already being loaded or about to be processed.
+ * All virtual memory allocated to these failed modules will be freed with
+ * no functional use.
+ *
+ * To help with this the dup_failed_modules allows us to track modules which
+ * failed to load due to the fact that a module was already loaded or being
+ * processed. There are only two points at which we can fail such calls,
+ * we list them below along with the number of virtual memory allocation
+ * calls:
+ *
+ * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
+ * layout_and_allocate().
+ * - with module decompression: 2 virtual memory allocation calls
+ * - without module decompression: 1 virtual memory allocation calls
+ * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
+ * - with module decompression 3 virtual memory allocation calls
+ * - without module decompression 2 virtual memory allocation calls
+ *
+ * We should strive to get this list to be as small as possible. If this list
+ * is not empty it is a reflection of possible work or optimizations possible
+ * either in-kernel or in userspace.
+ */
+static LIST_HEAD(dup_failed_modules);
+
+/**
+ * DOC: module statistics debugfs counters
+ *
+ * The total amount of wasted virtual memory allocation space during module
+ * loading can be computed by adding the total from the summation:
+ *
+ * * @invalid_kread_bytes +
+ * @invalid_decompress_bytes +
+ * @invalid_becoming_bytes +
+ * @invalid_mod_bytes
+ *
+ * The following debugfs counters are available to inspect module loading
+ * failures:
+ *
+ * * total_mod_size: total bytes ever used by all modules we've dealt with on
+ * this system
+ * * total_text_size: total bytes of the .text and .init.text ELF section
+ * sizes we've dealt with on this system
+ * * invalid_kread_bytes: bytes allocated and then freed on failures which
+ * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
+ * uses vmalloc(). These should typically not happen unless your system is
+ * under memory pressure.
+ * * invalid_decompress_bytes: number of bytes allocated and freed due to
+ * memory allocations in the module decompression path that use vmap().
+ * These typically should not happen unless your system is under memory
+ * pressure.
+ * * invalid_becoming_bytes: total number of bytes allocated and freed used
+ * used to read the kernel module userspace wants us to read before we
+ * promote it to be processed to be added to our @modules linked list. These
+ * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
+ * call and right before we allocate the our private memory for the module
+ * which would be kept if the module is successfully loaded. The most common
+ * reason for this failure is when userspace is racing to load a module
+ * which it does not yet see loaded. The first module to succeed in
+ * add_unformed_module() will add a module to our &modules list and
+ * subsequent loads of modules with the same name will error out at the
+ * end of early_mod_check(). The check for module_patient_check_exists()
+ * at the end of early_mod_check() prevents duplicate allocations
+ * on layout_and_allocate() for modules already being processed. These
+ * duplicate failed modules are non-fatal, however they typically are
+ * indicative of userspace not seeing a module in userspace loaded yet and
+ * unnecessarily trying to load a module before the kernel even has a chance
+ * to begin to process prior requests. Although duplicate failures can be
+ * non-fatal, we should try to reduce vmalloc() pressure proactively, so
+ * ideally after boot this will be close to as 0 as possible. If module
+ * decompression was used we also add to this counter the cost of the
+ * initial kernel_read_file_from_fd() of the compressed module. If module
+ * decompression was not used the value represents the total allocated and
+ * freed bytes in kernel_read_file_from_fd() calls for these type of
+ * failures. These failures can occur because:
+ *
+ * * module_sig_check() - module signature checks
+ * * elf_validity_cache_copy() - some ELF validation issue
+ * * early_mod_check():
+ *
+ * * blacklisting
+ * * failed to rewrite section headers
+ * * version magic
+ * * live patch requirements didn't check out
+ * * the module was detected as being already present
+ *
+ * * invalid_mod_bytes: these are the total number of bytes allocated and
+ * freed due to failures after we did all the sanity checks of the module
+ * which userspace passed to us and after our first check that the module
+ * is unique. A module can still fail to load if we detect the module is
+ * loaded after we allocate space for it with layout_and_allocate(), we do
+ * this check right before processing the module as live and run its
+ * initialization routines. Note that you have a failure of this type it
+ * also means the respective kernel_read_file_from_fd() memory space was
+ * also freed and not used, and so we increment this counter with twice
+ * the size of the module. Additionally if you used module decompression
+ * the size of the compressed module is also added to this counter.
+ *
+ * * modcount: how many modules we've loaded in our kernel life time
+ * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
+ * * failed_decompress: how many failed module decompression attempts we've had.
+ * These really should not happen unless your compression / decompression
+ * might be broken.
+ * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
+ * it and before we allocate memory for it with layout_and_allocate(). This
+ * counter is never incremented if you manage to validate the module and
+ * call layout_and_allocate() for it.
+ * * failed_load_modules: how many modules failed once we've allocated our
+ * private space for our module using layout_and_allocate(). These failures
+ * should hopefully mostly be dealt with already. Races in theory could
+ * still exist here, but it would just mean the kernel had started processing
+ * two threads concurrently up to early_mod_check() and one thread won.
+ * These failures are good signs the kernel or userspace is doing something
+ * seriously stupid or that could be improved. We should strive to fix these,
+ * but it is perhaps not easy to fix them. A recent example are the modules
+ * requests incurred for frequency modules, a separate module request was
+ * being issued for each CPU on a system.
+ */
+
+atomic_long_t total_mod_size;
+atomic_long_t total_text_size;
+atomic_long_t invalid_kread_bytes;
+atomic_long_t invalid_decompress_bytes;
+static atomic_long_t invalid_becoming_bytes;
+static atomic_long_t invalid_mod_bytes;
+atomic_t modcount;
+atomic_t failed_kreads;
+atomic_t failed_decompress;
+static atomic_t failed_becoming;
+static atomic_t failed_load_modules;
+
+static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
+{
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
+ test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Becoming & Load";
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
+ return "Becoming";
+ if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Load";
+ return "Bug-on-stats";
+}
+
+void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+ atomic_long_add(info->len * 2, &invalid_mod_bytes);
+ atomic_inc(&failed_load_modules);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_mod_bytes);
+#endif
+}
+
+void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+ atomic_inc(&failed_becoming);
+ atomic_long_add(info->len, &invalid_becoming_bytes);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
+#endif
+}
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
+{
+ struct mod_fail_load *mod_fail;
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_fail->name, name)) {
+ atomic_long_inc(&mod_fail->count);
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ goto out;
+ }
+ }
+
+ mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
+ if (!mod_fail)
+ return -ENOMEM;
+ memcpy(mod_fail->name, name, strlen(name));
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ atomic_long_inc(&mod_fail->count);
+ list_add_rcu(&mod_fail->list, &dup_failed_modules);
+out:
+ return 0;
+}
+
+/*
+ * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
+ * 112 module prints within 8k.
+ *
+ * 1024 + (64*112) = 8k
+ */
+#define MAX_PREAMBLE 1024
+#define MAX_FAILED_MOD_PRINT 112
+#define MAX_BYTES_PER_MOD 64
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct mod_fail_load *mod_fail;
+ unsigned int len, size, count_failed = 0;
+ char *buf;
+ u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
+ unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
+ idecompress_bytes, imod_bytes, total_virtual_lost;
+
+ live_mod_count = atomic_read(&modcount);
+ fkreads = atomic_read(&failed_kreads);
+ fdecompress = atomic_read(&failed_decompress);
+ fbecoming = atomic_read(&failed_becoming);
+ floads = atomic_read(&failed_load_modules);
+
+ total_size = atomic_long_read(&total_mod_size);
+ text_size = atomic_long_read(&total_text_size);
+ ikread_bytes = atomic_long_read(&invalid_kread_bytes);
+ idecompress_bytes = atomic_long_read(&invalid_decompress_bytes);
+ ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes);
+ imod_bytes = atomic_long_read(&invalid_mod_bytes);
+
+ total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
+
+ size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
+ (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
+ buf = kzalloc(size, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* The beginning of our debug preamble */
+ len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
+ fdecompress);
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size);
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes",
+ idecompress_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost);
+
+ if (live_mod_count && total_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size",
+ DIV_ROUND_UP(total_size, live_mod_count));
+ }
+
+ if (live_mod_count && text_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size",
+ DIV_ROUND_UP(text_size, live_mod_count));
+ }
+
+ /*
+ * We use WARN_ON_ONCE() for the counters to ensure we always have parity
+ * for keeping tabs on a type of failure with one type of byte counter.
+ * The counters for imod_bytes does not increase for fkreads failures
+ * for example, and so on.
+ */
+
+ WARN_ON_ONCE(ikread_bytes && !fkreads);
+ if (fkreads && ikread_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes",
+ DIV_ROUND_UP(ikread_bytes, fkreads));
+ }
+
+ WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
+ if (fbecoming && ibecoming_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes",
+ DIV_ROUND_UP(ibecoming_bytes, fbecoming));
+ }
+
+ WARN_ON_ONCE(idecompress_bytes && !fdecompress);
+ if (fdecompress && idecompress_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes",
+ DIV_ROUND_UP(idecompress_bytes, fdecompress));
+ }
+
+ WARN_ON_ONCE(imod_bytes && !floads);
+ if (floads && imod_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes",
+ DIV_ROUND_UP(imod_bytes, floads));
+ }
+
+ /* End of our debug preamble header. */
+
+ /* Catch when we've gone beyond our expected preamble */
+ WARN_ON_ONCE(len >= MAX_PREAMBLE);
+
+ if (list_empty(&dup_failed_modules))
+ goto out;
+
+ len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
+ len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
+ "Module-name", "How-many-times", "Reason");
+ mutex_lock(&module_mutex);
+
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
+ if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
+ goto out_unlock;
+ len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name,
+ atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail));
+ }
+out_unlock:
+ mutex_unlock(&module_mutex);
+out:
+ kfree(buf);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+#undef MAX_PREAMBLE
+#undef MAX_FAILED_MOD_PRINT
+#undef MAX_BYTES_PER_MOD
+
+static const struct file_operations fops_mod_stats = {
+ .read = read_file_mod_stats,
+ .open = simple_open,
+ .owner = THIS_MODULE,
+ .llseek = default_llseek,
+};
+
+#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
+#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
+static int __init module_stats_init(void)
+{
+ mod_debug_add_ulong(total_mod_size);
+ mod_debug_add_ulong(total_text_size);
+ mod_debug_add_ulong(invalid_kread_bytes);
+ mod_debug_add_ulong(invalid_decompress_bytes);
+ mod_debug_add_ulong(invalid_becoming_bytes);
+ mod_debug_add_ulong(invalid_mod_bytes);
+
+ mod_debug_add_atomic(modcount);
+ mod_debug_add_atomic(failed_kreads);
+ mod_debug_add_atomic(failed_decompress);
+ mod_debug_add_atomic(failed_becoming);
+ mod_debug_add_atomic(failed_load_modules);
+
+ debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+ return 0;
+}
+#undef mod_debug_add_ulong
+#undef mod_debug_add_atomic
+module_init(module_stats_init);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
new file mode 100644
index 000000000000..a2b656b4e3d2
--- /dev/null
+++ b/kernel/module/strict_rwx.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module strict rwx
+ *
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include "internal.h"
+
+static void module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ set_vm_flush_reset_perms(mod_mem->base);
+ set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+}
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of
+ * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
+ * are strict.
+ */
+void module_enable_x(const struct module *mod)
+{
+ for_class_mod_mem_type(type, text)
+ module_set_memory(mod, type, set_memory_x);
+}
+
+void module_enable_ro(const struct module *mod, bool after_init)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return;
+#ifdef CONFIG_STRICT_MODULE_RWX
+ if (!rodata_enabled)
+ return;
+#endif
+
+ module_set_memory(mod, MOD_TEXT, set_memory_ro);
+ module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro);
+ module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+
+ if (after_init)
+ module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+}
+
+void module_enable_nx(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return;
+
+ for_class_mod_mem_type(type, data)
+ module_set_memory(mod, type, set_memory_nx);
+}
+
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
+{
+ const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+ pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+ mod->name, secstrings + sechdrs[i].sh_name, i);
+ return -ENOEXEC;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
new file mode 100644
index 000000000000..c921bf044050
--- /dev/null
+++ b/kernel/module/sysfs.c
@@ -0,0 +1,436 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module sysfs support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include "internal.h"
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attr {
+ struct bin_attribute battr;
+ unsigned long address;
+};
+
+struct module_sect_attrs {
+ struct attribute_group grp;
+ unsigned int nsections;
+ struct module_sect_attr attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
+{
+ struct module_sect_attr *sattr =
+ container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ unsigned int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].battr.attr.name);
+ kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int nloaded = 0, i, size[2];
+ struct module_sect_attrs *sect_attrs;
+ struct module_sect_attr *sattr;
+ struct bin_attribute **gattr;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
+ nloaded++;
+ size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
+ sizeof(sect_attrs->grp.bin_attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (!sect_attrs)
+ return;
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+
+ sect_attrs->nsections = 0;
+ sattr = &sect_attrs->attrs[0];
+ gattr = &sect_attrs->grp.bin_attrs[0];
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+
+ if (sect_empty(sec))
+ continue;
+ sysfs_bin_attr_init(&sattr->battr);
+ sattr->address = sec->sh_addr;
+ sattr->battr.attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (!sattr->battr.attr.name)
+ goto out;
+ sect_attrs->nsections++;
+ sattr->battr.read = module_sect_read;
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
+ sattr->battr.attr.mode = 0400;
+ *(gattr++) = &(sattr++)->battr;
+ }
+ *gattr = NULL;
+
+ if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return;
+out:
+ free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /*
+ * We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately.
+ */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct kobject *dir;
+ unsigned int notes;
+ struct bin_attribute attrs[];
+};
+
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ /*
+ * The caller checked the pos and count against our size.
+ */
+ memcpy(buf, bin_attr->private + pos, count);
+ return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+ unsigned int i)
+{
+ if (notes_attrs->dir) {
+ while (i-- > 0)
+ sysfs_remove_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]);
+ kobject_put(notes_attrs->dir);
+ }
+ kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ struct bin_attribute *nattr;
+
+ /* failed to create section attributes, so can't create notes */
+ if (!mod->sect_attrs)
+ return;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ info->sechdrs[i].sh_type == SHT_NOTE)
+ ++notes;
+
+ if (notes == 0)
+ return;
+
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+ GFP_KERNEL);
+ if (!notes_attrs)
+ return;
+
+ notes_attrs->notes = notes;
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
+ continue;
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ sysfs_bin_attr_init(nattr);
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+ nattr->attr.mode = 0444;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *)info->sechdrs[i].sh_addr;
+ nattr->read = module_notes_read;
+ ++nattr;
+ }
+ ++loaded;
+ }
+
+ notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+ if (!notes_attrs->dir)
+ goto out;
+
+ for (i = 0; i < notes; ++i)
+ if (sysfs_create_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]))
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return;
+
+out:
+ free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs)
+ free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else /* !CONFIG_KALLSYMS */
+static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_sect_attrs(struct module *mod) { }
+static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_notes_attrs(struct module *mod) { }
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+ int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
+#endif
+ return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (modinfo_attrs_count + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (!attr->test || attr->test(mod)) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ sysfs_attr_init(&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
+ if (error)
+ goto error_out;
+ ++temp_attr;
+ }
+ }
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
+ return error;
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_kset) {
+ pr_err("%s: module sysfs not initialized\n", mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ pr_err("%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ mod_kobject_put(mod);
+
+out:
+ return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
+ add_sect_attrs(mod, info);
+ add_notes_attrs(mod, info);
+
+ return 0;
+
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ mod_kobject_put(mod);
+out:
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_put(mod);
+}
+
+void mod_sysfs_teardown(struct module *mod)
+{
+ del_usage_links(mod);
+ module_remove_modinfo_attrs(mod, -1);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
new file mode 100644
index 000000000000..16742d1c630c
--- /dev/null
+++ b/kernel/module/tracking.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module taint unload tracking support
+ *
+ * Copyright (C) 2022 Aaron Tomlin
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include "internal.h"
+
+static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root;
+
+int try_add_tainted_module(struct module *mod)
+{
+ struct mod_unload_taint *mod_taint;
+
+ module_assert_mutex_or_preempt();
+
+ if (!mod->taints)
+ goto out;
+
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_taint->name, mod->name) &&
+ mod_taint->taints & mod->taints) {
+ mod_taint->count++;
+ goto out;
+ }
+ }
+
+ mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL);
+ if (unlikely(!mod_taint))
+ return -ENOMEM;
+ strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN);
+ mod_taint->taints = mod->taints;
+ list_add_rcu(&mod_taint->list, &unloaded_tainted_modules);
+ mod_taint->count = 1;
+out:
+ return 0;
+}
+
+void print_unloaded_tainted_modules(void)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ if (!list_empty(&unloaded_tainted_modules)) {
+ printk(KERN_DEFAULT "Unloaded tainted modules:");
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules,
+ list) {
+ size_t l;
+
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+ pr_cont(" %s(%s):%llu", mod_taint->name, buf,
+ mod_taint->count);
+ }
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
+}
+
+static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
+}
+
+static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ size_t l;
+
+ mod_taint = list_entry(p, struct mod_unload_taint, list);
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+
+ seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations unloaded_tainted_modules_seq_ops = {
+ .start = unloaded_tainted_modules_seq_start,
+ .next = unloaded_tainted_modules_seq_next,
+ .stop = unloaded_tainted_modules_seq_stop,
+ .show = unloaded_tainted_modules_seq_show,
+};
+
+static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unloaded_tainted_modules_seq_ops);
+}
+
+static const struct file_operations unloaded_tainted_modules_fops = {
+ .open = unloaded_tainted_modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init unloaded_tainted_modules_init(void)
+{
+ debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
+ &unloaded_tainted_modules_fops);
+ return 0;
+}
+module_init(unloaded_tainted_modules_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
new file mode 100644
index 000000000000..277197977d43
--- /dev/null
+++ b/kernel/module/tree_lookup.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Modules tree lookup
+ *
+ * Copyright (C) 2015 Peter Zijlstra
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree_latch.h>
+#include "internal.h"
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
+};
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_insert(&node->node, &tree->root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_erase(&node->node, &tree->root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+void mod_tree_insert(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ mod->mem[type].mtn.mod = mod;
+ if (mod->mem[type].size)
+ __mod_tree_insert(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove_init(struct module *mod)
+{
+ for_class_mod_mem_type(type, init) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
diff --git a/kernel/module/version.c b/kernel/module/version.c
new file mode 100644
index 000000000000..53f43ac5a73e
--- /dev/null
+++ b/kernel/module/version.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module version support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include "internal.h"
+
+int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const s32 *crc)
+{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *)sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ crcval = *crc;
+ if (versions[i].crc == crcval)
+ return 1;
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
+ goto bad_version;
+ }
+
+ /* Broken toolchain. Warn once, then let it go.. */
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+ return 1;
+
+bad_version:
+ pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname);
+ return 0;
+}
+
+int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
+
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary -- use preempt_disable() to placate lockdep.
+ */
+ preempt_disable();
+ if (!find_symbol(&fsa)) {
+ preempt_enable();
+ BUG();
+ }
+ preempt_enable();
+ return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index 8723ae70ea1f..000000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/module_signature.h>
-#include <linux/string.h>
-#include <linux/verification.h>
-#include <crypto/public_key.h>
-#include "module-internal.h"
-
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, struct load_info *info)
-{
- struct module_signature ms;
- size_t sig_len, modlen = info->len;
- int ret;
-
- pr_devel("==>%s(,%zu)\n", __func__, modlen);
-
- if (modlen <= sizeof(ms))
- return -EBADMSG;
-
- memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-
- ret = mod_check_sig(&ms, modlen, "module");
- if (ret)
- return ret;
-
- sig_len = be32_to_cpu(ms.sig_len);
- modlen -= sig_len + sizeof(ms);
- info->len = modlen;
-
- return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
- VERIFY_USE_SECONDARY_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
-}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 1b019cbca594..b3ce28f39eb6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -7,6 +7,9 @@
#include <linux/vmalloc.h>
#include <linux/reboot.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/notifier.h>
+
/*
* Notifier list for kernel code which wants to be called
* at shutdown. This is used to stop any idling DMA operations
@@ -20,19 +23,24 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n,
+ bool unique_priority)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
- WARN(1, "double register detected");
- return 0;
+ WARN(1, "notifier callback %ps already registered",
+ n->notifier_call);
+ return -EEXIST;
}
if (n->priority > (*nl)->priority)
break;
+ if (n->priority == (*nl)->priority && unique_priority)
+ return -EBUSY;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
+ trace_notifier_register((void *)n->notifier_call);
return 0;
}
@@ -42,6 +50,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
+ trace_notifier_unregister((void *)n->notifier_call);
return 0;
}
nl = &((*nl)->next);
@@ -58,7 +67,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -80,6 +89,7 @@ static int notifier_call_chain(struct notifier_block **nl,
continue;
}
#endif
+ trace_notifier_run((void *)nb->notifier_call);
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
@@ -101,13 +111,13 @@ NOKPROBE_SYMBOL(notifier_call_chain);
* @val_up: Value passed unmodified to the notifier function
* @val_down: Value passed unmodified to the notifier function when recovering
* from an error on @val_up
- * @v Pointer passed unmodified to the notifier function
+ * @v: Pointer passed unmodified to the notifier function
*
* NOTE: It is important the @nl chain doesn't change between the two
* invocations of notifier_call_chain() such that we visit the
* exact same notifier callbacks; this rules out any RCU usage.
*
- * Returns: the return value of the @val_up call.
+ * Return: the return value of the @val_up call.
*/
static int notifier_call_chain_robust(struct notifier_block **nl,
unsigned long val_up, unsigned long val_down,
@@ -134,7 +144,7 @@ static int notifier_call_chain_robust(struct notifier_block **nl,
*
* Adds a notifier to an atomic notifier chain.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
@@ -143,53 +153,57 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
int ret;
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
- * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
- * @n: Entry to remove from notifier chain
+ * @n: New entry in notifier chain
*
- * Removes a notifier from an atomic notifier chain.
+ * Adds a notifier to an atomic notifier chain if there is no other
+ * notifier registered using the same priority.
*
- * Returns zero on success or %-ENOENT on failure.
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
*/
-int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
- struct notifier_block *n)
+int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_chain_unregister(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, true);
spin_unlock_irqrestore(&nh->lock, flags);
- synchronize_rcu();
return ret;
}
-EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);
-int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
- unsigned long val_up, unsigned long val_down, void *v)
+/**
+ * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an atomic notifier chain.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
{
unsigned long flags;
int ret;
- /*
- * Musn't use RCU; because then the notifier list can
- * change between the up and down traversal.
- */
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+ ret = notifier_chain_unregister(&nh->head, n);
spin_unlock_irqrestore(&nh->lock, flags);
-
+ synchronize_rcu();
return ret;
}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust);
-NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust);
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
* atomic_notifier_call_chain - Call functions in an atomic notifier chain
@@ -222,23 +236,27 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);
+/**
+ * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
+ * @nh: Pointer to head of the atomic notifier chain
+ *
+ * Checks whether notifier chain is empty.
+ *
+ * Returns true is notifier chain is empty, false otherwise.
+ */
+bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
+{
+ return !rcu_access_pointer(nh->head);
+}
+
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
-/**
- * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain.
- * Must be called in process context.
- *
- * Currently always returns zero.
- */
-int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
+static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n,
+ bool unique_priority)
{
int ret;
@@ -248,16 +266,49 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, unique_priority);
down_write(&nh->rwsem);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, unique_priority);
up_write(&nh->rwsem);
return ret;
}
+
+/**
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, false);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
+ * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an blocking notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, true);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);
+
+/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
@@ -354,12 +405,12 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
@@ -411,7 +462,6 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
-#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -425,7 +475,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
@@ -438,10 +488,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
mutex_lock(&nh->mutex);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
mutex_unlock(&nh->mutex);
return ret;
}
@@ -528,8 +578,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-#endif /* CONFIG_SRCU */
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index abc01fcad8c7..80d9c6d77a45 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,7 +157,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
- if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ if ((flags & CLONE_VM) ||
+ likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
return 0;
}
@@ -179,7 +180,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- timens_on_fork(new_ns, tsk);
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
@@ -254,6 +256,23 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+int exec_task_namespaces(void)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new;
+
+ if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
+ return 0;
+
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ timens_on_fork(new, tsk);
+ switch_task_namespaces(tsk, new);
+ return 0;
+}
+
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -526,21 +545,20 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct file *file;
+ struct fd f = fdget(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- file = fget(fd);
- if (!file)
+ if (!f.file)
return -EBADF;
- if (proc_ns_file(file)) {
- ns = get_proc_ns(file_inode(file));
+ if (proc_ns_file(f.file)) {
+ ns = get_proc_ns(file_inode(f.file));
if (flags && (ns->ops->type != flags))
err = -EINVAL;
flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(file))) {
+ } else if (!IS_ERR(pidfd_pid(f.file))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -552,22 +570,22 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(file))
+ if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, file->private_data);
+ err = validate_nsset(&nsset, f.file->private_data);
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
- fput(file);
+ fdput(f);
return err;
}
int __init nsproxy_cache_init(void)
{
- nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
return 0;
}
diff --git a/kernel/padata.c b/kernel/padata.c
index d4d3ba6e1728..222d60195de6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -9,19 +9,6 @@
*
* Copyright (c) 2020 Oracle and/or its affiliates.
* Author: Daniel Jordan <daniel.m.jordan@oracle.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <linux/completion.h>
@@ -96,8 +83,16 @@ static struct padata_work *padata_work_alloc(void)
return pw;
}
-static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data, int flags)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
if (flags & PADATA_WORK_ONSTACK)
INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
@@ -194,7 +189,7 @@ int padata_do_parallel(struct padata_shell *ps,
goto out;
if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
- if (!cpumask_weight(pd->cpumask.cbcpu))
+ if (cpumask_empty(pd->cpumask.cbcpu))
goto out;
/* Select an alternate fallback CPU and notify the caller. */
@@ -211,7 +206,7 @@ int padata_do_parallel(struct padata_shell *ps,
if ((pinst->flags & PADATA_RESET))
goto out;
- atomic_inc(&pd->refcnt);
+ refcount_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
@@ -220,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps,
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
rcu_read_unlock_bh();
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
- } else {
- /* Maximum works limit exceeded, run in the current task. */
- padata->parallel(padata);
}
return 0;
@@ -383,7 +380,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
}
local_bh_enable();
- if (atomic_sub_and_test(cnt, &pd->refcnt))
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
padata_free_pd(pd);
}
@@ -401,13 +398,16 @@ void padata_do_serial(struct padata_priv *padata)
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
+ struct list_head *pos;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &reorder->list, list)
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
if (cur->seq_nr < padata->seq_nr)
break;
- list_add(&padata->list, &cur->list);
+ }
+ list_add(&padata->list, pos);
spin_unlock(&reorder->lock);
/*
@@ -491,7 +491,7 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
return;
/* Ensure at least one thread when size < min_chunk. */
- nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = max(job->size / max(job->min_chunk, job->align), 1ul);
nworks = min(nworks, job->max_threads);
if (nworks == 1) {
@@ -593,7 +593,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_reorder_list(pd);
padata_init_squeues(pd);
pd->seq_nr = -1;
- atomic_set(&pd->refcnt, 1);
+ refcount_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
@@ -667,7 +667,7 @@ static int padata_replace(struct padata_instance *pinst)
synchronize_rcu();
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
- if (atomic_dec_and_test(&ps->opd->refcnt))
+ if (refcount_dec_and_test(&ps->opd->refcnt))
padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
@@ -733,7 +733,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&pinst->lock);
switch (cpumask_type) {
@@ -753,7 +753,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
out:
mutex_unlock(&pinst->lock);
- put_online_cpus();
+ cpus_read_unlock();
return err;
}
@@ -967,7 +967,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
.store = padata_sysfs_store,
};
-static struct kobj_type padata_attr_type = {
+static const struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
.default_groups = padata_default_groups,
.release = padata_sysfs_release,
@@ -992,7 +992,7 @@ struct padata_instance *padata_alloc(const char *name)
if (!pinst->parallel_wq)
goto err_free_inst;
- get_online_cpus();
+ cpus_read_lock();
pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
WQ_CPU_INTENSIVE, 1, name);
@@ -1026,7 +1026,7 @@ struct padata_instance *padata_alloc(const char *name)
&pinst->cpu_dead_node);
#endif
- put_online_cpus();
+ cpus_read_unlock();
return pinst;
@@ -1036,7 +1036,7 @@ err_free_masks:
err_free_serial_wq:
destroy_workqueue(pinst->serial_wq);
err_put_cpus:
- put_online_cpus();
+ cpus_read_unlock();
destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
@@ -1074,9 +1074,9 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
ps->pinst = pinst;
- get_online_cpus();
+ cpus_read_lock();
pd = padata_alloc_pd(ps);
- put_online_cpus();
+ cpus_read_unlock();
if (!pd)
goto out_free_ps;
diff --git a/kernel/panic.c b/kernel/panic.c
index edad89660a2b..886d2ebd0a0d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -25,6 +25,7 @@
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -32,6 +33,9 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
+#include <trace/events/error_report.h>
#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
@@ -42,12 +46,14 @@
* Should we dump all CPUs backtraces in an oops event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask =
- IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
+ IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -55,6 +61,7 @@ bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -65,12 +72,63 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
+#define PANIC_PRINT_ALL_CPU_BT 0x00000040
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "oops_all_cpu_backtrace",
+ .data = &sysctl_oops_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_panic_table);
+ return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -83,7 +141,7 @@ EXPORT_SYMBOL(panic_blink);
/*
* Stop ourself in panic -- architecture code may override this
*/
-void __weak panic_smp_self_stop(void)
+void __weak __noreturn panic_smp_self_stop(void)
{
while (1)
cpu_relax();
@@ -93,7 +151,7 @@ void __weak panic_smp_self_stop(void)
* Stop ourselves in NMI context if another CPU has already panicked. Arch code
* may override this to prepare for crash dumping, e.g. save regs info.
*/
-void __weak nmi_panic_self_stop(struct pt_regs *regs)
+void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
panic_smp_self_stop();
}
@@ -146,10 +204,13 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(void)
+static void panic_print_sys_info(bool console_flush)
{
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ if (console_flush) {
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ return;
+ }
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -167,6 +228,43 @@ static void panic_print_sys_info(void)
ftrace_dump(DUMP_ALL);
}
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
+/*
+ * Helper that triggers the NMI backtrace (if set in panic_print)
+ * and then performs the secondary CPUs shutdown - we cannot have
+ * the NMI backtrace after the CPUs are off!
+ */
+static void panic_other_cpus_shutdown(bool crash_kexec)
+{
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
+
+ /*
+ * Note that smp_send_stop() is the usual SMP shutdown function,
+ * which unfortunately may not be hardened to work in a panic
+ * situation. If we want to do crash dump after notifier calls
+ * and kmsg_dump, we will need architecture dependent extra
+ * bits in addition to stopping other CPUs, hence we rely on
+ * crash_smp_send_stop() for that.
+ */
+ if (!crash_kexec)
+ smp_send_stop();
+ else
+ crash_smp_send_stop();
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -184,6 +282,16 @@ void panic(const char *fmt, ...)
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -247,24 +355,10 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
+ if (!_crash_kexec_post_notifiers)
__crash_kexec(NULL);
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a
- * panic situation.
- */
- smp_send_stop();
- } else {
- /*
- * If we want to do crash dump after notifier calls and
- * kmsg_dump, we will need architecture dependent extra
- * works in addition to stopping other CPUs.
- */
- crash_smp_send_stop();
- }
+ panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
/*
* Run any panic handlers, including those that might need to
@@ -272,8 +366,8 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
+ panic_print_sys_info(false);
+
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -288,9 +382,6 @@ void panic(const char *fmt, ...)
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
-#ifdef CONFIG_VT
- unblank_screen();
-#endif
console_unblank();
/*
@@ -304,7 +395,7 @@ void panic(const char *fmt, ...)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info();
+ panic_print_sys_info(true);
if (!panic_blink)
panic_blink = no_blink;
@@ -387,6 +478,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
[ TAINT_LIVEPATCH ] = { 'K', ' ', true },
[ TAINT_AUX ] = { 'X', ' ', true },
[ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
+ [ TAINT_TEST ] = { 'N', ' ', true },
};
/**
@@ -536,26 +628,9 @@ void oops_enter(void)
trigger_all_cpu_backtrace();
}
-/*
- * 64-bit random ID for oopses:
- */
-static u64 oops_id;
-
-static int init_oops_id(void)
-{
- if (!oops_id)
- get_random_bytes(&oops_id, sizeof(oops_id));
- else
- oops_id++;
-
- return 0;
-}
-late_initcall(init_oops_id);
-
static void print_oops_end_marker(void)
{
- init_oops_id();
- pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+ pr_warn("---[ end trace %016llx ]---\n", 0ULL);
}
/*
@@ -595,16 +670,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
- panic("panic_on_warn set ...\n");
- }
+ check_panic_on_warn("kernel");
if (!regs)
dump_stack();
@@ -612,6 +678,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
print_irqtrace_events(current);
print_oops_end_marker();
+ trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller);
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
@@ -621,6 +688,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
struct warn_args args;
pr_warn(CUT_HERE);
@@ -635,11 +703,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
va_list args;
pr_warn(CUT_HERE);
@@ -647,6 +717,7 @@ void __warn_printk(const char *fmt, ...)
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif
@@ -729,8 +800,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 2daa2780a92c..6a7548979aa9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -4,6 +4,7 @@
*/
#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
@@ -243,6 +244,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint);
+int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
+ unsigned int min, unsigned int max)
+{
+ unsigned int num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtouint(val, 0, &num);
+ if (ret)
+ return ret;
+ if (num < min || num > max)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(param_set_uint_minmax);
+
int param_set_charp(const char *val, const struct kernel_param *kp)
{
if (strlen(val) > 1024) {
@@ -292,7 +311,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp)
if (!val) val = "1";
/* One of =[yYnN01] */
- return strtobool(val, kp->arg);
+ return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
@@ -908,9 +927,9 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
- struct kobj_type *ktype = get_ktype(kobj);
+ const struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &module_ktype)
return 1;
@@ -922,7 +941,6 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
@@ -930,13 +948,17 @@ static void module_kobj_release(struct kobject *kobj)
complete(mk->kobj_completion);
}
-struct kobj_type module_ktype = {
+const struct kobj_type module_ktype = {
.release = module_kobj_release,
.sysfs_ops = &module_sysfs_ops,
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -946,13 +968,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid.c b/kernel/pid.c
index ebdf9c60cd0b..f93954a0384d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
+EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
@@ -540,6 +541,42 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
}
/**
+ * pidfd_get_task() - Get the task associated with a pidfd
+ *
+ * @pidfd: pidfd for which to get the task
+ * @flags: flags associated with this pidfd
+ *
+ * Return the task associated with @pidfd. The function takes a reference on
+ * the returned task. The caller is responsible for releasing that reference.
+ *
+ * Currently, the process identified by @pidfd is always a thread-group leader.
+ * This restriction currently exists for all aspects of pidfds including pidfd
+ * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
+ * (only supports thread group leaders).
+ *
+ * Return: On success, the task_struct associated with the pidfd.
+ * On error, a negative errno number will be returned.
+ */
+struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
+{
+ unsigned int f_flags;
+ struct pid *pid;
+ struct task_struct *task;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ put_pid(pid);
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ *flags = f_flags;
+ return task;
+}
+
+/**
* pidfd_create() - Create a new pid file descriptor.
*
* @pid: struct pid that the pidfd will reference
@@ -550,19 +587,22 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
+ * This symbol should not be explicitly exported to loadable modules.
+ *
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-static int pidfd_create(struct pid *pid, unsigned int flags)
+int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
+ int pidfd;
+ struct file *pidfd_file;
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- flags | O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
@@ -596,10 +636,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
- if (pid_has_task(p, PIDTYPE_TGID))
- fd = pidfd_create(p, flags);
- else
- fd = -EINVAL;
+ fd = pidfd_create(p, flags);
put_pid(p);
return fd;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca43239a255a..b43eee07b00c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
@@ -51,7 +52,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
@@ -109,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ initialize_memfd_noexec_scope(ns);
+
return ns;
out_free_idr:
@@ -243,7 +247,24 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
+ /*
+ * Release tasks_rcu_exit_srcu to avoid following deadlock:
+ *
+ * 1) TASK A unshare(CLONE_NEWPID)
+ * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
+ * and TASK C
+ * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
+ * 4) TASK A calls synchronize_rcu_tasks()
+ * -> synchronize_srcu(tasks_rcu_exit_srcu)
+ * 5) *DEADLOCK*
+ *
+ * It is considered safe to release tasks_rcu_exit_srcu here
+ * because we assume the current task can not be concurrently
+ * reaped at this point.
+ */
+ exit_tasks_rcu_stop();
schedule();
+ exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
@@ -293,7 +314,6 @@ static struct ctl_table pid_ns_ctl_table[] = {
},
{ }
};
-static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
#endif /* CONFIG_CHECKPOINT_RESTORE */
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
@@ -449,11 +469,13 @@ const struct proc_ns_operations pidns_for_children_operations = {
static __init int pid_namespaces_init(void)
{
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
- register_sysctl_paths(kern_path, pid_ns_ctl_table);
+ register_sysctl_init("kernel", pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..d67a4d45bb42
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ ns->memfd_noexec_scope =
+ task_active_pid_ns(current)->memfd_noexec_scope;
+}
+
+static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+ if (ns != &init_pid_ns)
+ table_copy.data = &ns->memfd_noexec_scope;
+
+ /*
+ * set minimum to current value, the effect is only bigger
+ * value is accepted.
+ */
+ if (*(int *)table_copy.data > *(int *)table_copy.extra1)
+ table_copy.extra1 = table_copy.data;
+
+ return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl("vm", pid_ns_ctl_table_vm);
+}
+#else
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a12779650f15..4b31629c5be4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -118,7 +118,6 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
- select SRCU
config PM_SLEEP_SMP
def_bool y
@@ -143,6 +142,26 @@ config PM_AUTOSLEEP
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
+config PM_USERSPACE_AUTOSLEEP
+ bool "Userspace opportunistic sleep"
+ depends on PM_SLEEP
+ help
+ Notify kernel of aggressive userspace autosleep power management policy.
+
+ This option changes the behavior of various sleep-sensitive code to deal
+ with frequent userspace-initiated transitions into a global sleep state.
+
+ Saying Y here, disables code paths that most users really should keep
+ enabled. In particular, only enable this if it is very common to be
+ asleep/awake for very short periods of time (<= 2 seconds).
+
+ Only platforms, such as Android, that implement opportunistic sleep from
+ a userspace power manager service should enable this option; and not
+ other machines. Therefore, you should say N here, unless you are
+ extremely certain that this is what you want. The option otherwise has
+ bad, undesirable effects, and should not be enabled just for fun.
+
+
config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8bef..874ad834dc8d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o := -DDEBUG
+CFLAGS_snapshot.o := -DDEBUG
+CFLAGS_energy_model.o := -DDEBUG
+endif
KASAN_SANITIZE_snapshot.o := n
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0f4530b3a8cd..7b44f5b89fa1 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -2,7 +2,7 @@
/*
* Energy Model of devices
*
- * Copyright (c) 2018-2020, Arm ltd.
+ * Copyright (c) 2018-2021, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
* Improvements provided by: Lukasz Luba, Arm ltd.
*/
@@ -10,6 +10,7 @@
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/energy_model.h>
@@ -42,6 +43,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
debugfs_create_ulong("power", 0444, d, &ps->power);
debugfs_create_ulong("cost", 0444, d, &ps->cost);
+ debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -52,16 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static int em_debug_units_show(struct seq_file *s, void *unused)
+static int em_debug_flags_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
- seq_printf(s, "%s\n", units);
+ seq_printf(s, "%#lx\n", pd->flags);
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
static void em_debug_create_pd(struct device *dev)
{
@@ -75,7 +76,8 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
&em_debug_cpus_fops);
- debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
+ debugfs_create_file("flags", 0444, d, dev->em_pd,
+ &em_debug_flags_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@@ -85,10 +87,7 @@ static void em_debug_create_pd(struct device *dev)
static void em_debug_remove_pd(struct device *dev)
{
- struct dentry *debug_dir;
-
- debug_dir = debugfs_lookup(dev_name(dev), rootdir);
- debugfs_remove_recursive(debug_dir);
+ debugfs_lookup_and_remove(dev_name(dev), rootdir);
}
static int __init em_debug_init(void)
@@ -105,10 +104,10 @@ static void em_debug_remove_pd(struct device *dev) {}
#endif
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
- int nr_states, struct em_data_callback *cb)
+ int nr_states, struct em_data_callback *cb,
+ unsigned long flags)
{
- unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
- unsigned long power, freq, prev_freq = 0;
+ unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
int i, ret;
u64 fmax;
@@ -124,7 +123,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
* lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, dev);
+ ret = cb->active_power(dev, &power, &freq);
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
@@ -143,7 +142,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/*
* The power returned by active_state() is expected to be
- * positive and to fit into 16 bits.
+ * positive and be in range.
*/
if (!power || power > EM_MAX_POWER) {
dev_err(dev, "EM: invalid power: %lu\n",
@@ -153,25 +152,34 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
table[i].power = power;
table[i].frequency = prev_freq = freq;
-
- /*
- * The hertz/watts efficiency ratio should decrease as the
- * frequency grows on sane platforms. But this isn't always
- * true in practice so warn the user if a higher OPP is more
- * power efficient than a lower one.
- */
- opp_eff = freq / power;
- if (opp_eff >= prev_opp_eff)
- dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
- i, i - 1);
- prev_opp_eff = opp_eff;
}
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
- for (i = 0; i < nr_states; i++) {
- table[i].cost = div64_u64(fmax * table[i].power,
- table[i].frequency);
+ for (i = nr_states - 1; i >= 0; i--) {
+ unsigned long power_res, cost;
+
+ if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ goto free_ps_table;
+ }
+ } else {
+ power_res = table[i].power;
+ cost = div64_u64(fmax * power_res, table[i].frequency);
+ }
+
+ table[i].cost = cost;
+
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
}
pd->table = table;
@@ -185,13 +193,22 @@ free_ps_table:
}
static int em_create_pd(struct device *dev, int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus)
+ struct em_data_callback *cb, cpumask_t *cpus,
+ unsigned long flags)
{
struct em_perf_domain *pd;
struct device *cpu_dev;
- int cpu, ret;
+ int cpu, ret, num_cpus;
if (_is_cpu_device(dev)) {
+ num_cpus = cpumask_weight(cpus);
+
+ /* Prevent max possible energy calculation to not overflow */
+ if (num_cpus > EM_MAX_NUM_CPUS) {
+ dev_err(dev, "EM: too many CPUs, overflow possible\n");
+ return -EINVAL;
+ }
+
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return -ENOMEM;
@@ -203,7 +220,7 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
- ret = em_create_perf_table(dev, pd, nr_states, cb);
+ ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
if (ret) {
kfree(pd);
return ret;
@@ -220,6 +237,45 @@ static int em_create_pd(struct device *dev, int nr_states,
return 0;
}
+static void em_cpufreq_update_efficiencies(struct device *dev)
+{
+ struct em_perf_domain *pd = dev->em_pd;
+ struct em_perf_state *table;
+ struct cpufreq_policy *policy;
+ int found = 0;
+ int i;
+
+ if (!_is_cpu_device(dev) || !pd)
+ return;
+
+ policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ return;
+ }
+
+ table = pd->table;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
+ continue;
+
+ if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
+ found++;
+ }
+
+ cpufreq_cpu_put(policy);
+
+ if (!found)
+ return;
+
+ /*
+ * Efficiencies have been installed in CPUFreq, inefficient frequencies
+ * will be skipped. The EM can do the same.
+ */
+ pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
+}
+
/**
* em_pd_get() - Return the performance domain for a device
* @dev : Device to find the performance domain for
@@ -263,13 +319,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
* @cpus : Pointer to cpumask_t, which in case of a CPU device is
* obligatory. It can be taken from i.e. 'policy->cpus'. For other
* type of devices this should be set to NULL.
- * @milliwatts : Flag indicating that the power values are in milliWatts or
+ * @microwatts : Flag indicating that the power values are in micro-Watts or
* in some other scale. It must be set properly.
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
- * The @milliwatts is important to set with correct value. Some kernel
+ * The @microwatts is important to set with correct value. Some kernel
* sub-systems might rely on this flag and check if all devices in the EM are
* using the same scale.
*
@@ -280,9 +336,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
*/
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
- bool milliwatts)
+ bool microwatts)
{
unsigned long cap, prev_cap = 0;
+ unsigned long flags = 0;
int cpu, ret;
if (!dev || !nr_states || !cb)
@@ -329,11 +386,18 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
}
}
- ret = em_create_pd(dev, nr_states, cb, cpus);
+ if (microwatts)
+ flags |= EM_PERF_DOMAIN_MICROWATTS;
+ else if (cb->get_cost)
+ flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+ ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
- dev->em_pd->milliwatts = milliwatts;
+ dev->em_pd->flags |= flags;
+
+ em_cpufreq_update_efficiencies(dev);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 559acef3fddb..30d1274f03f6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,7 +28,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
#include <linux/secretmem.h>
@@ -65,6 +64,7 @@ enum {
static int hibernation_mode = HIBERNATION_SHUTDOWN;
bool freezer_test_done;
+bool snapshot_test;
static const struct platform_hibernation_ops *hibernation_ops;
@@ -84,7 +84,7 @@ bool hibernation_available(void)
{
return nohibernate == 0 &&
!security_locked_down(LOCKDOWN_HIBERNATION) &&
- !secretmem_active();
+ !secretmem_active() && !cxl_mem_active();
}
/**
@@ -93,20 +93,24 @@ bool hibernation_available(void)
*/
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
+ unsigned int sleep_flags;
+
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
- lock_system_sleep();
+
+ sleep_flags = lock_system_sleep();
+
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(hibernation_set_ops);
@@ -300,7 +304,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
@@ -342,7 +346,7 @@ static int create_image(int platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
/* Allow architectures to do nosmt-specific post-resume dances */
if (!in_suspend)
@@ -466,6 +470,8 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
+ cpuidle_pause();
+
error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -509,7 +515,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -587,7 +593,7 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error)
goto Enable_cpus;
@@ -609,7 +615,7 @@ int hibernation_platform_enter(void)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
@@ -640,7 +646,7 @@ static void power_down(void)
int error;
if (hibernation_mode == HIBERNATION_SUSPEND) {
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ error = suspend_devices_and_enter(mem_sleep_current);
if (error) {
hibernation_mode = hibernation_ops ?
HIBERNATION_PLATFORM :
@@ -664,7 +670,7 @@ static void power_down(void)
hibernation_platform_enter();
fallthrough;
case HIBERNATION_SHUTDOWN:
- if (pm_power_off)
+ if (kernel_can_power_off())
kernel_power_off();
break;
}
@@ -682,16 +688,22 @@ static int load_image_and_restore(void)
{
int error;
unsigned int flags;
+ fmode_t mode = FMODE_READ;
+
+ if (snapshot_test)
+ mode |= FMODE_EXCL;
pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
- if (error)
+ if (error) {
+ swsusp_close(mode);
goto Unlock;
+ }
error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
+ swsusp_close(mode);
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -709,7 +721,7 @@ static int load_image_and_restore(void)
*/
int hibernate(void)
{
- bool snapshot_test = false;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available()) {
@@ -717,7 +729,7 @@ int hibernate(void)
return -EPERM;
}
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -736,6 +748,9 @@ int hibernate(void)
if (error)
goto Exit;
+ /* protected by system_transition_mutex */
+ snapshot_test = false;
+
lock_device_hotplug();
/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
@@ -791,7 +806,7 @@ int hibernate(void)
pm_restore_console();
hibernate_release();
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
pr_info("hibernation exit\n");
return error;
@@ -806,9 +821,10 @@ int hibernate(void)
*/
int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
+ unsigned int sleep_flags;
int error;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -888,7 +904,7 @@ restore:
hibernate_release();
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -931,6 +947,8 @@ static int software_resume(void)
*/
mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
+ snapshot_test = false;
+
if (swsusp_resume_device)
goto Check_image;
@@ -981,7 +999,7 @@ static int software_resume(void)
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Unlock;
}
@@ -1016,7 +1034,7 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Finish;
}
@@ -1097,11 +1115,12 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ int mode = HIBERNATION_INVALID;
+ unsigned int sleep_flags;
int error = 0;
- int i;
int len;
char *p;
- int mode = HIBERNATION_INVALID;
+ int i;
if (!hibernation_available())
return -EPERM;
@@ -1109,7 +1128,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -1139,7 +1158,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!error)
pm_pr_dbg("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -1155,9 +1174,10 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- dev_t res;
+ unsigned int sleep_flags;
int len = n;
char *name;
+ dev_t res;
if (len && buf[len-1] == '\n')
len--;
@@ -1170,9 +1190,10 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!res)
return -EINVAL;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_resume_device = res;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
+
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
swsusp_resume_device);
noresume = 0;
@@ -1326,7 +1347,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 12c7e1bb442f..3113ec2f1db4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -6,6 +6,7 @@
* Copyright (c) 2003 Open Source Development Lab
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/kobject.h>
#include <linux/string.h>
@@ -21,14 +22,16 @@
#ifdef CONFIG_PM_SLEEP
-void lock_system_sleep(void)
+unsigned int lock_system_sleep(void)
{
- current->flags |= PF_FREEZER_SKIP;
+ unsigned int flags = current->flags;
+ current->flags |= PF_NOFREEZE;
mutex_lock(&system_transition_mutex);
+ return flags;
}
EXPORT_SYMBOL_GPL(lock_system_sleep);
-void unlock_system_sleep(void)
+void unlock_system_sleep(unsigned int flags)
{
/*
* Don't use freezer_count() because we don't want the call to
@@ -46,7 +49,8 @@ void unlock_system_sleep(void)
* Which means, if we use try_to_freeze() here, it would make them
* enter the refrigerator, thus causing hibernation to lockup.
*/
- current->flags &= ~PF_FREEZER_SKIP;
+ if (!(flags & PF_NOFREEZE))
+ current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
@@ -80,6 +84,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
@@ -127,7 +144,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
char *s = buf;
suspend_state_t i;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+ if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+ continue;
if (mem_sleep_states[i]) {
const char *label = mem_sleep_states[i];
@@ -136,6 +155,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
else
s += sprintf(s, "%s ", label);
}
+ }
/* Convert the last space to a newline if needed. */
if (s != buf)
@@ -260,16 +280,17 @@ static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ unsigned int sleep_flags;
const char * const *s;
+ int error = -EINVAL;
int level;
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -279,7 +300,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
break;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -307,24 +328,27 @@ static char *suspend_step_name(enum suspend_stat_step step)
}
}
-#define suspend_attr(_name) \
+#define suspend_attr(_name, format_str) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, "%d\n", suspend_stats._name); \
+ return sprintf(buf, format_str, suspend_stats._name); \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success);
-suspend_attr(fail);
-suspend_attr(failed_freeze);
-suspend_attr(failed_prepare);
-suspend_attr(failed_suspend);
-suspend_attr(failed_suspend_late);
-suspend_attr(failed_suspend_noirq);
-suspend_attr(failed_resume);
-suspend_attr(failed_resume_early);
-suspend_attr(failed_resume_noirq);
+suspend_attr(success, "%d\n");
+suspend_attr(fail, "%d\n");
+suspend_attr(failed_freeze, "%d\n");
+suspend_attr(failed_prepare, "%d\n");
+suspend_attr(failed_suspend, "%d\n");
+suspend_attr(failed_suspend_late, "%d\n");
+suspend_attr(failed_suspend_noirq, "%d\n");
+suspend_attr(failed_resume, "%d\n");
+suspend_attr(failed_resume_early, "%d\n");
+suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -384,12 +408,30 @@ static struct attribute *suspend_attrs[] = {
&last_failed_dev.attr,
&last_failed_errno.attr,
&last_failed_step.attr,
+ &last_hw_sleep.attr,
+ &total_hw_sleep.attr,
+ &max_hw_sleep.attr,
NULL,
};
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ if (attr != &last_hw_sleep.attr &&
+ attr != &total_hw_sleep.attr &&
+ attr != &max_hw_sleep.attr)
+ return 0444;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+ return 0444;
+#endif
+ return 0;
+}
+
static const struct attribute_group suspend_attr_group = {
.name = "suspend_stats",
.attrs = suspend_attrs,
+ .is_visible = suspend_attr_is_visible,
};
#ifdef CONFIG_DEBUG_FS
@@ -504,7 +546,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+ if (!pm_wakeup_irq())
+ return -ENODATA;
+
+ return sprintf(buf, "%u\n", pm_wakeup_irq());
}
power_attr_ro(pm_wakeup_irq);
@@ -542,42 +587,13 @@ static int __init pm_debug_messages_setup(char *str)
}
__setup("pm_debug_messages", pm_debug_messages_setup);
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (!pm_debug_messages_on)
- return;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (defer)
- printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
- else
- printk(KERN_DEBUG "PM: %pV", &vaf);
-
- va_end(args);
-}
-
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
struct kobject *power_kobj;
-/**
+/*
* state - control system sleep states.
*
* show() returns available sleep state labels, which may be "mem", "standby",
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 778bf431ec02..b83c8d5e188d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -4,6 +4,8 @@
#include <linux/utsname.h>
#include <linux/freezer.h>
#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
struct swsusp_info {
struct new_utsname uts;
@@ -57,6 +59,7 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
+extern bool snapshot_test;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
@@ -168,6 +171,7 @@ extern int swsusp_swap_in_use(void);
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
+#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
extern int swsusp_check(void);
@@ -310,3 +314,15 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
+
+static inline int pm_sleep_disable_secondary_cpus(void)
+{
+ cpuidle_pause();
+ return suspend_disable_secondary_cpus();
+}
+
+static inline void pm_sleep_enable_secondary_cpus(void)
+{
+ suspend_enable_secondary_cpus();
+ cpuidle_resume();
+}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 37401c99b7d7..cae81a87cc91 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -6,9 +6,6 @@
* Originally from swsusp.
*/
-
-#undef DEBUG
-
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
@@ -30,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
@@ -39,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only)
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+ pr_info("Freezing %s\n", what);
+
start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -53,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- if (!freezer_should_skip(p))
- todo++;
+ todo++;
}
read_unlock(&tasklist_lock);
@@ -86,28 +86,26 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
if (wq_busy)
- show_workqueue_state();
+ show_freezable_workqueues();
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
+ if (p != current && freezing(p) && !frozen(p))
sched_show_task(p);
}
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -132,17 +130,14 @@ int freeze_processes(void)
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
- pm_wakeup_clear(true);
- pr_info("Freezing user space processes ... ");
+ pm_wakeup_clear(0);
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
@@ -171,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -193,7 +183,7 @@ void thaw_processes(void)
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
pm_freezing = false;
pm_nosig_freezing = false;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index ec7e1e85923e..af51ed6d45ef 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -531,7 +531,7 @@ int freq_qos_add_request(struct freq_constraints *qos,
{
int ret;
- if (IS_ERR_OR_NULL(qos) || !req)
+ if (IS_ERR_OR_NULL(qos) || !req || value < 0)
return -EINVAL;
if (WARN(freq_qos_request_active(req),
@@ -563,7 +563,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request);
*/
int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
{
- if (!req)
+ if (!req || new_value < 0)
return -EINVAL;
if (WARN(!freq_qos_request_active(req),
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f7a986078213..cd8b7b35f1e8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
return ret;
}
-/**
+/*
* Data types related to memory bitmaps.
*
* Memory bitmap is a structure consisting of many linked lists of
@@ -427,6 +427,10 @@ struct memory_bitmap {
/**
* alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages not used before hibernation (restore only)
+ * @ca: Pointer to a linked list of pages ("a chain") to allocate from
+ * @list: Radix Tree node to add.
*
* This function is used to allocate inner nodes as well as the
* leave nodes of the radix tree. It also adds the node to the
@@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * memory_bm_next_pfn - Find the next set bit in a memory bitmap.
* @bm: Memory bitmap.
*
* Starting from the last returned position this function searches for the next
@@ -978,8 +982,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
* Register a range of page frames the contents of which should not be saved
* during hibernation (to be used in the early initialization code).
*/
-void __init __register_nosave_region(unsigned long start_pfn,
- unsigned long end_pfn, int use_kmalloc)
+void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
{
struct nosave_region *region;
@@ -995,18 +998,12 @@ void __init __register_nosave_region(unsigned long start_pfn,
goto Report;
}
}
- if (use_kmalloc) {
- /* During init, this shouldn't fail */
- region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
- BUG_ON(!region);
- } else {
- /* This allocation cannot fail */
- region = memblock_alloc(sizeof(struct nosave_region),
- SMP_CACHE_BYTES);
- if (!region)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(struct nosave_region));
- }
+ /* This allocation cannot fail */
+ region = memblock_alloc(sizeof(struct nosave_region),
+ SMP_CACHE_BYTES);
+ if (!region)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -1726,8 +1723,8 @@ static unsigned long minimum_image_size(unsigned long saveable)
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
- * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2
+ * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -1944,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ * alloc_highmem_pages - Allocate some highmem pages for the image.
*
* Try to allocate as many pages as needed, but if the number of free highmem
* pages is less than that, allocate them all.
@@ -2231,7 +2228,7 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - Check the image header and copy the data from it.
+ * load_header - Check the image header and copy the data from it.
*/
static int load_header(struct swsusp_info *info)
{
@@ -2262,10 +2259,14 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) {
memory_bm_set_bit(bm, buf[j]);
- else
+ } else {
+ if (!pfn_valid(buf[j]))
+ pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
+ (unsigned long long)PFN_PHYS(buf[j]));
return -EFAULT;
+ }
}
return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d8cae434f9eb..fa3bf161d13f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
s2idle_ops = ops;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
static void s2idle_begin(void)
@@ -96,8 +98,7 @@ static void s2idle_enter(void)
s2idle_state = S2IDLE_STATE_ENTER;
raw_spin_unlock_irq(&s2idle_lock);
- get_online_cpus();
- cpuidle_resume();
+ cpus_read_lock();
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
@@ -105,8 +106,7 @@ static void s2idle_enter(void)
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
- cpuidle_pause();
- put_online_cpus();
+ cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
@@ -138,7 +138,8 @@ static void s2idle_loop(void)
break;
}
- pm_wakeup_clear(false);
+ if (s2idle_ops && s2idle_ops->check)
+ s2idle_ops->check();
s2idle_enter();
}
@@ -162,11 +163,13 @@ EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
/*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
- * support and need to be valid to the low level
- * implementation, no valid callback implies that none are valid.
+ * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level
+ * support and need to be valid to the low-level implementation.
+ *
+ * No ->valid() or ->enter() callback implies that none are valid.
*/
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) &&
+ suspend_ops->enter;
}
void __init pm_states_init(void)
@@ -202,7 +205,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup);
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
suspend_ops = ops;
@@ -218,7 +223,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
mem_sleep_current = PM_SUSPEND_MEM;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
@@ -238,7 +243,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE ||
+ (valid_state(state) && !cxl_mem_active());
}
static int platform_suspend_prepare(suspend_state_t state)
@@ -422,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -452,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled());
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index e1ed58adb69e..b663a97f5867 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
- if (!candidate->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, candidate->features))
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
@@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
- return 0;
+ return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
- return 0;
+ return 1;
}
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
- return 0;
+ return 1;
}
printk(warn_bad_state, suspend_type);
- return 0;
+ return 1;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3cb89baebc79..92e41ed292ad 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
-#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -36,6 +35,8 @@
#define HIBERNATE_SIG "S1SUSPEND"
+u32 swsusp_hardware_signature;
+
/*
* When reading an {un,}compressed image, we may restore pages in place,
* in which case some architectures need these pages cleaning before they
@@ -87,7 +88,7 @@ struct swap_map_page_list {
struct swap_map_page_list *next;
};
-/**
+/*
* The swap_map_handle structure is used for handling swap in
* a file-alike way
*/
@@ -104,7 +105,8 @@ struct swap_map_handle {
struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
- sizeof(u32)];
+ sizeof(u32) - sizeof(u32)];
+ u32 hw_sig;
u32 crc32;
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
@@ -114,7 +116,7 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
-/**
+/*
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
*/
@@ -168,7 +170,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
-/**
+/*
* alloc_swapdev_block - allocate a swap page and register that it has
* been allocated, so that it can be freed in case of an error.
*/
@@ -187,7 +189,7 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
-/**
+/*
* free_all_swap_pages - free swap pages allocated for saving image data.
* It also frees the extents used to register which swap entries had been
* allocated.
@@ -267,17 +269,15 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
- struct hib_bio_batch *hb)
+static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
+ struct hib_bio_batch *hb)
{
struct page *page = virt_to_page(addr);
struct bio *bio;
int error = 0;
- bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+ bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio_set_dev(bio, hib_resume_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
pr_err("Adding page to bio failed at %llu\n",
@@ -299,7 +299,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
return error;
}
-static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
+static int hib_wait_io(struct hib_bio_batch *hb)
{
/*
* We are relying on the behavior of blk_plug that a thread with
@@ -312,22 +312,24 @@ static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
/*
* Saving part
*/
-
static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
swsusp_header->image = handle->first_sector;
+ if (swsusp_hardware_signature) {
+ swsusp_header->hw_sig = swsusp_hardware_signature;
+ flags |= SF_HW_SIG;
+ }
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
pr_err("Swap header not found!\n");
@@ -404,7 +406,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -579,7 +581,7 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for CRC32.
*/
struct crc_data {
@@ -594,7 +596,7 @@ struct crc_data {
unsigned char *unc[LZO_THREADS]; /* uncompressed data */
};
-/**
+/*
* CRC32 update function that runs in its own thread.
*/
static int crc32_threadfn(void *data)
@@ -621,7 +623,7 @@ static int crc32_threadfn(void *data)
}
return 0;
}
-/**
+/*
* Structure used for LZO data compression.
*/
struct cmp_data {
@@ -638,7 +640,7 @@ struct cmp_data {
unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
};
-/**
+/*
* Compression function that runs in its own thread.
*/
static int lzo_compress_threadfn(void *data)
@@ -705,22 +707,19 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct cmp_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
/*
* Start the compression threads.
@@ -949,9 +948,9 @@ out_finish:
return error;
}
-/**
+/*
* The following functions allow us to read data using a swap map
- * in a file-alike way
+ * in a file-like way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -1002,7 +1001,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1026,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1108,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for LZO data decompression.
*/
struct dec_data {
@@ -1124,7 +1123,7 @@ struct dec_data {
unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
};
-/**
+/*
* Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)
@@ -1198,22 +1197,19 @@ static int load_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct dec_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
clean_pages_on_decompress = true;
@@ -1521,14 +1517,18 @@ end:
int swsusp_check(void)
{
int error;
+ void *holder;
+ fmode_t mode = FMODE_READ;
+
+ if (snapshot_test)
+ mode |= FMODE_EXCL;
hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- FMODE_READ, NULL);
+ mode, &holder);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(REQ_OP_READ, 0,
- swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1536,16 +1536,22 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
}
+ if (!error && swsusp_header->flags & SF_HW_SIG &&
+ swsusp_header->hw_sig != swsusp_hardware_signature) {
+ pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n",
+ swsusp_header->hw_sig, swsusp_hardware_signature);
+ error = -EINVAL;
+ }
put:
if (error)
- blkdev_put(hib_resume_bdev, FMODE_READ);
+ blkdev_put(hib_resume_bdev, mode);
else
pr_debug("Image signature found, resuming\n");
} else {
@@ -1581,11 +1587,11 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 740723bb3885..3a4e70366f35 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -26,6 +26,7 @@
#include "power.h"
+static bool need_wait;
static struct snapshot_data {
struct snapshot_handle handle;
@@ -46,12 +47,13 @@ int is_hibernate_resume_dev(dev_t dev)
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available())
return -EPERM;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -78,7 +80,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
* Resuming. We may need to wait for the image device to
* appear.
*/
- wait_for_device_probe();
+ need_wait = true;
data->swap = -1;
data->mode = O_WRONLY;
@@ -97,7 +99,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->dev = 0;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -105,8 +107,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_free();
data = filp->private_data;
@@ -123,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
PM_POST_HIBERNATION : PM_POST_RESTORE);
hibernate_release();
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return 0;
}
@@ -131,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp)
static ssize_t snapshot_read(struct file *filp, char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned int sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
if (!data->ready) {
@@ -156,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
*offp += res;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -164,11 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
static ssize_t snapshot_write(struct file *filp, const char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned long sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
@@ -177,7 +187,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res <= 0)
goto unlock;
} else {
- res = PAGE_SIZE - pg_offp;
+ res = PAGE_SIZE;
}
if (!data_of(data->handle)) {
@@ -190,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res > 0)
*offp += res;
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -244,6 +254,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
loff_t size;
sector_t offset;
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
return -ENOTTY;
if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 105df4dfc783..52571dcad768 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
{
struct rb_node *node;
struct wakelock *wl;
- char *str = buf;
- char *end = buf + PAGE_SIZE;
+ int len = 0;
mutex_lock(&wakelocks_lock);
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
if (wl->ws->active == show_active)
- str += scnprintf(str, end - str, "%s ", wl->name);
+ len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
- if (str > buf)
- str--;
- str += scnprintf(str, end - str, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
- return (str - buf);
+ return len;
}
#if CONFIG_PM_WAKELOCKS_LIMIT > 0
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index eee3dc9b60a9..f5b388e810b9 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -2,4 +2,8 @@
obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
-obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
+obj-$(CONFIG_PRINTK_INDEX) += index.o
+
+obj-$(CONFIG_PRINTK) += printk_support.o
+printk_support-y := printk_ringbuffer.o
+printk_support-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
new file mode 100644
index 000000000000..a6b27526baaf
--- /dev/null
+++ b/kernel/printk/index.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace indexing of printk formats
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "internal.h"
+
+extern struct pi_entry *__start_printk_index[];
+extern struct pi_entry *__stop_printk_index[];
+
+/* The base dir for module formats, typically debugfs/printk/index/ */
+static struct dentry *dfs_index;
+
+static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos)
+{
+ struct pi_entry **entries;
+ unsigned int nr_entries;
+
+#ifdef CONFIG_MODULES
+ if (mod) {
+ entries = mod->printk_index_start;
+ nr_entries = mod->printk_index_size;
+ } else
+#endif
+ {
+ /* vmlinux, comes from linker symbols */
+ entries = __start_printk_index;
+ nr_entries = __stop_printk_index - __start_printk_index;
+ }
+
+ if (pos >= nr_entries)
+ return NULL;
+
+ return entries[pos];
+}
+
+static void *pi_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ const struct module *mod = s->file->f_inode->i_private;
+ struct pi_entry *entry = pi_get_entry(mod, *pos);
+
+ (*pos)++;
+
+ return entry;
+}
+
+static void *pi_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Make show() print the header line. Do not update *pos because
+ * pi_next() still has to return the entry at index 0 later.
+ */
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ return pi_next(s, NULL, pos);
+}
+
+/*
+ * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only
+ * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be
+ * ignored for quoting.
+ */
+#define seq_escape_printf_format(s, src) \
+ seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\")
+
+static int pi_show(struct seq_file *s, void *v)
+{
+ const struct pi_entry *entry = v;
+ int level = LOGLEVEL_DEFAULT;
+ enum printk_info_flags flags = 0;
+ u16 prefix_len = 0;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "# <level/flags> filename:line function \"format\"\n");
+ return 0;
+ }
+
+ if (!entry->fmt)
+ return 0;
+
+ if (entry->level)
+ printk_parse_prefix(entry->level, &level, &flags);
+ else
+ prefix_len = printk_parse_prefix(entry->fmt, &level, &flags);
+
+
+ if (flags & LOG_CONT) {
+ /*
+ * LOGLEVEL_DEFAULT here means "use the same level as the
+ * message we're continuing from", not the default message
+ * loglevel, so don't display it as such.
+ */
+ if (level == LOGLEVEL_DEFAULT)
+ seq_puts(s, "<c>");
+ else
+ seq_printf(s, "<%d,c>", level);
+ } else
+ seq_printf(s, "<%d>", level);
+
+ seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func);
+ if (entry->subsys_fmt_prefix)
+ seq_escape_printf_format(s, entry->subsys_fmt_prefix);
+ seq_escape_printf_format(s, entry->fmt + prefix_len);
+ seq_puts(s, "\"\n");
+
+ return 0;
+}
+
+static void pi_stop(struct seq_file *p, void *v) { }
+
+static const struct seq_operations dfs_index_sops = {
+ .start = pi_start,
+ .next = pi_next,
+ .show = pi_show,
+ .stop = pi_stop,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dfs_index);
+
+#ifdef CONFIG_MODULES
+static const char *pi_get_module_name(struct module *mod)
+{
+ return mod ? mod->name : "vmlinux";
+}
+#else
+static const char *pi_get_module_name(struct module *mod)
+{
+ return "vmlinux";
+}
+#endif
+
+static void pi_create_file(struct module *mod)
+{
+ debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index,
+ mod, &dfs_index_fops);
+}
+
+#ifdef CONFIG_MODULES
+static void pi_remove_file(struct module *mod)
+{
+ debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);
+}
+
+static int pi_module_notify(struct notifier_block *nb, unsigned long op,
+ void *data)
+{
+ struct module *mod = data;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ pi_create_file(mod);
+ break;
+ case MODULE_STATE_GOING:
+ pi_remove_file(mod);
+ break;
+ default: /* we don't care about other module states */
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block module_printk_fmts_nb = {
+ .notifier_call = pi_module_notify,
+};
+
+static void __init pi_setup_module_notifier(void)
+{
+ register_module_notifier(&module_printk_fmts_nb);
+}
+#else
+static inline void __init pi_setup_module_notifier(void) { }
+#endif
+
+static int __init pi_init(void)
+{
+ struct dentry *dfs_root = debugfs_create_dir("printk", NULL);
+
+ dfs_index = debugfs_create_dir("index", dfs_root);
+ pi_setup_module_notifier();
+ pi_create_file(NULL);
+
+ return 0;
+}
+
+/* debugfs comes up on core and must be initialised first */
+postcore_initcall(pi_init);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 51615c909b2f..2a17704136f1 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -4,13 +4,36 @@
*/
#include <linux/percpu.h>
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+void __init printk_sysctl_init(void);
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#else
+#define printk_sysctl_init() do { } while (0)
+#endif
+
#ifdef CONFIG_PRINTK
-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
+#ifdef CONFIG_PRINTK_CALLER
+#define PRINTK_PREFIX_MAX 48
+#else
+#define PRINTK_PREFIX_MAX 32
+#endif
+
+/*
+ * the maximum size of a formatted record (i.e. with prefix added
+ * per line and dropped messages or in extended message format)
+ */
+#define PRINTK_MESSAGE_MAX 2048
-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
+/* the maximum size allowed to be reserved for a record */
+#define PRINTKRB_RECORD_MAX 1024
+
+/* Flags for a single printk record. */
+enum printk_info_flags {
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
__printf(4, 0)
int vprintk_store(int facility, int level,
@@ -19,10 +42,7 @@ int vprintk_store(int facility, int level,
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-void printk_safe_init(void);
bool printk_percpu_data_ready(void);
#define printk_safe_enter_irqsave(flags) \
@@ -37,22 +57,16 @@ bool printk_percpu_data_ready(void);
local_irq_restore(flags); \
} while (0)
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
-
void defer_console_output(void);
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags);
#else
+#define PRINTK_PREFIX_MAX 0
+#define PRINTK_MESSAGE_MAX 0
+#define PRINTKRB_RECORD_MAX 0
+
/*
* In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
@@ -61,9 +75,31 @@ void defer_console_output(void);
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-static inline void printk_safe_init(void) { }
static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
+
+/**
+ * struct printk_buffers - Buffers to read/format/output printk messages.
+ * @outbuf: After formatting, contains text to output.
+ * @scratchbuf: Used as temporary ringbuffer reading and string-print space.
+ */
+struct printk_buffers {
+ char outbuf[PRINTK_MESSAGE_MAX];
+ char scratchbuf[PRINTKRB_RECORD_MAX];
+};
+
+/**
+ * struct printk_message - Container for a prepared printk message.
+ * @pbufs: printk buffers used to prepare the message.
+ * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This
+ * does not count the terminator. A value of 0 means there is
+ * nothing to output and this record should be skipped.
+ * @seq: The sequence number of the record used for @pbufs->outbuf.
+ * @dropped: The number of dropped records from reading @seq.
+ */
+struct printk_message {
+ struct printk_buffers *pbufs;
+ unsigned int outbuf_len;
+ u64 seq;
+ unsigned long dropped;
+};
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 142a58d124d9..6a333adce3b3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -71,6 +71,8 @@ EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
+EXPORT_TRACEPOINT_SYMBOL_GPL(console);
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -79,13 +81,20 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
+ * console_mutex protects console_list updates and console->flags updates.
+ * The flags are synchronized only for consoles that are registered, i.e.
+ * accessible via the console list.
*/
-static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
+static DEFINE_MUTEX(console_mutex);
+
+/*
+ * console_sem protects updates to console->seq and console_suspended,
+ * and also provides serialization for console printing.
+ */
+static DEFINE_SEMAPHORE(console_sem, 1);
+HLIST_HEAD(console_list);
+EXPORT_SYMBOL_GPL(console_list);
+DEFINE_STATIC_SRCU(console_srcu);
/*
* System may need to suppress printk message under certain
@@ -93,10 +102,30 @@ EXPORT_SYMBOL_GPL(console_drivers);
*/
int __read_mostly suppress_printk;
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+static int __read_mostly suppress_panic_printk;
+
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
+
+void lockdep_assert_console_list_lock_held(void)
+{
+ lockdep_assert_held(&console_mutex);
+}
+EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+bool console_srcu_read_lock_is_held(void)
+{
+ return srcu_read_lock_held(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif
enum devkmsg_log_bits {
@@ -146,8 +175,10 @@ static int __control_devkmsg(char *str)
static int __init control_devkmsg(char *str)
{
- if (__control_devkmsg(str) < 0)
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
+ }
/*
* Set sysctl string accordingly:
@@ -166,12 +197,12 @@ static int __init control_devkmsg(char *str)
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
- return 0;
+ return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
-
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -210,9 +241,70 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
return 0;
}
+#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
-/* Number of registered extended console drivers. */
-static int nr_ext_console_drivers;
+/**
+ * console_list_lock - Lock the console list
+ *
+ * For console list or console->flags updates
+ */
+void console_list_lock(void)
+{
+ /*
+ * In unregister_console() and console_force_preferred_locked(),
+ * synchronize_srcu() is called with the console_list_lock held.
+ * Therefore it is not allowed that the console_list_lock is taken
+ * with the srcu_lock held.
+ *
+ * Detecting if this context is really in the read-side critical
+ * section is only possible if the appropriate debug options are
+ * enabled.
+ */
+ WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
+ srcu_read_lock_held(&console_srcu));
+
+ mutex_lock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_lock);
+
+/**
+ * console_list_unlock - Unlock the console list
+ *
+ * Counterpart to console_list_lock()
+ */
+void console_list_unlock(void)
+{
+ mutex_unlock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_unlock);
+
+/**
+ * console_srcu_read_lock - Register a new reader for the
+ * SRCU-protected console list
+ *
+ * Use for_each_console_srcu() to iterate the console list
+ *
+ * Context: Any context.
+ * Return: A cookie to pass to console_srcu_read_unlock().
+ */
+int console_srcu_read_lock(void)
+{
+ return srcu_read_lock_nmisafe(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock);
+
+/**
+ * console_srcu_read_unlock - Unregister an old reader from
+ * the SRCU-protected console list
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * Counterpart to console_srcu_read_lock()
+ */
+void console_srcu_read_unlock(int cookie)
+{
+ srcu_read_unlock_nmisafe(&console_srcu, cookie);
+}
+EXPORT_SYMBOL(console_srcu_read_unlock);
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
@@ -256,6 +348,11 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
+static bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -267,11 +364,6 @@ static void __up_console_sem(unsigned long ip)
static int console_locked, console_suspended;
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
-/*
* Array of consoles built from command line options (console=)
*/
@@ -280,7 +372,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
-static bool has_preferred_console;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -350,13 +441,8 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* non-prinatable characters are escaped in the "\xff" notation.
*/
-enum log_flags {
- LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_CONT = 8, /* text is a fragment of a continuation line */
-};
-
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
-static DEFINE_RAW_SPINLOCK(syslog_lock);
+static DEFINE_MUTEX(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
@@ -366,12 +452,6 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-/* All 3 protected by @console_sem. */
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
-
struct latched_seq {
seqcount_latch_t latch;
u64 val[2];
@@ -388,18 +468,6 @@ static struct latched_seq clear_seq = {
.val[1] = 0,
};
-#ifdef CONFIG_PRINTK_CALLER
-#define PREFIX_MAX 48
-#else
-#define PREFIX_MAX 32
-#endif
-
-/* the maximum size of a formatted record (i.e. with prefix added per line) */
-#define CONSOLE_LOG_MAX 1024
-
-/* the maximum size allowed to be reserved for a record */
-#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
-
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -433,7 +501,7 @@ static struct printk_ringbuffer *prb = &printk_rb_static;
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
*/
-static bool __printk_percpu_data_ready __read_mostly;
+static bool __printk_percpu_data_ready __ro_after_init;
bool printk_percpu_data_ready(void)
{
@@ -630,16 +698,15 @@ out:
return len;
}
+static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
- char buf[CONSOLE_EXT_LOG_MAX];
-
- struct printk_info info;
- char text_buf[CONSOLE_EXT_LOG_MAX];
- struct printk_record record;
+ struct printk_buffers pbufs;
};
static __printf(3, 4) __cold
@@ -665,7 +732,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > LOG_LINE_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -721,59 +788,58 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_record *r = &user->record;
- size_t len;
+ char *outbuf = &user->pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &user->pbufs,
+ };
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- printk_safe_enter_irq();
- if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
+ if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- printk_safe_exit_irq();
goto out;
}
- printk_safe_exit_irq();
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb, atomic64_read(&user->seq), r));
+ printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
+ false)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
- printk_safe_enter_irq();
}
- if (r->info->seq != atomic64_read(&user->seq)) {
+ if (pmsg.dropped) {
/* our last seen message is gone, return error and reset */
- atomic64_set(&user->seq, r->info->seq);
+ atomic64_set(&user->seq, pmsg.seq);
ret = -EPIPE;
- printk_safe_exit_irq();
goto out;
}
- len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
- len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- &r->text_buf[0], r->info->text_len,
- &r->info->dev_info);
-
- atomic64_set(&user->seq, r->info->seq + 1);
- printk_safe_exit_irq();
+ atomic64_set(&user->seq, pmsg.seq + 1);
- if (len > count) {
+ if (pmsg.outbuf_len > count) {
ret = -EINVAL;
goto out;
}
- if (copy_to_user(buf, user->buf, len)) {
+ if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
ret = -EFAULT;
goto out;
}
- ret = len;
+ ret = pmsg.outbuf_len;
out:
mutex_unlock(&user->lock);
return ret;
@@ -792,12 +858,9 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
- printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -818,7 +881,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- printk_safe_exit_irq();
return ret;
}
@@ -828,12 +890,8 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
struct printk_info info;
__poll_t ret = 0;
- if (!user)
- return EPOLLERR|EPOLLNVAL;
-
poll_wait(file, &log_wait, wait);
- printk_safe_enter_irq();
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
if (info.seq != atomic64_read(&user->seq))
@@ -841,7 +899,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
else
ret = EPOLLIN|EPOLLRDNORM;
}
- printk_safe_exit_irq();
return ret;
}
@@ -862,7 +919,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
return err;
}
- user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
@@ -871,12 +928,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- prb_rec_init_rd(&user->record, &user->info,
- &user->text_buf[0], sizeof(user->text_buf));
-
- printk_safe_enter_irq();
atomic64_set(&user->seq, prb_first_valid_seq(prb));
- printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -886,13 +938,10 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
-
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
- kfree(user);
+ kvfree(user);
return 0;
}
@@ -1042,9 +1091,6 @@ static inline void log_buf_add_cpu(void) {}
static void __init set_percpu_data_ready(void)
{
- printk_safe_init();
- /* Make sure we set this flag only after printk_safe() init is done */
- barrier();
__printk_percpu_data_ready = true;
}
@@ -1073,7 +1119,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
return prb_record_text_space(&e);
}
-static char setup_text_buf[LOG_LINE_MAX] __initdata;
+static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
void __init setup_log_buf(int early)
{
@@ -1082,6 +1128,7 @@ void __init setup_log_buf(int early)
struct prb_desc *new_descs;
struct printk_info info;
struct printk_record r;
+ unsigned int text_size;
size_t new_descs_size;
size_t new_infos_size;
unsigned long flags;
@@ -1142,24 +1189,37 @@ void __init setup_log_buf(int early)
new_descs, ilog2(new_descs_count),
new_infos);
- printk_safe_enter_irqsave(flags);
+ local_irq_save(flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
free = __LOG_BUF_LEN;
- prb_for_each_record(0, &printk_rb_static, seq, &r)
- free -= add_to_rb(&printk_rb_dynamic, &r);
+ prb_for_each_record(0, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
- /*
- * This is early enough that everything is still running on the
- * boot CPU and interrupts are disabled. So no new messages will
- * appear during the transition to the dynamic buffer.
- */
prb = &printk_rb_dynamic;
- printk_safe_exit_irqrestore(flags);
+ local_irq_restore(flags);
+
+ /*
+ * Copy any remaining messages that might have appeared from
+ * NMI context after copying but before switching to the
+ * dynamic buffer.
+ */
+ prb_for_each_record(seq, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
@@ -1172,9 +1232,9 @@ void __init setup_log_buf(int early)
return;
err_free_descs:
- memblock_free(__pa(new_descs), new_descs_size);
+ memblock_free(new_descs, new_descs_size);
err_free_log_buf:
- memblock_free(__pa(new_log_buf), new_log_buf_len);
+ memblock_free(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
@@ -1325,7 +1385,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog,
size_t text_len = r->info->text_len;
size_t buf_size = r->text_buf_size;
char *text = r->text_buf;
- char prefix[PREFIX_MAX];
+ char prefix[PRINTK_PREFIX_MAX];
bool truncated = false;
size_t prefix_len;
size_t line_len;
@@ -1424,7 +1484,7 @@ static size_t get_record_print_text_size(struct printk_info *info,
unsigned int line_count,
bool syslog, bool time)
{
- char prefix[PREFIX_MAX];
+ char prefix[PRINTK_PREFIX_MAX];
size_t prefix_len;
prefix_len = info_print_prefix(info, syslog, time, prefix);
@@ -1481,30 +1541,61 @@ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
return seq;
}
+/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
struct printk_info info;
struct printk_record r;
char *text;
int len = 0;
+ u64 seq;
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
- while (size > 0) {
+ mutex_lock(&syslog_lock);
+
+ /*
+ * Wait for the @syslog_seq record to be available. @syslog_seq may
+ * change while waiting.
+ */
+ do {
+ seq = syslog_seq;
+
+ mutex_unlock(&syslog_lock);
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
+ len = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
+ mutex_lock(&syslog_lock);
+
+ if (len)
+ goto out;
+ } while (syslog_seq != seq);
+
+ /*
+ * Copy records that fit into the buffer. The above cycle makes sure
+ * that the first record is always available.
+ */
+ do {
size_t n;
size_t skip;
+ int err;
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
- if (!prb_read_valid(prb, syslog_seq, &r)) {
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ if (!prb_read_valid(prb, syslog_seq, &r))
break;
- }
+
if (r.info->seq != syslog_seq) {
/* message is gone, move to next valid one */
syslog_seq = r.info->seq;
@@ -1531,13 +1622,15 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
if (!n)
break;
- if (copy_to_user(buf, text + skip, n)) {
+ mutex_unlock(&syslog_lock);
+ err = copy_to_user(buf, text + skip, n);
+ mutex_lock(&syslog_lock);
+
+ if (err) {
if (!len)
len = -EFAULT;
break;
@@ -1546,8 +1639,9 @@ static int syslog_print(char __user *buf, int size)
len += n;
size -= n;
buf += n;
- }
-
+ } while (size);
+out:
+ mutex_unlock(&syslog_lock);
kfree(text);
return len;
}
@@ -1561,12 +1655,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 seq;
bool time;
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
time = printk_time;
- printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1574,7 +1667,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
size, true, time);
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -1587,23 +1680,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}
- printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- printk_safe_enter_irq();
if (len < 0)
break;
}
if (clear) {
- raw_spin_lock(&syslog_lock);
+ mutex_lock(&syslog_lock);
latched_seq_write(&clear_seq, seq);
- raw_spin_unlock(&syslog_lock);
+ mutex_unlock(&syslog_lock);
}
- printk_safe_exit_irq();
kfree(text);
return len;
@@ -1611,23 +1701,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ mutex_lock(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
-}
-
-/* Return a consistent copy of @syslog_seq. */
-static u64 read_syslog_seq_irq(void)
-{
- u64 seq;
-
- raw_spin_lock_irq(&syslog_lock);
- seq = syslog_seq;
- raw_spin_unlock_irq(&syslog_lock);
-
- return seq;
+ mutex_unlock(&syslog_lock);
}
int do_syslog(int type, char __user *buf, int len, int source)
@@ -1653,11 +1729,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
-
- error = wait_event_interruptible(log_wait,
- prb_read_valid(prb, read_syslog_seq_irq(), NULL));
- if (error)
- return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
@@ -1703,12 +1774,10 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ mutex_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ mutex_unlock(&syslog_lock);
return 0;
}
if (info.seq != syslog_seq) {
@@ -1736,8 +1805,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1793,19 +1861,20 @@ static void console_lock_spinning_enable(void)
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
+ * Important: Callers lose both the console_lock and the SRCU read lock if
+ * there was a busy waiter. They must not touch items synchronized by
+ * console_lock or SRCU read lock in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int console_lock_spinning_disable_and_check(void)
+static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
@@ -1825,6 +1894,12 @@ static int console_lock_spinning_disable_and_check(void)
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
+ * Preserve lockdep lock ordering. Release the SRCU read lock before
+ * releasing the console_lock.
+ */
+ console_srcu_read_unlock(cookie);
+
+ /*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
@@ -1852,6 +1927,16 @@ static int console_trylock_spinning(void)
if (console_trylock())
return 1;
+ /*
+ * It's unsafe to spin once a panic has begun. If we are the
+ * panic CPU, we may have already halted the owner of the
+ * console_sem. If we are not the panic CPU, then we should
+ * avoid taking console_sem, so the panic CPU has a better
+ * chance of cleanly acquiring it later.
+ */
+ if (panic_in_progress())
+ return 0;
+
printk_safe_enter_irqsave(flags);
raw_spin_lock(&console_owner_lock);
@@ -1897,53 +1982,81 @@ static int console_trylock_spinning(void)
}
/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
+ * Recursion is tracked separately on each CPU. If NMIs are supported, an
+ * additional NMI context per CPU is also separately tracked. Until per-CPU
+ * is available, a separate "early tracking" is performed.
*/
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
-{
- static char dropped_text[64];
- size_t dropped_len = 0;
- struct console *con;
-
- trace_console_rcuidle(text, len);
-
- if (!console_drivers)
- return;
+static DEFINE_PER_CPU(u8, printk_count);
+static u8 printk_count_early;
+#ifdef CONFIG_HAVE_NMI
+static DEFINE_PER_CPU(u8, printk_count_nmi);
+static u8 printk_count_nmi_early;
+#endif
- if (console_dropped) {
- dropped_len = snprintf(dropped_text, sizeof(dropped_text),
- "** %lu printk messages dropped **\n",
- console_dropped);
- console_dropped = 0;
- }
+/*
+ * Recursion is limited to keep the output sane. printk() should not require
+ * more than 1 level of recursion (allowing, for example, printk() to trigger
+ * a WARN), but a higher value is used in case some printk-internal errors
+ * exist, such as the ringbuffer validation checks failing.
+ */
+#define PRINTK_MAX_RECURSION 3
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- if (con->flags & CON_EXTENDED)
- con->write(con, ext_text, ext_len);
- else {
- if (dropped_len)
- con->write(con, dropped_text, dropped_len);
- con->write(con, text, len);
- }
+/*
+ * Return a pointer to the dedicated counter for the CPU+context of the
+ * caller.
+ */
+static u8 *__printk_recursion_counter(void)
+{
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi()) {
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count_nmi);
+ return &printk_count_nmi_early;
}
+#endif
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count);
+ return &printk_count_early;
}
+/*
+ * Enter recursion tracking. Interrupts are disabled to simplify tracking.
+ * The caller must check the boolean return value to see if the recursion is
+ * allowed. On failure, interrupts are not disabled.
+ *
+ * @recursion_ptr must be a variable of type (u8 *) and is the same variable
+ * that is passed to printk_exit_irqrestore().
+ */
+#define printk_enter_irqsave(recursion_ptr, flags) \
+({ \
+ bool success = true; \
+ \
+ typecheck(u8 *, recursion_ptr); \
+ local_irq_save(flags); \
+ (recursion_ptr) = __printk_recursion_counter(); \
+ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \
+ local_irq_restore(flags); \
+ success = false; \
+ } else { \
+ (*(recursion_ptr))++; \
+ } \
+ success; \
+})
+
+/* Exit recursion tracking, restoring interrupts. */
+#define printk_exit_irqrestore(recursion_ptr, flags) \
+ do { \
+ typecheck(u8 *, recursion_ptr); \
+ (*(recursion_ptr))--; \
+ local_irq_restore(flags); \
+ } while (0)
+
int printk_delay_msec __read_mostly;
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
{
+ boot_delay_msec(level);
+
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
@@ -1957,27 +2070,28 @@ static inline void printk_delay(void)
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
- 0x80000000 + raw_smp_processor_id();
+ 0x80000000 + smp_processor_id();
}
/**
- * parse_prefix - Parse level and control flags.
+ * printk_parse_prefix - Parse level and control flags.
*
* @text: The terminated text message.
* @level: A pointer to the current level value, will be updated.
- * @lflags: A pointer to the current log flags, will be updated.
+ * @flags: A pointer to the current printk_info flags, will be updated.
*
* @level may be NULL if the caller is not interested in the parsed value.
* Otherwise the variable pointed to by @level must be set to
* LOGLEVEL_DEFAULT in order to be updated with the parsed value.
*
- * @lflags may be NULL if the caller is not interested in the parsed value.
- * Otherwise the variable pointed to by @lflags will be OR'd with the parsed
+ * @flags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @flags will be OR'd with the parsed
* value.
*
* Return: The length of the parsed level and control flags.
*/
-static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags)
{
u16 prefix_len = 0;
int kern_level;
@@ -1993,8 +2107,8 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
*level = kern_level - '0';
break;
case 'c': /* KERN_CONT */
- if (lflags)
- *lflags |= LOG_CONT;
+ if (flags)
+ *flags |= LOG_CONT;
}
prefix_len += 2;
@@ -2004,8 +2118,10 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
return prefix_len;
}
-static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags,
- const char *fmt, va_list args)
+__printf(5, 0)
+static u16 printk_sprint(char *text, u16 size, int facility,
+ enum printk_info_flags *flags, const char *fmt,
+ va_list args)
{
u16 text_len;
@@ -2014,20 +2130,22 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl
/* Mark and strip a trailing newline. */
if (text_len && text[text_len - 1] == '\n') {
text_len--;
- *lflags |= LOG_NEWLINE;
+ *flags |= LOG_NEWLINE;
}
/* Strip log level and control flags. */
if (facility == 0) {
u16 prefix_len;
- prefix_len = parse_prefix(text, NULL, NULL);
+ prefix_len = printk_parse_prefix(text, NULL, NULL);
if (prefix_len) {
text_len -= prefix_len;
memmove(text, text + prefix_len, text_len);
}
}
+ trace_console(text, text_len);
+
return text_len;
}
@@ -2036,17 +2154,23 @@ int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
- const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
- enum log_flags lflags = 0;
+ enum printk_info_flags flags = 0;
struct printk_record r;
+ unsigned long irqflags;
u16 trunc_msg_len = 0;
char prefix_buf[8];
+ u8 *recursion_ptr;
u16 reserve_size;
va_list args2;
+ u32 caller_id;
u16 text_len;
+ int ret = 0;
u64 ts_nsec;
+ if (!printk_enter_irqsave(recursion_ptr, irqflags))
+ return 0;
+
/*
* Since the duration of printk() can vary depending on the message
* and state of the ringbuffer, grab the timestamp now so that it is
@@ -2055,6 +2179,8 @@ int vprintk_store(int facility, int level,
*/
ts_nsec = local_clock();
+ caller_id = printk_caller_id();
+
/*
* The sprintf needs to come first since the syslog prefix might be
* passed in as a parameter. An extra byte must be reserved so that
@@ -2065,34 +2191,35 @@ int vprintk_store(int facility, int level,
reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
va_end(args2);
- if (reserve_size > LOG_LINE_MAX)
- reserve_size = LOG_LINE_MAX;
+ if (reserve_size > PRINTKRB_RECORD_MAX)
+ reserve_size = PRINTKRB_RECORD_MAX;
/* Extract log level or control flags. */
if (facility == 0)
- parse_prefix(&prefix_buf[0], &level, &lflags);
+ printk_parse_prefix(&prefix_buf[0], &level, &flags);
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
if (dev_info)
- lflags |= LOG_NEWLINE;
+ flags |= LOG_NEWLINE;
- if (lflags & LOG_CONT) {
+ if (flags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
- if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
- facility, &lflags, fmt, args);
+ facility, &flags, fmt, args);
r.info->text_len += text_len;
- if (lflags & LOG_NEWLINE) {
+ if (flags & LOG_NEWLINE) {
r.info->flags |= LOG_NEWLINE;
prb_final_commit(&e);
} else {
prb_commit(&e);
}
- return text_len;
+ ret = text_len;
+ goto out;
}
}
@@ -2108,29 +2235,32 @@ int vprintk_store(int facility, int level,
prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
- return 0;
+ goto out;
}
/* fill message */
- text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args);
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
if (trunc_msg_len)
memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
r.info->text_len = text_len + trunc_msg_len;
r.info->facility = facility;
r.info->level = level & 7;
- r.info->flags = lflags & 0x1f;
+ r.info->flags = flags & 0x1f;
r.info->ts_nsec = ts_nsec;
r.info->caller_id = caller_id;
if (dev_info)
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* A message without a trailing newline can be continued. */
- if (!(lflags & LOG_NEWLINE))
+ if (!(flags & LOG_NEWLINE))
prb_commit(&e);
else
prb_final_commit(&e);
- return (text_len + trunc_msg_len);
+ ret = text_len + trunc_msg_len;
+out:
+ printk_exit_irqrestore(recursion_ptr, irqflags);
+ return ret;
}
asmlinkage int vprintk_emit(int facility, int level,
@@ -2139,36 +2269,39 @@ asmlinkage int vprintk_emit(int facility, int level,
{
int printed_len;
bool in_sched = false;
- unsigned long flags;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
return 0;
+ if (unlikely(suppress_panic_printk) &&
+ atomic_read(&panic_cpu) != raw_smp_processor_id())
+ return 0;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
}
- boot_delay_msec(level);
- printk_delay();
+ printk_delay(level);
- printk_safe_enter_irqsave(flags);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- printk_safe_exit_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
+ * The caller may be holding system-critical or
+ * timing-sensitive locks. Disable preemption during
+ * printing of all remaining records to all consoles so that
+ * this context can return as soon as possible. Hopefully
+ * another printk() caller will take over the printing.
*/
preempt_disable();
/*
* Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
+ * semaphore. The release will print out buffers. With the
+ * spinning variant, this context tries to take over the
+ * printing from another printing context.
*/
if (console_trylock_spinning())
console_unlock();
@@ -2186,28 +2319,7 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the
- * output and call the console drivers. If we fail to get the semaphore, we
- * place the output into the log buffer and return. The current holder of
- * the console_sem will notice the new output in console_unlock(); and will
- * send it to the consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage __visible int printk(const char *fmt, ...)
+asmlinkage __visible int _printk(const char *fmt, ...)
{
va_list args;
int r;
@@ -2218,20 +2330,20 @@ asmlinkage __visible int printk(const char *fmt, ...)
return r;
}
-EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(_printk);
+
+static bool pr_flush(int timeout_ms, bool reset_on_progress);
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
-#define CONSOLE_LOG_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
static u64 syslog_seq;
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
static size_t record_print_text(const struct printk_record *r,
bool syslog, bool time)
@@ -2247,10 +2359,10 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
char *text, size_t text_len,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
+static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
static bool suppress_message_printing(int level) { return false; }
+static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2274,6 +2386,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
}
#endif
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+ if (!user_specified)
+ return;
+
+ /*
+ * @c console was defined by the user on the command line.
+ * Do not clear when added twice also by SPCR or the device tree.
+ */
+ c->user_specified = true;
+ /* At least one console defined by the user on the command line. */
+ console_set_on_cmdline = 1;
+}
+
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options, bool user_specified)
{
@@ -2290,8 +2416,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
if (strcmp(c->name, name) == 0 && c->index == idx) {
if (!brl_options)
preferred_console = i;
- if (user_specified)
- c->user_specified = true;
+ set_user_specified(c, user_specified);
return 0;
}
}
@@ -2299,9 +2424,9 @@ static int __add_preferred_console(char *name, int idx, char *options,
return -E2BIG;
if (!brl_options)
preferred_console = i;
- strlcpy(c->name, name, sizeof(c->name));
+ strscpy(c->name, name, sizeof(c->name));
c->options = options;
- c->user_specified = user_specified;
+ set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
c->index = idx;
@@ -2367,7 +2492,6 @@ static int __init console_setup(char *str)
*s = 0;
__add_preferred_console(buf, idx, options, brl_options, true);
- console_set_on_cmdline = 1;
return 1;
}
__setup("console=", console_setup);
@@ -2404,6 +2528,18 @@ module_param_named(console_suspend, console_suspend_enabled,
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
" and hibernate operations");
+static bool printk_console_no_auto_verbose;
+
+void console_verbose(void)
+{
+ if (console_loglevel && !printk_console_no_auto_verbose)
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+}
+EXPORT_SYMBOL_GPL(console_verbose);
+
+module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
+MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
+
/**
* suspend_console - suspend the console subsystem
*
@@ -2414,6 +2550,7 @@ void suspend_console(void)
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
+ pr_flush(1000, true);
console_lock();
console_suspended = 1;
up_console_sem();
@@ -2426,6 +2563,7 @@ void resume_console(void)
down_console_sem();
console_suspended = 0;
console_unlock();
+ pr_flush(1000, true);
}
/**
@@ -2448,10 +2586,10 @@ static int console_cpu_notify(unsigned int cpu)
}
/**
- * console_lock - lock the console system for exclusive use.
+ * console_lock - block the console subsystem from printing
*
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Acquires a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2468,10 +2606,10 @@ void console_lock(void)
EXPORT_SYMBOL(console_lock);
/**
- * console_trylock - try to lock the console system for exclusive use.
+ * console_trylock - try to block the console subsystem from printing
*
- * Try to acquire a lock which guarantees that the caller has exclusive
- * access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2496,179 +2634,396 @@ int is_console_locked(void)
EXPORT_SYMBOL(is_console_locked);
/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
+ * Return true when this CPU should unlock console_sem without pushing all
+ * messages to the console. This reduces the chance that the console is
+ * locked when the panic CPU tries to use it.
*/
-static int have_callable_console(void)
+static bool abandon_console_lock_in_panic(void)
{
- struct console *con;
-
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
+ if (!panic_in_progress())
+ return false;
- return 0;
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
}
/*
- * Can we actually use the console at this time on this cpu?
+ * Check if the given console is currently capable and allowed to print
+ * records.
*
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
+ * Requires the console_srcu_read_lock.
*/
-static inline int can_use_console(void)
+static inline bool console_is_usable(struct console *con)
{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
+ short flags = console_srcu_read_flags(con);
+
+ if (!(flags & CON_ENABLED))
+ return false;
+
+ if (!con->write)
+ return false;
+
+ /*
+ * Console drivers may assume that per-cpu resources have been
+ * allocated. So unless they're explicitly marked as being able to
+ * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+ */
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+ return false;
+
+ return true;
}
-/**
- * console_unlock - unlock the console system
+static void __console_unlock(void)
+{
+ console_locked = 0;
+ up_console_sem();
+}
+
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
+ * is achieved by shifting the existing message over and inserting the dropped
+ * message.
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * @pmsg is the printk message to prepend.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
+ * @dropped is the dropped count to report in the dropped message.
*
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ * If the message text in @pmsg->pbufs->outbuf does not have enough space for
+ * the dropped message, the message text will be sufficiently truncated.
*
- * console_unlock(); may be called from any context.
+ * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
-void console_unlock(void)
+#ifdef CONFIG_PRINTK
+static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[CONSOLE_LOG_MAX];
- unsigned long flags;
- bool do_cond_resched, retry;
- struct printk_info info;
- struct printk_record r;
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ size_t len;
- if (console_suspended) {
- up_console_sem();
+ len = scnprintf(scratchbuf, scratchbuf_sz,
+ "** %lu printk messages dropped **\n", dropped);
+
+ /*
+ * Make sure outbuf is sufficiently large before prepending.
+ * Keep at least the prefix when the message must be truncated.
+ * It is a rather theoretical problem when someone tries to
+ * use a minimalist buffer.
+ */
+ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
return;
+
+ if (pmsg->outbuf_len + len >= outbuf_sz) {
+ /* Truncate the message, but keep it terminated. */
+ pmsg->outbuf_len = outbuf_sz - (len + 1);
+ outbuf[pmsg->outbuf_len] = 0;
}
- prb_rec_init_rd(&r, &info, text, sizeof(text));
+ memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
+ memcpy(outbuf, scratchbuf, len);
+ pmsg->outbuf_len += len;
+}
+#else
+#define console_prepend_dropped(pmsg, dropped)
+#endif /* CONFIG_PRINTK */
+
+/*
+ * Read and format the specified record (or a later record if the specified
+ * record is not available).
+ *
+ * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
+ * struct printk_buffers.
+ *
+ * @seq is the record to read and format. If it is not available, the next
+ * valid record is read.
+ *
+ * @is_extended specifies if the message should be formatted for extended
+ * console output.
+ *
+ * @may_supress specifies if records may be skipped based on loglevel.
+ *
+ * Returns false if no record is available. Otherwise true and all fields
+ * of @pmsg are valid. (See the documentation of struct printk_message
+ * for information about the @pmsg fields.)
+ */
+static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
+{
+ static int panic_console_dropped;
+
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ struct printk_info info;
+ struct printk_record r;
+ size_t len = 0;
/*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
+ * Formatting extended messages requires a separate buffer, so use the
+ * scratch buffer to read in the ringbuffer text.
*
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the "again" goto label.
+ * Formatting normal messages is done in-place, so read the ringbuffer
+ * text directly into the output buffer.
*/
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
+ if (is_extended)
+ prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
+ else
+ prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);
+
+ if (!prb_read_valid(prb, seq, &r))
+ return false;
+
+ pmsg->seq = r.info->seq;
+ pmsg->dropped = r.info->seq - seq;
/*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
+ * Check for dropped messages in panic here so that printk
+ * suppression can occur as early as possible if necessary.
*/
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
+ if (pmsg->dropped &&
+ panic_in_progress() &&
+ panic_console_dropped++ > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
}
- for (;;) {
- size_t ext_len = 0;
- size_t len;
+ /* Skip record that has level above the console loglevel. */
+ if (may_suppress && suppress_message_printing(r.info->level))
+ goto out;
+
+ if (is_extended) {
+ len = info_print_ext_header(outbuf, outbuf_sz, r.info);
+ len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ }
+out:
+ pmsg->outbuf_len = len;
+ return true;
+}
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding both the
+ * console_lock and the SRCU read lock. Otherwise it is set to false.
+ *
+ * @cookie is the cookie from the SRCU read lock.
+ *
+ * Returns false if the given console has no next record to print, otherwise
+ * true.
+ *
+ * Requires the console_lock and the SRCU read lock.
+ */
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ static struct printk_buffers pbufs;
+
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ char *outbuf = &pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &pbufs,
+ };
+ unsigned long flags;
+
+ *handover = false;
+
+ if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
+ return false;
+
+ con->dropped += pmsg.dropped;
- printk_safe_enter_irqsave(flags);
+ /* Skip messages of formatted length 0. */
+ if (pmsg.outbuf_len == 0) {
+ con->seq = pmsg.seq + 1;
+ goto skip;
+ }
+
+ if (con->dropped && !is_extended) {
+ console_prepend_dropped(&pmsg, con->dropped);
+ con->dropped = 0;
+ }
+
+ /*
+ * While actively printing out messages, if another printk()
+ * were to occur on another CPU, it may wait for this one to
+ * finish. This task can not be preempted if there is a
+ * waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+
+ /* Do not trace print latency. */
+ stop_critical_timings();
+
+ /* Write everything out to the hardware. */
+ con->write(con, outbuf, pmsg.outbuf_len);
+
+ start_critical_timings();
+
+ con->seq = pmsg.seq + 1;
+
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
skip:
- if (!prb_read_valid(prb, console_seq, &r))
- break;
+ return true;
+}
- if (console_seq != r.info->seq) {
- console_dropped += r.info->seq - console_seq;
- console_seq = r.info->seq;
- }
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+ bool any_usable = false;
+ struct console *con;
+ bool any_progress;
+ int cookie;
+
+ *next_seq = 0;
+ *handover = false;
+
+ do {
+ any_progress = false;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ bool progress;
+
+ if (!console_is_usable(con))
+ continue;
+ any_usable = true;
+
+ progress = console_emit_next_record(con, handover, cookie);
- if (suppress_message_printing(r.info->level)) {
/*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
*/
- console_seq++;
- goto skip;
- }
+ if (*handover)
+ return false;
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
+ /* Track the next of the highest seq flushed. */
+ if (con->seq > *next_seq)
+ *next_seq = con->seq;
- /*
- * Handle extended console text first because later
- * record_print_text() will modify the record buffer in-place.
- */
- if (nr_ext_console_drivers) {
- ext_len = info_print_ext_header(ext_text,
- sizeof(ext_text),
- r.info);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- &r.text_buf[0],
- r.info->text_len,
- &r.info->dev_info);
- }
- len = record_print_text(&r,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time);
- console_seq++;
-
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- */
- console_lock_spinning_enable();
+ if (!progress)
+ continue;
+ any_progress = true;
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
+ /* Allow panic_cpu to take over the consoles safely. */
+ if (abandon_console_lock_in_panic())
+ goto abandon;
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
+ if (do_cond_resched)
+ cond_resched();
}
+ console_srcu_read_unlock(cookie);
+ } while (any_progress);
- printk_safe_exit_irqrestore(flags);
+ return any_usable;
- if (do_cond_resched)
- cond_resched();
- }
+abandon:
+ console_srcu_read_unlock(cookie);
+ return false;
+}
- console_locked = 0;
+/**
+ * console_unlock - unblock the console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ bool do_cond_resched;
+ bool handover;
+ bool flushed;
+ u64 next_seq;
- up_console_sem();
+ if (console_suspended) {
+ up_console_sem();
+ return;
+ }
/*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
+ * Console drivers are called with interrupts disabled, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system. Therefore, create
+ * a local to use for the printing loop.
*/
- retry = prb_read_valid(prb, console_seq, NULL);
- printk_safe_exit_irqrestore(flags);
+ do_cond_resched = console_may_schedule;
+
+ do {
+ console_may_schedule = 0;
+
+ flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
+ if (!handover)
+ __console_unlock();
+
+ /*
+ * Abort if there was a failure to flush all messages to all
+ * usable consoles. Either it is not possible to flush (in
+ * which case it would be an infinite loop of retrying) or
+ * another context has taken over printing.
+ */
+ if (!flushed)
+ break;
- if (retry && console_trylock())
- goto again;
+ /*
+ * Some context may have added new records after
+ * console_flush_all() but before unlocking the console.
+ * Re-check if there is a new record to flush. If the trylock
+ * fails, another context is already handling the printing.
+ */
+ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}
EXPORT_SYMBOL(console_unlock);
@@ -2691,10 +3046,14 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
struct console *c;
+ int cookie;
/*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
+ * Stop console printing because the unblank() callback may
+ * assume the console is not within its write() callback.
+ *
+ * If @oops_in_progress is set, this may be an atomic context.
+ * In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
@@ -2704,10 +3063,18 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
c->unblank();
+ }
+ console_srcu_read_unlock(cookie);
+
console_unlock();
+
+ if (!oops_in_progress)
+ pr_flush(1000, true);
}
/**
@@ -2729,11 +3096,22 @@ void console_flush_on_panic(enum con_flush_mode mode)
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
+ struct console *c;
+ int cookie;
+ u64 seq;
- printk_safe_enter_irqsave(flags);
- console_seq = prb_first_valid_seq(prb);
- printk_safe_exit_irqrestore(flags);
+ seq = prb_first_valid_seq(prb);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ /*
+ * If the above console_trylock() failed, this is an
+ * unsynchronized assignment. But in that case, the
+ * kernel is in "hope and pray" mode anyway.
+ */
+ c->seq = seq;
+ }
+ console_srcu_read_unlock(cookie);
}
console_unlock();
}
@@ -2745,15 +3123,25 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
+ int cookie;
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
console_lock();
- for_each_console(c) {
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
+ console_srcu_read_unlock(cookie);
+
console_unlock();
return driver;
}
@@ -2765,17 +3153,27 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
+ __pr_flush(console, 1000, true);
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts must
+ * be able to see that this console is disabled so that (for example)
+ * the caller can suspend the port without risk of another context
+ * using the port.
+ */
+ synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ console_list_unlock();
+ __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
@@ -2800,7 +3198,8 @@ early_param("keep_bootcon", keep_bootcon_setup);
* Care need to be taken with consoles that are statically
* enabled such as netconsole
*/
-static int try_enable_new_console(struct console *newcon, bool user_specified)
+static int try_enable_preferred_console(struct console *newcon,
+ bool user_specified)
{
struct console_cmdline *c;
int i, err;
@@ -2830,10 +3229,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return err;
}
newcon->flags |= CON_ENABLED;
- if (i == preferred_console) {
+ if (i == preferred_console)
newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
return 0;
}
@@ -2848,6 +3245,92 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return -ENOENT;
}
+/* Try to enable the console unconditionally */
+static void try_enable_default_console(struct console *newcon)
+{
+ if (newcon->index < 0)
+ newcon->index = 0;
+
+ if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ return;
+
+ newcon->flags |= CON_ENABLED;
+
+ if (newcon->device)
+ newcon->flags |= CON_CONSDEV;
+}
+
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+
+static void console_init_seq(struct console *newcon, bool bootcon_registered)
+{
+ struct console *con;
+ bool handover;
+
+ if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
+ /* Get a consistent copy of @syslog_seq. */
+ mutex_lock(&syslog_lock);
+ newcon->seq = syslog_seq;
+ mutex_unlock(&syslog_lock);
+ } else {
+ /* Begin with next message added to ringbuffer. */
+ newcon->seq = prb_next_seq(prb);
+
+ /*
+ * If any enabled boot consoles are due to be unregistered
+ * shortly, some may not be caught up and may be the same
+ * device as @newcon. Since it is not known which boot console
+ * is the same device, flush all consoles and, if necessary,
+ * start with the message of the enabled boot console that is
+ * the furthest behind.
+ */
+ if (bootcon_registered && !keep_bootcon) {
+ /*
+ * Hold the console_lock to stop console printing and
+ * guarantee safe access to console->seq.
+ */
+ console_lock();
+
+ /*
+ * Flush all consoles and set the console to start at
+ * the next unprinted sequence number.
+ */
+ if (!console_flush_all(true, &newcon->seq, &handover)) {
+ /*
+ * Flushing failed. Just choose the lowest
+ * sequence of the enabled boot consoles.
+ */
+
+ /*
+ * If there was a handover, this context no
+ * longer holds the console_lock.
+ */
+ if (handover)
+ console_lock();
+
+ newcon->seq = prb_next_seq(prb);
+ for_each_console(con) {
+ if ((con->flags & CON_BOOT) &&
+ (con->flags & CON_ENABLED) &&
+ con->seq < newcon->seq) {
+ newcon->seq = con->seq;
+ }
+ }
+ }
+
+ console_unlock();
+ }
+ }
+}
+
+#define console_first() \
+ hlist_entry(console_list.first, struct console, node)
+
+static int unregister_console_locked(struct console *console);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -2869,64 +3352,61 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
- unsigned long flags;
- struct console *bcon = NULL;
+ struct console *con;
+ bool bootcon_registered = false;
+ bool realcon_registered = false;
int err;
- for_each_console(bcon) {
- if (WARN(bcon == newcon, "console '%s%d' already registered\n",
- bcon->name, bcon->index))
- return;
- }
+ console_list_lock();
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (newcon->flags & CON_BOOT) {
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
+ for_each_console(con) {
+ if (WARN(con == newcon, "console '%s%d' already registered\n",
+ con->name, con->index)) {
+ goto unlock;
}
- }
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
+ if (con->flags & CON_BOOT)
+ bootcon_registered = true;
+ else
+ realcon_registered = true;
+ }
- if (!has_preferred_console || bcon || !console_drivers)
- has_preferred_console = preferred_console >= 0;
+ /* Do not register boot consoles when there already is a real one. */
+ if ((newcon->flags & CON_BOOT) && realcon_registered) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ goto unlock;
+ }
/*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
+ * See if we want to enable this console driver by default.
+ *
+ * Nope when a console is preferred by the command line, device
+ * tree, or SPCR.
+ *
+ * The first real console with tty binding (driver) wins. More
+ * consoles might get enabled before the right one is found.
+ *
+ * Note that a console with tty binding will have CON_CONSDEV
+ * flag set and will be first in the list.
*/
- if (!has_preferred_console) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
+ if (preferred_console < 0) {
+ if (hlist_empty(&console_list) || !console_first()->device ||
+ console_first()->flags & CON_BOOT) {
+ try_enable_default_console(newcon);
}
}
/* See if this console matches one we selected on the command line */
- err = try_enable_new_console(newcon, true);
+ err = try_enable_preferred_console(newcon, true);
/* If not, try to match against the platform default(s) */
if (err == -ENOENT)
- err = try_enable_new_console(newcon, false);
+ err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
if (err || newcon->flags & CON_BRL)
- return;
+ goto unlock;
/*
* If we have a bootconsole, and are switching to a real console,
@@ -2934,51 +3414,38 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bootcon_registered &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ }
+
+ newcon->dropped = 0;
+ console_init_seq(newcon, bootcon_registered);
/*
- * Put this console in the list - keep the
- * preferred driver at the head of the list.
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
*/
- console_lock();
- if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
- newcon->next = console_drivers;
- console_drivers = newcon;
- if (newcon->next)
- newcon->next->flags &= ~CON_CONSDEV;
- /* Ensure this flag is always set for the head of the list */
+ if (hlist_empty(&console_list)) {
+ /* Ensure CON_CONSDEV is always set for the head. */
newcon->flags |= CON_CONSDEV;
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
+ } else if (newcon->flags & CON_CONSDEV) {
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
} else {
- newcon->next = console_drivers->next;
- console_drivers->next = newcon;
+ hlist_add_behind_rcu(&newcon->node, console_list.first);
}
- if (newcon->flags & CON_EXTENDED)
- nr_ext_console_drivers++;
-
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- *
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
+ /*
+ * No need to synchronize SRCU here! The caller does not rely
+ * on all contexts being able to see the new console before
+ * register_console() completes.
+ */
- /* Get a consistent copy of @syslog_seq. */
- raw_spin_lock_irqsave(&syslog_lock, flags);
- console_seq = syslog_seq;
- raw_spin_unlock_irqrestore(&syslog_lock, flags);
- }
- console_unlock();
console_sysfs_notify();
/*
@@ -2988,30 +3455,30 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- pr_info("%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
- if (bcon &&
+ con_printk(KERN_INFO, newcon, "enabled\n");
+ if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* We need to iterate through all boot consoles, to make
- * sure we print everything out, before we unregister them.
- */
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_BOOT)
+ unregister_console_locked(con);
+ }
}
+unlock:
+ console_list_unlock();
}
EXPORT_SYMBOL(register_console);
-int unregister_console(struct console *console)
+/* Must be called under console_list_lock(). */
+static int unregister_console_locked(struct console *console)
{
- struct console *con;
int res;
- pr_info("%sconsole [%s%d] disabled\n",
- (console->flags & CON_BOOT) ? "boot" : "" ,
- console->name, console->index);
+ lockdep_assert_console_list_lock_held();
+
+ con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
if (res < 0)
@@ -3019,51 +3486,94 @@ int unregister_console(struct console *console)
if (res > 0)
return 0;
- res = -ENODEV;
- console_lock();
- if (console_drivers == console) {
- console_drivers=console->next;
- res = 0;
- } else {
- for_each_console(con) {
- if (con->next == console) {
- con->next = console->next;
- res = 0;
- break;
- }
- }
- }
+ /* Disable it unconditionally */
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
- if (res)
- goto out_disable_unlock;
+ if (!console_is_registered_locked(console))
+ return -ENODEV;
- if (console->flags & CON_EXTENDED)
- nr_ext_console_drivers--;
+ hlist_del_init_rcu(&console->node);
/*
+ * <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
+ * </HISTORICAL>
+ *
+ * The above makes no sense as there is no guarantee that the next
+ * console has any device attached. Oh well....
*/
- if (console_drivers != NULL && console->flags & CON_CONSDEV)
- console_drivers->flags |= CON_CONSDEV;
+ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
+ console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts
+ * must not be able to see this console in the list so that any
+ * exit/cleanup routines can be performed safely.
+ */
+ synchronize_srcu(&console_srcu);
- console->flags &= ~CON_ENABLED;
- console_unlock();
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
return res;
+}
-out_disable_unlock:
- console->flags &= ~CON_ENABLED;
- console_unlock();
+int unregister_console(struct console *console)
+{
+ int res;
+ console_list_lock();
+ res = unregister_console_locked(console);
+ console_list_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
+/**
+ * console_force_preferred_locked - force a registered console preferred
+ * @con: The registered console to force preferred.
+ *
+ * Must be called under console_list_lock().
+ */
+void console_force_preferred_locked(struct console *con)
+{
+ struct console *cur_pref_con;
+
+ if (!console_is_registered_locked(con))
+ return;
+
+ cur_pref_con = console_first();
+
+ /* Already preferred? */
+ if (cur_pref_con == con)
+ return;
+
+ /*
+ * Delete, but do not re-initialize the entry. This allows the console
+ * to continue to appear registered (via any hlist_unhashed_lockless()
+ * checks), even though it was briefly removed from the console list.
+ */
+ hlist_del_rcu(&con->node);
+
+ /*
+ * Ensure that all SRCU list walks have completed so that the console
+ * can be added to the beginning of the console list and its forward
+ * list pointer can be re-initialized.
+ */
+ synchronize_srcu(&console_srcu);
+
+ con->flags |= CON_CONSDEV;
+ WARN_ON(!con->device);
+
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&con->node, &console_list);
+}
+EXPORT_SYMBOL(console_force_preferred_locked);
+
/*
* Initialize the console device. This is called *early*, so
* we can't necessarily depend on lots of kernel help here.
@@ -3110,10 +3620,12 @@ void __init console_init(void)
*/
static int __init printk_late_init(void)
{
+ struct hlist_node *tmp;
struct console *con;
int ret;
- for_each_console(con) {
+ console_list_lock();
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (!(con->flags & CON_BOOT))
continue;
@@ -3130,20 +3642,112 @@ static int __init printk_late_init(void)
*/
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
con->name, con->index);
- unregister_console(con);
+ unregister_console_locked(con);
}
}
+ console_list_unlock();
+
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
console_cpu_notify, NULL);
WARN_ON(ret < 0);
+ printk_sysctl_init();
return 0;
}
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/* If @con is specified, only wait for that console. Otherwise wait for all. */
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
+{
+ int remaining = timeout_ms;
+ struct console *c;
+ u64 last_diff = 0;
+ u64 printk_seq;
+ int cookie;
+ u64 diff;
+ u64 seq;
+
+ might_sleep();
+
+ seq = prb_next_seq(prb);
+
+ for (;;) {
+ diff = 0;
+
+ /*
+ * Hold the console_lock to guarantee safe access to
+ * console->seq and to prevent changes to @console_suspended
+ * until all consoles have been processed.
+ */
+ console_lock();
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (con && con != c)
+ continue;
+ if (!console_is_usable(c))
+ continue;
+ printk_seq = c->seq;
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+ console_srcu_read_unlock(cookie);
+
+ /*
+ * If consoles are suspended, it cannot be expected that they
+ * make forward progress, so timeout immediately. @diff is
+ * still used to return a valid flush status.
+ */
+ if (console_suspended)
+ remaining = 0;
+ else if (diff != last_diff && reset_on_progress)
+ remaining = timeout_ms;
+
+ console_unlock();
+
+ if (diff == 0 || remaining == 0)
+ break;
+
+ if (remaining < 0) {
+ /* no timeout limit */
+ msleep(100);
+ } else if (remaining < 100) {
+ msleep(remaining);
+ remaining = 0;
+ } else {
+ msleep(100);
+ remaining -= 100;
+ }
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Process context. May sleep while acquiring console lock.
+ * Return: true if all enabled printers are caught up.
+ */
+static bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ return __pr_flush(NULL, timeout_ms, reset_on_progress);
+}
+
/*
* Delayed printk version, for scheduler-internal messages:
*/
@@ -3154,7 +3758,7 @@ static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
/* If trylock fails, someone else is doing the printing */
@@ -3169,28 +3773,48 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
-void wake_up_klogd(void)
+static void __wake_up_klogd(int val)
{
if (!printk_percpu_data_ready())
return;
preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the wait queue is empty.
+ *
+ * The full memory barrier within wq_has_sleeper() pairs with the full
+ * memory barrier within set_current_state() of
+ * prepare_to_wait_event(), which is called after ___wait_event() adds
+ * the waiter but before it has checked the wait condition.
+ *
+ * This pairs with devkmsg_read:A and syslog_print:A.
+ */
+ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
+ (val & PRINTK_PENDING_OUTPUT)) {
+ this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
+void wake_up_klogd(void)
+{
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP);
+}
+
void defer_console_output(void)
{
- if (!printk_percpu_data_ready())
- return;
+ /*
+ * New messages may have been added directly to the ringbuffer
+ * using vprintk_store(), so wake any waiters as well.
+ */
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+}
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
+void printk_trigger_flush(void)
+{
+ defer_console_output();
}
int vprintk_deferred(const char *fmt, va_list args)
@@ -3203,7 +3827,7 @@ int vprintk_deferred(const char *fmt, va_list args)
return r;
}
-int printk_deferred(const char *fmt, ...)
+int _printk_deferred(const char *fmt, ...)
{
va_list args;
int r;
@@ -3386,14 +4010,12 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
struct printk_info info;
unsigned int line_count;
struct printk_record r;
- unsigned long flags;
size_t l = 0;
bool ret = false;
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
- printk_safe_enter_irqsave(flags);
prb_rec_init_rd(&r, &info, line, size);
/* Read text or count text lines? */
@@ -3414,7 +4036,6 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
iter->cur_seq = r.info->seq + 1;
ret = true;
out:
- printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
@@ -3446,7 +4067,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
struct printk_record r;
- unsigned long flags;
u64 seq;
u64 next_seq;
size_t len = 0;
@@ -3459,7 +4079,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
- printk_safe_enter_irqsave(flags);
if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
@@ -3468,10 +4087,8 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
}
/* last entry */
- if (iter->cur_seq >= iter->next_seq) {
- printk_safe_exit_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
/*
* Find first record that fits, including all following records,
@@ -3503,7 +4120,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
iter->next_seq = next_seq;
ret = true;
- printk_safe_exit_irqrestore(flags);
out:
if (len_out)
*len_out = len;
@@ -3521,38 +4137,34 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
- unsigned long flags;
-
- printk_safe_enter_irqsave(flags);
iter->cur_seq = latched_seq_read_nolock(&clear_seq);
iter->next_seq = prb_next_seq(prb);
- printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
#ifdef CONFIG_SMP
-static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1);
-static atomic_t printk_cpulock_nested = ATOMIC_INIT(0);
+static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
/**
- * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant
- * spinning lock is not owned by any CPU.
+ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
+ * spinning lock is not owned by any CPU.
*
* Context: Any context.
*/
-void __printk_wait_on_cpu_lock(void)
+void __printk_cpu_sync_wait(void)
{
do {
cpu_relax();
- } while (atomic_read(&printk_cpulock_owner) != -1);
+ } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
-EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
+EXPORT_SYMBOL(__printk_cpu_sync_wait);
/**
- * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant
- * spinning lock.
+ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
+ * spinning lock.
*
* If no processor has the lock, the calling processor takes the lock and
* becomes the owner. If the calling processor is already the owner of the
@@ -3561,7 +4173,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
* Context: Any context. Expects interrupts to be disabled.
* Return: 1 on success, otherwise 0.
*/
-int __printk_cpu_trylock(void)
+int __printk_cpu_sync_try_get(void)
{
int cpu;
int old;
@@ -3571,79 +4183,80 @@ int __printk_cpu_trylock(void)
/*
* Guarantee loads and stores from this CPU when it is the lock owner
* are _not_ visible to the previous lock owner. This pairs with
- * __printk_cpu_unlock:B.
+ * __printk_cpu_sync_put:B.
*
* Memory barrier involvement:
*
- * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then
- * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B.
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_put:A can never read from
+ * __printk_cpu_sync_try_get:B.
*
* Relies on:
*
- * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
* of the previous CPU
* matching
- * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
- * of this CPU
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of this CPU
*/
- old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1,
- cpu); /* LMM(__printk_cpu_trylock:A) */
+ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
+ cpu); /* LMM(__printk_cpu_sync_try_get:A) */
if (old == -1) {
/*
* This CPU is now the owner and begins loading/storing
- * data: LMM(__printk_cpu_trylock:B)
+ * data: LMM(__printk_cpu_sync_try_get:B)
*/
return 1;
} else if (old == cpu) {
/* This CPU is already the owner. */
- atomic_inc(&printk_cpulock_nested);
+ atomic_inc(&printk_cpu_sync_nested);
return 1;
}
return 0;
}
-EXPORT_SYMBOL(__printk_cpu_trylock);
+EXPORT_SYMBOL(__printk_cpu_sync_try_get);
/**
- * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock.
+ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
*
* The calling processor must be the owner of the lock.
*
* Context: Any context. Expects interrupts to be disabled.
*/
-void __printk_cpu_unlock(void)
+void __printk_cpu_sync_put(void)
{
- if (atomic_read(&printk_cpulock_nested)) {
- atomic_dec(&printk_cpulock_nested);
+ if (atomic_read(&printk_cpu_sync_nested)) {
+ atomic_dec(&printk_cpu_sync_nested);
return;
}
/*
* This CPU is finished loading/storing data:
- * LMM(__printk_cpu_unlock:A)
+ * LMM(__printk_cpu_sync_put:A)
*/
/*
* Guarantee loads and stores from this CPU when it was the
* lock owner are visible to the next lock owner. This pairs
- * with __printk_cpu_trylock:A.
+ * with __printk_cpu_sync_try_get:A.
*
* Memory barrier involvement:
*
- * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B,
- * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A.
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
*
* Relies on:
*
- * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
* of this CPU
* matching
- * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
- * of the next CPU
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of the next CPU
*/
- atomic_set_release(&printk_cpulock_owner,
- -1); /* LMM(__printk_cpu_unlock:B) */
+ atomic_set_release(&printk_cpu_sync_owner,
+ -1); /* LMM(__printk_cpu_sync_put:B) */
}
-EXPORT_SYMBOL(__printk_cpu_unlock);
+EXPORT_SYMBOL(__printk_cpu_sync_put);
#endif /* CONFIG_SMP */
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 8a7b7362c0dd..2dc4d5a1f1ff 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -203,7 +203,7 @@
* prb_rec_init_wr(&r, 5);
*
* // try to extend, but only if it does not exceed 32 bytes
- * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) {
+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
* snprintf(&r.text_buf[r.info->text_len],
* r.text_buf_size - r.info->text_len, "hello");
*
@@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
* state has been re-checked. A memcpy() for all of @desc
* cannot be used because of the atomic_t @state_var field.
*/
- memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
- sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ if (desc_out) {
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ }
if (seq_out)
*seq_out = info->seq; /* also part of desc_read:C */
if (caller_id_out)
@@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
d_state = get_desc_state(id, state_val);
out:
- atomic_long_set(&desc_out->state_var, state_val);
+ if (desc_out)
+ atomic_long_set(&desc_out->state_var, state_val);
return d_state;
}
@@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, id);
}
/**
@@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e)
*/
void prb_final_commit(struct prb_reserved_entry *e)
{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+
_prb_commit(e, desc_finalized);
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, e->id);
}
/*
@@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
*/
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
- u64 seq = 0;
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ unsigned long id;
+ u64 seq;
+
+ /* Check if the cached @id still points to a valid @seq. */
+ id = atomic_long_read(&desc_ring->last_finalized_id);
+ d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
- /* Search forward from the oldest descriptor. */
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
+ } else {
+ /*
+ * The information about the last finalized sequence number
+ * has gone. It should happen only when there is a flood of
+ * new messages and the ringbuffer is rapidly recycled.
+ * Give up and start from the beginning.
+ */
+ seq = 0;
+ }
+
+ /*
+ * The information about the last finalized @seq might be inaccurate.
+ * Search forward to find the current one.
+ */
while (_prb_read_valid(rb, &seq, NULL, NULL))
seq++;
@@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb,
rb->desc_ring.infos = infos;
atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
rb->text_data_ring.size_bits = textbits;
rb->text_data_ring.data = text_buf;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 73cc80e01cef..18cd25e489b8 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -75,6 +75,7 @@ struct prb_desc_ring {
struct printk_info *infos;
atomic_long_t head_id;
atomic_long_t tail_id;
+ atomic_long_t last_finalized_id;
};
/*
@@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \
.infos = &_##name##_infos[0], \
.head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
.tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \
}, \
.text_data_ring = { \
.size_bits = (avgtextbits) + (descbits), \
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 94232186fccb..ef0f9a2044da 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -4,347 +4,16 @@
*/
#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
-#include <linux/irq_work.h>
#include <linux/printk.h>
#include <linux/kprobes.h>
#include "internal.h"
-/*
- * In NMI and safe mode, printk() avoids taking locks. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examining current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
static DEFINE_PER_CPU(int, printk_context);
-static DEFINE_RAW_SPINLOCK(safe_read_lock);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_percpu_data_ready())
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&safe_read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&safe_read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- /*
- * Make sure that we could access the safe buffers.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&safe_read_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&safe_read_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void noinstr printk_nmi_enter(void)
-{
- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-void noinstr printk_nmi_exit(void)
-{
- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will store the messages into the main logbuf directly.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
@@ -369,46 +38,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
- unsigned long flags;
+ if (this_cpu_read(printk_context) || in_nmi()) {
int len;
- printk_safe_enter_irqsave(flags);
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- printk_safe_exit_irqrestore(flags);
defer_console_output();
return len;
}
- /* Use extra buffer in NMI. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
/* No obstacles. */
return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
new file mode 100644
index 000000000000..c228343eeb97
--- /dev/null
+++ b/kernel/printk/sysctl.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysctl.c: General linux system control interface
+ */
+
+#include <linux/sysctl.h>
+#include <linux/printk.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include "internal.h"
+
+static const int ten_thousand = 10000;
+
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table printk_sysctls[] = {
+ {
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_delay",
+ .data = &printk_delay_msec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&ten_thousand,
+ },
+ {
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {}
+};
+
+void __init printk_sysctl_init(void)
+{
+ register_sysctl_init("kernel", printk_sysctls);
+}
diff --git a/kernel/profile.c b/kernel/profile.c
index c2ebddb5e974..8a77769bc4b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -41,7 +41,8 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer;
-static unsigned long prof_len, prof_shift;
+static unsigned long prof_len;
+static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
@@ -58,43 +59,39 @@ int profile_setup(char *str)
static const char schedstr[] = "schedule";
static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
+ const char *select = NULL;
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
- if (str[strlen(sleepstr)] == ',')
- str += strlen(sleepstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel sleep profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = sleepstr;
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- if (str[strlen(schedstr)] == ',')
- str += strlen(schedstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel schedule profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
- if (str[strlen(kvmstr)] == ',')
- str += strlen(kvmstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel KVM profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = kvmstr;
} else if (get_option(&str, &par)) {
- prof_shift = par;
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
- pr_info("kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
+
+ if (select) {
+ if (str[strlen(select)] == ',')
+ str += strlen(select) + 1;
+ if (get_option(&str, &par))
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel %s profiling enabled (shift: %u)\n",
+ select, prof_shift);
+ }
+
return 1;
}
__setup("profile=", profile_setup);
@@ -108,6 +105,13 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
+
+ if (!prof_len) {
+ pr_warn("profiling shift: %u too large\n", prof_shift);
+ prof_on = 0;
+ return -EINVAL;
+ }
+
buffer_bytes = prof_len*sizeof(atomic_t);
if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
@@ -132,79 +136,6 @@ int __ref profile_init(void)
return -ENOMEM;
}
-/* Profile event notifications */
-
-static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
-static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
-static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
-
-void profile_task_exit(struct task_struct *task)
-{
- blocking_notifier_call_chain(&task_exit_notifier, 0, task);
-}
-
-int profile_handoff_task(struct task_struct *task)
-{
- int ret;
- ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
- return (ret == NOTIFY_OK) ? 1 : 0;
-}
-
-void profile_munmap(unsigned long addr)
-{
- blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-}
-
-int task_handoff_register(struct notifier_block *n)
-{
- return atomic_notifier_chain_register(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_register);
-
-int task_handoff_unregister(struct notifier_block *n)
-{
- return atomic_notifier_chain_unregister(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-
-int profile_event_register(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_register(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_register(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_register);
-
-int profile_event_unregister(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_unregister(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_unregister(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_unregister);
-
#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
/*
* Each cpu has a pair of open-addressed hashtables for pending
@@ -468,7 +399,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long p = *ppos;
ssize_t read;
char *pnt;
- unsigned int sample_step = 1 << prof_shift;
+ unsigned long sample_step = 1UL << prof_shift;
profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
@@ -490,6 +421,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return read;
}
+/* default is to not implement this call */
+int __weak setup_profiling_timer(unsigned mult)
+{
+ return -EINVAL;
+}
+
/*
* Writing to /proc/profile resets the counters
*
@@ -500,8 +437,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
- extern int setup_profiling_timer(unsigned int multiplier);
-
if (count == sizeof(int)) {
unsigned int multiplier;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f8589bf8d7dc..443057bee87c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,7 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -185,7 +186,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
return true;
}
-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -197,7 +203,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
- WRITE_ONCE(task->__state, __TASK_TRACED);
+ task->jobctl |= JOBCTL_PTRACE_FROZEN;
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -207,23 +213,21 @@ static bool ptrace_freeze_traced(struct task_struct *task)
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (READ_ONCE(task->__state) != __TASK_TRACED)
- return;
-
- WARN_ON(!task->ptrace || task->parent != current);
+ unsigned long flags;
/*
- * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
- * Recheck state under the lock to close this race.
+ * The child may be awake and may have cleared
+ * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
+ * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
*/
- spin_lock_irq(&task->sighand->siglock);
- if (READ_ONCE(task->__state) == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
+ if (lock_task_sighand(task, &flags)) {
+ task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~JOBCTL_TRACED;
wake_up_state(task, __TASK_TRACED);
- else
- WRITE_ONCE(task->__state, TASK_TRACED);
+ }
+ unlock_task_sighand(task, &flags);
}
- spin_unlock_irq(&task->sighand->siglock);
}
/**
@@ -256,7 +260,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
*/
read_lock(&tasklist_lock);
if (child->ptrace && child->parent == current) {
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
@@ -266,17 +269,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) {
- /*
- * This can only happen if may_ptrace_stop() fails and
- * ptrace_stop() changes ->state back to TASK_RUNNING,
- * so we should not worry about leaking __TASK_TRACED.
- */
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
- ret = -ESRCH;
- }
- }
+ if (!ret && !ignore_state &&
+ WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
+ ret = -ESRCH;
return ret;
}
@@ -371,6 +366,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
+static int check_ptrace_options(unsigned long data)
+{
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+ return 0;
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -382,8 +397,16 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize) {
if (addr != 0)
goto out;
+ /*
+ * This duplicates the check in check_ptrace_options() because
+ * ptrace_attach() and ptrace_setoptions() have historically
+ * used different error codes for unknown ptrace options.
+ */
if (flags & ~(unsigned long)PTRACE_O_MASK)
goto out;
+ retval = check_ptrace_options(flags);
+ if (retval)
+ return retval;
flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
} else {
flags = PT_PTRACED;
@@ -419,8 +442,6 @@ static int ptrace_attach(struct task_struct *task, long request,
if (task->ptrace)
goto unlock_tasklist;
- if (seize)
- flags |= PT_SEIZED;
task->ptrace = flags;
ptrace_link(task, current);
@@ -449,8 +470,10 @@ static int ptrace_attach(struct task_struct *task, long request,
* in and out of STOPPED are protected by siglock.
*/
if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
signal_wake_up_state(task, __TASK_STOPPED);
+ }
spin_unlock(&task->sighand->siglock);
@@ -656,22 +679,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
unsigned flags;
+ int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK)
- return -EINVAL;
-
- if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
- !IS_ENABLED(CONFIG_SECCOMP))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
- current->ptrace & PT_SUSPEND_SECCOMP)
- return -EPERM;
- }
+ ret = check_ptrace_options(data);
+ if (ret)
+ return ret;
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
@@ -802,7 +814,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
@@ -814,11 +826,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
}
#endif
-#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
-#else
-#define is_singlestep(request) 0
-#endif
#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
@@ -835,8 +843,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
- bool need_siglock;
-
if (!valid_signal(data))
return -EIO;
@@ -872,18 +878,12 @@ static int ptrace_resume(struct task_struct *child, long request,
* Note that we need siglock even if ->exit_code == data and/or this
* status was not reported yet, the new status must not be cleared by
* wait_task_stopped() after resume.
- *
- * If data == 0 we do not care if wait_task_stopped() reports the old
- * status and clears the code too; this can't race with the tracee, it
- * takes siglock after resume.
*/
- need_siglock = data && !thread_group_empty(current);
- if (need_siglock)
- spin_lock_irq(&child->sighand->siglock);
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
- if (need_siglock)
- spin_unlock_irq(&child->sighand->siglock);
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
@@ -1206,9 +1206,7 @@ int ptrace_request(struct task_struct *child, long request,
}
#endif
-#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
-#endif
#ifdef PTRACE_SINGLEBLOCK
case PTRACE_SINGLEBLOCK:
#endif
@@ -1221,9 +1219,8 @@ int ptrace_request(struct task_struct *child, long request,
return ptrace_resume(child, request, data);
case PTRACE_KILL:
- if (child->exit_state) /* already dead */
- return 0;
- return ptrace_resume(child, request, SIGKILL);
+ send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
+ return 0;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
case PTRACE_GETREGSET:
@@ -1263,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request,
break;
#endif
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
+
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
+
default:
break;
}
@@ -1270,10 +1275,6 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-#ifndef arch_ptrace_attach
-#define arch_ptrace_attach(child) do { } while (0)
-#endif
-
SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
@@ -1282,8 +1283,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
- if (!ret)
- arch_ptrace_attach(current);
goto out;
}
@@ -1295,12 +1294,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
@@ -1440,12 +1433,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3128b7cf8e1f..9071182b1284 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -8,6 +8,8 @@ menu "RCU Subsystem"
config TREE_RCU
bool
default y if SMP
+ # Dynticks-idle tracking
+ select CONTEXT_TRACKING_IDLE
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -51,57 +53,77 @@ config RCU_EXPERT
Say N if you are unsure.
-config SRCU
- bool
- help
- This option selects the sleepable version of RCU. This version
- permits arbitrary sleeping or blocking within RCU read-side critical
- sections.
-
config TINY_SRCU
bool
- default y if SRCU && TINY_RCU
+ default y if TINY_RCU
help
This option selects the single-CPU non-preemptible version of SRCU.
config TREE_SRCU
bool
- default y if SRCU && !TINY_RCU
+ default y if !TINY_RCU
help
This option selects the full-fledged version of SRCU.
+config NEED_SRCU_NMI_SAFE
+ def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
+
config TASKS_RCU_GENERIC
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
- select SRCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
+config FORCE_TASKS_RCU
+ bool "Force selection of TASKS_RCU"
+ depends on RCU_EXPERT
+ select TASKS_RCU
+ default n
+ help
+ This option force-enables a task-based RCU implementation
+ that uses only voluntary context switch (not preemption!),
+ idle, and user-mode execution as quiescent states. Not for
+ manual selection in most cases.
+
config TASKS_RCU
- def_bool PREEMPTION
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_RUDE_RCU
+ bool "Force selection of Tasks Rude RCU"
+ depends on RCU_EXPERT
+ select TASKS_RUDE_RCU
+ default n
help
- This option enables a task-based RCU implementation that uses
- only voluntary context switch (not preemption!), idle, and
- user-mode execution as quiescent states. Not for manual selection.
+ This option force-enables a task-based RCU implementation
+ that uses only context switch (including preemption) and
+ user-mode execution as quiescent states. It forces IPIs and
+ context switches on all online CPUs, including idle ones,
+ so use with caution. Not for manual selection in most cases.
config TASKS_RUDE_RCU
- def_bool 0
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_TRACE_RCU
+ bool "Force selection of Tasks Trace RCU"
+ depends on RCU_EXPERT
+ select TASKS_TRACE_RCU
+ default n
help
This option enables a task-based RCU implementation that uses
- only context switch (including preemption) and user-mode
- execution as quiescent states. It forces IPIs and context
- switches on all online CPUs, including idle ones, so use
- with caution.
+ explicit rcu_read_lock_trace() read-side markers, and allows
+ these readers to appear in the idle loop as well as on the
+ CPU hotplug code paths. It can force IPIs on online CPUs,
+ including idle ones, so use with caution. Not for manual
+ selection in most cases.
config TASKS_TRACE_RCU
- def_bool 0
+ bool
+ default n
select IRQ_WORK
- help
- This option enables a task-based RCU implementation that uses
- explicit rcu_read_lock_trace() read-side markers, and allows
- these readers to appear in the idle loop as well as on the CPU
- hotplug code paths. It can force IPIs on online CPUs, including
- idle ones, so use with caution.
config RCU_STALL_COMMON
def_bool TREE_RCU
@@ -112,7 +134,7 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
@@ -169,24 +191,6 @@ config RCU_FANOUT_LEAF
Take the default if unsure.
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
config RCU_BOOST
bool "Enable RCU priority boosting"
depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
@@ -213,6 +217,20 @@ config RCU_BOOST_DELAY
Accept the default if unsure.
+config RCU_EXP_KTHREAD
+ bool "Perform RCU expedited work in a real-time kthread"
+ depends on RCU_BOOST && RCU_EXPERT
+ default !PREEMPT_RT && NR_CPUS <= 32
+ help
+ Use this option to further reduce the latencies of expedited
+ grace periods at the expense of being more disruptive.
+
+ This option is disabled by default on PREEMPT_RT=y kernels which
+ disable expedited grace periods after boot by unconditionally
+ setting rcupdate.rcu_normal_after_boot=1.
+
+ Accept the default if unsure.
+
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
depends on TREE_RCU
@@ -241,9 +259,38 @@ config RCU_NOCB_CPU
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
+config RCU_NOCB_CPU_DEFAULT_ALL
+ bool "Offload RCU callback processing from all CPUs by default"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ Use this option to offload callback processing from all CPUs
+ by default, in the absence of the rcu_nocbs or nohz_full boot
+ parameter. This also avoids the need to use any boot parameters
+ to achieve the effect of offloading all CPUs on boot.
+
+ Say Y here if you want offload all CPUs by default on boot.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_CB_BOOST
+ bool "Offload RCU callback from real-time kthread"
+ depends on RCU_NOCB_CPU && RCU_BOOST
+ default y if PREEMPT_RT
+ help
+ Use this option to invoke offloaded callbacks as SCHED_FIFO
+ to avoid starvation by heavy SCHED_OTHER background load.
+ Of course, running as SCHED_FIFO during callback floods will
+ cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ of milliseconds or more. Therefore, when enabling this option,
+ it is your responsibility to ensure that latency-sensitive
+ tasks either run with higher priority or run on some other CPU.
+
+ Say Y here if you want to set RT priority for offloading kthreads.
+ Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
- depends on RCU_EXPERT
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
default PREEMPT_RT || NR_CPUS < 8
help
Use this option to further reduce the number of IPIs sent
@@ -259,4 +306,12 @@ config TASKS_TRACE_RCU_READ_MB
Say N here if you hate read-side memory barriers.
Take the default if you are unsure.
+config RCU_LAZY
+ bool "RCU callback lazy invocation functionality"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ To save power, batch RCU callbacks and flush after delay, memory
+ pressure, or callback list growing too big.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 4fd64999300f..2984de629f74 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -27,10 +27,6 @@ config RCU_SCALE_TEST
tristate "performance tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance
@@ -46,10 +42,6 @@ config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs torture tests
@@ -65,10 +57,6 @@ config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance tests
@@ -91,6 +79,32 @@ config RCU_CPU_STALL_TIMEOUT
RCU grace period persists, additional CPU stall warnings are
printed at more widely spaced intervals.
+config RCU_EXP_CPU_STALL_TIMEOUT
+ int "Expedited RCU CPU stall timeout in milliseconds"
+ depends on RCU_STALL_COMMON
+ range 0 300000
+ default 0
+ help
+ If a given expedited RCU grace period extends more than the
+ specified number of milliseconds, a CPU stall warning is printed.
+ If the RCU grace period persists, additional CPU stall warnings
+ are printed at more widely spaced intervals. A value of zero
+ says to use the RCU_CPU_STALL_TIMEOUT value converted from
+ seconds to milliseconds.
+
+config RCU_CPU_STALL_CPUTIME
+ bool "Provide additional RCU stall debug information"
+ depends on RCU_STALL_COMMON
+ default n
+ help
+ Collect statistics during the sampling period, such as the number of
+ (hard interrupts, soft interrupts, task switches) and the cputime of
+ (hard interrupts, soft interrupts, kernel tasks) are added to the
+ RCU stall report. For multiple continuous RCU stalls, all sampling
+ periods begin at half of the first RCU stall timeout.
+ The boot option rcupdate.rcu_cpu_stall_cputime has the same function
+ as this one, but will override this if it exists.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
@@ -116,7 +130,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
- depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
default n
select PREEMPT_COUNT if PREEMPT=n
help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..4a1b9622598b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,17 +12,55 @@
#include <trace/events/rcu.h>
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
-
-
/*
* Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
*/
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED 0x1
+
+extern int sysctl_sched_rt_runtime;
+
/*
* Return the counter portion of a sequence number previously returned
* by rcu_seq_snap() or rcu_seq_current().
@@ -118,6 +156,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
}
/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred, but do not allow the
+ * (ULONG_MAX / 2) safety-factor/guard-band.
+ */
+static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
+{
+ unsigned long cur_s = READ_ONCE(*sp);
+
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+}
+
+/*
* Has a grace period completed since the time the old gp_seq was collected?
*/
static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new)
@@ -210,7 +260,11 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
+extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
+extern bool rcu_exp_stall_task_details __read_mostly;
int rcu_jiffies_till_stall_check(void);
+int rcu_exp_jiffies_till_stall_check(void);
static inline bool rcu_stall_is_suppressed(void)
{
@@ -271,7 +325,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
-#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
+#if !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -324,11 +378,13 @@ extern void rcu_init_geometry(void);
* specified state structure (for SRCU) or the only rcu_state structure
* (for RCU).
*/
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
for ((rnp) = &(sp)->node[0]; \
(rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
#define rcu_for_each_node_breadth_first(rnp) \
- srcu_for_each_node_breadth_first(&rcu_state, rnp)
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
/*
* Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
@@ -360,6 +416,10 @@ extern void rcu_init_geometry(void);
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
+#endif /* !defined(CONFIG_TINY_RCU) */
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
+
/*
* Wrappers for the rcu_node::lock acquire and release.
*
@@ -422,20 +482,26 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
-#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
+#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline bool rcu_async_should_hurry(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_async_hurry(void) { }
+static inline void rcu_async_relax(void) { }
static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+bool rcu_async_should_hurry(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
+void rcu_async_hurry(void);
+void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
@@ -459,6 +525,14 @@ enum rcutorture_type {
INVALID_RCU_FLAVOR
};
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_lazy_get_jiffies_till_flush(void);
+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+#else
+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+#endif
+
#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
@@ -523,6 +597,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
+static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
@@ -534,14 +610,19 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
+#else /* !CONFIG_RCU_EXP_KTHREAD */
extern struct workqueue_struct *rcu_par_gp_wq;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+void rcu_gp_slow_register(atomic_t *rgssp);
+void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
-bool rcu_is_nocb_cpu(int cpu);
void rcu_bind_current_to_nocb(void);
#else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
static inline void rcu_bind_current_to_nocb(void) { }
#endif
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index aaa111237b60..f71fac422c8f 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -89,7 +89,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
}
/* Get the length of a segment of the rcu_segcblist structure. */
-static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
{
return READ_ONCE(rsclp->seglen[seg]);
}
@@ -261,16 +261,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Mark the specified rcu_segcblist structure as offloaded.
+ * Mark the specified rcu_segcblist structure as offloaded (or not)
*/
void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- if (offload) {
- rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
- rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
- } else {
+ if (offload)
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
+ else
rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
- }
}
/*
@@ -507,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
- * Callbacks moved, so clean up the misordered ->tails[] pointers
- * that now point into the middle of the list of ready-to-invoke
- * callbacks. The overall effect is to copy down the later pointers
- * into the gap that was created by the now-ready segments.
+ * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+ * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the
+ * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+ * created by the now-ready-to-invoke segments.
*/
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 9a19328ff251..4fe877f5f654 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,8 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg);
+
/* Return number of callbacks in segmented callback list by summing seglen. */
long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
@@ -56,13 +58,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
int flags)
{
- rsclp->flags |= flags;
+ WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
}
static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
int flags)
{
- rsclp->flags &= ~flags;
+ WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
}
static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
@@ -80,11 +82,14 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */
+/*
+ * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the
+ * [de]offloading process)?
+ */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY))
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
return true;
return false;
@@ -92,9 +97,8 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
{
- int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED;
-
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags)
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
return true;
return false;
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index dca51fe9c73f..e82ec9f9a5d8 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -50,8 +50,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
#define VERBOSE_SCALEOUT_STRING(s) \
do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0)
+#define SCALEOUT_ERRSTRING(s) \
+ pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s)
/*
* The intended use cases for the nreaders and nwriters module parameters
@@ -95,6 +95,7 @@ torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
+torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
static char *scale_type = "rcu";
module_param(scale_type, charp, 0444);
@@ -175,7 +176,7 @@ static struct rcu_scale_ops rcu_ops = {
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.exp_completed = rcu_exp_batches_completed,
- .async = call_rcu,
+ .async = call_rcu_hurry,
.gp_barrier = rcu_barrier,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
@@ -268,6 +269,8 @@ static struct rcu_scale_ops srcud_ops = {
.name = "srcud"
};
+#ifdef CONFIG_TASKS_RCU
+
/*
* Definitions for RCU-tasks scalability testing.
*/
@@ -295,6 +298,16 @@ static struct rcu_scale_ops tasks_ops = {
.name = "tasks"
};
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for RCU-tasks-trace scalability testing.
*/
@@ -324,6 +337,14 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -399,6 +420,7 @@ rcu_scale_writer(void *arg)
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ current->flags |= PF_NO_SETAFFINITY;
sched_set_fifo_low(current);
if (holdoff)
@@ -487,7 +509,7 @@ retry:
if (gp_async) {
cur_ops->gp_barrier();
}
- writer_n_durations[me] = i_max;
+ writer_n_durations[me] = i_max + 1;
torture_kthread_stopping("rcu_scale_writer");
return 0;
}
@@ -514,11 +536,11 @@ rcu_scale_cleanup(void)
* during the mid-boot phase, so have to wait till the end.
*/
if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
if (rcu_gp_is_normal() && gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
if (gp_exp && gp_async)
- VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
if (torture_cleanup_begin())
return;
@@ -561,7 +583,7 @@ rcu_scale_cleanup(void)
wdpp = writer_durations[i];
if (!wdpp)
continue;
- for (j = 0; j <= writer_n_durations[i]; j++) {
+ for (j = 0; j < writer_n_durations[i]; j++) {
wdp = &wdpp[j];
pr_alert("%s%s %4d writer-duration: %5d %llu\n",
scale_type, SCALE_FLAG,
@@ -609,8 +631,7 @@ static int compute_real(int n)
static int
rcu_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_scale_cleanup();
kernel_power_off();
@@ -638,6 +659,14 @@ struct kfree_obj {
struct rcu_head rh;
};
+/* Used if doing RCU-kfree'ing via call_rcu(). */
+static void kfree_call_rcu(struct rcu_head *rh)
+{
+ struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh);
+
+ kfree(obj);
+}
+
static int
kfree_scale_thread(void *arg)
{
@@ -675,13 +704,18 @@ kfree_scale_thread(void *arg)
if (!alloc_ptr)
return -ENOMEM;
+ if (kfree_by_call_rcu) {
+ call_rcu(&(alloc_ptr->rh), kfree_call_rcu);
+ continue;
+ }
+
// By default kfree_rcu_test_single and kfree_rcu_test_double are
// initialized to false. If both have the same value (false or true)
// both are randomly tested, otherwise only the one with value true
// is tested.
if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
(kfree_rcu_test_both && torture_random(&tr) & 0x800))
- kfree_rcu(alloc_ptr);
+ kfree_rcu_mightsleep(alloc_ptr);
else
kfree_rcu(alloc_ptr, rh);
}
@@ -736,8 +770,8 @@ kfree_scale_cleanup(void)
static int
kfree_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
@@ -746,11 +780,58 @@ kfree_scale_shutdown(void *arg)
return -EINVAL;
}
+// Used if doing RCU-kfree'ing via call_rcu().
+static unsigned long jiffies_at_lazy_cb;
+static struct rcu_head lazy_test1_rh;
+static int rcu_lazy_test1_cb_called;
+static void call_rcu_lazy_test1(struct rcu_head *rh)
+{
+ jiffies_at_lazy_cb = jiffies;
+ WRITE_ONCE(rcu_lazy_test1_cb_called, 1);
+}
+
static int __init
kfree_scale_init(void)
{
- long i;
int firsterr = 0;
+ long i;
+ unsigned long jif_start;
+ unsigned long orig_jif;
+
+ // Also, do a quick self-test to ensure laziness is as much as
+ // expected.
+ if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
+ pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n");
+ kfree_by_call_rcu = 0;
+ }
+
+ if (kfree_by_call_rcu) {
+ /* do a test to check the timeout. */
+ orig_jif = rcu_lazy_get_jiffies_till_flush();
+
+ rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_barrier();
+
+ jif_start = jiffies;
+ jiffies_at_lazy_cb = 0;
+ call_rcu(&lazy_test1_rh, call_rcu_lazy_test1);
+
+ smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
+
+ rcu_lazy_set_jiffies_till_flush(orig_jif);
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ }
kfree_nrealthreads = compute_real(kfree_nthreads);
/* Start up the kthreads. */
@@ -758,12 +839,14 @@ kfree_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
- pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
+ pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
+ kfree_mult * sizeof(struct kfree_obj),
+ kfree_by_call_rcu);
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
GFP_KERNEL);
@@ -775,7 +858,7 @@ kfree_scale_init(void)
for (i = 0; i < kfree_nrealthreads; i++) {
firsterr = torture_create_kthread(kfree_scale_thread, (void *)i,
kfree_reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -797,7 +880,7 @@ rcu_scale_init(void)
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS
};
if (!torture_init_begin(scale_type, verbose))
@@ -838,21 +921,21 @@ rcu_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
firsterr = torture_create_kthread(rcu_scale_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
@@ -865,7 +948,7 @@ rcu_scale_init(void)
kcalloc(nrealwriters, sizeof(*writer_n_durations),
GFP_KERNEL);
if (!writer_tasks || !writer_durations || !writer_n_durations) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -879,7 +962,7 @@ rcu_scale_init(void)
}
firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
writer_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 40ef5417d954..147551c23baf 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/tick.h>
#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
#include "rcu.h"
@@ -53,15 +54,18 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1)
+#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
@@ -71,60 +75,54 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
"Extend readers by disabling bh (1), irqs (2), or preempt (4)");
-torture_param(int, fqs_duration, 0,
- "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
-torture_param(int, fwd_progress_holdoff, 60,
- "Time between forward-progress tests (s)");
-torture_param(bool, fwd_progress_need_resched, 1,
- "Hide cond_resched() behind need_resched()");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+ "Use conditional/async full-stateexpedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
- "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
+torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
-torture_param(int, n_barrier_cbs, 0,
- "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
- "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
-torture_param(int, read_exit_delay, 13,
- "Delay between read-then-exit episodes (s)");
-torture_param(int, read_exit_burst, 16,
- "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
- "Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
-torture_param(int, stall_gp_kthread, 0,
- "Grace-period kthread stall duration (s).");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
- "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
- "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
- "Test support for tickless idle CPUs");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
@@ -140,7 +138,7 @@ static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
-static struct task_struct *fwd_prog_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
@@ -183,7 +181,6 @@ static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
@@ -203,12 +200,24 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_SYNC 6
-#define RTWS_POLL_GET 7
-#define RTWS_POLL_WAIT 8
-#define RTWS_SYNC 9
-#define RTWS_STUTTER 10
-#define RTWS_STOPPING 11
+#define RTWS_COND_GET_FULL 6
+#define RTWS_COND_GET_EXP 7
+#define RTWS_COND_GET_EXP_FULL 8
+#define RTWS_COND_SYNC 9
+#define RTWS_COND_SYNC_FULL 10
+#define RTWS_COND_SYNC_EXP 11
+#define RTWS_COND_SYNC_EXP_FULL 12
+#define RTWS_POLL_GET 13
+#define RTWS_POLL_GET_FULL 14
+#define RTWS_POLL_GET_EXP 15
+#define RTWS_POLL_GET_EXP_FULL 16
+#define RTWS_POLL_WAIT 17
+#define RTWS_POLL_WAIT_FULL 18
+#define RTWS_POLL_WAIT_EXP 19
+#define RTWS_POLL_WAIT_EXP_FULL 20
+#define RTWS_SYNC 21
+#define RTWS_STUTTER 22
+#define RTWS_STOPPING 23
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -216,9 +225,21 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_FULL",
+ "RTWS_COND_GET_EXP",
+ "RTWS_COND_GET_EXP_FULL",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_FULL",
+ "RTWS_COND_SYNC_EXP",
+ "RTWS_COND_SYNC_EXP_FULL",
"RTWS_POLL_GET",
+ "RTWS_POLL_GET_FULL",
+ "RTWS_POLL_GET_EXP",
+ "RTWS_POLL_GET_EXP_FULL",
"RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_FULL",
+ "RTWS_POLL_WAIT_EXP",
+ "RTWS_POLL_WAIT_EXP_FULL",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -278,7 +299,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
/*
* Allocate an element from the rcu_tortures pool.
@@ -331,10 +352,27 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ unsigned long (*get_gp_state_exp)(void);
+ unsigned long (*start_gp_poll_exp)(void);
+ void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state_exp)(unsigned long oldstate);
+ void (*cond_sync_exp)(unsigned long oldstate);
+ void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_comp_state)(void);
+ void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
+ bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
unsigned long (*get_gp_state)(void);
+ void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_gp_completed)(void);
+ void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*start_gp_poll)(void);
+ void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state)(unsigned long oldstate);
+ bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
+ void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -342,10 +380,12 @@ struct rcu_torture_ops {
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
+ int no_pi_lock;
const char *name;
};
@@ -360,7 +400,7 @@ static int torture_readlock_not_held(void)
return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
}
-static int rcu_torture_read_lock(void) __acquires(RCU)
+static int rcu_torture_read_lock(void)
{
rcu_read_lock();
return 0;
@@ -379,7 +419,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
+ if (!atomic_read(&rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -402,7 +442,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
+static void rcu_torture_read_unlock(int idx)
{
rcu_read_unlock();
}
@@ -475,7 +515,7 @@ static unsigned long rcu_no_completed(void)
static void rcu_torture_deferred_free(struct rcu_torture *p)
{
- call_rcu(&p->rtort_rcu, rcu_torture_cb);
+ call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb);
}
static void rcu_sync_torture_init(void)
@@ -483,6 +523,11 @@ static void rcu_sync_torture_init(void)
INIT_LIST_HEAD(&rcu_torture_removed);
}
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
+{
+ return poll;
+}
+
static struct rcu_torture_ops rcu_ops = {
.ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
@@ -495,11 +540,27 @@ static struct rcu_torture_ops rcu_ops = {
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
+ .same_gp_state = same_state_synchronize_rcu,
+ .same_gp_state_full = same_state_synchronize_rcu_full,
+ .get_comp_state = get_completed_synchronize_rcu,
+ .get_comp_state_full = get_completed_synchronize_rcu_full,
.get_gp_state = get_state_synchronize_rcu,
+ .get_gp_state_full = get_state_synchronize_rcu_full,
+ .get_gp_completed = get_completed_synchronize_rcu,
+ .get_gp_completed_full = get_completed_synchronize_rcu_full,
.start_gp_poll = start_poll_synchronize_rcu,
+ .start_gp_poll_full = start_poll_synchronize_rcu_full,
.poll_gp_state = poll_state_synchronize_rcu,
+ .poll_gp_state_full = poll_state_synchronize_rcu_full,
+ .poll_need_2gp = rcu_poll_need_2gp,
.cond_sync = cond_synchronize_rcu,
- .call = call_rcu,
+ .cond_sync_full = cond_synchronize_rcu_full,
+ .get_gp_state_exp = get_state_synchronize_rcu,
+ .start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
+ .poll_gp_state_exp = poll_state_synchronize_rcu,
+ .cond_sync_exp = cond_synchronize_rcu_expedited,
+ .call = call_rcu_hurry,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -563,10 +624,14 @@ static struct rcu_torture_ops rcu_busted_ops = {
DEFINE_STATIC_SRCU(srcu_ctl);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
+static struct rcu_torture_ops srcud_ops;
-static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
+static int srcu_torture_read_lock(void)
{
- return srcu_read_lock(srcu_ctlp);
+ if (cur_ops == &srcud_ops)
+ return srcu_read_lock_nmisafe(srcu_ctlp);
+ else
+ return srcu_read_lock(srcu_ctlp);
}
static void
@@ -588,9 +653,12 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
+static void srcu_torture_read_unlock(int idx)
{
- srcu_read_unlock(srcu_ctlp, idx);
+ if (cur_ops == &srcud_ops)
+ srcu_read_unlock_nmisafe(srcu_ctlp, idx);
+ else
+ srcu_read_unlock(srcu_ctlp, idx);
}
static int torture_srcu_read_lock_held(void)
@@ -666,7 +734,9 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
};
@@ -696,10 +766,15 @@ static struct rcu_torture_ops srcud_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
};
@@ -720,11 +795,56 @@ static struct rcu_torture_ops busted_srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted_srcud"
};
/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
+}
+
+static int rcu_torture_read_lock_trivial(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "trivial"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
* Definitions for RCU-tasks torture testing.
*/
@@ -744,7 +864,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
static void synchronize_rcu_mult_test(void)
{
- synchronize_rcu_mult(call_rcu_tasks, call_rcu);
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry);
}
static struct rcu_torture_ops tasks_ops = {
@@ -767,47 +887,16 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-/*
- * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
- */
+#define TASKS_OPS &tasks_ops,
-static void synchronize_rcu_trivial(void)
-{
- int cpu;
+#else // #ifdef CONFIG_TASKS_RCU
- for_each_online_cpu(cpu) {
- rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
- WARN_ON_ONCE(raw_smp_processor_id() != cpu);
- }
-}
+#define TASKS_OPS
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
-{
- preempt_disable();
- return 0;
-}
+#endif // #else #ifdef CONFIG_TASKS_RCU
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
-{
- preempt_enable();
-}
-static struct rcu_torture_ops trivial_ops = {
- .ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock_trivial,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_torture_read_unlock_trivial,
- .readlock_held = torture_readlock_not_held,
- .get_gp_seq = rcu_no_completed,
- .sync = synchronize_rcu_trivial,
- .exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
- .irq_capable = 1,
- .name = "trivial"
-};
+#ifdef CONFIG_TASKS_RUDE_RCU
/*
* Definitions for rude RCU-tasks torture testing.
@@ -831,12 +920,24 @@ static struct rcu_torture_ops tasks_rude_ops = {
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
.name = "tasks-rude"
};
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU
+
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for tracing RCU-tasks torture testing.
*/
@@ -871,6 +972,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -878,6 +980,15 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU
+
+
static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -984,7 +1095,7 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
/* Wait for the next test interval. */
- oldstarttime = boost_starttime;
+ oldstarttime = READ_ONCE(boost_starttime);
while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
if (stutter_wait("rcu_torture_boost"))
@@ -1028,10 +1139,11 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime && !kthread_should_stop()) {
+ while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
if (oldstarttime == boost_starttime) {
- boost_starttime = jiffies + test_boost_interval * HZ;
+ WRITE_ONCE(boost_starttime,
+ jiffies + test_boost_interval * HZ);
n_rcu_torture_boosts++;
}
mutex_unlock(&boost_mutex);
@@ -1087,9 +1199,8 @@ rcu_torture_fqs(void *arg)
return 0;
}
-// Used by writers to randomly choose from the available grace-period
-// primitives. The only purpose of the initialization is to size the array.
-static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC };
+// Used by writers to randomly choose from the available grace-period primitives.
+static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
static int nsynctypes;
/*
@@ -1097,18 +1208,60 @@ static int nsynctypes;
*/
static void rcu_torture_write_types(void)
{
- bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_poll1 = gp_poll, gp_sync1 = gp_sync;
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+ bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+ bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (!gp_cond1 &&
+ !gp_cond_exp1 &&
+ !gp_cond_full1 &&
+ !gp_cond_exp_full1 &&
+ !gp_exp1 &&
+ !gp_poll_exp1 &&
+ !gp_poll_exp_full1 &&
+ !gp_normal1 &&
+ !gp_poll1 &&
+ !gp_poll_full1 &&
+ !gp_sync1) {
+ gp_cond1 = true;
+ gp_cond_exp1 = true;
+ gp_cond_full1 = true;
+ gp_cond_exp_full1 = true;
+ gp_exp1 = true;
+ gp_poll_exp1 = true;
+ gp_poll_exp_full1 = true;
+ gp_normal1 = true;
+ gp_poll1 = true;
+ gp_poll_full1 = true;
+ gp_sync1 = true;
+ }
if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
} else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
+ if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP;
+ pr_info("%s: Testing conditional expedited GPs.\n", __func__);
+ } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
+ pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
+ }
+ if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+ pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+ } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+ pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+ }
+ if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+ pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+ } else if (gp_cond_exp_full &&
+ (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+ pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1121,12 +1274,33 @@ static void rcu_torture_write_types(void)
} else if (gp_normal && !cur_ops->deferred_free) {
pr_alert("%s: gp_normal without primitives.\n", __func__);
}
- if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state &&
+ cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
synctype[nsynctypes++] = RTWS_POLL_GET;
pr_info("%s: Testing polling GPs.\n", __func__);
} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
pr_alert("%s: gp_poll without primitives.\n", __func__);
}
+ if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full
+ && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+ pr_info("%s: Testing polling full-state GPs.\n", __func__);
+ } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+ }
+ if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
+ pr_info("%s: Testing polling expedited GPs.\n", __func__);
+ } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
+ pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
+ }
+ if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+ pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+ } else if (gp_poll_exp_full &&
+ (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
@@ -1136,6 +1310,40 @@ static void rcu_torture_write_types(void)
}
/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs. Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ bool dopoll;
+ bool dopoll_full;
+ unsigned long r = torture_random(trsp);
+
+ dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+ dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+ if (dopoll || dopoll_full)
+ cpus_read_lock();
+ if (dopoll)
+ cookie = cur_ops->get_gp_state();
+ if (dopoll_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
+ sync();
+ sync();
+ WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %pS() online %*pbl.",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 4 failed %pS() online %*pbl",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ if (dopoll || dopoll_full)
+ cpus_read_unlock();
+}
+
+/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
* after a series of grace periods (the "pipeline").
@@ -1146,15 +1354,21 @@ rcu_torture_writer(void *arg)
bool boot_ended;
bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int expediting = 0;
unsigned long gp_snap;
+ unsigned long gp_snap1;
+ struct rcu_gp_oldstate gp_snap_full;
+ struct rcu_gp_oldstate gp_snap1_full;
int i;
int idx;
int oldnice = task_nice(current);
+ struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
bool stutter_waited;
+ unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
@@ -1162,7 +1376,7 @@ rcu_torture_writer(void *arg)
" GP expediting controlled from boot/sysfs for %s.\n",
torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
- "rcu_torture_writer: No update-side primitives.\n")) {
+ "%s: No update-side primitives.\n", __func__)) {
/*
* No updates primitives, so don't try updating.
* The resulting test won't be testing much, hence the
@@ -1170,6 +1384,7 @@ rcu_torture_writer(void *arg)
*/
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
+ return 0;
}
do {
@@ -1194,16 +1409,36 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+
+ // Make sure readers block polled grace periods.
if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
idx = cur_ops->readlock();
cookie = cur_ops->get_gp_state();
- WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
- cur_ops->poll_gp_state(cookie),
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
"%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
__func__,
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_completed) {
+ cookie = cur_ops->get_gp_completed();
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ }
+ cur_ops->readunlock(idx);
+ }
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+ idx = cur_ops->readlock();
+ cur_ops->get_gp_state_full(&cookie_full);
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ if (cur_ops->get_gp_completed_full) {
+ cur_ops->get_gp_completed_full(&cookie_full);
+ WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+ }
cur_ops->readunlock(idx);
}
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1213,7 +1448,7 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
- cur_ops->exp_sync();
+ do_rtws_sync(&rand, cur_ops->exp_sync);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
@@ -1224,18 +1459,92 @@ rcu_torture_writer(void *arg)
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_EXP:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP;
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
+ cur_ops->cond_sync_exp(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+ cur_ops->cond_sync_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
- while (!cur_ops->poll_gp_state(gp_snap))
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ gp_snap1 = cur_ops->get_gp_state();
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ if (cur_ops->poll_gp_state(ulo[i]) ||
+ cur_ops->same_gp_state(ulo[i], gp_snap1)) {
+ ulo[i] = gp_snap1;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ cur_ops->get_comp_state_full(&rgo[i]);
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ cur_ops->get_gp_state_full(&gp_snap1_full);
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ if (cur_ops->poll_gp_state_full(&rgo[i]) ||
+ cur_ops->same_gp_state_full(&rgo[i],
+ &gp_snap1_full)) {
+ rgo[i] = gp_snap1_full;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP;
+ gp_snap = cur_ops->start_gp_poll_exp();
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
+ while (!cur_ops->poll_gp_state_exp(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
- cur_ops->sync();
+ do_rtws_sync(&rand, cur_ops->sync);
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1263,7 +1572,7 @@ rcu_torture_writer(void *arg)
boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
- !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ !atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
boot_ended)
@@ -1271,8 +1580,9 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
- rcu_ftrace_dump(DUMP_ALL);
+ tracing_off();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ rcu_ftrace_dump(DUMP_ALL);
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1301,11 +1611,23 @@ static int
rcu_torture_fakewriter(void *arg)
{
unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
+ if (WARN_ONCE(nsynctypes == 0,
+ "%s: No update-side primitives.\n", __func__)) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+ }
+
do {
torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
@@ -1323,6 +1645,21 @@ rcu_torture_fakewriter(void *arg)
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync(gp_snap);
break;
+ case RTWS_COND_GET_EXP:
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp(gp_snap);
+ break;
+ case RTWS_COND_GET_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_full(&gp_snap_full);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ break;
case RTWS_POLL_GET:
gp_snap = cur_ops->start_gp_poll();
while (!cur_ops->poll_gp_state(gp_snap)) {
@@ -1330,6 +1667,27 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_FULL:
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP:
+ gp_snap = cur_ops->start_gp_poll_exp();
+ while (!cur_ops->poll_gp_state_exp(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_SYNC:
cur_ops->sync();
break;
@@ -1420,46 +1778,64 @@ static void rcutorture_one_extend(int *readstate, int newstate,
struct rt_read_seg *rtrsp)
{
unsigned long flags;
- int idxnew = -1;
- int idxold = *readstate;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0);
- WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
if (statesnew & RCUTORTURE_RDR_BH)
local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_IRQ)
local_irq_disable();
if (statesnew & RCUTORTURE_RDR_PREEMPT)
preempt_disable();
- if (statesnew & RCUTORTURE_RDR_RBH)
- rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
- if (statesnew & RCUTORTURE_RDR_RCU)
- idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
- /* Next, remove old protection, irq first due to bh conflict. */
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Namely: BH can not be enabled with disabled interrupts.
+ * Additionally PREEMPT_RT requires that BH is enabled in preemptible
+ * context.
+ */
if (statesold & RCUTORTURE_RDR_IRQ)
local_irq_enable();
- if (statesold & RCUTORTURE_RDR_BH)
- local_bh_enable();
if (statesold & RCUTORTURE_RDR_PREEMPT)
preempt_enable();
- if (statesold & RCUTORTURE_RDR_RBH)
- rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_SCHED)
rcu_read_unlock_sched();
- if (statesold & RCUTORTURE_RDR_RCU) {
- bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
+ cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
@@ -1469,13 +1845,19 @@ static void rcutorture_one_extend(int *readstate, int newstate,
cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
- if (idxnew == -1)
- idxnew = idxold & ~RCUTORTURE_RDR_MASK;
- WARN_ON_ONCE(idxnew < 0);
- WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
- *readstate = idxnew | newstate;
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
+ pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
+ pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1485,7 +1867,7 @@ static int rcutorture_extend_mask_max(void)
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
- mask = mask | RCUTORTURE_RDR_RCU;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
return mask;
}
@@ -1496,19 +1878,45 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
int mask = rcutorture_extend_mask_max();
unsigned long randmask1 = torture_random(trsp) >> 8;
unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
- /* Can't enable bh w/irq disabled. */
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
- (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
- mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- return mask ?: RCUTORTURE_RDR_RCU;
+
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & bhs;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* Can't modify BH in atomic context */
+ if (oldmask & preempts_irq)
+ mask &= ~bhs;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & bhs;
+ }
+
+ return mask ?: RCUTORTURE_RDR_RCU_1;
}
/*
@@ -1543,7 +1951,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
*/
static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ bool checkpolling = !(torture_random(trsp) & 0xfff);
unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int i;
unsigned long started;
unsigned long completed;
@@ -1559,8 +1969,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ }
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
@@ -1594,15 +2008,24 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ONCE(cur_ops->poll_gp_state(cookie),
- "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
- __func__,
- rcu_torture_writer_state_getname(),
- rcu_torture_writer_state,
- cookie, cur_ops->get_gp_state());
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ }
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+ WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
@@ -1757,7 +2180,7 @@ rcu_torture_stats_print(void)
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
}
@@ -1772,12 +2195,11 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
@@ -1795,15 +2217,13 @@ rcu_torture_stats_print(void)
if (atomic_read(&n_rcu_torture_mberror) ||
atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
- n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
- i > 1) {
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
- WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
WARN_ON_ONCE(i > 1); // Too-short grace period
}
@@ -1879,7 +2299,13 @@ static void rcu_torture_mem_dump_obj(void)
static int z;
kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ if (WARN_ON_ONCE(!kcp))
+ return;
rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp)) {
+ kmem_cache_destroy(kcp);
+ return;
+ }
pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
mem_dump_obj(ZERO_SIZE_PTR);
@@ -1896,6 +2322,8 @@ static void rcu_torture_mem_dump_obj(void)
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp))
+ return;
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(kmalloc %px):", rhp);
mem_dump_obj(rhp);
@@ -1903,6 +2331,8 @@ static void rcu_torture_mem_dump_obj(void)
mem_dump_obj(&rhp->func);
kfree(rhp);
rhp = vmalloc(4096);
+ if (WARN_ON_ONCE(!rhp))
+ return;
pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(vmalloc %px):", rhp);
mem_dump_obj(rhp);
@@ -1926,7 +2356,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
- "nocbs_nthreads=%d nocbs_toggle=%d\n",
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1937,7 +2368,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
- nocbs_nthreads, nocbs_toggle);
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -1964,13 +2396,25 @@ static int rcutorture_booster_init(unsigned int cpu)
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@ -1979,8 +2423,6 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
@@ -2022,8 +2464,15 @@ static int rcu_torture_stall(void *args)
__func__, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
stop_at))
- if (stall_cpu_block)
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
schedule_timeout_uninterruptible(HZ);
+#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
+ }
if (stall_cpu_irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
@@ -2094,10 +2543,13 @@ struct rcu_fwd {
unsigned long rcu_fwd_startat;
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
};
static DEFINE_MUTEX(rcu_fwd_mutex);
static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -2110,8 +2562,8 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rfp->rcu_fwd_startat);
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
gps = rfp->n_launders_hist[j].launder_gp_seq;
@@ -2209,6 +2661,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
unsigned long stopat;
static DEFINE_TORTURE_RANDOM(trs);
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (!cur_ops->sync)
return; // Cannot do need_resched() forward progress testing without ->sync.
if (cur_ops->call && cur_ops->cb_barrier) {
@@ -2217,7 +2670,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
}
/* Tight loop containing cond_resched(). */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
@@ -2247,11 +2700,13 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
}
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
cur_ops->sync(); /* Wait for running CB to complete. */
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for queued callbacks. */
}
@@ -2260,7 +2715,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
destroy_rcu_head_on_stack(&fcs.rh);
}
schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -2280,13 +2735,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
unsigned long stopat;
unsigned long stoppedat;
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
if (!cur_ops->call)
return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
@@ -2315,7 +2771,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
- } else {
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
@@ -2325,8 +2781,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_sa = 0;
rfcp->rfc_gps = 0;
rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
}
- cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
@@ -2338,6 +2797,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree(rfp);
@@ -2350,11 +2810,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
+ mutex_unlock(&rcu_fwd_mutex);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
@@ -2365,6 +2828,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ int i;
+ long ncbs;
struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex);
@@ -2375,18 +2840,26 @@ static int rcutorture_oom_notify(struct notifier_block *self,
}
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist(rfp);
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
- rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
- rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
@@ -2401,7 +2874,10 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ bool firsttime = true;
+ long max_cbs;
int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -2411,21 +2887,38 @@ static int rcu_torture_fwd_prog(void *args)
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- WRITE_ONCE(rcu_fwd_emergency_stop, false);
- if (!IS_ENABLED(CONFIG_TINY_RCU) ||
- rcu_inkernel_boot_has_ended())
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- if (rcu_inkernel_boot_has_ended())
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
+ schedule_timeout_interruptible(1);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
rcu_torture_fwd_prog_cr(rfp);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
if (stutter_wait("rcu_torture_fwd_prog"))
sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
- WARN_ON(!tested && tested_tries >= 5);
- pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
torture_kthread_stopping("rcu_torture_fwd_prog");
return 0;
}
@@ -2433,18 +2926,29 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ int i;
+ int ret = 0;
struct rcu_fwd *rfp;
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
if ((!cur_ops->sync && !cur_ops->call) ||
- !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
+ cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
return 0;
}
if (stall_cpu > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
- if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS))
+ fwd_progress = 0;
+ if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
WARN_ON(1); /* Make sure rcutorture notices conflict. */
return 0;
@@ -2453,29 +2957,51 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
- if (!rfp)
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
return -ENOMEM;
- spin_lock_init(&rfp->rcu_fwd_lock);
- rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
mutex_lock(&rcu_fwd_mutex);
rcu_fwds = rfp;
mutex_unlock(&rcu_fwd_mutex);
register_oom_notifier(&rcutorture_oom_nb);
- return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
}
static void rcu_torture_fwd_prog_cleanup(void)
{
+ int i;
struct rcu_fwd *rfp;
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
- rfp = rcu_fwds;
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
rcu_fwds = NULL;
mutex_unlock(&rcu_fwd_mutex);
- unregister_oom_notifier(&rcutorture_oom_nb);
kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
}
/* Callback function for RCU barrier testing. */
@@ -2681,7 +3207,6 @@ static int rcu_torture_read_exit_child(void *trsp_in)
// Parent kthread which creates and destroys read-exit child kthreads.
static int rcu_torture_read_exit(void *unused)
{
- int count = 0;
bool errexit = false;
int i;
struct task_struct *tsp;
@@ -2693,34 +3218,28 @@ static int rcu_torture_read_exit(void *unused)
// Each pass through this loop does one read-exit episode.
do {
- if (++count > read_exit_burst) {
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
- rcu_barrier(); // Wait for task_struct free, avoid OOM.
- for (i = 0; i < read_exit_delay; i++) {
- schedule_timeout_uninterruptible(HZ);
- if (READ_ONCE(read_exit_child_stop))
- break;
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ for (i = 0; i < read_exit_burst; i++) {
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ stutter_wait("rcu_torture_read_exit");
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s", "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ break;
}
- if (!READ_ONCE(read_exit_child_stop))
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
- count = 0;
- }
- if (READ_ONCE(read_exit_child_stop))
- break;
- // Spawn child.
- tsp = kthread_run(rcu_torture_read_exit_child,
- &trs, "%s",
- "rcu_torture_read_exit_child");
- if (IS_ERR(tsp)) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- errexit = true;
- tsp = NULL;
- break;
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits++;
}
- cond_resched();
- kthread_stop(tsp);
- n_read_exits ++;
- stutter_wait("rcu_torture_read_exit");
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ i = 0;
+ for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+ schedule_timeout_uninterruptible(HZ);
} while (!errexit && !READ_ONCE(read_exit_child_stop));
// Clean up and exit.
@@ -2736,7 +3255,7 @@ static int rcu_torture_read_exit(void *unused)
static int rcu_torture_read_exit_init(void)
{
if (read_exit_burst <= 0)
- return -EINVAL;
+ return 0;
init_waitqueue_head(&read_exit_wq);
read_exit_child_stop = false;
read_exit_child_stopped = false;
@@ -2754,6 +3273,29 @@ static void rcu_torture_read_exit_cleanup(void)
torture_stop_kthread(rcutorture_read_exit, read_exit_task);
}
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -2765,15 +3307,21 @@ rcu_torture_cleanup(void)
int i;
if (torture_cleanup_begin()) {
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
+ rcu_gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
+ rcu_gp_slow_unregister(NULL);
return;
}
+ rcutorture_test_nmis(test_nmis);
+
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
rcu_torture_read_exit_cleanup();
@@ -2814,15 +3362,17 @@ rcu_torture_cleanup(void)
rcutorture_seq_diff(gp_seq, start_gp_seq));
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
- if (rcu_torture_can_boost())
+ if (rcu_torture_can_boost() && rcutor_hp >= 0)
cpuhp_remove_state(rcutor_hp);
/*
* Wait for all RCU callbacks to fire, then do torture-type-specific
* cleanup operations.
*/
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
@@ -2865,6 +3415,7 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
+ rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -2906,13 +3457,13 @@ static void rcu_test_debug_objects(void)
/* Try to queue the rh2 pair of callbacks for the same grace period. */
preempt_disable(); /* Prevent preemption from interrupting test. */
rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu(&rh2, rcu_torture_leak_cb);
- call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ call_rcu_hurry(&rh2, rcu_torture_leak_cb);
+ call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
if (rhp) {
- call_rcu(rhp, rcu_torture_leak_cb);
- call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ call_rcu_hurry(rhp, rcu_torture_leak_cb);
+ call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
}
local_irq_enable();
rcu_read_unlock();
@@ -2923,6 +3474,7 @@ static void rcu_test_debug_objects(void)
pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
+ kfree(rhp);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -2936,6 +3488,188 @@ static void rcutorture_sync(void)
cur_ops->sync();
}
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
+}
+
static int __init
rcu_torture_init(void)
{
@@ -2945,9 +3679,9 @@ rcu_torture_init(void)
int flags = 0;
unsigned long gp_seq = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &busted_srcud_ops, &tasks_ops, &tasks_rude_ops,
- &tasks_tracing_ops, &trivial_ops,
+ &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
+ TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ &trivial_ops,
};
if (!torture_init_begin(torture_type, verbose))
@@ -2974,9 +3708,17 @@ rcu_torture_init(void)
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
cur_ops->init();
+ rcu_torture_init_srcu_lockdep();
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -3013,7 +3755,6 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -3032,14 +3773,14 @@ rcu_torture_init(void)
rcu_torture_write_types();
firsterr = torture_create_kthread(rcu_torture_writer, NULL,
writer_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (nfakewriters > 0) {
fakewriter_tasks = kcalloc(nfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3047,7 +3788,7 @@ rcu_torture_init(void)
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
@@ -3055,7 +3796,7 @@ rcu_torture_init(void)
rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
GFP_KERNEL);
if (!reader_tasks || !rcu_torture_reader_mbchk) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3063,7 +3804,7 @@ rcu_torture_init(void)
rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
nrealnocbers = nocbs_nthreads;
@@ -3074,7 +3815,7 @@ rcu_torture_init(void)
if (nrealnocbers > 0) {
nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
if (nocb_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3083,18 +3824,18 @@ rcu_torture_init(void)
}
for (i = 0; i < nrealnocbers; i++) {
firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter < 0)
@@ -3104,7 +3845,7 @@ rcu_torture_init(void)
t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
firsterr = torture_stutter_init(stutter * HZ, t);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (fqs_duration < 0)
@@ -3113,7 +3854,7 @@ rcu_torture_init(void)
/* Create the fqs thread */
firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
fqs_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_boost_interval < 1)
@@ -3127,48 +3868,34 @@ rcu_torture_init(void)
firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
rcutorture_booster_init,
rcutorture_booster_cleanup);
- if (firsterr < 0)
- goto unwind;
rcutor_hp = firsterr;
-
- // Testing RCU priority boosting requires rcutorture do
- // some serious abuse. Counter this by running ksoftirqd
- // at higher priority.
- if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
- for_each_online_cpu(cpu) {
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(ksoftirqd, cpu);
- WARN_ON_ONCE(!t);
- sp.sched_priority = 2;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- }
- }
+ if (torture_init_error(firsterr))
+ goto unwind;
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
rcutorture_sync);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_stall_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_fwd_prog_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_barrier_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_read_exit_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
+ rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index d998a76fb542..1970ce5f22d4 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -44,7 +44,10 @@
pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
#define VERBOSE_SCALEOUT(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+ do { \
+ if (verbose) \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } while (0)
static atomic_t verbose_batch_ctr;
@@ -54,12 +57,11 @@ do { \
(verbose_batched <= 0 || \
!(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
schedule_timeout_uninterruptible(1); \
- pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
} \
} while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
@@ -74,6 +76,8 @@ torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
+// Number of typesafe_lookup structures, that is, the degree of concurrency.
+torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
torture_param(long, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
@@ -122,7 +126,7 @@ static int exp_idx;
// Operations vector for selecting different types of tests.
struct ref_scale_ops {
- void (*init)(void);
+ bool (*init)(void);
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
@@ -160,8 +164,9 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl
}
}
-static void rcu_sync_scale_init(void)
+static bool rcu_sync_scale_init(void)
{
+ return true;
}
static struct ref_scale_ops rcu_ops = {
@@ -205,6 +210,8 @@ static struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+#ifdef CONFIG_TASKS_RCU
+
// Definitions for RCU Tasks ref scale testing: Empty read markers.
// These definitions also work for RCU Rude readers.
static void rcu_tasks_ref_scale_read_section(const int nloops)
@@ -230,6 +237,16 @@ static struct ref_scale_ops rcu_tasks_ops = {
.name = "rcu-tasks"
};
+#define RCU_TASKS_OPS &rcu_tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define RCU_TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
// Definitions for RCU Tasks Trace ref scale testing.
static void rcu_trace_ref_scale_read_section(const int nloops)
{
@@ -259,6 +276,14 @@ static struct ref_scale_ops rcu_trace_ops = {
.name = "rcu-trace"
};
+#define RCU_TRACE_OPS &rcu_trace_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define RCU_TRACE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
// Definitions for reference count
static atomic_t refcnt;
@@ -293,9 +318,10 @@ static struct ref_scale_ops refcnt_ops = {
// Definitions for rwlock
static rwlock_t test_rwlock;
-static void ref_rwlock_init(void)
+static bool ref_rwlock_init(void)
{
rwlock_init(&test_rwlock);
+ return true;
}
static void ref_rwlock_section(const int nloops)
@@ -329,9 +355,10 @@ static struct ref_scale_ops rwlock_ops = {
// Definitions for rwsem
static struct rw_semaphore test_rwsem;
-static void ref_rwsem_init(void)
+static bool ref_rwsem_init(void)
{
init_rwsem(&test_rwsem);
+ return true;
}
static void ref_rwsem_section(const int nloops)
@@ -363,7 +390,7 @@ static struct ref_scale_ops rwsem_ops = {
};
// Definitions for global spinlock
-static DEFINE_SPINLOCK(test_lock);
+static DEFINE_RAW_SPINLOCK(test_lock);
static void ref_lock_section(const int nloops)
{
@@ -371,8 +398,8 @@ static void ref_lock_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock(&test_lock);
- spin_unlock(&test_lock);
+ raw_spin_lock(&test_lock);
+ raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@@ -383,9 +410,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock(&test_lock);
+ raw_spin_lock(&test_lock);
un_delay(udl, ndl);
- spin_unlock(&test_lock);
+ raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@@ -405,8 +432,8 @@ static void ref_lock_irq_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock_irqsave(&test_lock, flags);
- spin_unlock_irqrestore(&test_lock, flags);
+ raw_spin_lock_irqsave(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
@@ -418,9 +445,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock_irqsave(&test_lock, flags);
+ raw_spin_lock_irqsave(&test_lock, flags);
un_delay(udl, ndl);
- spin_unlock_irqrestore(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
@@ -467,6 +494,271 @@ static struct ref_scale_ops acqrel_ops = {
.name = "acqrel"
};
+static volatile u64 stopopts;
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += ktime_get_real_fast_ns();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+// Methods leveraging SLAB_TYPESAFE_BY_RCU.
+//
+
+// Item to look up in a typesafe manner. Array of pointers to these.
+struct refscale_typesafe {
+ atomic_t rts_refctr; // Used by all flavors
+ spinlock_t rts_lock;
+ seqlock_t rts_seqlock;
+ unsigned int a;
+ unsigned int b;
+};
+
+static struct kmem_cache *typesafe_kmem_cachep;
+static struct refscale_typesafe **rtsarray;
+static long rtsarray_size;
+static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand);
+static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start);
+static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start);
+
+// Conditionally acquire an explicit in-structure reference count.
+static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ return atomic_inc_not_zero(&rtsp->rts_refctr);
+}
+
+// Unconditionally release an explicit in-structure reference count.
+static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ if (!atomic_dec_return(&rtsp->rts_refctr)) {
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ kmem_cache_free(typesafe_kmem_cachep, rtsp);
+ }
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure spinlock.
+static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ spin_lock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally release an explicit in-structure spinlock.
+static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ spin_unlock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure sequence lock.
+static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ *start = read_seqbegin(&rtsp->rts_seqlock);
+ return true;
+}
+
+// Conditionally release an explicit in-structure sequence lock. Return
+// true if this release was successful, that is, if no retry is required.
+static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ return !read_seqretry(&rtsp->rts_seqlock, start);
+}
+
+// Do a read-side critical section with the specified delay in
+// microseconds and nanoseconds inserted so as to increase probability
+// of failure.
+static void typesafe_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned int a;
+ unsigned int b;
+ int i;
+ long idx;
+ struct refscale_typesafe *rtsp;
+ unsigned int start;
+
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size;
+ preempt_enable();
+retry:
+ rcu_read_lock();
+ rtsp = rcu_dereference(rtsarray[idx]);
+ a = READ_ONCE(rtsp->a);
+ if (!rts_acquire(rtsp, &start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (a != READ_ONCE(rtsp->a)) {
+ (void)rts_release(rtsp, start);
+ rcu_read_unlock();
+ goto retry;
+ }
+ un_delay(udl, ndl);
+ // Remember, seqlock read-side release can fail.
+ if (!rts_release(rtsp, start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ b = READ_ONCE(rtsp->a);
+ WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
+ b = rtsp->b;
+ rcu_read_unlock();
+ WARN_ON_ONCE(a * a != b);
+ }
+}
+
+// Because the acquisition and release methods are expensive, there
+// is no point in optimizing away the un_delay() function's two checks.
+// Thus simply define typesafe_read_section() as a simple wrapper around
+// typesafe_delay_section().
+static void typesafe_read_section(const int nloops)
+{
+ typesafe_delay_section(nloops, 0, 0);
+}
+
+// Allocate and initialize one refscale_typesafe structure.
+static struct refscale_typesafe *typesafe_alloc_one(void)
+{
+ struct refscale_typesafe *rtsp;
+
+ rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL);
+ if (!rtsp)
+ return NULL;
+ atomic_set(&rtsp->rts_refctr, 1);
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a);
+ return rtsp;
+}
+
+// Slab-allocator constructor for refscale_typesafe structures created
+// out of a new slab of system memory.
+static void refscale_typesafe_ctor(void *rtsp_in)
+{
+ struct refscale_typesafe *rtsp = rtsp_in;
+
+ spin_lock_init(&rtsp->rts_lock);
+ seqlock_init(&rtsp->rts_seqlock);
+ preempt_disable();
+ rtsp->a = torture_random(this_cpu_ptr(&refscale_rand));
+ preempt_enable();
+}
+
+static struct ref_scale_ops typesafe_ref_ops;
+static struct ref_scale_ops typesafe_lock_ops;
+static struct ref_scale_ops typesafe_seqlock_ops;
+
+// Initialize for a typesafe test.
+static bool typesafe_init(void)
+{
+ long idx;
+ long si = lookup_instances;
+
+ typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe",
+ sizeof(struct refscale_typesafe), sizeof(void *),
+ SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor);
+ if (!typesafe_kmem_cachep)
+ return false;
+ if (si < 0)
+ si = -si * nr_cpu_ids;
+ else if (si == 0)
+ si = nr_cpu_ids;
+ rtsarray_size = si;
+ rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ if (!rtsarray)
+ return false;
+ for (idx = 0; idx < rtsarray_size; idx++) {
+ rtsarray[idx] = typesafe_alloc_one();
+ if (!rtsarray[idx])
+ return false;
+ }
+ if (cur_ops == &typesafe_ref_ops) {
+ rts_acquire = typesafe_ref_acquire;
+ rts_release = typesafe_ref_release;
+ } else if (cur_ops == &typesafe_lock_ops) {
+ rts_acquire = typesafe_lock_acquire;
+ rts_release = typesafe_lock_release;
+ } else if (cur_ops == &typesafe_seqlock_ops) {
+ rts_acquire = typesafe_seqlock_acquire;
+ rts_release = typesafe_seqlock_release;
+ } else {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+}
+
+// Clean up after a typesafe test.
+static void typesafe_cleanup(void)
+{
+ long idx;
+
+ if (rtsarray) {
+ for (idx = 0; idx < rtsarray_size; idx++)
+ kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]);
+ kfree(rtsarray);
+ rtsarray = NULL;
+ rtsarray_size = 0;
+ }
+ kmem_cache_destroy(typesafe_kmem_cachep);
+ typesafe_kmem_cachep = NULL;
+ rts_acquire = NULL;
+ rts_release = NULL;
+}
+
+// The typesafe_init() function distinguishes these structures by address.
+static struct ref_scale_ops typesafe_ref_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_ref"
+};
+
+static struct ref_scale_ops typesafe_lock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_lock"
+};
+
+static struct ref_scale_ops typesafe_seqlock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_seqlock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -570,7 +862,7 @@ static u64 process_durations(int n)
char *buf;
u64 sum = 0;
- buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+ buf = kmalloc(800 + 64, GFP_KERNEL);
if (!buf)
return 0;
buf[0] = 0;
@@ -583,13 +875,15 @@ static u64 process_durations(int n)
if (i % 5 == 0)
strcat(buf, "\n");
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
strcat(buf, buf1);
sum += rt->last_duration_ns;
}
- strcat(buf, "\n");
-
- SCALEOUT("%s\n", buf);
+ pr_alert("%s\n", buf);
kfree(buf);
return sum;
@@ -603,7 +897,6 @@ static u64 process_durations(int n)
// point all the timestamps are printed.
static int main_func(void *arg)
{
- bool errexit = false;
int exp, r;
char buf1[64];
char *buf;
@@ -614,10 +907,10 @@ static int main_func(void *arg)
VERBOSE_SCALEOUT("main_func task started");
result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
- buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+ buf = kzalloc(800 + 64, GFP_KERNEL);
if (!result_avg || !buf) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
- errexit = true;
+ SCALEOUT_ERRSTRING("out of memory");
+ goto oom_exit;
}
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
@@ -629,8 +922,6 @@ static int main_func(void *arg)
// Start exp readers up per experiment
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
- if (errexit)
- break;
if (torture_must_stop())
goto end;
@@ -664,26 +955,23 @@ static int main_func(void *arg)
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
- if (!errexit) {
- buf[0] = 0;
- strcat(buf, "\n");
- strcat(buf, "Runs\tTime(ns)\n");
- }
-
+ pr_alert("Runs\tTime(ns)\n");
for (exp = 0; exp < nruns; exp++) {
u64 avg;
u32 rem;
- if (errexit)
- break;
avg = div_u64_rem(result_avg[exp], 1000, &rem);
sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
strcat(buf, buf1);
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
}
- if (!errexit)
- SCALEOUT("%s", buf);
+ pr_alert("%s", buf);
+oom_exit:
// This will shutdown everything including us.
if (shutdown) {
shutdown_start = 1;
@@ -743,7 +1031,7 @@ ref_scale_cleanup(void)
static int
ref_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq, shutdown_start);
+ wait_event_idle(shutdown_wq, shutdown_start);
smp_mb(); // Wake before output.
ref_scale_cleanup();
@@ -758,8 +1046,9 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
+ &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
if (!torture_init_begin(scale_type, verbose))
@@ -781,7 +1070,10 @@ ref_scale_init(void)
goto unwind;
}
if (cur_ops->init)
- cur_ops->init();
+ if (!cur_ops->init()) {
+ firsterr = -EUCLEAN;
+ goto unwind;
+ }
ref_scale_print_module_parms(cur_ops, "Start of test");
@@ -790,7 +1082,7 @@ ref_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
@@ -807,17 +1099,17 @@ ref_scale_init(void)
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+ VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
init_waitqueue_head(&(reader_tasks[i].wq));
@@ -826,7 +1118,7 @@ ref_scale_init(void)
// Main Task
init_waitqueue_head(&main_wq);
firsterr = torture_create_kthread(main_func, NULL, main_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
torture_init_end();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 26344dc6483b..336af24e0fe3 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,10 +96,10 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = ssp->srcu_lock_nesting[idx] - 1;
+ int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
- unsigned short cookie;
+ unsigned long cookie;
cookie = get_state_synchronize_srcu(ssp);
- if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
return;
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
@@ -197,6 +197,18 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ srcu_lock_sync(&ssp->dep_map);
+
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+
+ might_sleep();
init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
call_srcu(ssp, &rs.head, wakeme_after_rcu);
@@ -215,7 +227,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
barrier();
ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
barrier();
- return ret & USHRT_MAX;
+ return ret;
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -240,10 +252,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+ unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ret;
+ return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6833d8887181..20d7a238d675 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,6 +24,7 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/srcu.h>
#include "rcu.h"
@@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444);
static ulong counter_wrap_check = (ULONG_MAX >> 2);
module_param(counter_wrap_check, ulong, 0444);
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all.
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ * 3: Decide at boot time based on system shape (default).
+ * 0x1x: Convert when excessive contention encountered.
+ */
+#define SRCU_SIZING_NONE 0
+#define SRCU_SIZING_INIT 1
+#define SRCU_SIZING_TORTURE 2
+#define SRCU_SIZING_AUTO 3
+#define SRCU_SIZING_CONTEND 0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_AUTO;
+module_param(convert_to_big, int, 0444);
+
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
/* Early-boot callback-management, so early that no lock is required! */
static LIST_HEAD(srcu_boot_list);
static bool __read_mostly srcu_init_done;
@@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work);
static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irq_rcu_node(p) \
+#define spin_unlock_irq_rcu_node(p) \
spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+#define spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
/*
- * Initialize SRCU combining tree. Note that statically allocated
+ * Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp)
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+ sdp->mynode = NULL;
+ sdp->cpu = cpu;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
+ }
+}
+
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ 0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+ return s == SRCU_SNP_INIT_SEQ;
+}
+
+/*
+ * Allocated and initialize SRCU combining tree. Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
{
int cpu;
int i;
@@ -92,11 +173,14 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
+ return false;
/* Work out the overall tree geometry. */
- ssp->level[0] = &ssp->node[0];
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
@@ -105,23 +189,23 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
- snp->srcu_have_cbs[i] = 0;
+ snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
snp->srcu_data_have_cbs[i] = 0;
}
- snp->srcu_gp_seq_needed_exp = 0;
+ snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &ssp->node[0]) {
+ if (snp == &ssp->srcu_sup->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == ssp->level[level + 1])
+ if (snp == ssp->srcu_sup->level[level + 1])
level++;
- snp->srcu_parent = ssp->level[level - 1] +
- (snp - ssp->level[level]) /
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
levelspread[level - 1];
}
@@ -129,55 +213,70 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
level = rcu_num_lvls - 1;
- snp_first = ssp->level[level];
+ snp_first = ssp->srcu_sup->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
- rcu_segcblist_init(&sdp->srcu_cblist);
- sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
sdp->mynode = &snp_first[cpu / levelspread[level]];
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
if (snp->grplo < 0)
snp->grplo = cpu;
snp->grphi = cpu;
}
- sdp->cpu = cpu;
- INIT_WORK(&sdp->work, srcu_invoke_callbacks);
- timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
- sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
}
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ return true;
}
/*
* Initialize non-compile-time initialized fields, including the
- * associated srcu_node and srcu_data structures. The is_static
- * parameter is passed through to init_srcu_struct_nodes(), and
- * also tells us that ->sda has already been wired up to srcu_data.
+ * associated srcu_node and srcu_data structures. The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- mutex_init(&ssp->srcu_cb_mutex);
- mutex_init(&ssp->srcu_gp_mutex);
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_gp_seq = 0;
- ssp->srcu_barrier_seq = 0;
- mutex_init(&ssp->srcu_barrier_mutex);
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&ssp->work, process_srcu);
+ ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
- if (!ssp->sda)
+ if (!ssp->sda) {
+ if (!is_static)
+ kfree(ssp->srcu_sup);
return -ENOMEM;
- init_srcu_struct_nodes(ssp);
- ssp->srcu_gp_seq_needed_exp = 0;
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+ }
+ init_srcu_struct_data(ssp);
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
+ if (!ssp->srcu_sup->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ kfree(ssp->srcu_sup);
+ return -ENOMEM;
+ }
+ } else {
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ }
+ }
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
}
@@ -189,7 +288,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -206,7 +304,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -214,6 +311,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* Double-checked locking on ->srcu_size-state. */
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ return;
+ }
+ __srcu_transition_to_big(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+}
+
+/*
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
+ */
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+{
+ unsigned long j;
+
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
+ return;
+ j = jiffies;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
+ }
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
+ return;
+ __srcu_transition_to_big(ssp);
+}
+
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+ struct srcu_struct *ssp = sdp->ssp;
+
+ if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+}
+
+/*
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
@@ -226,15 +403,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -249,7 +426,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
}
return sum;
}
@@ -261,13 +438,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
+ unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
}
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
+ "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
return sum;
}
@@ -296,24 +478,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
/*
* If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does
- * not mean that there are no more readers, as one could have read
- * the current index but not have incremented the lock counter yet.
+ * been no readers on this index at some point in this function.
+ * But there might be more readers, as a task might have read
+ * the current ->srcu_idx but not yet have incremented its CPU's
+ * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * that most of the tasks have been preempted between fetching
+ * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
+ * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
+ * in a system whose address space was fully populated with memory.
+ * Call this quantity Nt.
+ *
+ * So suppose that the updater is preempted at this point in the
+ * code for a long time. That now-preempted updater has already
+ * flipped ->srcu_idx (possibly during the preceding grace period),
+ * done an smp_mb() (again, possibly during the preceding grace
+ * period), and summed up the ->srcu_unlock_count[idx] counters.
+ * How many times can a given one of the aforementioned Nt tasks
+ * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
+ * counter, in the absence of nesting?
*
- * So suppose that the updater is preempted here for so long
- * that more than ULONG_MAX non-nested readers come and go in
- * the meantime. It turns out that this cannot result in overflow
- * because if a reader modifies its unlock count after we read it
- * above, then that reader's next load of ->srcu_idx is guaranteed
- * to get the new value, which will cause it to operate on the
- * other bank of counters, where it cannot contribute to the
- * overflow of these counters. This means that there is a maximum
- * of 2*NR_CPUS increments, which cannot overflow given current
- * systems, especially not on 64-bit systems.
+ * It can clearly do so once, given that it has already fetched
+ * the old value of ->srcu_idx and is just about to use that value
+ * to index its increment of ->srcu_lock_count[idx]. But as soon as
+ * it leaves that SRCU read-side critical section, it will increment
+ * ->srcu_unlock_count[idx], which must follow the updater's above
+ * read from that same value. Thus, as soon the reading task does
+ * an smp_mb() and a later fetch from ->srcu_idx, that task will be
+ * guaranteed to get the new index. Except that the increment of
+ * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
+ * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
+ * is before the smp_mb(). Thus, that task might not see the new
+ * value of ->srcu_idx until the -second- __srcu_read_lock(),
+ * which in turn means that this task might well increment
+ * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
+ * not just once.
*
- * OK, how about nesting? This does impose a limit on nesting
- * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
- * especially on 64-bit systems.
+ * However, it is important to note that a given smp_mb() takes
+ * effect not just for the task executing it, but also for any
+ * later task running on that same CPU.
+ *
+ * That is, there can be almost Nt + Nc further increments of
+ * ->srcu_lock_count[idx] for the old index, where Nc is the number
+ * of CPUs. But this is OK because the size of the task_struct
+ * structure limits the value of Nt and current systems limit Nc
+ * to a few thousand.
+ *
+ * OK, but what about nesting? This does impose a limit on
+ * nesting of half of the size of the task_struct structure
+ * (measured in bytes), which should be sufficient. A late 2022
+ * TREE01 rcutorture run reported this size to be no less than
+ * 9408 bytes, allowing up to 4704 levels of nesting, which is
+ * comfortably beyond excessive. Especially on 64-bit systems,
+ * which are unlikely to be configured with an address space fully
+ * populated with memory, at least not anytime soon.
*/
return srcu_readers_lock_idx(ssp, idx) == unlocks;
}
@@ -335,15 +552,60 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[0]);
- sum += READ_ONCE(cpuc->srcu_lock_count[1]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
}
return sum;
}
-#define SRCU_INTERVAL 1
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
/*
* Return grace-period delay, zero if there are expedited grace
@@ -351,10 +613,25 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
- READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
- return 0;
- return SRCU_INTERVAL;
+ unsigned long gpstart;
+ unsigned long j;
+ unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ jbase = 0;
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
+ j = jiffies - 1;
+ gpstart = READ_ONCE(sup->srcu_gp_start);
+ if (time_after(j, gpstart))
+ jbase += j - gpstart;
+ if (!jbase) {
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ jbase = 1;
+ }
+ }
+ return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
/**
@@ -367,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ struct srcu_usage *sup = ssp->srcu_sup;
if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -381,17 +659,49 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
- pr_info("%s: Active srcu_struct %p state: %d\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- free_percpu(ssp->sda);
- ssp->sda = NULL;
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
+ }
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+#ifdef CONFIG_PROVE_RCU
+/*
+ * Check for consistent NMI safety.
+ */
+void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+{
+ int nmi_safe_mask = 1 << nmi_safe;
+ int old_nmi_safe_mask;
+ struct srcu_data *sdp;
+
+ /* NMI-unsafe use in NMI is a bad sign */
+ WARN_ON_ONCE(!nmi_safe && in_nmi());
+ sdp = raw_cpu_ptr(ssp->sda);
+ old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
+ if (!old_nmi_safe_mask) {
+ WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
+ return;
+ }
+ WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+}
+EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
+#endif /* CONFIG_PROVE_RCU */
+
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
@@ -402,7 +712,7 @@ int __srcu_read_lock(struct srcu_struct *ssp)
int idx;
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -416,38 +726,70 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+#ifdef CONFIG_NEED_SRCU_NMI_SAFE
+
/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct, but in an NMI-safe manner using RMW atomics.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
+{
+ int idx;
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+ atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
*/
-#define SRCU_RETRY_CHECK_DELAY 5
+void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+{
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
+ atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
+
+#endif // CONFIG_NEED_SRCU_NMI_SAFE
/*
* Start an SRCU grace period.
*/
static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
+ struct srcu_data *sdp;
int state;
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
+ else
+ sdp = this_cpu_ptr(ssp->sda);
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(ssp->srcu_gp_seq);
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -508,7 +850,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
- unsigned long cbdelay;
+ unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@@ -517,71 +859,92 @@ static void srcu_gp_end(struct srcu_struct *ssp)
int idx;
unsigned long mask;
struct srcu_data *sdp;
+ unsigned long sgsne;
struct srcu_node *snp;
+ int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Prevent more than one additional grace period. */
- mutex_lock(&ssp->srcu_cb_mutex);
+ mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(ssp);
- idx = rcu_seq_state(ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = srcu_get_delay(ssp);
- WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
- rcu_seq_end(&ssp->srcu_gp_seq);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ cbdelay = 0;
+
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
- cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
- if (last_lvl)
- cbs = snp->srcu_have_cbs[idx] == gpseq;
- snp->srcu_have_cbs[idx] = gpseq;
- rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
- if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
- mask = snp->srcu_data_have_cbs[idx];
- snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
- if (cbs)
- srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
-
- /* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check) && last_lvl)
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed + 100))
- sdp->srcu_gp_seq_needed = gpseq;
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed_exp + 100))
- sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- }
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
+ cbdelay);
+ } else {
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
+ if (last_lvl)
+ cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+ if (ss_state < SRCU_SIZE_BIG)
+ mask = ~0;
+ else
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+ }
}
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
+
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&ssp->srcu_cb_mutex);
+ mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(ssp);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
+ }
+
+ /* Transition to big if needed. */
+ if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+ if (ss_state == SRCU_SIZE_ALLOC)
+ init_srcu_struct_nodes(ssp, GFP_KERNEL);
+ else
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
}
}
@@ -596,23 +959,27 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
unsigned long s)
{
unsigned long flags;
+ unsigned long sgsne;
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
- ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
- return;
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ if (snp)
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
+ (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
+ return;
+ spin_lock_irqsave_rcu_node(snp, flags);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
- return;
}
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -624,67 +991,85 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
*
* Note that this function also does the work of srcu_funnel_exp_start(),
* in some cases by directly invoking it.
+ *
+ * The srcu read lock should be hold around this function. And s is a seq snap
+ * after holding that lock.
*/
static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
unsigned long s, bool do_norm)
{
unsigned long flags;
int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
- struct srcu_node *snp = sdp->mynode;
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ struct srcu_node *snp_leaf;
unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
- /* Each pass through the loop does one level of the srcu_node tree. */
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
- return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ snp_leaf = NULL;
+ else
+ snp_leaf = sdp->mynode;
+
+ if (snp_leaf)
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
- if (snp == sdp->mynode && snp_seq == s)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
- if (snp == sdp->mynode && snp_seq != s) {
- srcu_schedule_cbs_sdp(sdp, do_norm
- ? SRCU_INTERVAL
- : 0);
+ if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
+ if (snp == snp_leaf && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == snp_leaf && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(ssp, snp, s);
return;
}
- if (!do_norm)
- srcu_funnel_exp_start(ssp, snp, s);
- return;
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == snp_leaf)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- snp->srcu_have_cbs[idx] = s;
- if (snp == sdp->mynode)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
- /* If grace period not already done and none in progress, start it. */
- if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
- rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ /* If grace period not already in progress, start it. */
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
+
+ // And how can that list_add() in the "else" clause
+ // possibly be safe for concurrent execution? Well,
+ // it isn't. And it does not have to be. After all, it
+ // can only be executed during early boot when there is only
+ // the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &ssp->work,
- srcu_get_delay(ssp));
- else if (list_empty(&ssp->work.work.entry))
- list_add(&ssp->work.work.entry, &srcu_boot_list);
+ queue_delayed_work(rcu_gp_wq, &sup->work,
+ !!srcu_get_delay(ssp));
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -694,12 +1079,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
+ unsigned long curdelay;
+
+ curdelay = !srcu_get_delay(ssp);
+
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(ssp) <= 0)
+ if ((--trycount + curdelay) <= 0)
return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
+ udelay(srcu_retry_check_delay);
}
}
@@ -711,23 +1100,44 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Ensure that if this updater saw a given reader's increment
- * from __srcu_read_lock(), that reader was using an old value
- * of ->srcu_idx. Also ensure that if a given reader sees the
- * new value of ->srcu_idx, this updater's earlier scans cannot
- * have seen that reader's increments (which is OK, because this
- * grace period need not wait on that reader).
+ * Because the flip of ->srcu_idx is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
+ * and because that summing uses atomic_long_read(), there is
+ * ordering due to a control dependency between that summing and
+ * the WRITE_ONCE() in this call to srcu_flip(). This ordering
+ * ensures that if this updater saw a given reader's increment from
+ * __srcu_read_lock(), that reader was using a value of ->srcu_idx
+ * from before the previous call to srcu_flip(), which should be
+ * quite rare. This ordering thus helps forward progress because
+ * the grace period could otherwise be delayed by additional
+ * calls to __srcu_read_lock() using that old (soon to be new)
+ * value of ->srcu_idx.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_idx, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
/*
* Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
+ * increment, that task's __srcu_read_lock() following its next
+ * __srcu_read_lock() or __srcu_read_unlock() will see the above
+ * counter update. Note that both this memory barrier and the
+ * one in srcu_readers_active_idx_check() provide the guarantee
+ * for __srcu_read_lock().
*/
smp_mb(); /* D */ /* Pairs with C. */
}
@@ -779,18 +1189,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
- tlast = READ_ONCE(ssp->srcu_last_gp_end);
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
if (exp_holdoff == 0 ||
time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -814,16 +1224,27 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
bool needgp = false;
unsigned long s;
struct srcu_data *sdp;
+ struct srcu_node *sdp_mynode;
+ int ss_state;
check_init_srcu_struct(ssp);
- idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ /*
+ * While starting a new grace period, make sure we are in an
+ * SRCU read-side critical section so that the grace-period
+ * sequence number cannot wrap around in the meantime.
+ */
+ idx = __srcu_read_lock_nmisafe(ssp);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_CALL)
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
+ else
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
sdp->srcu_gp_seq_needed = s;
@@ -834,11 +1255,18 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
needexp = true;
}
spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+ sdp_mynode = NULL;
+ else
+ sdp_mynode = sdp->mynode;
+
if (needgp)
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
- srcu_read_unlock(ssp, idx);
+ srcu_funnel_exp_start(ssp, sdp_mynode, s);
+ __srcu_read_unlock_nmisafe(ssp, idx);
return s;
}
@@ -914,6 +1342,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
@@ -1027,7 +1457,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
// Any prior manipulation of SRCU-protected data must happen
// before the load from ->srcu_gp_seq.
smp_mb();
- return rcu_seq_snap(&ssp->srcu_gp_seq);
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -1074,7 +1504,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
@@ -1093,8 +1523,30 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+}
+
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist. but only if that ->cblist already has at least one
+ * callback enqueued. Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+ spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ }
+ spin_unlock_irq_rcu_node(sdp);
}
/**
@@ -1104,51 +1556,37 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
- struct srcu_data *sdp;
- unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+ int idx;
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
check_init_srcu_struct(ssp);
- mutex_lock(&ssp->srcu_barrier_mutex);
- if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&ssp->srcu_barrier_seq);
- init_completion(&ssp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
- /*
- * Each pass through this loop enqueues a callback, but only
- * on CPUs already having callbacks enqueued. Note that if
- * a CPU already has callbacks enqueue, it must have already
- * registered the need for a future grace period, so all we
- * need do is enqueue a callback that will use the same
- * grace period as the last callback already in the queue.
- */
- for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
- sdp->srcu_barrier_head.func = srcu_barrier_cb;
- debug_rcu_head_queue(&sdp->srcu_barrier_head);
- if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head)) {
- debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
- }
- spin_unlock_irq_rcu_node(sdp);
- }
+ idx = __srcu_read_lock_nmisafe(ssp);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
+ else
+ for_each_possible_cpu(cpu)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+ __srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
- wait_for_completion(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
- rcu_seq_end(&ssp->srcu_barrier_seq);
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
@@ -1174,7 +1612,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&ssp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1186,38 +1624,39 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 1)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp);
- rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
- spin_unlock_irq_rcu_node(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
@@ -1225,9 +1664,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 2)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1253,7 +1693,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1281,7 +1721,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
@@ -1297,20 +1737,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
}
/*
@@ -1318,12 +1758,30 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
*/
static void process_srcu(struct work_struct *work)
{
+ unsigned long curdelay;
+ unsigned long j;
struct srcu_struct *ssp;
+ struct srcu_usage *sup;
- ssp = container_of(work, struct srcu_struct, work.work);
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
- srcu_reschedule(ssp, srcu_get_delay(ssp));
+ curdelay = srcu_get_delay(ssp);
+ if (curdelay) {
+ WRITE_ONCE(sup->reschedule_count, 0);
+ } else {
+ j = jiffies;
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
+ curdelay = 1;
+ } else {
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
+ }
+ }
+ srcu_reschedule(ssp, curdelay);
}
void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -1333,47 +1791,73 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+static const char * const srcu_size_state_name[] = {
+ "SRCU_SIZE_SMALL",
+ "SRCU_SIZE_ALLOC",
+ "SRCU_SIZE_WAIT_BARRIER",
+ "SRCU_SIZE_WAIT_CALL",
+ "SRCU_SIZE_WAIT_CBS1",
+ "SRCU_SIZE_WAIT_CBS2",
+ "SRCU_SIZE_WAIT_CBS3",
+ "SRCU_SIZE_WAIT_CBS4",
+ "SRCU_SIZE_BIG",
+ "SRCU_SIZE_???",
+};
+
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
{
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
+ int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *sdp;
-
- sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(sdp->srcu_unlock_count[!idx]);
- u1 = data_race(sdp->srcu_unlock_count[idx]);
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = data_race(sdp->srcu_lock_count[!idx]);
- l1 = data_race(sdp->srcu_lock_count[idx]);
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %c)",
- cpu, c0, c1,
- "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
- s0 += c0;
- s1 += c1;
+ if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+ ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+ pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
+ srcu_size_state_name[ss_state_idx]);
+ if (!ssp->sda) {
+ // Called after cleanup_srcu_struct(), perhaps.
+ pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+ } else {
+ pr_cont(" per-CPU(idx=%d):", idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *sdp;
+
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
+ u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
+ l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
}
- pr_cont(" T(%ld,%ld)\n", s0, s1);
+ if (SRCU_SIZING_IS_TORTURE())
+ srcu_transition_to_big(ssp);
}
EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
@@ -1382,13 +1866,29 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *ssp;
+ struct srcu_usage *sup;
+
+ /* Decide on srcu_struct-size strategy. */
+ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+ if (nr_cpu_ids >= big_cpu_lim) {
+ convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+ pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+ } else {
+ convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+ pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+ }
+ }
/*
* Once that is set, call_srcu() can follow the normal path and
@@ -1397,10 +1897,13 @@ void __init srcu_init(void)
*/
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
work.work.entry);
- list_del_init(&ssp->work.work.entry);
- queue_work(rcu_gp_wq, &ssp->work.work);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
}
}
@@ -1410,13 +1913,14 @@ void __init srcu_init(void)
static int srcu_module_coming(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- int ret;
for (i = 0; i < mod->num_srcu_structs; i++) {
- ret = init_srcu_struct(*(sspp++));
- if (WARN_ON_ONCE(ret))
- return ret;
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
}
return 0;
}
@@ -1425,10 +1929,17 @@ static int srcu_module_coming(struct module *mod)
static void srcu_module_going(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- for (i = 0; i < mod->num_srcu_structs; i++)
- cleanup_srcu_struct(*(sspp++));
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
}
/* Handle one module, either coming or going. */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 33d896d85902..e550f97779b8 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -44,7 +44,7 @@ static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
{
- call_rcu(&rsp->cb_head, rcu_sync_func);
+ call_rcu_hurry(&rsp->cb_head, rcu_sync_func);
}
/**
@@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp)
* a slowpath during the update. After this function returns, all
* subsequent calls to rcu_sync_is_idle() will return false, which
* tells readers to stay off their fastpaths. A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
+ * rcu_sync_exit() re-enables reader fastpaths.
*
* When called in isolation, rcu_sync_enter() must wait for a grace
* period, however, closely spaced calls to rcu_sync_enter() can
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8536c55df514..5f4fc8184dd0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -6,6 +6,7 @@
*/
#ifdef CONFIG_TASKS_RCU_GENERIC
+#include "rcu_segcblist.h"
////////////////////////////////////////////////////////////////////////
//
@@ -13,18 +14,43 @@
struct rcu_tasks;
typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
-typedef void (*pregp_func_t)(void);
+typedef void (*pregp_func_t)(struct list_head *hop);
typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
typedef void (*postscan_func_t)(struct list_head *hop);
typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
/**
+ * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism.
+ * @cblist: Callback list.
+ * @lock: Lock protecting per-CPU callback list.
+ * @rtp_jiffies: Jiffies counter value for statistics.
+ * @rtp_n_lock_retries: Rough lock-contention statistic.
+ * @rtp_work: Work queue for invoking callbacks.
+ * @rtp_irq_work: IRQ work queue for deferred wakeups.
+ * @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @cpu: CPU number corresponding to this entry.
+ * @rtpp: Pointer to the rcu_tasks structure.
+ */
+struct rcu_tasks_percpu {
+ struct rcu_segcblist cblist;
+ raw_spinlock_t __private lock;
+ unsigned long rtp_jiffies;
+ unsigned long rtp_n_lock_retries;
+ struct work_struct rtp_work;
+ struct irq_work rtp_irq_work;
+ struct rcu_head barrier_q_head;
+ struct list_head rtp_blkd_tasks;
+ int cpu;
+ struct rcu_tasks *rtpp;
+};
+
+/**
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
- * @cbs_head: Head of callback list.
- * @cbs_tail: Tail pointer for callback list.
- * @cbs_wq: Wait queue allowing new callback to get kthread's attention.
- * @cbs_lock: Lock protecting callback list.
+ * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
+ * @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
@@ -32,7 +58,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @n_gps: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
* @pregp_func: This flavor's pre-grace-period function (optional).
@@ -41,20 +67,28 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
+ * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
+ * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
+ * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
+ * @barrier_q_mutex: Serialize barrier operations.
+ * @barrier_q_count: Number of queues being waited on.
+ * @barrier_q_completion: Barrier wait/wakeup mechanism.
+ * @barrier_q_seq: Sequence number for barrier operations.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
struct rcu_tasks {
- struct rcu_head *cbs_head;
- struct rcu_head **cbs_tail;
- struct wait_queue_head cbs_wq;
- raw_spinlock_t cbs_lock;
+ struct rcuwait cbs_wait;
+ raw_spinlock_t cbs_gbl_lock;
+ struct mutex tasks_gp_mutex;
int gp_state;
int gp_sleep;
int init_fract;
unsigned long gp_jiffies;
unsigned long gp_start;
- unsigned long n_gps;
+ unsigned long tasks_gp_seq;
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
@@ -65,24 +99,53 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ struct rcu_tasks_percpu __percpu *rtpcpu;
+ int percpu_enqueue_shift;
+ int percpu_enqueue_lim;
+ int percpu_dequeue_lim;
+ unsigned long percpu_dequeue_gpseq;
+ struct mutex barrier_q_mutex;
+ atomic_t barrier_q_count;
+ struct completion barrier_q_completion;
+ unsigned long barrier_q_seq;
char *name;
char *kname;
};
-#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
-static struct rcu_tasks rt_name = \
-{ \
- .cbs_tail = &rt_name.cbs_head, \
- .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
- .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \
- .gp_func = gp, \
- .call_func = call, \
- .name = n, \
- .kname = #rt_name, \
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
+
+#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
+static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
+ .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \
+}; \
+static struct rcu_tasks rt_name = \
+{ \
+ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
+ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
+ .gp_func = gp, \
+ .call_func = call, \
+ .rtpcpu = &rt_name ## __percpu, \
+ .name = n, \
+ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_lim = 1, \
+ .percpu_dequeue_lim = 1, \
+ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
+ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \
+ .kname = #rt_name, \
}
+#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+#endif
+
+#ifdef CONFIG_TASKS_RCU
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
/* Avoid IPIing CPUs early in the grace period. */
#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
@@ -90,9 +153,24 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
module_param(rcu_task_ipi_delay, int, 0644);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
+#define RCU_TASK_STALL_INFO (HZ * 10)
+static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO;
+module_param(rcu_task_stall_info, int, 0644);
+static int rcu_task_stall_info_mult __read_mostly = 3;
+module_param(rcu_task_stall_info_mult, int, 0444);
+
+static int rcu_task_enqueue_lim __read_mostly = -1;
+module_param(rcu_task_enqueue_lim, int, 0444);
+
+static bool rcu_task_cb_adjust;
+static int rcu_task_contend_lim __read_mostly = 100;
+module_param(rcu_task_contend_lim, int, 0444);
+static int rcu_task_collapse_lim __read_mostly = 10;
+module_param(rcu_task_collapse_lim, int, 0444);
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
@@ -128,6 +206,8 @@ static const char * const rcu_tasks_gp_state_names[] = {
//
// Generic code.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp);
+
/* Record grace-period phase and time. */
static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
{
@@ -148,46 +228,327 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+// Initialize per-CPU callback lists for the specified flavor of
+// Tasks RCU.
+static void cblist_init_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ int lim;
+ int shift;
+
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rcu_task_enqueue_lim < 0) {
+ rcu_task_enqueue_lim = 1;
+ rcu_task_cb_adjust = true;
+ pr_info("%s: Setting adjustable number of callback queues.\n", __func__);
+ } else if (rcu_task_enqueue_lim == 0) {
+ rcu_task_enqueue_lim = 1;
+ }
+ lim = rcu_task_enqueue_lim;
+
+ if (lim > nr_cpu_ids)
+ lim = nr_cpu_ids;
+ shift = ilog2(nr_cpu_ids / lim);
+ if (((nr_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(!rtpcp);
+ if (cpu)
+ raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ rcu_segcblist_init(&rtpcp->cblist);
+ INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
+ rtpcp->cpu = cpu;
+ rtpcp->rtpp = rtp;
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim));
+}
+
+// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
+
+ rtp = rtpcp->rtpp;
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
struct rcu_tasks *rtp)
{
+ int chosen_cpu;
unsigned long flags;
+ int ideal_cpu;
+ unsigned long j;
+ bool needadjust = false;
bool needwake;
+ struct rcu_tasks_percpu *rtpcp;
rhp->next = NULL;
rhp->func = func;
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- needwake = !rtp->cbs_head;
- WRITE_ONCE(*rtp->cbs_tail, rhp);
- rtp->cbs_tail = &rhp->next;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
+ local_irq_save(flags);
+ rcu_read_lock();
+ ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
+ chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
+ if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ j = jiffies;
+ if (rtpcp->rtp_jiffies != j) {
+ rtpcp->rtp_jiffies = j;
+ rtpcp->rtp_n_lock_retries = 0;
+ }
+ if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
+ READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ needadjust = true; // Defer adjustment to avoid deadlock.
+ }
+ if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) {
+ raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
+ cblist_init_generic(rtp);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ }
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (unlikely(needadjust)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ rcu_read_unlock();
/* We can't create the thread unless interrupts are enabled. */
if (needwake && READ_ONCE(rtp->kthread_ptr))
- wake_up(&rtp->cbs_wq);
+ irq_work_queue(&rtpcp->rtp_irq_work);
}
-// Wait for a grace period for the specified flavor of Tasks RCU.
-static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+// RCU callback function for rcu_barrier_tasks_generic().
+static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
{
- /* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
- "synchronize_rcu_tasks called too soon");
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp;
- /* Wait for the grace period. */
- wait_rcu_gp(rtp->call_func);
+ rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
+ rtp = rtpcp->rtpp;
+ if (atomic_dec_and_test(&rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
}
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
+// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
+// Operates in a manner similar to rcu_barrier().
+static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
{
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq);
+
+ mutex_lock(&rtp->barrier_q_mutex);
+ if (rcu_seq_done(&rtp->barrier_q_seq, s)) {
+ smp_mb();
+ mutex_unlock(&rtp->barrier_q_mutex);
+ return;
+ }
+ rcu_seq_start(&rtp->barrier_q_seq);
+ init_completion(&rtp->barrier_q_completion);
+ atomic_set(&rtp->barrier_q_count, 2);
+ for_each_possible_cpu(cpu) {
+ if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim))
+ break;
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head))
+ atomic_inc(&rtp->barrier_q_count);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ if (atomic_sub_and_test(2, &rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+ wait_for_completion(&rtp->barrier_q_completion);
+ rcu_seq_end(&rtp->barrier_q_seq);
+ mutex_unlock(&rtp->barrier_q_mutex);
+}
+
+// Advance callbacks and indicate whether either a grace period or
+// callback invocation is needed.
+static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
+ long n;
+ long ncbs = 0;
+ long ncbsnz = 0;
+ int needgpcb = 0;
+
+ for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ /* Advance and accelerate any new callbacks. */
+ if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ // Should we shrink down to a single callback queue?
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (n) {
+ ncbs += n;
+ if (cpu > 0)
+ ncbsnz += n;
+ }
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
+ needgpcb |= 0x3;
+ if (!rcu_segcblist_empty(&rtpcp->cblist))
+ needgpcb |= 0x1;
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Shrink down to a single callback queue if appropriate.
+ // This is done in two stages: (1) If there are no more than
+ // rcu_task_collapse_lim callbacks on CPU 0 and none on any other
+ // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
+ // if there has not been an increase in callbacks, limit dequeuing
+ // to CPU 0. Note the matching RCU read-side critical section in
+ // call_rcu_tasks_generic().
+ if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim > 1) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+ smp_store_release(&rtp->percpu_enqueue_lim, 1);
+ rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ gpdone = false;
+ pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ if (rcu_task_cb_adjust && !ncbsnz && gpdone) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
+ WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
+ pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ if (rtp->percpu_dequeue_lim == 1) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+
+ return needgpcb;
+}
+
+// Advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
+{
+ int cpu;
+ int cpunext;
unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
+ int len;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_tasks_percpu *rtpcp_next;
+
+ cpu = rtpcp->cpu;
+ cpunext = cpu * 2 + 1;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpunext++;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
+
+ if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
+ return;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ len = rcl.len;
+ for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ cond_resched();
+ }
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_add_len(&rtpcp->cblist, -len);
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+
+// Workqueue flood to advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work);
+
+ rtp = rtpcp->rtpp;
+ rcu_tasks_invoke_cbs(rtp, rtpcp);
+}
+
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
+{
+ int needgpcb;
+
+ mutex_lock(&rtp->tasks_gp_mutex);
+
+ // If there were none, wait a bit and start over.
+ if (unlikely(midboot)) {
+ needgpcb = 0x2;
+ } else {
+ set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ rcuwait_wait_event(&rtp->cbs_wait,
+ (needgpcb = rcu_tasks_need_gpcb(rtp)),
+ TASK_IDLE);
+ }
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
+ }
+
+ // Invoke callbacks.
+ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+ mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_FLAG_RCU);
+ housekeeping_affine(current, HK_TYPE_RCU);
WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
/*
@@ -197,48 +558,29 @@ static int __noreturn rcu_tasks_kthread(void *arg)
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
+ // Wait for one grace period and invoke any callbacks
+ // that are ready.
+ rcu_tasks_one_gp(rtp, false);
- /* Pick up any new callbacks. */
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- smp_mb__after_spinlock(); // Order updates vs. GP.
- list = rtp->cbs_head;
- rtp->cbs_head = NULL;
- rtp->cbs_tail = &rtp->cbs_head;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
-
- /* If there were none, wait a bit and start over. */
- if (!list) {
- wait_event_interruptible(rtp->cbs_wq,
- READ_ONCE(rtp->cbs_head));
- if (!rtp->cbs_head) {
- WARN_ON(signal_pending(current));
- set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_idle(HZ/10);
- }
- continue;
- }
-
- // Wait for one grace period.
- set_tasks_gp_state(rtp, RTGS_WAIT_GP);
- rtp->gp_start = jiffies;
- rtp->gp_func(rtp);
- rtp->n_gps++;
-
- /* Invoke the callbacks. */
- set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
- while (list) {
- next = list->next;
- local_bh_disable();
- list->func(list);
- local_bh_enable();
- list = next;
- cond_resched();
- }
- /* Paranoid sleep to keep this from entering a tight loop */
+ // Paranoid sleep to keep this from entering a tight loop.
schedule_timeout_idle(rtp->gp_sleep);
+ }
+}
- set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+ /* Complain if the scheduler has not started. */
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_%s() called too soon", rtp->name))
+ return;
+
+ // If the grace-period kthread is running, use it.
+ if (READ_ONCE(rtp->kthread_ptr)) {
+ wait_rcu_gp(rtp->call_func);
+ return;
}
+ rcu_tasks_one_gp(rtp, true);
}
/* Spawn RCU-tasks grace-period kthread. */
@@ -260,8 +602,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
static void __init rcu_tasks_bootup_oddness(void)
{
#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+ int rtsimc;
+
if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ rtsimc = clamp(rcu_task_stall_info_mult, 1, 10);
+ if (rtsimc != rcu_task_stall_info_mult) {
+ pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc);
+ rcu_task_stall_info_mult = rtsimc;
+ }
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RCU
pr_info("\tTrampoline variant of Tasks RCU enabled.\n");
@@ -280,14 +629,25 @@ static void __init rcu_tasks_bootup_oddness(void)
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
+ int cpu;
+ bool havecbs = false;
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) {
+ havecbs = true;
+ break;
+ }
+ }
pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
- data_race(rtp->n_gps),
+ data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
- ".C"[!!data_race(rtp->cbs_head)],
+ ".C"[havecbs],
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -303,13 +663,18 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t);
/* Wait for one RCU-tasks grace period. */
static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
{
- struct task_struct *g, *t;
- unsigned long lastreport;
- LIST_HEAD(holdouts);
+ struct task_struct *g;
int fract;
+ LIST_HEAD(holdouts);
+ unsigned long j;
+ unsigned long lastinfo;
+ unsigned long lastreport;
+ bool reported = false;
+ int rtsi;
+ struct task_struct *t;
set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
- rtp->pregp_func();
+ rtp->pregp_func(&holdouts);
/*
* There were callbacks, so we need to wait for an RCU-tasks
@@ -318,10 +683,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* and make a list of them in holdouts.
*/
set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
- rcu_read_lock();
- for_each_process_thread(g, t)
- rtp->pertask_func(t, &holdouts);
- rcu_read_unlock();
+ if (rtp->pertask_func) {
+ rcu_read_lock();
+ for_each_process_thread(g, t)
+ rtp->pertask_func(t, &holdouts);
+ rcu_read_unlock();
+ }
set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
rtp->postscan_func(&holdouts);
@@ -332,30 +699,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* is empty, we are done.
*/
lastreport = jiffies;
+ lastinfo = lastreport;
+ rtsi = READ_ONCE(rcu_task_stall_info);
// Start off with initial wait and slowly back off to 1 HZ wait.
fract = rtp->init_fract;
while (!list_empty(&holdouts)) {
+ ktime_t exp;
bool firstreport;
bool needreport;
int rtst;
- /* Slowly back off waiting for holdouts */
+ // Slowly back off waiting for holdouts
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_idle(fract);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ schedule_timeout_idle(fract);
+ } else {
+ exp = jiffies_to_nsecs(fract);
+ __set_current_state(TASK_IDLE);
+ schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+ }
if (fract < HZ)
fract++;
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
- if (needreport)
+ if (needreport) {
lastreport = jiffies;
+ reported = true;
+ }
firstreport = true;
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS);
rtp->holdouts_func(&holdouts, needreport, &firstreport);
+
+ // Print pre-stall informational messages if needed.
+ j = jiffies;
+ if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
+ lastinfo = j;
+ rtsi = rtsi * rcu_task_stall_info_mult;
+ pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n",
+ __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
+ }
}
set_tasks_gp_state(rtp, RTGS_POST_GP);
@@ -369,7 +756,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
////////////////////////////////////////////////////////////////////////
//
// Simple variant of RCU whose quiescent states are voluntary context
-// switch, cond_resched_rcu_qs(), user-space execution, and idle.
+// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle.
// As such, grace periods can take one good long time. There are no
// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
// because this implementation is intended to get the system into a safe
@@ -412,14 +799,14 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
// read-side critical sections waited for by rcu_tasks_postscan().
//
-// Pre-grace-period update-side code is ordered before the grace via the
-// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side
-// code is ordered before the grace period via synchronize_rcu() call
-// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// Pre-grace-period update-side code is ordered before the grace
+// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+// is ordered before the grace period via synchronize_rcu() call in
+// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
// disabling.
/* Pre-grace-period preparation. */
-static void rcu_tasks_pregp_step(void)
+static void rcu_tasks_pregp_step(struct list_head *hop)
{
/*
* Wait for all pre-existing t->on_rq and t->nvcsw transitions
@@ -451,14 +838,34 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
/*
- * Wait for tasks that are in the process of exiting. This
- * does only part of the job, ensuring that all tasks that were
- * previously exiting reach the point where they have disabled
- * preemption, allowing the later synchronize_rcu() to finish
- * the job.
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
+ * until their final schedule() with TASK_DEAD state. To cope with
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+ * 1) An _SRCU_ read side starting before calling exit_notify(),
+ * which may remove the task from the tasklist, and ending after
+ * the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+ * with TASK_DEAD state.
+ *
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -523,7 +930,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after the synchronize_srcu() above.
+ * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
*/
synchronize_rcu();
}
@@ -531,6 +941,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
/**
* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
@@ -540,7 +965,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks() assumes
* that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
+ * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle,
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
@@ -587,13 +1012,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
*/
void rcu_barrier_tasks(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks();
+ rcu_barrier_tasks_generic(&rcu_tasks);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
static int __init rcu_spawn_tasks_kthread(void)
{
+ cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
@@ -613,27 +1038,42 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
-/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
- preempt_disable();
current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
- preempt_enable();
}
-/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
{
struct task_struct *t = current;
- preempt_disable();
__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
- preempt_enable();
- exit_tasks_rcu_finish_trace(t);
+}
+
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_finish(void)
+{
+ exit_tasks_rcu_stop();
+ exit_tasks_rcu_finish_trace(current);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
+void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -643,8 +1083,8 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
//
// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching
-// of concurrent calls to the synchronous synchronize_rcu_rude() API.
+// provides an asynchronous call_rcu_tasks_rude() API and batching of
+// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
// and induces otherwise unnecessary context switches on all online CPUs,
// whether idle or not.
@@ -678,11 +1118,11 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks_rude()
* assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-structure synchronization.
+ * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as
+ * usermode execution is schedulable). As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -700,8 +1140,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable
+ * context), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -725,13 +1165,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
*/
void rcu_barrier_tasks_rude(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_rude();
+ rcu_barrier_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
static int __init rcu_spawn_tasks_rude_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
@@ -758,14 +1198,13 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// 2. Protects code in the idle loop, exception entry/exit, and
// CPU-hotplug code paths, similar to the capabilities of SRCU.
//
-// 3. Avoids expensive read-side instruction, having overhead similar
+// 3. Avoids expensive read-side instructions, having overhead similar
// to that of Preemptible RCU.
//
-// There are of course downsides. The grace-period code can send IPIs to
-// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace.
-// It is necessary to scan the full tasklist, much as for Tasks RCU. There
-// is a single callback queue guarded by a single lock, again, much as for
-// Tasks RCU. If needed, these downsides can be at least partially remedied.
+// There are of course downsides. For example, the grace-period code
+// can send IPIs to CPUs, even when those CPUs are in the idle loop or
+// in nohz_full userspace. If needed, these downsides can be at least
+// partially remedied.
//
// Perhaps most important, this variant of RCU does not affect the vanilla
// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
@@ -778,35 +1217,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// invokes these functions in this order:
//
// rcu_tasks_trace_pregp_step():
-// Initialize the count of readers and block CPU-hotplug operations.
-// rcu_tasks_trace_pertask(), invoked on every non-idle task:
-// Initialize per-task state and attempt to identify an immediate
-// quiescent state for that task, or, failing that, attempt to
-// set that task's .need_qs flag so that task's next outermost
-// rcu_read_unlock_trace() will report the quiescent state (in which
-// case the count of readers is incremented). If both attempts fail,
-// the task is added to a "holdout" list.
+// Disables CPU hotplug, adds all currently executing tasks to the
+// holdout list, then checks the state of all tasks that blocked
+// or were preempted within their current RCU Tasks Trace read-side
+// critical section, adding them to the holdout list if appropriate.
+// Finally, this function re-enables CPU hotplug.
+// The ->pertask_func() pointer is NULL, so there is no per-task processing.
// rcu_tasks_trace_postscan():
-// Initialize state and attempt to identify an immediate quiescent
-// state as above (but only for idle tasks), unblock CPU-hotplug
-// operations, and wait for an RCU grace period to avoid races with
-// tasks that are in the process of exiting.
+// Invokes synchronize_rcu() to wait for late-stage exiting tasks
+// to finish exiting.
// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
-// corresponding task is removed from the holdout list.
+// corresponding task is removed from the holdout list. Once this
+// list is empty, the grace period has completed.
// rcu_tasks_trace_postgp():
-// Wait for the count of readers do drop to zero, reporting any stalls.
-// Also execute full memory barriers to maintain ordering with code
-// executing after the grace period.
+// Provides the needed full memory barrier and does debug checks.
//
// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
//
-// Pre-grace-period update-side code is ordered before the grace
-// period via the ->cbs_lock and barriers in rcu_tasks_kthread().
-// Pre-grace-period read-side code is ordered before the grace period by
-// atomic_dec_and_test() of the count of readers (for IPIed readers) and by
-// scheduler context-switch ordering (for locked-down non-running readers).
+// Pre-grace-period update-side code is ordered before the grace period
+// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
+// read-side code is ordered before the grace period by atomic operations
+// on .b.need_qs flag of each task involved in this process, or by scheduler
+// context-switch ordering (for locked-down non-running readers).
// The lockdep state must be outside of #ifdef to be useful.
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -818,9 +1252,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
-static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
-static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
-
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -829,44 +1260,104 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
static unsigned long n_heavy_reader_attempts;
static unsigned long n_heavy_reader_updates;
static unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_trc_holdouts;
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
"RCU Tasks Trace");
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+ smp_mb(); // Enforce full grace-period ordering.
+ return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+ smp_store_release(&t->trc_reader_special.b.need_qs, v);
+ smp_mb(); // Enforce full grace-period ordering.
+}
+
/*
- * This irq_work handler allows rcu_read_unlock_trace() to be invoked
- * while the scheduler locks are held.
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
*/
-static void rcu_read_unlock_iw(struct irq_work *iwp)
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
- wake_up(&trc_wait);
+ union rcu_special ret;
+ union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+ union rcu_special trs_new = trs_old;
+
+ if (trs_old.b.need_qs != old)
+ return trs_old.b.need_qs;
+ trs_new.b.need_qs = new;
+ ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+ return ret.b.need_qs;
}
-static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-/* If we are the last reader, wake up the grace-period kthread. */
-void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
+/*
+ * If we are the last reader, signal the grace-period kthread.
+ * Also remove from the per-CPU list of blocked tasks.
+ */
+void rcu_read_unlock_trace_special(struct task_struct *t)
{
- int nq = t->trc_reader_special.b.need_qs;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ union rcu_special trs;
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
- t->trc_reader_special.b.need_mb)
+ // Open-coded full-word version of rcu_ld_need_qs().
+ smp_mb(); // Enforce full grace-period ordering.
+ trs = smp_load_acquire(&t->trc_reader_special);
+
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
smp_mb(); // Pairs with update-side barriers.
// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
- if (nq)
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_nesting, nesting);
- if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
- irq_work_queue(&rcu_tasks_trace_iw);
+ if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
+ u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+ TRC_NEED_QS_CHECKED);
+
+ WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
+ }
+ if (trs.b.blocked) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->trc_blkd_node);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, false);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+
+ local_irq_save(flags);
+ rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+ t->trc_blkd_cpu = smp_processor_id();
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, true);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
/* Add a task to the holdout list, if it is not already on the list. */
static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
{
if (list_empty(&t->trc_holdout_list)) {
get_task_struct(t);
list_add(&t->trc_holdout_list, bhp);
+ n_trc_holdouts++;
}
}
@@ -876,97 +1367,91 @@ static void trc_del_holdout(struct task_struct *t)
if (!list_empty(&t->trc_holdout_list)) {
list_del_init(&t->trc_holdout_list);
put_task_struct(t);
+ n_trc_holdouts--;
}
}
/* IPI handler to check task state. */
static void trc_read_check_handler(void *t_in)
{
+ int nesting;
struct task_struct *t = current;
struct task_struct *texp = t_in;
// If the task is no longer running on this CPU, leave.
- if (unlikely(texp != t)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
+ if (unlikely(texp != t))
goto reset_ipi; // Already on holdout list, so will check later.
- }
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!t->trc_reader_nesting)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
- // Mark as checked after decrement to avoid false
- // positives on the above WARN_ON_ONCE().
- WRITE_ONCE(t->trc_reader_checked, true);
+ nesting = READ_ONCE(t->trc_reader_nesting);
+ if (likely(!nesting)) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(t->trc_reader_nesting < 0)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
+ if (unlikely(nesting < 0))
goto reset_ipi;
- }
- WRITE_ONCE(t->trc_reader_checked, true);
- // Get here if the task is in a read-side critical section. Set
- // its state so that it will awaken the grace-period kthread upon
- // exit from that critical section.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+ // Get here if the task is in a read-side critical section.
+ // Set its state so that it will update state for the grace-period
+ // kthread upon exit from that critical section.
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
// Also order this IPI handler against any later manipulations of
// the intended task.
- smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
+ smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
}
/* Callback function for scheduler to check locked-down task. */
-static bool trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
{
+ struct list_head *bhp = bhp_in;
int cpu = task_cpu(t);
- bool in_qs = false;
+ int nesting;
bool ofl = cpu_is_offline(cpu);
- if (task_curr(t)) {
- WARN_ON_ONCE(ofl && !is_idle_task(t));
-
+ if (task_curr(t) && !ofl) {
// If no chance of heavyweight readers, do it the hard way.
- if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- return false;
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ return -EINVAL;
// If heavyweight readers are enabled on the remote task,
// we can inspect its state despite its currently running.
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
- if (!ofl && // Check for "running" idle tasks on offline CPUs.
- !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
- return false; // No quiescent state, do it the hard way.
+ // Check for "running" idle tasks on offline CPUs.
+ if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
- if (ofl)
- n_heavy_reader_ofl_updates++;
- in_qs = true;
+ nesting = 0;
} else {
- in_qs = likely(!t->trc_reader_nesting);
+ // The task is not running, so C-language access is safe.
+ nesting = t->trc_reader_nesting;
+ WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+ n_heavy_reader_ofl_updates++;
}
- // Mark as checked so that the grace-period kthread will
- // remove it from the holdout list.
- t->trc_reader_checked = true;
-
- if (in_qs)
- return true; // Already in quiescent state, done!!!
+ // If not exiting a read-side critical section, mark as checked
+ // so that the grace-period kthread will remove it from the
+ // holdout list.
+ if (!nesting) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ return 0; // In QS, so done.
+ }
+ if (nesting < 0)
+ return -EINVAL; // Reader transitioning, try again later.
// The task is in a read-side critical section, so set up its
- // state so that it will awaken the grace-period kthread upon exit
- // from that critical section.
- atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
- return true;
+ // state so that it will update state upon exit from that critical
+ // section.
+ if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+ trc_add_holdout(t, bhp);
+ return 0;
}
/* Attempt to extract the state for the specified task. */
@@ -981,19 +1466,25 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
- t->trc_reader_checked = true;
- WARN_ON_ONCE(t->trc_reader_nesting);
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
// Attempt to nail down the task for inspection.
get_task_struct(t);
- if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) {
+ if (!task_call_func(t, trc_inspect_reader, bhp)) {
put_task_struct(t);
return;
}
put_task_struct(t);
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // trc_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
// If currently running, send an IPI, either way, add to list.
trc_add_holdout(t, bhp);
if (task_curr(t) &&
@@ -1005,96 +1496,161 @@ static void trc_wait_for_one_reader(struct task_struct *t,
if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
return;
- atomic_inc(&trc_n_readers_need_end);
per_cpu(trc_ipi_to_cpu, cpu) = true;
t->trc_ipi_to_cpu = cpu;
rcu_tasks_trace.n_ipis++;
- if (smp_call_function_single(cpu,
- trc_read_check_handler, t, 0)) {
+ if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
// Just in case there is some other reason for
// failure than the target CPU being offline.
+ WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
+ __func__, cpu);
rcu_tasks_trace.n_ipis_fails++;
per_cpu(trc_ipi_to_cpu, cpu) = false;
- t->trc_ipi_to_cpu = cpu;
- if (atomic_dec_and_test(&trc_n_readers_need_end)) {
- WARN_ON_ONCE(1);
- wake_up(&trc_wait);
- }
+ t->trc_ipi_to_cpu = -1;
}
}
}
+/*
+ * Initialize for first-round processing for the specified task.
+ * Return false if task is NULL or already taken care of, true otherwise.
+ */
+static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
+{
+ // During early boot when there is only the one boot CPU, there
+ // is no idle task for the other CPUs. Also, the grace-period
+ // kthread is always in a quiescent state. In addition, just return
+ // if this task is already on the list.
+ if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
+ return false;
+
+ rcu_st_need_qs(t, 0);
+ t->trc_ipi_to_cpu = -1;
+ return true;
+}
+
+/* Do first-round processing for the specified task. */
+static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
+{
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_wait_for_one_reader(t, hop);
+}
+
/* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(void)
+static void rcu_tasks_trace_pregp_step(struct list_head *hop)
{
+ LIST_HEAD(blkd_tasks);
int cpu;
-
- // Allow for fast-acting IPIs.
- atomic_set(&trc_n_readers_need_end, 1);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t;
// There shouldn't be any old IPIs, but...
for_each_possible_cpu(cpu)
WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
- // Disable CPU hotplug across the tasklist scan.
- // This also waits for all readers in CPU-hotplug code paths.
+ // Disable CPU hotplug across the CPU scan for the benefit of
+ // any IPIs that might be needed. This also waits for all readers
+ // in CPU-hotplug code paths.
cpus_read_lock();
-}
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t,
- struct list_head *hop)
-{
- // During early boot when there is only the one boot CPU, there
- // is no idle task for the other CPUs. Just return.
- if (unlikely(t == NULL))
- return;
+ // These rcu_tasks_trace_pertask_prep() calls are serialized to
+ // allow safe access to the hop list.
+ for_each_online_cpu(cpu) {
+ rcu_read_lock();
+ t = cpu_curr_snapshot(cpu);
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_add_holdout(t, hop);
+ rcu_read_unlock();
+ cond_resched_tasks_rcu_qs();
+ }
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_checked, false);
- t->trc_ipi_to_cpu = -1;
- trc_wait_for_one_reader(t, hop);
+ // Only after all running tasks have been accounted for is it
+ // safe to take care of the tasks that have blocked within their
+ // current RCU tasks trace read-side critical section.
+ for_each_possible_cpu(cpu) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
+ while (!list_empty(&blkd_tasks)) {
+ rcu_read_lock();
+ t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
+ list_del_init(&t->trc_blkd_node);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ rcu_tasks_trace_pertask(t, hop);
+ rcu_read_unlock();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Re-enable CPU hotplug now that the holdout list is populated.
+ cpus_read_unlock();
}
/*
- * Do intermediate processing between task and holdout scans and
- * pick up the idle tasks.
+ * Do intermediate processing between task and holdout scans.
*/
static void rcu_tasks_trace_postscan(struct list_head *hop)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- rcu_tasks_trace_pertask(idle_task(cpu), hop);
-
- // Re-enable CPU hotplug now that the tasklist scan has completed.
- cpus_read_unlock();
-
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
+
+ // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
synchronize_rcu();
- // Any tasks that exit after this point will set ->trc_reader_checked.
+ // Any tasks that exit after this point will set
+ // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
+}
+
+/* Communicate task state back to the RCU tasks trace stall warning request. */
+struct trc_stall_chk_rdr {
+ int nesting;
+ int ipi_to_cpu;
+ u8 needqs;
+};
+
+static int trc_check_slow_task(struct task_struct *t, void *arg)
+{
+ struct trc_stall_chk_rdr *trc_rdrp = arg;
+
+ if (task_curr(t) && cpu_online(task_cpu(t)))
+ return false; // It is running, so decline to inspect it.
+ trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
+ trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
+ trc_rdrp->needqs = rcu_ld_need_qs(t);
+ return true;
}
/* Show the state of a task stalling the current RCU tasks trace GP. */
static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
{
int cpu;
+ struct trc_stall_chk_rdr trc_rdr;
+ bool is_idle_tsk = is_idle_task(t);
if (*firstreport) {
pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
*firstreport = false;
}
- // FIXME: This should attempt to use try_invoke_on_nonrunning_task().
cpu = task_cpu(t);
- pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
- t->pid,
- ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
- ".i"[is_idle_task(t)],
- ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- t->trc_reader_nesting,
- " N"[!!t->trc_reader_special.b.need_qs],
- cpu);
+ if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
+ pr_alert("P%d: %c%c\n",
+ t->pid,
+ ".I"[t->trc_ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk]);
+ else
+ pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
+ t->pid,
+ ".I"[trc_rdr.ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk],
+ ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ ".B"[!!data_race(t->trc_reader_special.b.blocked)],
+ trc_rdr.nesting,
+ " !CN"[trc_rdr.needqs & 0x3],
+ " ?"[trc_rdr.needqs > 0x3],
+ cpu, cpu_online(cpu) ? "" : "(offline)");
sched_show_task(t);
}
@@ -1114,71 +1670,52 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
{
struct task_struct *g, *t;
- // Disable CPU hotplug across the holdout list scan.
+ // Disable CPU hotplug across the holdout list scan for IPIs.
cpus_read_lock();
list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
// If safe and needed, try to check the current task.
if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
- !READ_ONCE(t->trc_reader_checked))
+ !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
- if (READ_ONCE(t->trc_reader_checked))
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
+ rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list scan has completed.
cpus_read_unlock();
if (needreport) {
- if (firstreport)
+ if (*firstreport)
pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
show_stalled_ipi_trace();
}
}
+static void rcu_tasks_trace_empty_fn(void *unused)
+{
+}
+
/* Wait for grace period to complete and provide ordering. */
static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
{
- bool firstreport;
- struct task_struct *g, *t;
- LIST_HEAD(holdouts);
- long ret;
+ int cpu;
- // Remove the safety count.
- smp_mb__before_atomic(); // Order vs. earlier atomics
- atomic_dec(&trc_n_readers_need_end);
- smp_mb__after_atomic(); // Order vs. later atomics
+ // Wait for any lingering IPI handlers to complete. Note that
+ // if a CPU has gone offline or transitioned to userspace in the
+ // meantime, all IPI handlers should have been drained beforehand.
+ // Yes, this assumes that CPUs process IPIs in order. If that ever
+ // changes, there will need to be a recheck and/or timed wait.
+ for_each_online_cpu(cpu)
+ if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
+ smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
- // Wait for readers.
- set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
- for (;;) {
- ret = wait_event_idle_exclusive_timeout(
- trc_wait,
- atomic_read(&trc_n_readers_need_end) == 0,
- READ_ONCE(rcu_task_stall_timeout));
- if (ret)
- break; // Count reached zero.
- // Stall warning time, so make a list of the offenders.
- rcu_read_lock();
- for_each_process_thread(g, t)
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- trc_add_holdout(t, &holdouts);
- rcu_read_unlock();
- firstreport = true;
- list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- show_stalled_task_trace(t, &firstreport);
- trc_del_holdout(t); // Release task_struct reference.
- }
- if (firstreport)
- pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
- show_stalled_ipi_trace();
- pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
- }
smp_mb(); // Caller's code must be ordered after wakeup.
// Pairs with pretty much every ordering primitive.
}
@@ -1186,11 +1723,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
/* Report any needed quiescent state for this exiting task. */
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
- WRITE_ONCE(t->trc_reader_checked, true);
- WARN_ON_ONCE(t->trc_reader_nesting);
- WRITE_ONCE(t->trc_reader_nesting, 0);
- if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
- rcu_read_unlock_trace_special(t, 0);
+ union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
+ if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
+ rcu_read_unlock_trace_special(t);
+ else
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
/**
@@ -1198,15 +1738,11 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* @rhp: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks_trace()
- * assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-structure synchronization.
+ * The callback function will be invoked some time after a trace rcu-tasks
+ * grace period elapses, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have completed. These
+ * read-side critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -1222,7 +1758,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
*
* Control will return to the caller some time after a trace rcu-tasks
* grace period has elapsed, in other words after all currently executing
- * rcu-tasks read-side critical sections have elapsed. These read-side
+ * trace rcu-tasks read-side critical sections have elapsed. These read-side
* critical sections are delimited by calls to rcu_read_lock_trace()
* and rcu_read_unlock_trace().
*
@@ -1249,13 +1785,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
*/
void rcu_barrier_tasks_trace(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_trace();
+ rcu_barrier_tasks_generic(&rcu_tasks_trace);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
static int __init rcu_spawn_tasks_trace_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
@@ -1268,7 +1804,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
rcu_tasks_trace.init_fract = 1;
}
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
- rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
@@ -1281,7 +1816,8 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
+ sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
data_race(n_heavy_reader_attempts));
@@ -1308,23 +1844,24 @@ struct rcu_tasks_test_desc {
struct rcu_head rh;
const char *name;
bool notrun;
+ unsigned long runstart;
};
static struct rcu_tasks_test_desc tests[] = {
{
.name = "call_rcu_tasks()",
/* If not defined, the test is skipped. */
- .notrun = !IS_ENABLED(CONFIG_TASKS_RCU),
+ .notrun = IS_ENABLED(CONFIG_TASKS_RCU),
},
{
.name = "call_rcu_tasks_rude()",
/* If not defined, the test is skipped. */
- .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+ .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
},
{
.name = "call_rcu_tasks_trace()",
/* If not defined, the test is skipped. */
- .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
}
};
@@ -1335,46 +1872,83 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
pr_info("Callback from %s invoked.\n", rttd->name);
- rttd->notrun = true;
+ rttd->notrun = false;
}
static void rcu_tasks_initiate_self_tests(void)
{
pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
+ tests[0].runstart = jiffies;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
+ tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
+ tests[2].runstart = jiffies;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
#endif
}
+/*
+ * Return: 0 - test passed
+ * 1 - test failed, but have not timed out yet
+ * -1 - test failed and timed out
+ */
static int rcu_tasks_verify_self_tests(void)
{
int ret = 0;
int i;
+ unsigned long bst = rcu_task_stall_timeout;
+ if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
+ bst = RCU_TASK_BOOT_STALL_TIMEOUT;
for (i = 0; i < ARRAY_SIZE(tests); i++) {
- if (!tests[i].notrun) { // still hanging.
- pr_err("%s has been failed.\n", tests[i].name);
- ret = -1;
+ while (tests[i].notrun) { // still hanging.
+ if (time_after(jiffies, tests[i].runstart + bst)) {
+ pr_err("%s has failed boot-time tests.\n", tests[i].name);
+ ret = -1;
+ break;
+ }
+ ret = 1;
+ break;
}
}
-
- if (ret)
- WARN_ON(1);
+ WARN_ON(ret < 0);
return ret;
}
-late_initcall(rcu_tasks_verify_self_tests);
+
+/*
+ * Repeat the rcu_tasks_verify_self_tests() call once every second until the
+ * test passes or has timed out.
+ */
+static struct delayed_work rcu_tasks_verify_work;
+static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
+{
+ int ret = rcu_tasks_verify_self_tests();
+
+ if (ret <= 0)
+ return;
+
+ /* Test fails but not timed out yet, reschedule another check */
+ schedule_delayed_work(&rcu_tasks_verify_work, HZ);
+}
+
+static int rcu_tasks_verify_schedule_work(void)
+{
+ INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
+ rcu_tasks_verify_work_fn(NULL);
+ return 0;
+}
+late_initcall(rcu_tasks_verify_schedule_work);
#else /* #ifdef CONFIG_PROVE_RCU */
static void rcu_tasks_initiate_self_tests(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 340b3f8b090d..42f7589e51e0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
void rcu_barrier(void)
{
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
}
EXPORT_SYMBOL(rcu_barrier);
@@ -58,7 +58,7 @@ void rcu_qs(void)
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
raise_softirq_irqoff(RCU_SOFTIRQ);
}
- WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1);
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_rcu() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_rcu() is a quiescent
- * state, and so on a UP system, synchronize_rcu() need do nothing.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent state,
+ * and so on a UP system, synchronize_rcu() need do nothing, other than
+ * let the polled APIs know that another grace period elapsed.
+ *
* (But Lai Jiangshan points out the benefits of doing might_sleep()
* to reduce latency.)
*
@@ -152,9 +154,14 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+static void tiny_rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -162,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+
+ if (!__is_kvfree_rcu_offset((unsigned long)head->func))
+ WRITE_ONCE(head->func, tiny_rcu_leak_callback);
+ return;
+ }
+
head->func = func;
head->next = NULL;
@@ -181,6 +199,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
EXPORT_SYMBOL_GPL(call_rcu);
/*
+ * Store a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/*
* Return a grace-period-counter "cookie". For more information,
* see the Tree RCU header comment.
*/
@@ -213,10 +241,21 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
- return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+ return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head)
+ kasan_record_aux_stack_noalloc(ptr);
+
+ __kvfree_call_rcu(head, ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51f24ecd94b2..f52ff7241041 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -62,6 +62,7 @@
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kasan.h>
+#include <linux/context_tracking.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -74,19 +75,10 @@
/* Data structures. */
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control. Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
- .dynticks_nesting = 1,
- .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ .gpwrap = true,
#ifdef CONFIG_RCU_NOCB_CPU
- .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+ .cblist.flags = SEGCBLIST_RCU_CORE,
#endif
};
static struct rcu_state rcu_state = {
@@ -94,11 +86,12 @@ static struct rcu_state rcu_state = {
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+ .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
.name = RCU_NAME,
.abbr = RCU_ABBR,
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
- .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+ .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -151,16 +144,22 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
+static bool rcu_init_invoked(void);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-/* rcuc/rcub kthread realtime priority */
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -218,17 +217,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
- * Compute the mask of online CPUs for the specified rcu_node structure.
- * This will not be stable unless the rcu_node structure's ->lock is
- * held, but the bit corresponding to the current CPU will be stable
- * in most contexts.
- */
-static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->qsmaskinitnext);
-}
-
-/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -259,58 +247,6 @@ void rcu_softirq_qs(void)
}
/*
- * Record entry into an extended quiescent state. This is only to be
- * called when not already in an extended quiescent state, that is,
- * RCU is watching prior to the call to this function and is no longer
- * watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_enter(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior RCU read-side
- * critical sections, and we also must force ordering with the
- * next idle sojourn.
- */
- rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- // RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_CTR));
- /* Better not have special action (TLB flush) pending! */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_MASK));
-}
-
-/*
- * Record exit from an extended quiescent state. This is only to be
- * called from an extended quiescent state, that is, RCU is not watching
- * prior to the call to this function and is watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_exit(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior idle sojourns,
- * and we also must force ordering with the next RCU read-side
- * critical section.
- */
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- // RCU is now watching. Better not be in an extended quiescent state!
- rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(seq & RCU_DYNTICK_CTRL_CTR));
- if (seq & RCU_DYNTICK_CTRL_MASK) {
- arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
- smp_mb__after_atomic(); /* _exit after clearing mask. */
- }
-}
-
-/*
* Reset the current CPU's ->dynticks counter to indicate that the
* newly onlined CPU is no longer in an extended quiescent state.
* This will either leave the counter unchanged, or increment it
@@ -322,34 +258,19 @@ static noinstr void rcu_dynticks_eqs_exit(void)
*/
static void rcu_dynticks_eqs_online(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ if (ct_dynticks() & RCU_DYNTICKS_IDX)
return;
- atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
-}
-
-/*
- * Is the current CPU in an extended quiescent state?
- *
- * No ordering, as we are sampling CPU-local information.
- */
-static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ ct_state_inc(RCU_DYNTICKS_IDX);
}
/*
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
-static int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(int cpu)
{
- int snap = atomic_add_return(0, &rdp->dynticks);
-
- return snap & ~RCU_DYNTICK_CTRL_MASK;
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return ct_dynticks_cpu_acquire(cpu);
}
/*
@@ -358,15 +279,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICK_CTRL_CTR);
-}
-
-/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
-bool rcu_is_idle_cpu(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
- return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+ return !(snap & RCU_DYNTICKS_IDX);
}
/*
@@ -376,7 +289,7 @@ bool rcu_is_idle_cpu(int cpu)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp);
+ return snap != rcu_dynticks_snap(rdp->cpu);
}
/*
@@ -385,45 +298,17 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
*/
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
- RCU_DYNTICK_CTRL_CTR);
-
+ snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
-}
-
-/*
- * Set the special (bottom) bit of the specified CPU so that it
- * will take special action (such as flushing its TLB) on the
- * next exit from an extended quiescent state. Returns true if
- * the bit was successfully set, or false if the CPU was not in
- * an extended quiescent state.
- */
-bool rcu_eqs_special_set(int cpu)
-{
- int old;
- int new;
- int new_old;
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- new_old = atomic_read(&rdp->dynticks);
- do {
- old = new_old;
- if (old & RCU_DYNTICK_CTRL_CTR)
- return false;
- new = old | RCU_DYNTICK_CTRL_MASK;
- new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
- } while (new_old != old);
- return true;
+ return snap == ct_dynticks_cpu(cpu);
}
/*
@@ -439,13 +324,12 @@ bool rcu_eqs_special_set(int cpu)
*/
notrace void rcu_momentary_dyntick_idle(void)
{
- int special;
+ int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
- &this_cpu_ptr(&rcu_data)->dynticks);
+ seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+ WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -470,13 +354,13 @@ static int rcu_is_cpu_rrupt_from_idle(void)
lockdep_assert_irqs_disabled();
/* Check for counter underflows */
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
"RCU dynticks_nesting counter underflow!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
"RCU dynticks_nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
+ nesting = ct_dynticks_nmi_nesting();
if (nesting > 1)
return false;
@@ -486,7 +370,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
WARN_ON_ONCE(!nesting && !is_idle_task(current));
/* Does CPU appear to be idle from an RCU standpoint? */
- return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
+ return ct_dynticks_nesting() == 0;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -637,67 +521,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
-/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- */
-static noinstr void rcu_eqs_enter(bool user)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdp->dynticks_nesting == 0);
- if (rdp->dynticks_nesting != 1) {
- // RCU will still be watching, so just do accounting and leave.
- rdp->dynticks_nesting--;
- return;
- }
-
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- rcu_prepare_for_idle();
- rcu_preempt_deferred_qs(current);
-
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * If you add or remove a call to rcu_idle_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_eqs_enter(false);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-#ifdef CONFIG_NO_HZ_FULL
-
-#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
* An empty function that will trigger a reschedule on
* IRQ tail once IRQs get re-enabled on userspace/guest resume.
@@ -719,7 +543,7 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
* last resort is to fire a local irq_work that will trigger a reschedule once IRQs
* get re-enabled again.
*/
-noinstr static void rcu_irq_work_resched(void)
+noinstr void rcu_irq_work_resched(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -735,117 +559,7 @@ noinstr static void rcu_irq_work_resched(void)
}
instrumentation_end();
}
-
-#else
-static inline void rcu_irq_work_resched(void) { }
-#endif
-
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace. No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- *
- * If you add or remove a call to rcu_user_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_user_enter(void)
-{
- lockdep_assert_irqs_disabled();
-
- /*
- * Other than generic entry implementation, we may be past the last
- * rescheduling opportunity in the entry code. Trigger a self IPI
- * that will fire and reschedule once we resume in user/guest mode.
- */
- rcu_irq_work_resched();
- rcu_eqs_enter(true);
-}
-
-#endif /* CONFIG_NO_HZ_FULL */
-
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
- *
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
- *
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_exit(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- instrumentation_begin();
- /*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
- */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
-
- /*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
- */
- if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
- atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
- rdp->dynticks_nmi_nesting - 2);
- instrumentation_end();
- return;
- }
-
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
-
- if (!in_nmi())
- rcu_prepare_for_idle();
-
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
- instrumentation_end();
-
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
-
- if (!in_nmi())
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture's idle loop violates this assumption, RCU will give you what
- * you deserve, good and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_irq_exit(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-}
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
#ifdef CONFIG_PROVE_RCU
/**
@@ -855,9 +569,9 @@ void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
"RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
DYNTICK_IRQ_NONIDLE,
"Bad RCU dynticks_nmi_nesting counter\n");
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -865,96 +579,8 @@ void rcu_irq_exit_check_preempt(void)
}
#endif /* #ifdef CONFIG_PROVE_RCU */
-/*
- * Wrapper for rcu_irq_exit() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_exit_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_exit();
- local_irq_restore(flags);
-}
-
-/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
- * allow for the possibility of usermode upcalls messing up our count of
- * interrupt nesting level during the busy period that is just now starting.
- */
-static void noinstr rcu_eqs_exit(bool user)
-{
- struct rcu_data *rdp;
- long oldval;
-
- lockdep_assert_irqs_disabled();
- rdp = this_cpu_ptr(&rcu_data);
- oldval = rdp->dynticks_nesting;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
- if (oldval) {
- // RCU was already watching, so just do accounting and leave.
- rdp->dynticks_nesting++;
- return;
- }
- rcu_dynticks_task_exit();
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
- instrumentation_begin();
-
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- WRITE_ONCE(rdp->dynticks_nesting, 1);
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
- instrumentation_end();
-}
-
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * If you add or remove a call to rcu_idle_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_exit(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_eqs_exit(false);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
#ifdef CONFIG_NO_HZ_FULL
/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- *
- * If you add or remove a call to rcu_user_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_user_exit(void)
-{
- rcu_eqs_exit(true);
-}
-
-/**
* __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
*
* The scheduler tick is not normally enabled when CPUs enter the kernel
@@ -1014,115 +640,24 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- *
- * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
- * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * that the CPU is active. This implementation permits nested NMIs, as
- * long as the nesting level does not overflow an int. (You will probably
- * run out of stack space first.)
- *
- * If you add or remove a call to rcu_nmi_enter(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_enter(void)
-{
- long incby = 2;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- /* Complain about underflow. */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
-
- /*
- * If idle from RCU viewpoint, atomically increment ->dynticks
- * to mark non-idle and increment ->dynticks_nmi_nesting by one.
- * Otherwise, increment ->dynticks_nmi_nesting by two. This means
- * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
- * to be in the outermost NMI handler that interrupted an RCU-idle
- * period (observation due to Andy Lutomirski).
- */
- if (rcu_dynticks_curr_cpu_in_eqs()) {
-
- if (!in_nmi())
- rcu_dynticks_task_exit();
-
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
-
- if (!in_nmi()) {
- instrumentation_begin();
- rcu_cleanup_after_idle();
- instrumentation_end();
- }
-
- instrumentation_begin();
- // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
- instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- incby = 1;
- } else if (!in_nmi()) {
- instrumentation_begin();
- rcu_irq_enter_check_tick();
- } else {
- instrumentation_begin();
- }
-
- trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
- rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
- rdp->dynticks_nmi_nesting + incby);
- barrier();
-}
-
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to user mode!
- * This code assumes that the idle loop never does upcalls to user mode.
- * If your architecture's idle loop does do upcalls to user mode (or does
- * anything else that results in unbalanced calls to the irq_enter() and
- * irq_exit() functions), RCU will give you what you deserve, good and hard.
- * But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_irq_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_enter();
-}
-
/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API. This is used by
+ * the idle-entry code to figure out whether it is safe to disable the
+ * scheduler-clock interrupt.
*
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
*/
-void rcu_irq_enter_irqson(void)
+int rcu_needs_cpu(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_enter();
- local_irq_restore(flags);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
}
/*
@@ -1181,41 +716,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
-
-/*
- * Is the current CPU online as far as RCU is concerned?
- *
- * Disable preemption to avoid false positives that could otherwise
- * happen due to the current CPU number being sampled, this task being
- * preempted, its old CPU being taken offline, resuming on some other CPU,
- * then determining that its old CPU is now offline.
- *
- * Disable checking if in an NMI handler because we cannot safely
- * report errors from NMI handlers anyway. In addition, it is OK to use
- * RCU on an offline processor during initial boot, hence the check for
- * rcu_scheduler_fully_active.
- */
-bool rcu_lockdep_current_cpu_online(void)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- bool ret = false;
-
- if (in_nmi() || !rcu_scheduler_fully_active)
- return true;
- preempt_disable_notrace();
- rdp = this_cpu_ptr(&rcu_data);
- rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
- ret = true;
- preempt_enable_notrace();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-
/*
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
@@ -1240,7 +740,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1258,8 +758,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
- bool *rnhqp;
- bool *ruqp;
struct rcu_node *rnp = rdp->mynode;
/*
@@ -1294,8 +792,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* For more detail, please refer to the "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
- bool onl;
+ if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
struct rcu_node *rnp1;
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
@@ -1304,9 +801,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
- __func__, rdp->cpu, ".o"[onl],
+ __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
return 1; /* Break things loose after complaining. */
@@ -1324,17 +820,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* is set way high.
*/
jtsq = READ_ONCE(jiffies_to_sched_qs);
- ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
- rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
- if (!READ_ONCE(*rnhqp) &&
+ if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
time_after(jiffies, rcu_state.jiffies_resched) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*rnhqp, true);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
- smp_store_release(ruqp, true);
+ smp_store_release(&rdp->rcu_urgent_qs, true);
} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
- WRITE_ONCE(*ruqp, true);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
}
/*
@@ -1348,7 +842,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*ruqp, true);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
@@ -1373,6 +867,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
}
+
+ if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+ int cpu = rdp->cpu;
+ struct rcu_snap_record *rsrp;
+ struct kernel_cpustat *kcsp;
+
+ kcsp = &kcpustat_cpu(cpu);
+
+ rsrp = &rdp->snap_record;
+ rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->jiffies = jiffies;
+ rsrp->gp_seq = rdp->gp_seq;
+ }
}
return 0;
@@ -1510,7 +1022,7 @@ static void rcu_gp_kthread_wake(void)
{
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
- if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
@@ -1633,10 +1145,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
- !raw_spin_trylock_rcu_node(rnp))
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
return;
- WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
raw_spin_unlock_rcu_node(rnp);
}
@@ -1701,6 +1214,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
@@ -1727,11 +1242,37 @@ static void note_gp_changes(struct rcu_data *rdp)
rcu_gp_kthread_wake();
}
+static atomic_t *rcu_gp_slow_suppress;
+
+/* Register a counter to suppress debugging grace-period delays. */
+void rcu_gp_slow_register(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rcu_gp_slow_suppress);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
+
+/* Unregister a counter, with NULL for not caring which. */
+void rcu_gp_slow_unregister(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
+
+static bool rcu_gp_slow_is_suppressed(void)
+{
+ atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
+
+ return rgssp && atomic_read(rgssp);
+}
+
static void rcu_gp_slow(int delay)
{
- if (delay > 0 &&
- !(rcu_seq_ctr(rcu_state.gp_seq) %
- (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+ !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
schedule_timeout_idle(delay);
}
@@ -1769,12 +1310,81 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If RCU was idle, note beginning of GP.
+ if (!rcu_seq_state(rcu_state.gp_seq_polled))
+ rcu_seq_start(&rcu_state.gp_seq_polled);
+
+ // Either way, record current state.
+ *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If the previously noted GP is still in effect, record the
+ // end of that GP. Either way, zero counter to avoid counter-wrap
+ // problems.
+ if (*snap && *snap == rcu_state.gp_seq_polled) {
+ rcu_seq_end(&rcu_state.gp_seq_polled);
+ rcu_state.gp_seq_polled_snap = 0;
+ rcu_state.gp_seq_polled_exp_snap = 0;
+ } else {
+ *snap = 0;
+ }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_start(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_end(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
/*
* Initialize a new grace period. Return false if no grace period required.
*/
-static bool rcu_gp_init(void)
+static noinline_for_stack bool rcu_gp_init(void)
{
- unsigned long firstseq;
unsigned long flags;
unsigned long oldmask;
unsigned long mask;
@@ -1805,6 +1415,7 @@ static bool rcu_gp_init(void)
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+ rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -1817,20 +1428,17 @@ static bool rcu_gp_init(void)
* of RCU's Requirements documentation.
*/
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+ /* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
- firstseq = READ_ONCE(rnp->ofl_seq);
- if (firstseq & 0x1)
- while (firstseq == READ_ONCE(rnp->ofl_seq))
- schedule_timeout_idle(1); // Can't wake unless RCU is watching.
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
- raw_spin_lock(&rcu_state.ofl_lock);
- raw_spin_lock_irq_rcu_node(rnp);
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
continue;
}
@@ -1865,8 +1473,9 @@ static bool rcu_gp_init(void)
rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -1946,7 +1555,7 @@ static void rcu_gp_fqs(bool first_time)
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
- rcu_state.n_force_qs++;
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -1966,21 +1575,25 @@ static void rcu_gp_fqs(bool first_time)
/*
* Loop doing repeated quiescent-state forcing until the grace period ends.
*/
-static void rcu_gp_fqs_loop(void)
+static noinline_for_stack void rcu_gp_fqs_loop(void)
{
- bool first_gp_fqs;
+ bool first_gp_fqs = true;
int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
- first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
if (rcu_state.cbovld)
gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
- if (!ret) {
+ if (rcu_state.cbovld) {
+ j = (j + 2) / 3;
+ if (j <= 0)
+ j = 1;
+ }
+ if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
/*
* jiffies_force_qs before RCU_GP_WAIT_FQS state
@@ -1993,12 +1606,20 @@ static void rcu_gp_fqs_loop(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
- ret = swait_event_idle_timeout_exclusive(
- rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
+ (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
+ rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
- /* If grace period done, leave loop. */
+ /*
+ * Exit the loop if the root rcu_node structure indicates that the grace period
+ * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
+ * is required only for single-node rcu_node trees because readers blocking
+ * the current grace period are queued only on leaf rcu_node structures.
+ * For multi-node trees, checking the root node's ->qsmask suffices, because a
+ * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+ * the corresponding leaf nodes have passed through their quiescent state.
+ */
if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
@@ -2066,6 +1687,7 @@ static noinline void rcu_gp_cleanup(void)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
+ rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -2085,6 +1707,8 @@ static noinline void rcu_gp_cleanup(void)
dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+ if (!rnp->parent)
+ smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
rdp = this_cpu_ptr(&rcu_data);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rnp, rdp) || needgp;
@@ -2121,14 +1745,29 @@ static noinline void rcu_gp_cleanup(void)
/* Advance CBs to reduce false positives below. */
offloaded = rcu_rdp_is_offloaded(rdp);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+
+ // We get here if a grace period was needed (“needgp”)
+ // and the above call to rcu_accelerate_cbs() did not set
+ // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+ // the need for another grace period).  The purpose
+ // of the “offloaded” check is to avoid invoking
+ // rcu_accelerate_cbs() on an offloaded CPU because we do not
+ // hold the ->nocb_lock needed to safely access an offloaded
+ // ->cblist.  We do not want to acquire that lock because
+ // it can be heavily contended during callback floods.
+
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
- trace_rcu_grace_period(rcu_state.name,
- rcu_state.gp_seq,
- TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
} else {
- WRITE_ONCE(rcu_state.gp_flags,
- rcu_state.gp_flags & RCU_GP_FLAG_INIT);
+
+ // We get here either if there is no need for an
+ // additional grace period or if rcu_accelerate_cbs() has
+ // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+ // So all we need to do is to clear all of the other
+ // ->gp_flags bits.
+
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
}
raw_spin_unlock_irq_rcu_node(rnp);
@@ -2317,8 +1956,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake = false;
- const bool offloaded = rcu_rdp_is_offloaded(rdp);
+ bool needacc = false;
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2345,15 +1983,33 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
+ *
+ * NOCB kthreads have their own way to deal with that...
*/
- if (!offloaded)
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
+ } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ /*
+ * ...but NOCB kthreads may miss or delay callbacks acceleration
+ * if in the middle of a (de-)offloading process.
+ */
+ needacc = true;
+ }
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake();
+
+ if (needacc) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_accelerate_cbs_unlocked(rnp, rdp);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
}
}
@@ -2391,95 +2047,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
}
/*
- * Near the end of the offline process. Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
- bool blkd;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp = rdp->mynode;
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- blkd = !!(rnp->qsmask & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
- blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
- return 0;
-}
-
-/*
- * All CPUs for the specified rcu_node structure have gone offline,
- * and all tasks that were preempted within an RCU read-side critical
- * section while running on one of those CPUs have since exited their RCU
- * read-side critical section. Some other CPU is reporting this fact with
- * the specified rcu_node structure's ->lock held and interrupts disabled.
- * This function therefore goes up the tree of rcu_node structures,
- * clearing the corresponding bits in the ->qsmaskinit fields. Note that
- * the leaf rcu_node structure's ->qsmaskinit field has already been
- * updated.
- *
- * This function does check that the specified rcu_node structure has
- * all CPUs offline and no blocked tasks, so it is OK to invoke it
- * prematurely. That said, invoking it after the fact will cost you
- * a needless lock acquisition. So once it has done its work, don't
- * invoke it again.
- */
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
- long mask;
- struct rcu_node *rnp = rnp_leaf;
-
- raw_lockdep_assert_held_rcu_node(rnp_leaf);
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
- WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
- return;
- for (;;) {
- mask = rnp->grpmask;
- rnp = rnp->parent;
- if (!rnp)
- break;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinit &= ~mask;
- /* Between grace periods, so better already be zero! */
- WARN_ON_ONCE(rnp->qsmask);
- if (rnp->qsmaskinit) {
- raw_spin_unlock_rcu_node(rnp);
- /* irqs remain disabled. */
- return;
- }
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-}
-
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
- /* Adjust any no-longer-needed kthreads. */
- rcu_boost_kthread_setaffinity(rnp, -1);
- /* Do any needed no-CB deferred wakeups from this CPU. */
- do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
-
- // Stop-machine done, so allow nohz_full to disable tick.
- tick_dep_clear(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Throttle as specified by rdp->blimit.
*/
@@ -2488,7 +2055,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
int div;
bool __maybe_unused empty;
unsigned long flags;
- const bool offloaded = rcu_rdp_is_offloaded(rdp);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count = 0;
@@ -2501,23 +2067,22 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
+ rcu_is_callbacks_kthread(rdp));
return;
}
/*
- * Extract the list of ready callbacks, disabling to prevent
+ * Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
*/
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
- if (unlikely(bl > 100)) {
+ if (in_serving_softirq() && unlikely(bl > 100)) {
long rrn = READ_ONCE(rcu_resched_ns);
rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
@@ -2526,7 +2091,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- if (offloaded)
+ if (rcu_rdp_is_offloaded(rdp))
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
@@ -2554,18 +2119,23 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
*/
- if (count >= bl && !offloaded &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((count & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
- break;
- }
- if (!in_serving_softirq()) {
+ if (in_serving_softirq()) {
+ if (count >= bl && (need_resched() || !is_idle_task(current)))
+ break;
+ /*
+ * Make sure we don't spend too much time here and deprive other
+ * softirq vectors of CPU cycles.
+ */
+ if (unlikely(tlimit)) {
+ /* only call local_clock() every 32 callbacks */
+ if (likely((count & 31) || local_clock() < tlimit))
+ continue;
+ /* Exceeded the time limit, so leave. */
+ break;
+ }
+ } else {
+ // In rcuoc context, so no worries about depriving
+ // other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -2574,11 +2144,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
}
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
- is_idle_task(current), rcu_is_callbacks_kthread());
+ is_idle_task(current), rcu_is_callbacks_kthread(rdp));
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
@@ -2592,7 +2161,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
@@ -2609,9 +2178,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_unlock_irqrestore(rdp, flags);
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_core();
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
@@ -2625,6 +2191,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
void rcu_sched_clock_irq(int user)
{
+ unsigned long j;
+
+ if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+ j = jiffies;
+ WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+ __this_cpu_write(rcu_data.last_sched_clock, j);
+ }
trace_rcu_utilization(TPS("Start scheduler-tick"));
lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
@@ -2640,6 +2213,8 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ if (user || rcu_is_cpu_rrupt_from_idle())
+ rcu_note_voluntary_context_switch(current);
lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
@@ -2710,7 +2285,7 @@ void rcu_force_quiescent_state(void)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = __this_cpu_read(rcu_data.mynode);
+ rnp = raw_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
@@ -2750,6 +2325,23 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ /*
+ * On RT rcu_core() can be preempted when IRQs aren't disabled.
+ * Therefore this function can race with concurrent NOCB (de-)offloading
+ * on this CPU and the below condition must be considered volatile.
+ * However if we race with:
+ *
+ * _ Offloading: In the worst case we accelerate or process callbacks
+ * concurrently with NOCB kthreads. We are guaranteed to
+ * call rcu_nocb_lock() if that happens.
+ *
+ * _ Deoffloading: In the worst case we miss callbacks acceleration or
+ * processing. This is fine because the early stage
+ * of deoffloading invokes rcu_core() after setting
+ * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
+ * what could have been dismissed without the need to wait
+ * for the next rcu_pending() check in the next jiffy.
+ */
const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
@@ -2758,7 +2350,7 @@ static __latent_entropy void rcu_core(void)
WARN_ON_ONCE(!rdp->beenonline);
/* Report any deferred quiescent states if preemption enabled. */
- if (!(preempt_count() & PREEMPT_MASK)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
set_tsk_need_resched(current);
@@ -2781,8 +2373,12 @@ static __latent_entropy void rcu_core(void)
/* If there are callbacks ready, invoke them. */
if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
- likely(READ_ONCE(rcu_scheduler_fully_active)))
+ likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ invoke_rcu_core();
+ }
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
@@ -2853,10 +2449,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
int spincnt;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
+ WRITE_ONCE(*j, jiffies);
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@@ -2877,6 +2475,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(*j, jiffies);
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
@@ -2897,7 +2496,7 @@ static int __init rcu_spawn_core_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ if (use_softirq)
return 0;
WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
@@ -2940,10 +2539,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
} else {
/* Give the grace period a kick. */
rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
- if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
@@ -2998,12 +2597,12 @@ static void check_cb_ovld(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp);
}
-/* Helper function for call_rcu() and friends. */
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func)
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
{
static atomic_t doublefrees;
unsigned long flags;
+ bool lazy;
struct rcu_data *rdp;
bool was_alldone;
@@ -3014,7 +2613,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
/*
* Probable double call_rcu(), so leak the callback.
* Use rcu:rcu_callback trace event to find the previous
- * time callback was passed to __call_rcu().
+ * time callback was passed to call_rcu().
*/
if (atomic_inc_return(&doublefrees) < 4) {
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
@@ -3025,9 +2624,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
head->func = func;
head->next = NULL;
+ kasan_record_aux_stack_noalloc(head);
local_irq_save(flags);
- kasan_record_aux_stack(head);
rdp = this_cpu_ptr(&rcu_data);
+ lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
@@ -3041,7 +2641,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -3064,8 +2664,40 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
}
+#ifdef CONFIG_RCU_LAZY
+/**
+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
+ * flush all lazy callbacks (including the new one) to the main ->cblist while
+ * doing so.
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't want the callback to be
+ * invoked after very long periods of time, which can happen on systems without
+ * memory pressure and on systems which are lightly loaded or mostly idle.
+ * This function will cause callbacks to be invoked sooner than later at the
+ * expense of extra power. Other than that, this function is identical to, and
+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
+ * ordering and other functionality.
+ */
+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
+{
+ return __call_rcu_common(head, func, false);
+}
+EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#endif
+
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * By default the callbacks are 'lazy' and are kept hidden from the main
+ * ->cblist to prevent starting of grace periods too soon.
+ * If you desire grace periods to start very soon, use call_rcu_hurry().
+ *
* @head: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
@@ -3106,25 +2738,26 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func);
+ return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
}
EXPORT_SYMBOL_GPL(call_rcu);
-
/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
#define FREE_N_CHANNELS 2
/**
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
* @nr_records: Number of active pointers in the array
- * @next: Next bulk object in the block chain
* @records: Array of the kvfree_rcu() pointers
*/
struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ unsigned long gp_snap;
unsigned long nr_records;
- struct kvfree_rcu_bulk_data *next;
void *records[];
};
@@ -3140,27 +2773,28 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @rcu_work fields have been initialized
- * @count: Number of objects for which GP not started
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
* @bkvcache:
* A simple cache list that contains objects for reuse purpose.
* In order to save some per-cpu space the list is singular.
@@ -3178,14 +2812,20 @@ struct kfree_rcu_cpu_work {
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
struct rcu_head *head;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
- bool monitor_todo;
bool initialized;
- int count;
struct delayed_work page_cache_work;
atomic_t backoff_page_cache_fill;
@@ -3273,29 +2913,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
return freed;
}
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ debug_rcu_bhead_unqueue(bnode);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
+
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bkvhead_free or ->head_free.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
- struct rcu_head *head, *next;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
- int i, j;
+ int i;
krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
+ struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
raw_spin_lock_irqsave(&krcp->lock, flags);
// Channels 1 and 2.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- bkvhead[i] = krwp->bkvhead_free[i];
- krwp->bkvhead_free[i] = NULL;
- }
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
// Channel 3.
head = krwp->head_free;
@@ -3304,39 +3002,9 @@ static void kfree_rcu_work(struct work_struct *work)
// Handle the first two channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
- for (; bkvhead[i]; bkvhead[i] = bnext) {
- bnext = bkvhead[i]->next;
- debug_rcu_bhead_unqueue(bkvhead[i]);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (i == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bkvhead[i]->nr_records,
- bkvhead[i]->records);
-
- kfree_bulk(bkvhead[i]->nr_records,
- bkvhead[i]->records);
- } else { // vmalloc() / vfree().
- for (j = 0; j < bkvhead[i]->nr_records; j++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name,
- bkvhead[i]->records[j], 0);
-
- vfree(bkvhead[i]->records[j]);
- }
- }
- rcu_lock_release(&rcu_callback_map);
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (put_cached_bnode(krcp, bkvhead[i]))
- bkvhead[i] = NULL;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (bkvhead[i])
- free_page((unsigned long) bkvhead[i]);
-
- cond_resched_tasks_rcu_qs();
- }
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
}
/*
@@ -3346,21 +3014,95 @@ static void kfree_rcu_work(struct work_struct *work)
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
- for (; head; head = next) {
- unsigned long offset = (unsigned long)head->func;
- void *ptr = (void *)head - offset;
+ kvfree_rcu_list(head);
+}
- next = head->next;
- debug_rcu_head_unqueue((struct rcu_head *)ptr);
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krcp->bulk_head[i]))
+ return true;
- rcu_lock_release(&rcu_callback_map);
- cond_resched_tasks_rcu_qs();
+ return !!READ_ONCE(krcp->head);
+}
+
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+ return;
}
+ queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+}
+
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu(bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
}
/*
@@ -3373,26 +3115,30 @@ static void kfree_rcu_monitor(struct work_struct *work)
unsigned long flags;
int i, j;
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
+
raw_spin_lock_irqsave(&krcp->lock, flags);
// Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- // Try to detach bkvhead or head and attach it over any
- // available corresponding free channel. It can be that
- // a previous RCU batch is in progress, it means that
- // immediately to queue another one is not possible so
- // in that case the monitor work is rearmed.
- if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
- (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
- (krcp->head && !krwp->head_free)) {
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
// Channel 1 corresponds to the SLAB-pointer bulk path.
// Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
- if (!krwp->bkvhead_free[j]) {
- krwp->bkvhead_free[j] = krcp->bkvhead[j];
- krcp->bkvhead[j] = NULL;
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
}
}
@@ -3400,11 +3146,10 @@ static void kfree_rcu_monitor(struct work_struct *work)
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
- krcp->head = NULL;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
}
- WRITE_ONCE(krcp->count, 0);
-
// One work is per one batch, so there are three
// "free channels", the batch can handle. It can
// be that the work is in the pending state when
@@ -3414,17 +3159,15 @@ static void kfree_rcu_monitor(struct work_struct *work)
}
}
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
// If there is nothing to detach, it means that our job is
// successfully done here. In case of having at least one
// of the channels that is still busy we should rearm the
// work to repeat an attempt. Because previous batches are
// still in progress.
- if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
- krcp->monitor_todo = false;
- else
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
-
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
}
static enum hrtimer_restart
@@ -3455,15 +3198,16 @@ static void fill_page_cache_func(struct work_struct *work)
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (bnode) {
- raw_spin_lock_irqsave(&krcp->lock, flags);
- pushed = put_cached_bnode(krcp, bnode);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (!bnode)
+ break;
- if (!pushed) {
- free_page((unsigned long) bnode);
- break;
- }
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
}
}
@@ -3506,10 +3250,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
return false;
idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
/* Check if a new block is required. */
- if (!(*krcp)->bkvhead[idx] ||
- (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(*krcp);
if (!bnode && can_alloc) {
krc_this_cpu_unlock(*krcp, *flags);
@@ -3533,17 +3278,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
if (!bnode)
return false;
- /* Initialize the new block. */
+ // Initialize the new block and attach it.
bnode->nr_records = 0;
- bnode->next = (*krcp)->bkvhead[idx];
-
- /* Attach it to the head. */
- (*krcp)->bkvhead[idx] = bnode;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
- /* Finally insert. */
- (*krcp)->bkvhead[idx]->records
- [(*krcp)->bkvhead[idx]->nr_records++] = ptr;
+ // Finally insert and update the GP for this page.
+ bnode->records[bnode->nr_records++] = ptr;
+ bnode->gp_snap = get_state_synchronize_rcu();
+ atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
}
@@ -3560,26 +3303,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
* be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
bool success;
- void *ptr;
- if (head) {
- ptr = (void *) head - (unsigned long) func;
- } else {
- /*
- * Please note there is a limitation for the head-less
- * variant, that is why there is a clear rule for such
- * objects: it can be used from might_sleep() context
- * only. For other places please embed an rcu_head to
- * your data.
- */
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
might_sleep();
- ptr = (unsigned long *) func;
- }
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
@@ -3591,7 +3329,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
return;
}
- kasan_record_aux_stack(ptr);
+ kasan_record_aux_stack_noalloc(ptr);
success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
if (!success) {
run_page_cache_worker(krcp);
@@ -3600,20 +3338,19 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
- head->func = func;
+ head->func = ptr;
head->next = krcp->head;
- krcp->head = head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
success = true;
}
- WRITE_ONCE(krcp->count, krcp->count + 1);
-
// Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !krcp->monitor_todo) {
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- }
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+ schedule_delayed_monitor_work(krcp);
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -3641,12 +3378,12 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count += READ_ONCE(krcp->count);
+ count += krc_count(krcp);
count += READ_ONCE(krcp->nr_bkv_objs);
atomic_set(&krcp->backoff_page_cache_fill, 1);
}
- return count;
+ return count == 0 ? SHRINK_EMPTY : count;
}
static unsigned long
@@ -3658,7 +3395,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count = krcp->count;
+ count = krc_count(krcp);
count += drain_page_cache(krcp);
kfree_rcu_monitor(&krcp->monitor_work.work);
@@ -3682,60 +3419,32 @@ static struct shrinker kfree_rcu_shrinker = {
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
- unsigned long flags;
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
- krcp->monitor_todo) {
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- continue;
- }
- krcp->monitor_todo = true;
- schedule_delayed_work_on(cpu, &krcp->monitor_work,
- KFREE_DRAIN_JIFFIES);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
}
}
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPTION.
+ * implies a grace period.
*
- * However, because a context switch is a grace period for !PREEMPTION, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
static int rcu_blocking_is_gp(void)
{
- int ret;
-
- if (IS_ENABLED(CONFIG_PREEMPTION))
- return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- /*
- * If the rcu_state.n_online_cpus counter is equal to one,
- * there is only one CPU, and that CPU sees all prior accesses
- * made by any CPU that was online at the time of its access.
- * Furthermore, if this counter is equal to one, its value cannot
- * change until after the preempt_enable() below.
- *
- * Furthermore, if rcu_state.n_online_cpus is equal to one here,
- * all later CPUs (both this one and any that come online later
- * on) are guaranteed to see all accesses prior to this point
- * in the code, without the need for additional memory barriers.
- * Those memory barriers are provided by CPU-hotplug code.
- */
- ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
- preempt_enable();
- return ret;
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
+ return false;
+ }
+ return true;
}
/**
@@ -3778,20 +3487,59 @@ static int rcu_blocking_is_gp(void)
*/
void synchronize_rcu(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
- return; // Context allows vacuous grace periods.
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
+ if (!rcu_blocking_is_gp()) {
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu_hurry);
+ return;
+ }
+
+ // Context allows vacuous grace periods.
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with other
+ // normal GPs only by being fully nested within them, which allows
+ // reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+ // Update the normal grace-period counters to record
+ // this grace period, but only those used by the boot CPU.
+ // The rcu_scheduler_starting() will take care of the rest of
+ // these counters.
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+ for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
+ *
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+ rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
* Returns a cookie that is used by a later call to cond_synchronize_rcu()
@@ -3805,26 +3553,47 @@ unsigned long get_state_synchronize_rcu(void)
* before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
- return rcu_seq_snap(&rcu_state.gp_seq);
+ return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
/**
- * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
*
- * Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * or poll_state_synchronize_rcu() to determine whether or not a full
- * grace period has elapsed in the meantime. If the needed grace period
- * is not already slated to start, notifies RCU core of the need for that
- * grace period.
+ * Places the normal and expedited grace-period states in @rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods. In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
*
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
+ * This does not guarantee that the needed grace period will actually
+ * start.
*/
-unsigned long start_poll_synchronize_rcu(void)
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the loads from ->gp_seq and ->expedited_sequence.
+ */
+ smp_mb(); /* ^^^ */
+ rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+ rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
+
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
+ */
+static void start_poll_synchronize_rcu_common(void)
{
unsigned long flags;
- unsigned long gp_seq = get_state_synchronize_rcu();
bool needwake;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -3834,33 +3603,88 @@ unsigned long start_poll_synchronize_rcu(void)
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); // irqs already disabled.
- needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+ // Note it is possible for a grace period to have elapsed between
+ // the above call to get_state_synchronize_rcu() and the below call
+ // to rcu_seq_snap. This is OK, the worst that happens is that we
+ // get a grace period that no one needed. These accesses are ordered
+ // by smp_mb(), and we are accessing them in the opposite order
+ // from which they are updated at grace-period start, as required.
+ needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake();
+}
+
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime. If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ start_poll_synchronize_rcu_common();
return gp_seq;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
/**
- * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
*
+ * Places the normal and expedited grace-period states in *@rgos. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+
+ start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
+
+/**
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call from
- * which oldstate was obtained, return @true, otherwise return @false.
+ * which @oldstate was obtained, return @true, otherwise return @false.
* If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
- * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
+ * on the one hand or by directly invoking either synchronize_rcu() or
+ * synchronize_rcu_expedited() on the other.
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!).
- * Those needing to keep oldstate values for very long time periods
- * (many hours even on 32-bit systems) should check them occasionally
- * and either refresh them or set a flag indicating that the grace period
- * has completed.
+ * more than a billion grace periods (and way more on a 64-bit system!).
+ * Those needing to keep old state values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed. Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
+ *
+ * In addition, because oldstate compresses the grace-period state for
+ * both normal and expedited grace periods into a single unsigned long,
+ * it can miss a grace period when synchronize_rcu() runs concurrently
+ * with synchronize_rcu_expedited(). If this is unacceptable, please
+ * instead use the _full() variant of these polling APIs.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
@@ -3869,7 +3693,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
- if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
+ if (oldstate == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
smp_mb(); /* Ensure GP ends before subsequent accesses. */
return true;
}
@@ -3878,22 +3703,70 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
*
- * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!). Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed. Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
+ *
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function. And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated. Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+ if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+ rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
* Otherwise, invoke synchronize_rcu() to wait for a full grace period.
*
- * Yes, this function does not take counter wrap into account. But
- * counter wrap is harmless. If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @oldstate, and that returned at the end
+ * to the function that provided @oldstate and that returned at the end
* of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
@@ -3903,6 +3776,33 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
+
/*
* Check to see if there is any immediate RCU-related work to be done by
* the current CPU, returning 1 if so and zero otherwise. The checks are
@@ -3987,26 +3887,56 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
}
/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
*/
-static void rcu_barrier_func(void *cpu_in)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
{
- uintptr_t cpu = (uintptr_t)cpu_in;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+ unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ bool wake_nocb = false;
+ bool was_alldone = false;
+ lockdep_assert_held(&rcu_state.barrier_lock);
+ if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+ return;
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ /*
+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
+ * queue. This way we don't wait for bypass timer that can reach seconds
+ * if it's fully lazy.
+ */
+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
+ if (wake_nocb)
+ wake_nocb_gp(rdp, false);
+ smp_store_release(&rdp->barrier_seq_snap, gseq);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_handler(void *cpu_in)
+{
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(cpu != rdp->cpu);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+ raw_spin_lock(&rcu_state.barrier_lock);
+ rcu_barrier_entrain(rdp);
+ raw_spin_unlock(&rcu_state.barrier_lock);
}
/**
@@ -4020,6 +3950,8 @@ static void rcu_barrier_func(void *cpu_in)
void rcu_barrier(void)
{
uintptr_t cpu;
+ unsigned long flags;
+ unsigned long gseq;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
@@ -4030,15 +3962,16 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
- rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
}
/* Mark the start of the barrier operation. */
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
rcu_seq_start(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
@@ -4050,7 +3983,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- get_online_cpus();
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
/*
* Force each CPU with callbacks to register a new callback.
@@ -4059,29 +3992,31 @@ void rcu_barrier(void)
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (cpu_is_offline(cpu) &&
- !rcu_rdp_is_offloaded(rdp))
+retry:
+ if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
continue;
- if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
- rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
- smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
- } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
- cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
- rcu_state.barrier_sequence);
- local_irq_disable();
- rcu_barrier_func((void *)cpu);
- local_irq_enable();
- } else if (cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ if (!rcu_rdp_cpu_online(rdp)) {
+ rcu_barrier_entrain(rdp);
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+ schedule_timeout_uninterruptible(1);
+ goto retry;
}
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
}
- put_online_cpus();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -4096,6 +4031,12 @@ void rcu_barrier(void)
/* Mark the end of the barrier operation. */
rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
rcu_seq_end(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ }
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_state.barrier_mutex);
@@ -4103,6 +4044,155 @@ void rcu_barrier(void)
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online as far as RCU is concerned?
+ *
+ * Disable preemption to avoid false positives that could otherwise
+ * happen due to the current CPU number being sampled, this task being
+ * preempted, its old CPU being taken offline, resuming on some other CPU,
+ * then determining that its old CPU is now offline.
+ *
+ * Disable checking if in an NMI handler because we cannot safely
+ * report errors from NMI handlers anyway. In addition, it is OK to use
+ * RCU on an offline processor during initial boot, hence the check for
+ * rcu_scheduler_fully_active.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ bool ret = false;
+
+ if (in_nmi() || !rcu_scheduler_fully_active)
+ return true;
+ preempt_disable_notrace();
+ rdp = this_cpu_ptr(&rcu_data);
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
+ ret = true;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return 0;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated.
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+ WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rnp->qsmaskinit &= ~mask;
+ /* Between grace periods, so better already be zero! */
+ WARN_ON_ONCE(rnp->qsmask);
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+}
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return 0;
+
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
* must hold the corresponding leaf rcu_node ->lock with interrupts
@@ -4136,17 +4226,20 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
- WARN_ON_ONCE(rdp->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+ WARN_ON_ONCE(ct->dynticks_nesting != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
}
@@ -4164,16 +4257,16 @@ rcu_boot_init_percpu_data(int cpu)
int rcutree_prepare_cpu(unsigned int cpu)
{
unsigned long flags;
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rcu_get_root();
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
- rcu_dynticks_eqs_online();
+ ct->dynticks_nesting = 1; /* CPU not up, no tearing. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -4291,11 +4384,13 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ rcu_dynticks_eqs_online();
+ raw_spin_lock(&rcu_state.barrier_lock);
+ raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
+ raw_spin_unlock(&rcu_state.barrier_lock);
newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
/* Allow lockless access for expedited grace periods. */
@@ -4307,15 +4402,18 @@ void rcu_cpu_starting(unsigned int cpu)
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+ /* rcu_report_qs_rnp() *really* wants some flags to restore */
+ unsigned long flags2;
+
+ local_irq_save(flags2);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
- rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
} else {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_rcu_node(rnp);
}
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
@@ -4329,7 +4427,7 @@ void rcu_cpu_starting(unsigned int cpu)
*/
void rcu_report_dead(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, seq_flags;
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -4337,32 +4435,25 @@ void rcu_report_dead(unsigned int cpu)
// Do any dangling deferred wakeups.
do_nocb_deferred_wakeup(rdp);
- /* QS for any half-done expedited grace period. */
- preempt_disable();
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
- preempt_enable();
rcu_preempt_deferred_qs(current);
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock(&rcu_state.ofl_lock);
+ local_irq_save(seq_flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- raw_spin_unlock(&rcu_state.ofl_lock);
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(seq_flags);
rdp->cpu_started = false;
}
@@ -4385,20 +4476,23 @@ void rcutree_migrate_callbacks(int cpu)
rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
- local_irq_save(flags);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+ rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
needwake = rcu_advance_cbs(my_rnp, rdp) ||
rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
- WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
- !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ check_cb_ovld_locked(my_rdp, my_rnp);
if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4427,11 +4521,13 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
+ rcu_async_hurry();
rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
rcu_unexpedite_gp();
+ rcu_async_relax();
break;
default:
break;
@@ -4439,31 +4535,61 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+ const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+ const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", gp_kworker_name);
+ return;
+ }
+
+ rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+ pr_err("Failed to create %s!\n", par_gp_kworker_name);
+ kthread_destroy_worker(rcu_exp_gp_kworker);
+ return;
+ }
+
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+ &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
- int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct sched_param sp;
struct task_struct *t;
-
- /* Force priority into range. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
- && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
- kthread_prio = 2;
- else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
- kthread_prio = 1;
- else if (kthread_prio < 0)
- kthread_prio = 0;
- else if (kthread_prio > 99)
- kthread_prio = 99;
-
- if (kthread_prio != kthread_prio_in)
- pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
- kthread_prio, kthread_prio_in);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
@@ -4481,9 +4607,17 @@ static int __init rcu_spawn_gp_kthread(void)
smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
- rcu_spawn_nocb_kthreads();
- rcu_spawn_boost_kthreads();
+ /* This is a pre-SMP initcall, we expect a single CPU */
+ WARN_ON(num_online_cpus() > 1);
+ /*
+ * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+ * due to rcu_scheduler_fully_active.
+ */
+ rcu_spawn_cpu_nocb_kthread(smp_processor_id());
+ rcu_spawn_one_boost_kthread(rdp->mynode);
rcu_spawn_core_kthreads();
+ /* Create kthread worker for expedited GPs */
+ rcu_start_exp_gp_kworkers();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4500,9 +4634,20 @@ early_initcall(rcu_spawn_gp_kthread);
*/
void rcu_scheduler_starting(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
rcu_test_sync_prims();
+
+ // Fix up the ->gp_seq counters.
+ local_irq_save(flags);
+ rcu_for_each_node_breadth_first(rnp)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
+
+ // Switch out of early boot mode.
rcu_scheduler_active = RCU_SCHEDULER_INIT;
rcu_test_sync_prims();
}
@@ -4575,6 +4720,10 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
+ mutex_init(&rnp->boost_kthread_mutex);
+ raw_spin_lock_init(&rnp->exp_poll_lock);
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
@@ -4590,6 +4739,28 @@ static void __init rcu_init_one(void)
}
/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+ int kthread_prio_in = kthread_prio;
+
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+ && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+ kthread_prio = 2;
+ else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("%s: Limited prio to %d from %d\n",
+ __func__, kthread_prio, kthread_prio_in);
+}
+
+/*
* Compute the rcu_node tree geometry from kernel parameters. This cannot
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
@@ -4706,12 +4877,11 @@ static void __init rcu_dump_rcu_node_tree(void)
}
struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
int cpu;
- int i;
+ int i, j;
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -4731,24 +4901,31 @@ static void __init kfree_rcu_batch_init(void)
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
}
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
- if (register_shrinker(&kfree_rcu_shrinker))
+ if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree"))
pr_err("Failed to register kfree_rcu() shrinker!\n");
}
void __init rcu_init(void)
{
- int cpu;
+ int cpu = smp_processor_id();
rcu_early_boot_tests();
kfree_rcu_batch_init();
rcu_bootup_announce();
+ sanitize_kthread_prio();
rcu_init_geometry();
rcu_init_one();
if (dump_tree)
@@ -4762,17 +4939,15 @@ void __init rcu_init(void)
* or the scheduler are operational.
*/
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu) {
- rcutree_prepare_cpu(cpu);
- rcu_cpu_starting(cpu);
- rcutree_online_cpu(cpu);
- }
+ WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+ rcutree_prepare_cpu(cpu);
+ rcu_cpu_starting(cpu);
+ rcutree_online_cpu(cpu);
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
+ rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -4780,8 +4955,14 @@ void __init rcu_init(void)
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
else
qovld_calc = qovld;
+
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
+
+ rcu_test_sync_prims();
}
#include "tree_stall.h"
#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..192536916f9a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
*/
#include <linux/cache.h>
+#include <linux/kthread.h>
#include <linux/spinlock.h>
#include <linux/rtmutex.h>
#include <linux/threads.h>
@@ -23,7 +24,11 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
unsigned long rew_s;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+ struct kthread_work rew_work;
+#else
struct work_struct rew_work;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -56,8 +61,6 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
- unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
- /* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
/* to complete. */
@@ -110,6 +113,9 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
+ struct mutex boost_kthread_mutex;
+ /* Exclusion for thread spawning and affinity */
+ /* manipulation. */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
@@ -127,6 +133,10 @@ struct rcu_node {
wait_queue_head_t exp_wq[4];
struct rcu_exp_work rew;
bool exp_need_flush; /* Need to flush workitem? */
+ raw_spinlock_t exp_poll_lock;
+ /* Lock and data for polled expedited grace periods. */
+ unsigned long exp_seq_poll_rq;
+ struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
@@ -148,6 +158,23 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
+/*
+ * Record the snapshot of the core stats at half of the first RCU stall timeout.
+ * The member gp_seq is used to ensure that all members are updated only once
+ * during the sampling period. The snapshot is taken only if this gp_seq is not
+ * equal to rdp->gp_seq.
+ */
+struct rcu_snap_record {
+ unsigned long gp_seq; /* Track rdp->gp_seq counter */
+ u64 cputime_irq; /* Accumulated cputime of hard irqs */
+ u64 cputime_softirq;/* Accumulated cputime of soft irqs */
+ u64 cputime_system; /* Accumulated cputime of kernel tasks */
+ unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ unsigned int nr_softirqs; /* Accumulated number of soft irqs */
+ unsigned long long nr_csw; /* Accumulated number of task switches */
+ unsigned long jiffies; /* Track jiffies value */
+};
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -157,7 +184,6 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -182,20 +208,13 @@ struct rcu_data {
/* 3) dynticks interface. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- long dynticks_nesting; /* Track process nesting level. */
- long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
+ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
int exp_dynticks_snap; /* Double-check need for IPI. */
@@ -209,6 +228,8 @@ struct rcu_data {
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+ struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+ /* spawning */
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
@@ -227,8 +248,12 @@ struct rcu_data {
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
struct task_struct *nocb_cb_kthread;
- struct rcu_data *nocb_next_cb_rdp;
- /* Next rcu_data in wakeup chain. */
+ struct list_head nocb_head_rdp; /*
+ * Head of rcu_data list in wakeup chain,
+ * if rdp_gp.
+ */
+ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+ struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -240,6 +265,7 @@ struct rcu_data {
/* rcuc per-CPU kthread or NULL. */
unsigned int rcu_cpu_kthread_status;
char rcu_cpu_has_work;
+ unsigned long rcuc_activity;
/* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
@@ -252,15 +278,20 @@ struct rcu_data {
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
+ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
+ /* the first RCU stall timeout */
+ long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOCB_WAKE_NOT 0
#define RCU_NOCB_WAKE_BYPASS 1
-#define RCU_NOCB_WAKE 2
-#define RCU_NOCB_WAKE_FORCE 3
+#define RCU_NOCB_WAKE_LAZY 2
+#define RCU_NOCB_WAKE 3
+#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -305,9 +336,8 @@ struct rcu_state {
/* The following fields are guarded by the root rcu_node's lock. */
- u8 boost ____cacheline_internodealigned_in_smp;
- /* Subject to priority boost. */
- unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+ /* Grace-period sequence #. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
@@ -316,6 +346,9 @@ struct rcu_state {
short gp_state; /* GP kthread sleep state. */
unsigned long gp_wake_time; /* Last GP kthread wake. */
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
+ unsigned long gp_seq_polled; /* GP seq for polled API. */
+ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
+ unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
@@ -326,6 +359,8 @@ struct rcu_state {
/* rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */
+
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
@@ -358,9 +393,10 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
- raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+ arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
+ int nocb_is_setup; /* nocb is setup from boot */
};
/* Values for rcu_state structure's gp_flags field. */
@@ -415,30 +451,27 @@ static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static bool rcu_is_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
-static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
-static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j);
+ unsigned long j, bool lazy);
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags);
+ bool *was_alldone, unsigned long flags,
+ bool lazy);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
-static void __init rcu_spawn_nocb_kthreads(void);
static void show_rcu_nocb_state(struct rcu_data *rdp);
static void rcu_nocb_lock(struct rcu_data *rdp);
static void rcu_nocb_unlock(struct rcu_data *rdp);
@@ -447,12 +480,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#define rcu_nocb_lock_irqsave(rdp, flags) \
-do { \
- if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
- local_irq_save(flags); \
- else \
- raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+
+/*
+ * Disable IRQs before checking offloaded state so that local
+ * locking is safe against concurrent de-offloading.
+ */
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ local_irq_save(flags); \
+ if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ raw_spin_lock(&(rdp)->nocb_lock); \
} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
@@ -460,10 +497,6 @@ do { \
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
-static void rcu_dynticks_task_enter(void);
-static void rcu_dynticks_task_exit(void);
-static void rcu_dynticks_task_trace_enter(void);
-static void rcu_dynticks_task_trace_exit(void);
/* Forward declarations for tree_stall.h */
static void record_gp_stall_check_time(void);
@@ -471,3 +504,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
static void check_cpu_stall(struct rcu_data *rdp);
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 2796084ef85a..3b7abb58157d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -11,6 +11,7 @@
static void rcu_exp_handler(void *unused);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -18,6 +19,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_exp_gp_seq_start(void)
{
rcu_seq_start(&rcu_state.expedited_sequence);
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
}
/*
@@ -34,6 +36,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
*/
static void rcu_exp_gp_seq_end(void)
{
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_seq_end(&rcu_state.expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
@@ -255,7 +258,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->exp_deferred_qs, false);
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -334,15 +337,13 @@ fastpath:
* Select the CPUs within the specified rcu_node that the upcoming
* expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
{
int cpu;
unsigned long flags;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -358,7 +359,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(rdp);
+ snap = rcu_dynticks_snap(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -387,6 +388,7 @@ retry_ipi:
continue;
}
if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
put_cpu();
continue;
}
@@ -416,13 +418,119 @@ retry_ipi:
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
}
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+#ifdef CONFIG_RCU_EXP_KTHREAD
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_par_gp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /*
+ * Use rcu_exp_par_gp_kworker, because flushing a work item from
+ * another work item on the same kthread worker can result in
+ * deadlock.
+ */
+ kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+ kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_par_gp_wq);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
+
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /* If all offline, queue the work on an unbound CPU. */
+ if (unlikely(cpu > rnp->grphi - rnp->grplo))
+ cpu = WORK_CPU_UNBOUND;
+ else
+ cpu += rnp->grplo;
+ queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+ destroy_work_on_stack(&rew->rew_work);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
static void sync_rcu_exp_select_cpus(void)
{
- int cpu;
struct rcu_node *rnp;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -434,28 +542,21 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!READ_ONCE(rcu_par_gp_wq) ||
+ if (!rcu_gp_par_worker_started() ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
- /* No workqueues yet or last leaf, do direct call. */
+ /* No worker started yet or last leaf, do direct call. */
sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
continue;
}
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_queue_work(rnp);
rnp->exp_need_flush = true;
}
- /* Wait for workqueue jobs (if any) to complete. */
+ /* Wait for jobs (if any) to complete. */
rcu_for_each_leaf_node(rnp)
if (rnp->exp_need_flush)
- flush_work(&rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_flush_work(rnp);
}
/*
@@ -493,26 +594,30 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
+ unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
- jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_stall = rcu_exp_jiffies_till_stall_check();
jiffies_start = jiffies;
if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ mask = READ_ONCE(rnp->expmask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ if (cpu_online(cpu))
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
return;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT));
}
for (;;) {
@@ -520,7 +625,6 @@ static void synchronize_rcu_expedited_wait(void)
return;
if (rcu_stall_is_suppressed())
continue;
- panic_on_rcu_stall();
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
@@ -535,10 +639,11 @@ static void synchronize_rcu_expedited_wait(void)
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c", cpu,
+ pr_cont(" %d-%c%c%c%c", cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
@@ -564,10 +669,14 @@ static void synchronize_rcu_expedited_wait(void)
mask = leaf_node_cpu_bit(rnp, cpu);
if (!(READ_ONCE(rnp->expmask) & mask))
continue;
+ preempt_disable(); // For smp_processor_id() in dump_cpu_task().
dump_cpu_task(cpu);
+ preempt_enable();
}
+ rcu_exp_print_detail_task_stall_rnp(rnp);
}
- jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+ panic_on_rcu_stall();
}
}
@@ -618,17 +727,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s);
}
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -653,10 +751,10 @@ static void rcu_exp_handler(void *unused)
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_is_cpu_rrupt_from_idle()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -678,7 +776,7 @@ static void rcu_exp_handler(void *unused)
if (depth > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -705,9 +803,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
int ndetected = 0;
struct task_struct *t;
- if (!READ_ONCE(rnp->exp_tasks))
- return 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
@@ -718,6 +818,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return ndetected;
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, dumping the stack of each that is blocking the current
+ * expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!rcu_exp_stall_task_details)
+ return;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!READ_ONCE(rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Request an expedited quiescent state. */
@@ -735,11 +865,13 @@ static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
- if (rcu_is_cpu_rrupt_from_idle()) {
+ if (rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
@@ -760,7 +892,7 @@ static void sync_sched_exp_online_cleanup(int cpu)
my_cpu = get_cpu();
/* Quiescent state either not needed or already requested, leave. */
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+ READ_ONCE(rdp->cpu_no_qs.b.exp)) {
put_cpu();
return;
}
@@ -788,6 +920,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return 0;
}
+/*
+ * Because preemptible RCU does not exist, we never have to print out
+ * tasks blocked within RCU read-side critical sections that are blocking
+ * the current expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
@@ -813,6 +954,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -823,12 +965,25 @@ void synchronize_rcu_expedited(void)
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
/* Is the state is such that the call is a grace period? */
- if (rcu_blocking_is_gp())
- return;
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs
+ // at process level. Therefore, this expedited GP overlaps
+ // with other expedited GPs only by being fully nested within
+ // them, which allows reuse of ->gp_seq_polled_exp_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+ local_irq_restore(flags);
+ return; // Context allows vacuous grace periods.
+ }
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
return;
}
@@ -844,20 +999,155 @@ void synchronize_rcu_expedited(void)
} else {
/* Marshall arguments & schedule the expedited grace period. */
rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
+ synchronize_rcu_expedited_queue_work(&rew);
}
/* Wait for expedited grace period to complete. */
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
+ smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
if (likely(!boottime))
- destroy_work_on_stack(&rew.rew_work);
+ synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+ unsigned long flags;
+ int i = 0;
+ struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+ unsigned long s;
+
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+ if (s == RCU_GET_STATE_COMPLETED)
+ return;
+ while (!poll_state_synchronize_rcu(s)) {
+ synchronize_rcu_expedited();
+ if (i == 10 || i == 20)
+ pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
+ i++;
+ }
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ if (poll_state_synchronize_rcu(s))
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime. If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ s = get_state_synchronize_rcu();
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rdp->mynode;
+ if (rcu_init_invoked())
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ if (!poll_state_synchronize_rcu(s)) {
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
+ queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
+ }
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+ return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+ (void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 000000000000..f2280616f9d5
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1762 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return lockdep_is_held(&rdp->nocb_lock);
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (*str == '=') {
+ if (cpulist_parse(++str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ }
+ rcu_state.nocb_is_setup = true;
+ return 1;
+}
+__setup("rcu_nocbs", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+
+#ifdef CONFIG_RCU_LAZY
+// To be called only from test code.
+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+{
+ jiffies_till_flush = jif;
+}
+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+
+unsigned long rcu_lazy_get_jiffies_till_flush(void)
+{
+ return jiffies_till_flush;
+}
+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+#endif
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case of
+ * callback storms, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_LAZY &&
+ rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in,
+ unsigned long j, bool lazy)
+{
+ struct rcu_cblist rcl;
+ struct rcu_head *rhp = rhp_in;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+
+ /*
+ * If the new CB requested was a lazy one, queue it onto the main
+ * ->cblist so that we can take advantage of the grace-period that will
+ * happen regardless. But queue it onto the bypass list first so that
+ * the lazy CB is ordered with the existing CBs in the bypass list.
+ */
+ if (lazy && rhp) {
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ rhp = NULL;
+ }
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ WRITE_ONCE(rdp->lazy_len, 0);
+
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags,
+ bool lazy)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // In the process of (de-)offloading: no bypassing, but
+ // locking.
+ if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ // Lazy CBs throttle this back and do immediate bypass queuing.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ (ncbs && bypass_is_lazy &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+
+ // The flush succeeded and we moved CBs into the regular list.
+ // Don't wait for the wake up timer as it may be too far ahead.
+ // Wake up the GP thread now instead, if the cblist was empty.
+ __call_rcu_nocb_wake(rdp, *was_alldone, flags);
+
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+ if (lazy)
+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ // A wake up of the grace period kthread or timer adjustment
+ // needs to be done only if:
+ // 1. Bypass list was fully empty before (this is the first
+ // bypass list entry), or:
+ // 2. Both of these conditions are met:
+ // a. The bypass list previously had only lazy CBs, and:
+ // b. The new CB is non-lazy.
+ if (ncbs && (!bypass_is_lazy || lazy)) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ long bypass_len;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long lazy_len;
+ long len;
+ struct task_struct *t;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_len = READ_ONCE(rdp->lazy_len);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ // Only lazy CBs in bypass list
+ if (lazy_len && bypass_len == lazy_len) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazy"));
+ } else if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_timer)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+}
+
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
+ bool *wake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Offloading. Set our flag and notify the offload worker.
+ * We will handle this rdp until it ever gets de-offloaded.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 1;
+ } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 0;
+ } else {
+ WARN_ON_ONCE(1);
+ ret = -1;
+ }
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return ret;
+}
+
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool lazy = false;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp, *rdp_toggling = NULL;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ /*
+ * An rcu_data structure is removed from the list after its
+ * CPU is de-offloaded and added to the list before that CPU is
+ * (re-)offloaded. If the following loop happens to be referencing
+ * that rcu_data structure during the time that the corresponding
+ * CPU is de-offloaded and then immediately re-offloaded, this
+ * loop's rdp pointer will be carried to the end of the list by
+ * the resulting pair of list operations. This can cause the loop
+ * to skip over some of the rcu_data structures that were supposed
+ * to have been scanned. Fortunately a new iteration through the
+ * entire loop is forced after a given CPU's rcu_data structure
+ * is added to the list, so the skipped-over rcu_data structures
+ * won't be ignored for long.
+ */
+ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+ long bypass_ncbs;
+ bool flush_bypass = false;
+ long lazy_ncbs;
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ lockdep_assert_held(&rdp->nocb_lock);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+
+ if (flush_bypass) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+ }
+
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+ if (bypass_ncbs == lazy_ncbs)
+ lazy = true;
+ else
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ if (!rcu_nocb_poll) {
+ // If bypass list only has lazy CBs. Add a deferred lazy wake up.
+ if (lazy && !bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazyIsDeferred"));
+ // Otherwise add a deferred bypass wake up.
+ } else if (bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ }
+
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ if (list_empty(&my_rdp->nocb_head_rdp)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (!my_rdp->nocb_toggling_rdp)
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ /* Wait for any offloading rdp */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ schedule_timeout_idle(1);
+ }
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ // (De-)queue an rdp to/from the group if its nocb state is changing
+ rdp_toggling = my_rdp->nocb_toggling_rdp;
+ if (rdp_toggling)
+ my_rdp->nocb_toggling_rdp = NULL;
+
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ } else {
+ rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+ if (rdp_toggling) {
+ /*
+ * Paranoid locking to make sure nocb_toggling_rdp is well
+ * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+ * race with another round of nocb toggling for this rdp.
+ * Nocb locking should prevent from that already but we stick
+ * to paranoia, especially in rare path.
+ */
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ my_rdp->nocb_toggling_rdp = NULL;
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ }
+
+ if (rdp_toggling) {
+ bool wake_state = false;
+ int ret;
+
+ ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ if (ret == 1)
+ list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
+ else if (ret == 0)
+ list_del(&rdp_toggling->nocb_entry_rdp);
+ if (wake_state)
+ swake_up_one(&rdp_toggling->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_state = false;
+ bool needwake_gp = false;
+ bool can_sleep = true;
+ struct rcu_node *rnp = rdp->mynode;
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ can_sleep = false;
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ // Queue this rdp for add/del to/from the list to iterate on rcuog
+ WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ return wake_gp;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ /*
+ * rcu_nocb_rdp_deoffload() may be called directly if
+ * rcuog/o[p] spawn failed, because at this time the rdp->cpu
+ * is not online yet.
+ */
+ WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Flush once and for all now. This suffices because we are
+ * running on the target CPU holding ->nocb_lock (thus having
+ * interrupts disabled), and because rdp_offload_toggle()
+ * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+ * Thus future calls to rcu_segcblist_completely_offloaded() will
+ * return false, which means that future calls to rcu_nocb_try_bypass()
+ * will refuse to put anything into the bypass.
+ */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ /*
+ * Start with invoking rcu_core() early. This way if the current thread
+ * happens to preempt an ongoing call to rcu_core() in the middle,
+ * leaving some work dismissed because rcu_core() still thinks the rdp is
+ * completely offloaded, we are guaranteed a nearby future instance of
+ * rcu_core() to catch up.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
+ invoke_rcu_core();
+ wake_gp = rdp_offload_toggle(rdp, false, flags);
+
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (rdp_gp->nocb_gp_kthread) {
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ /*
+ * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
+ * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
+ */
+ if (!rdp->nocb_cb_kthread) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ } else {
+ /*
+ * No kthread to clear the flags for us or remove the rdp from the nocb list
+ * to iterate. Do it here instead. Locking doesn't look stricly necessary
+ * but we stick to paranoia in this rare path.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ list_del(&rdp->nocb_entry_rdp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /*
+ * Lock one last time to acquire latest callback updates from kthreads
+ * so we can later handle callbacks locally without locking.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
+ * lock is released but how about being paranoid for once?
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
+ /*
+ * Without SEGCBLIST_LOCKING, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ /* Sanity check */
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
+ return 0;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+
+ /*
+ * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
+ * is set.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ wake_gp = rdp_offload_toggle(rdp, true, flags);
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ /*
+ * All kthreads are ready to work, we can finally relieve rcu_core() and
+ * enable nocb bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return 0;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+#ifdef CONFIG_RCU_LAZY
+static unsigned long
+lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ count += READ_ONCE(rdp->lazy_len);
+ }
+
+ return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int _count = READ_ONCE(rdp->lazy_len);
+
+ if (_count == 0)
+ continue;
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WRITE_ONCE(rdp->lazy_len, 0);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ sc->nr_to_scan -= _count;
+ count += _count;
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+ return count ? count : SHRINK_STOP;
+}
+
+static struct shrinker lazy_rcu_shrinker = {
+ .count_objects = lazy_rcu_shrink_count,
+ .scan_objects = lazy_rcu_shrink_scan,
+ .batch = 0,
+ .seeks = DEFAULT_SEEKS,
+};
+#endif // #ifdef CONFIG_RCU_LAZY
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ const struct cpumask *cpumask = NULL;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ cpumask = tick_nohz_full_mask;
+#endif
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) &&
+ !rcu_state.nocb_is_setup && !cpumask)
+ cpumask = cpu_possible_mask;
+
+ if (cpumask) {
+ if (!cpumask_available(rcu_nocb_mask)) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask);
+ rcu_state.nocb_is_setup = true;
+ }
+
+ if (!rcu_state.nocb_is_setup)
+ return;
+
+#ifdef CONFIG_RCU_LAZY
+ if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
+ pr_err("Failed to register lazy_rcu shrinker!\n");
+#endif // #ifdef CONFIG_RCU_LAZY
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+ WRITE_ONCE(rdp->lazy_len, 0);
+ mutex_init(&rdp->nocb_gp_kthread_mutex);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+ struct sched_param sp;
+
+ if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup)
+ return;
+
+ /* If there already is an rcuo kthread, then nothing to do. */
+ if (rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ sp.sched_priority = kthread_prio;
+ rdp_gp = rdp->nocb_gp_rdp;
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+ goto end;
+ }
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ goto end;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+ return;
+end:
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ rcu_nocb_rdp_deoffload(rdp);
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp_gp = rdp;
+ INIT_LIST_HEAD(&rdp->nocb_head_rdp);
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp->nocb_gp_rdp = rdp_gp;
+ if (cpumask_test_cpu(cpu, rcu_nocb_mask))
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufw[20];
+ char bufr[20];
+ struct rcu_data *nocb_next_rdp;
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
+ &rdp->nocb_entry_rdp,
+ typeof(*rdp),
+ nocb_entry_rdp);
+
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ nocb_next_rdp ? nocb_next_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return 0;
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ return false;
+}
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ return false;
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ return true;
+}
+
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags, bool lazy)
+{
+ return false;
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index de1dc3bb7f70..7b0fe741a088 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -13,43 +13,10 @@
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return lockdep_is_held(&rdp->nocb_lock);
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- /* Race on early boot between thread creation and assignment */
- if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
- return true;
-
- if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
- if (in_task())
- return true;
- return false;
-}
-
-#else
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- return false;
-}
-
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
- * In order to read the offloaded state of an rdp is a safe
+ * In order to read the offloaded state of an rdp in a safe
* and stable way and prevent from its value to be changed
* under us, we must either hold the barrier mutex, the cpu
* hotplug lock (read or write) or the nocb lock. Local
@@ -84,12 +51,10 @@ static void __init rcu_bootup_announce_oddness(void)
RCU_FANOUT);
if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
- if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
- pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
+ pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -121,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_kick_kthreads)
pr_info("\tKick kthreads if too-long grace period.\n");
if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
- pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
if (gp_preinit_delay)
pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
if (gp_init_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
- pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
@@ -293,10 +258,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->exp_deferred_qs);
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
}
/*
@@ -310,12 +275,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* current task, there might be any number of other tasks blocked while
* in an RCU read-side critical section.
*
+ * Unlike non-preemptible-RCU, quiescent state reports for expedited
+ * grace periods are handled separately via deferred quiescent states
+ * and context switch events.
+ *
* Callers to this function must disable preemption.
*/
static void rcu_qs(void)
{
RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
- if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
@@ -346,7 +315,7 @@ void rcu_note_context_switch(bool preempt)
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -361,7 +330,7 @@ void rcu_note_context_switch(bool preempt)
* then queue the task as required based on the states
* of any ongoing and expedited grace periods.
*/
- WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
trace_rcu_preempt_task(rcu_state.name,
t->pid,
@@ -383,7 +352,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -405,17 +374,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
static void rcu_preempt_read_enter(void)
{
- current->rcu_read_lock_nesting++;
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
}
static int rcu_preempt_read_exit(void)
{
- return --current->rcu_read_lock_nesting;
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
}
static void rcu_preempt_depth_set(int val)
{
- current->rcu_read_lock_nesting = val;
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
}
/*
@@ -488,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* be quite short, for example, in the case of the call from
* rcu_read_unlock_special().
*/
-static void
+static notrace void
rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
@@ -507,13 +479,14 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->exp_deferred_qs) {
+ if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
}
t->rcu_read_unlock_special.s = 0;
if (special.b.need_qs) {
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
} else {
@@ -527,7 +500,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
/* Clean up if blocked during RCU read-side critical section. */
@@ -559,7 +532,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WRITE_ONCE(rnp->exp_tasks, np);
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
if (&t->rcu_node_entry == rnp->boost_tasks)
WRITE_ONCE(rnp->boost_tasks, np);
}
@@ -584,16 +557,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- /* Unboost if we were boosted. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx);
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
} else {
local_irq_restore(flags);
}
@@ -608,9 +581,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* is disabled. This function cannot be expected to understand these
* nuances, so the caller must handle them.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
+ return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
rcu_preempt_depth() == 0;
}
@@ -622,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
* evaluate safety in terms of interrupt, softirq, and preemption
* disabling.
*/
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
@@ -668,11 +641,12 @@ static void rcu_read_unlock_special(struct task_struct *t)
expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
(rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
+ (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
(IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
@@ -688,7 +662,13 @@ static void rcu_read_unlock_special(struct task_struct *t)
expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler);
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ IS_ENABLED(CONFIG_PREEMPT_RT))
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
+ rcu_preempt_deferred_qs_handler);
+ else
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
rdp->defer_qs_iw_pending = true;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
@@ -739,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user)
struct task_struct *t = current;
lockdep_assert_irqs_disabled();
- if (user || rcu_is_cpu_rrupt_from_idle()) {
- rcu_note_voluntary_context_switch(current);
- }
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
@@ -801,7 +778,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
int cpu;
int i;
struct list_head *lhp;
- bool onl;
struct rcu_data *rdp;
struct rcu_node *rnp1;
@@ -825,9 +801,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_cont("\n");
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
- cpu, ".o"[onl],
+ cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
}
@@ -844,10 +819,10 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
- irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
return;
rdp = this_cpu_ptr(&rcu_data);
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
}
@@ -876,10 +851,8 @@ static void rcu_qs(void)
trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
- return;
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
}
/*
@@ -895,7 +868,7 @@ void rcu_all_qs(void)
if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
return;
- preempt_disable();
+ preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
preempt_enable();
@@ -925,8 +898,8 @@ void rcu_note_context_switch(bool preempt)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
- rcu_tasks_qs(current, preempt);
out:
+ rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -952,11 +925,25 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* Because there is no preemptible RCU, there can be no deferred quiescent
* states.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
-static void rcu_preempt_deferred_qs(struct task_struct *t) { }
+
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU. Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty. is therefore no need to
+// actually check it. Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (rdp->cpu_no_qs.b.exp)
+ rcu_report_exp_rdp(rdp);
+}
/*
* Because there is no preemptible RCU, there can be no readers blocked,
@@ -987,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user)
* neither access nor modify, at least not while the
* corresponding CPU is online.
*/
-
rcu_qs();
}
}
@@ -1016,12 +1002,34 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
*/
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
#ifdef CONFIG_RCU_BOOST
struct sched_param sp;
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ WRITE_ONCE(rdp->rcuc_activity, jiffies);
+}
+
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return rdp->nocb_cb_kthread == current;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
+{
+ return rdp->rcu_cpu_kthread_task == current ||
+ rcu_is_callbacks_nocb_kthread(rdp);
}
#ifdef CONFIG_RCU_BOOST
@@ -1083,7 +1091,7 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
@@ -1143,7 +1151,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ if (!rnp->boost_kthread_task ||
+ (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1151,7 +1160,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
+ (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+ IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1162,15 +1172,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
}
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1184,7 +1185,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
@@ -1193,15 +1193,14 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct sched_param sp;
struct task_struct *t;
+ mutex_lock(&rnp->boost_kthread_mutex);
if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
- return;
-
- rcu_state.boost = 1;
+ goto out;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- return;
+ goto out;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1209,6 +1208,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+
+ out:
+ mutex_unlock(&rnp->boost_kthread_mutex);
}
/*
@@ -1219,11 +1221,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
* We don't include outgoingcpu in the affinity set, use -1 if there is
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->boost_kthread_mutex.
*/
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rcu_rnp_online_cpus(rnp);
+ unsigned long mask;
cpumask_var_t cm;
int cpu;
@@ -1231,28 +1235,23 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
+ mutex_lock(&rnp->boost_kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
for_each_leaf_node_possible_cpu(rnp, cpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0)
- cpumask_setall(cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm)) {
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
set_cpus_allowed_ptr(t, cm);
+ mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
}
-/*
- * Spawn boost kthreads -- called as soon as the scheduler is running.
- */
-static void __init rcu_spawn_boost_kthreads(void)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp)
- if (rcu_rnp_online_cpus(rnp))
- rcu_spawn_one_boost_kthread(rnp);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1261,11 +1260,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
@@ -1278,1661 +1272,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
-static void __init rcu_spawn_boost_kthreads(void)
-{
-}
-
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future non-offloaded RCU-related work will need
- * to be done by the current CPU, even if none need be done immediately,
- * returning 1 if so. This function is part of the RCU implementation;
- * it is -not- an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether or not this
- * CPU has RCU callbacks queued.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- *nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
- !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.
- *
- * The following preprocessor symbol controls this:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- *
- * The value below works well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-
-/*
- * Try to advance callbacks on the current CPU, but only if it has been
- * awhile since the last time we did so. Afterwards, if there are any
- * callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
- bool cbs_ready = false;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
-
- /* Exit early if we advanced recently. */
- if (jiffies == rdp->last_advance_all)
- return false;
- rdp->last_advance_all = jiffies;
-
- rnp = rdp->mynode;
-
- /*
- * Don't bother checking unless a grace period has
- * completed since we last checked and there are
- * callbacks not yet ready to invoke.
- */
- if ((rcu_seq_completed_gp(rdp->gp_seq,
- rcu_seq_current(&rnp->gp_seq)) ||
- unlikely(READ_ONCE(rdp->gpwrap))) &&
- rcu_segcblist_pend_cbs(&rdp->cblist))
- note_gp_changes(rdp);
-
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- cbs_ready = true;
- return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller about what to set the timeout.
- *
- * The caller must have disabled interrupts.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- unsigned long dj;
-
- lockdep_assert_irqs_disabled();
-
- /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist) ||
- rcu_rdp_is_offloaded(rdp)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
-
- /* Attempt to advance callbacks. */
- if (rcu_try_advance_all_cbs()) {
- /* Some ready to invoke, so initiate later invocation. */
- invoke_rcu_core();
- return 1;
- }
- rdp->last_accelerate = jiffies;
-
- /* Request timer and round. */
- dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
-
- *nextevt = basemono + dj * TICK_NSEC;
- return 0;
-}
-
-/*
- * Prepare a CPU for idle from an RCU perspective. The first major task is to
- * sense whether nohz mode has been enabled or disabled via sysfs. The second
- * major task is to accelerate (that is, assign grace-period numbers to) any
- * recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
- bool needwake;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
- int tne;
-
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- return;
-
- /* Handle nohz enablement switches conservatively. */
- tne = READ_ONCE(tick_nohz_active);
- if (tne != rdp->tick_nohz_enabled_snap) {
- if (!rcu_segcblist_empty(&rdp->cblist))
- invoke_rcu_core(); /* force nohz to see update. */
- rdp->tick_nohz_enabled_snap = tne;
- return;
- }
- if (!tne)
- return;
-
- /*
- * If we have not yet accelerated this jiffy, accelerate all
- * callbacks on this CPU.
- */
- if (rdp->last_accelerate == jiffies)
- return;
- rdp->last_accelerate = jiffies;
- if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_accelerate_cbs(rnp, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- if (needwake)
- rcu_gp_kthread_wake();
- }
-}
-
-/*
- * Clean up for exit from idle. Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
- * created that pull the callbacks from the corresponding CPU, wait for
- * a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into GP kthreads, which manage incoming callbacks, wait for
- * grace periods, and awaken CB kthreads, and the CB kthreads, which only
- * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
- * do a wake_up() on their GP kthread when they insert a callback into any
- * empty list, unless the rcu_nocb_poll boot parameter has been specified,
- * in which case each kthread actively polls its CPU. (Which isn't so great
- * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callbacks can also be used as an energy-efficiency
- * measure because CPUs with no RCU callbacks queued are more aggressive
- * about entering dyntick-idle mode.
- */
-
-
-/*
- * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
- * If the list is invalid, a warning is emitted and all CPUs are offloaded.
- */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
- }
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = true;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Don't bother bypassing ->cblist if the call_rcu() rate is low.
- * After all, the main point of bypassing is to avoid lock contention
- * on ->nocb_lock, which only can happen at high call_rcu() rates.
- */
-static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
-module_param(nocb_nobypass_lim_per_jiffy, int, 0);
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
- */
-static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
- __acquires(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- if (raw_spin_trylock(&rdp->nocb_bypass_lock))
- return;
- atomic_inc(&rdp->nocb_lock_contended);
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
- raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
-}
-
-/*
- * Conditionally acquire the specified rcu_data structure's
- * ->nocb_bypass_lock.
- */
-static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- return raw_spin_trylock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_bypass_lock.
- */
-static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
- __releases(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_rdp_is_offloaded(rdp))
- return;
- raw_spin_lock(&rdp->nocb_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_lock);
- }
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock and restore
- * interrupts, but only if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- lockdep_assert_held(&rdp->nocb_lock);
-}
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
- swake_up_all(sq);
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_swait_queue_head(&rnp->nocb_gp_wq[0]);
- init_swait_queue_head(&rnp->nocb_gp_wq[1]);
-}
-
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
-static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
- struct rcu_data *rdp,
- bool force, unsigned long flags)
- __releases(rdp_gp->nocb_gp_lock)
-{
- bool needwake = false;
-
- if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("AlreadyAwake"));
- return false;
- }
-
- if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&rdp_gp->nocb_timer);
- }
-
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
- WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
- needwake = true;
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- wake_up_process(rdp_gp->nocb_gp_kthread);
- }
-
- return needwake;
-}
-
-/*
- * Kick the GP kthread for this NOCB group.
- */
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
-{
- unsigned long flags;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- return __wake_nocb_gp(rdp_gp, rdp, force, flags);
-}
-
-/*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
- */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
-{
- unsigned long flags;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
-
- /*
- * Bypass wakeup overrides previous deferments. In case
- * of callback storm, no need to wake up too early.
- */
- if (waketype == RCU_NOCB_WAKE_BYPASS) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
- } else {
- if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
- mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
- if (rdp_gp->nocb_defer_wakeup < waketype)
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
- }
-
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- struct rcu_cblist rcl;
-
- WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
- rcu_lockdep_assert_cblist_protected(rdp);
- lockdep_assert_held(&rdp->nocb_bypass_lock);
- if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
- raw_spin_unlock(&rdp->nocb_bypass_lock);
- return false;
- }
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
- return true;
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- if (!rcu_rdp_is_offloaded(rdp))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-}
-
-/*
- * If the ->nocb_bypass_lock is immediately available, flush the
- * ->nocb_bypass queue into ->cblist.
- */
-static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-{
- rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_rdp_is_offloaded(rdp) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-}
-
-/*
- * See whether it is appropriate to use the ->nocb_bypass list in order
- * to control contention on ->nocb_lock. A limited number of direct
- * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
- * is non-empty, further callbacks must be placed into ->nocb_bypass,
- * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
- * back to direct use of ->cblist. However, ->nocb_bypass should not be
- * used if ->cblist is empty, because otherwise callbacks can be stranded
- * on ->nocb_bypass because we cannot count on the current CPU ever again
- * invoking call_rcu(). The general rule is that if ->nocb_bypass is
- * non-empty, the corresponding no-CBs grace-period kthread must not be
- * in an indefinite sleep state.
- *
- * Finally, it is not permitted to use the bypass during early boot,
- * as doing so would confuse the auto-initialization code. Besides
- * which, there is no point in worrying about lock contention while
- * there is only one CPU in operation.
- */
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-
- lockdep_assert_irqs_disabled();
-
- // Pure softirq/rcuc based processing: no bypassing, no
- // locking.
- if (!rcu_rdp_is_offloaded(rdp)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // In the process of (de-)offloading: no bypassing, but
- // locking.
- if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
-
- // Don't use ->nocb_bypass during early boot.
- if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
- rcu_nocb_lock(rdp);
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // If we have advanced to a new jiffy, reset counts to allow
- // moving back from ->nocb_bypass to ->cblist.
- if (j == rdp->nocb_nobypass_last) {
- c = rdp->nocb_nobypass_count + 1;
- } else {
- WRITE_ONCE(rdp->nocb_nobypass_last, j);
- c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
- if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
- nocb_nobypass_lim_per_jiffy))
- c = 0;
- else if (c > nocb_nobypass_lim_per_jiffy)
- c = nocb_nobypass_lim_per_jiffy;
- }
- WRITE_ONCE(rdp->nocb_nobypass_count, c);
-
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return true; // Callback already enqueued.
- }
-
- // We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
- rcu_nocb_bypass_lock(rdp);
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
- rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
- if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQwake"));
- __call_rcu_nocb_wake(rdp, true, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- }
- return true; // Callback already enqueued.
-}
-
-/*
- * Awaken the no-CBs grace-period kthread if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- unsigned long cur_gp_seq;
- unsigned long j;
- long len;
- struct task_struct *t;
-
- // If we are being polled or there is no kthread, just leave.
- t = READ_ONCE(rdp->nocb_gp_kthread);
- if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeNotPoll"));
- return;
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp(rdp, false);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
- }
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_timer)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- return;
-}
-
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- * rdp_offload_toggle() nocb_gp_enabled_cb()
- * ------------------------- ----------------------------
- * WRITE flags LOCK nocb_gp_lock
- * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
- * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
- * UNLOCK nocb_gp_lock READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
- bool *needwake_state)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- }
- return false;
- }
-
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We will ignore this rdp until it ever gets re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- return true;
-}
-
-
-/*
- * No-CBs GP kthreads come here to wait for additional callbacks to show up
- * or for grace periods to end.
- */
-static void nocb_gp_wait(struct rcu_data *my_rdp)
-{
- bool bypass = false;
- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- bool wasempty = false;
-
- /*
- * Each pass through the following loop checks for CBs and for the
- * nearest grace period (if any) to wait for next. The CB kthreads
- * and the global grace-period kthread are awakened if needed.
- */
- WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
- bool needwake_state = false;
-
- if (!nocb_gp_enabled_cb(rdp))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue;
- }
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue; /* No callbacks here, try next. */
- }
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
- }
- rnp = rdp->mynode;
-
- // Advance callbacks if helpful and low contention.
- needwake_gp = false;
- if (!rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL) ||
- (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
- raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
- needwake_gp = rcu_advance_cbs(rnp, rdp);
- wasempty = rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL);
- raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
- }
- // Need to wait on some grace period?
- WARN_ON_ONCE(wasempty &&
- !rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL));
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
- if (!needwait_gp ||
- ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
- wait_gp_seq = cur_gp_seq;
- needwait_gp = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("NeedWaitGP"));
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- needwake = rdp->nocb_cb_sleep;
- WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
- } else {
- needwake = false;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake) {
- swake_up_one(&rdp->nocb_cb_wq);
- gotcbs = true;
- }
- if (needwake_gp)
- rcu_gp_kthread_wake();
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- }
-
- my_rdp->nocb_gp_bypass = bypass;
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
-
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
- TPS("WakeBypassIsDeferred"));
- }
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
- } else if (!needwait_gp) {
- /* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
- } else {
- rnp = my_rdp->mynode;
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
- swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
- rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
- }
- if (!rcu_nocb_poll) {
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
- WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- my_rdp->nocb_gp_seq = -1;
- WARN_ON(signal_pending(current));
-}
-
-/*
- * No-CBs grace-period-wait kthread. There is one of these per group
- * of CPUs, but only once at least one CPU in that group has come online
- * at least once since boot. This kthread checks for newly posted
- * callbacks from any of the CPUs it is responsible for, waits for a
- * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
- * that then have callback-invocation work to do.
- */
-static int rcu_nocb_gp_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- for (;;) {
- WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
- nocb_gp_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
-{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
-}
-
-/*
- * Invoke any ready callbacks from the corresponding no-CBs CPU,
- * then, if there are no more, wait for more to appear.
- */
-static void nocb_cb_wait(struct rcu_data *rdp)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool needwake_state = false;
- bool needwake_gp = false;
- bool can_sleep = true;
- struct rcu_node *rnp = rdp->mynode;
-
- local_irq_save(flags);
- rcu_momentary_dyntick_idle();
- local_irq_restore(flags);
- /*
- * Disable BH to provide the expected environment. Also, when
- * transitioning to/from NOCB mode, a self-requeuing callback might
- * be invoked from softirq. A short grace period could cause both
- * instances of this callback would execute concurrently.
- */
- local_bh_disable();
- rcu_do_batch(rdp);
- local_bh_enable();
- lockdep_assert_irqs_enabled();
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
- raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
- needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
- } else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
-
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
-
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
- * nocb_cb_wait() to do the dirty work.
- */
-static int rcu_nocb_cb_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- // Each pass through this loop does one callback batch, and,
- // if there are no more ready callbacks, waits for them.
- for (;;) {
- nocb_cb_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
-{
- return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
- struct rcu_data *rdp, int level,
- unsigned long flags)
- __releases(rdp_gp->nocb_gp_lock)
-{
- int ndw;
- int ret;
-
- if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- return false;
- }
-
- ndw = rdp_gp->nocb_defer_wakeup;
- ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
-
- return ret;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
-{
- unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
-
- WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
-
- raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
- do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
-}
-
-/*
- * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
- * This means we do an inexact common-case check. Note that if
- * we miss, ->nocb_timer will eventually clean things up.
- */
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
- return false;
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
-}
-
-void rcu_nocb_flush_deferred_wakeup(void)
-{
- do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
-
-static int rdp_offload_toggle(struct rcu_data *rdp,
- bool offload, unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- bool wake_gp = false;
-
- rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (rdp_gp->nocb_gp_sleep) {
- rdp_gp->nocb_gp_sleep = false;
- wake_gp = true;
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-
- if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return 0;
-}
-
-static long rcu_nocb_rdp_deoffload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
-
- pr_info("De-offloading %d\n", rdp->cpu);
-
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Flush once and for all now. This suffices because we are
- * running on the target CPU holding ->nocb_lock (thus having
- * interrupts disabled), and because rdp_offload_toggle()
- * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
- * Thus future calls to rcu_segcblist_completely_offloaded() will
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
- ret = rdp_offload_toggle(rdp, false, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
- SEGCBLIST_KTHREAD_GP));
- /*
- * Lock one last time to acquire latest callback updates from kthreads
- * so we can later handle callbacks locally without locking.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
- * lock is released but how about being paranoid for once?
- */
- rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
- /*
- * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
- * rcu_nocb_unlock_irqrestore() anymore.
- */
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-
- /* Sanity check */
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
-
-
- return ret;
-}
-
-int rcu_nocb_cpu_deoffload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
- if (!ret)
- cpumask_clear_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-
-static long rcu_nocb_rdp_offload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
- /*
- * For now we only support re-offload, ie: the rdp must have been
- * offloaded on boot first.
- */
- if (!rdp->nocb_gp_rdp)
- return -EINVAL;
-
- pr_info("Offloading %d\n", rdp->cpu);
- /*
- * Can't use rcu_nocb_lock_irqsave() while we are in
- * SEGCBLIST_SOFTIRQ_ONLY mode.
- */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
-
- /*
- * We didn't take the nocb lock while working on the
- * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
- * Every modifications that have been done previously on
- * rdp->cblist must be visible remotely by the nocb kthreads
- * upon wake up after reading the cblist flags.
- *
- * The layout against nocb_lock enforces that ordering:
- *
- * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
- * ------------------------- ----------------------------
- * WRITE callbacks rcu_nocb_lock()
- * rcu_nocb_lock() READ flags
- * WRITE flags READ callbacks
- * rcu_nocb_unlock() rcu_nocb_unlock()
- */
- ret = rdp_offload_toggle(rdp, true, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
-
- return ret;
-}
-
-int rcu_nocb_cpu_offload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (!rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
- if (!ret)
- cpumask_set_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-offload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = false;
- struct rcu_data *rdp;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- }
- if (!cpumask_available(rcu_nocb_mask))
- return;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- if (cpumask_empty(rcu_nocb_mask))
- pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
- else
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_empty(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
- }
- rcu_organize_nocb_kthreads();
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- init_swait_queue_head(&rdp->nocb_cb_wq);
- init_swait_queue_head(&rdp->nocb_gp_wq);
- init_swait_queue_head(&rdp->nocb_state_wq);
- raw_spin_lock_init(&rdp->nocb_lock);
- raw_spin_lock_init(&rdp->nocb_bypass_lock);
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
- * for this CPU's group has not yet been created, spawn it as well.
- */
-static void rcu_spawn_one_nocb_kthread(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_data *rdp_gp;
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
- return;
-
- /* If we didn't spawn the GP kthread first, reorganize! */
- rdp_gp = rdp->nocb_gp_rdp;
- if (!rdp_gp->nocb_gp_kthread) {
- t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
- "rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- }
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp->nocb_cb_kthread, t);
- WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
-}
-
-/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_gp_stride = -1;
-module_param(rcu_nocb_gp_stride, int, 0444);
-
-/*
- * Initialize GP-CB relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(void)
-{
- int cpu;
- bool firsttime = true;
- bool gotnocbs = false;
- bool gotnocbscbs = true;
- int ls = rcu_nocb_gp_stride;
- int nl = 0; /* Next GP kthread. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!cpumask_available(rcu_nocb_mask))
- return;
- if (ls == -1) {
- ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
- rcu_nocb_gp_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure.
- * Should the corresponding CPU come online in the future, then
- * we will spawn the needed set of rcu_nocb_kthread() kthreads.
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->cpu >= nl) {
- /* New GP kthread, set up for CBs & next GP. */
- gotnocbs = true;
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
- rdp_gp = rdp;
- if (dump_tree) {
- if (!firsttime)
- pr_cont("%s\n", gotnocbscbs
- ? "" : " (self only)");
- gotnocbscbs = false;
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:",
- __func__, cpu);
- }
- } else {
- /* Another CB kthread, link to previous GP kthread. */
- gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
- if (dump_tree)
- pr_cont(" %d", cpu);
- }
- rdp_prev = rdp;
- }
- if (gotnocbs && dump_tree)
- pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
-}
-
-/*
- * Bind the current task to the offloaded CPUs. If there are no offloaded
- * CPUs, leave the task unbound. Splat if the bind attempt fails.
- */
-void rcu_bind_current_to_nocb(void)
-{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
- WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
-}
-EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
-
-// The ->on_cpu field is available only in CONFIG_SMP=y, so...
-#ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
-}
-#else // #ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return "";
-}
-#endif // #else #ifdef CONFIG_SMP
-
-/*
- * Dump out nocb grace-period kthread state for the specified rcu_data
- * structure.
- */
-static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
- rdp->cpu,
- "kK"[!!rdp->nocb_gp_kthread],
- "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[timer_pending(&rdp->nocb_timer)],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[swait_active(&rdp->nocb_gp_wq)],
- ".W"[swait_active(&rnp->nocb_gp_wq[0])],
- ".W"[swait_active(&rnp->nocb_gp_wq[1])],
- ".B"[!!rdp->nocb_gp_bypass],
- ".G"[!!rdp->nocb_gp_gp],
- (long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
- rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-}
-
-/* Dump out nocb kthread state for the specified rcu_data structure. */
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
- char bufw[20];
- char bufr[20];
- struct rcu_segcblist *rsclp = &rdp->cblist;
- bool waslocked;
- bool wassleep;
-
- if (rdp->nocb_gp_rdp == rdp)
- show_rcu_nocb_gp_state(rdp);
-
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
- rdp->cpu, rdp->nocb_gp_rdp->cpu,
- rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
- "kK"[!!rdp->nocb_cb_kthread],
- "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
- "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
- "sS"[!!rdp->nocb_cb_sleep],
- ".W"[swait_active(&rdp->nocb_cb_wq)],
- jiffies - rdp->nocb_bypass_first,
- jiffies - rdp->nocb_nobypass_last,
- rdp->nocb_nobypass_count,
- ".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
- ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
- ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
- ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist),
- rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-
- /* It is OK for GP kthreads to have GP state. */
- if (rdp->nocb_gp_rdp == rdp)
- return;
-
- waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
- wassleep = swait_active(&rdp->nocb_gp_wq);
- if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
- return; /* Nothing untoward. */
-
- pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
- "lL"[waslocked],
- "dD"[!!rdp->nocb_defer_wakeup],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[wassleep]);
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-/* No ->nocb_lock to acquire. */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- local_irq_restore(flags);
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
-}
-
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- return true;
-}
-
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- return false;
-}
-
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags)
-{
- WARN_ON_ONCE(1); /* Should be dead code! */
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
-{
- return false;
-}
-
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2940,7 +1281,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPU CPUs.
+ * RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(void)
{
@@ -2960,39 +1301,5 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_FLAG_RCU);
-}
-
-/* Record the current task on dyntick-idle entry. */
-static void noinstr rcu_dynticks_task_enter(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Record no current task on dyntick-idle exit. */
-static void noinstr rcu_dynticks_task_exit(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static void rcu_dynticks_task_trace_enter(void)
-{
-#ifdef CONFIG_TASKS_RCU_TRACE
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
-}
-
-/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static void rcu_dynticks_task_trace_exit(void)
-{
-#ifdef CONFIG_TASKS_RCU_TRACE
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+ housekeeping_affine(current, HK_TYPE_RCU);
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 6c76988cc019..b10b8349bb2a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,8 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
@@ -23,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly;
#define RCU_STALL_MIGHT_DIV 8
#define RCU_STALL_MIGHT_MIN (2 * HZ)
+int rcu_exp_jiffies_till_stall_check(void)
+{
+ int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+ int exp_stall_delay_delta = 0;
+ int till_stall_check;
+
+ // Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+ if (!cpu_stall_timeout)
+ cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+
+ // Limit check must be consistent with the Kconfig limits for
+ // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+ // The minimum clamped value is "2UL", because at least one full
+ // tick has to be guaranteed.
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);
+
+ if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+ WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+
+#ifdef CONFIG_PROVE_RCU
+ /* Add extra ~25% out of till_stall_check. */
+ exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+#endif
+
+ return till_stall_check + exp_stall_delay_delta;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+
/* Limit-check stall timeouts specified at boottime and runtime. */
int rcu_jiffies_till_stall_check(void)
{
@@ -117,17 +147,14 @@ static void panic_on_rcu_stall(void)
}
/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
//////////////////////////////////////////////////////////////////////////////
@@ -241,16 +268,16 @@ struct rcu_stall_chk_rdr {
* Report out the state of a not-running task that is stalling the
* current RCU grace period.
*/
-static bool check_slow_task(struct task_struct *t, void *arg)
+static int check_slow_task(struct task_struct *t, void *arg)
{
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
- return false; // It is running, so decline to inspect it.
+ return -EBUSY; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
- return true;
+ return 0;
}
/*
@@ -267,8 +294,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *ts[8];
lockdep_assert_irqs_disabled();
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
@@ -280,9 +309,9 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
break;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- for (i--; i; i--) {
- t = ts[i];
- if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
+ while (i) {
+ t = ts[--i];
+ if (task_call_func(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
pr_cont(" P%d/%d:%c%c%c%c",
@@ -339,33 +368,13 @@ static void rcu_dump_cpu_stacks(void)
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
if (cpu_is_offline(cpu))
pr_err("Offline CPU %d blocking current GP.\n", cpu);
- else if (!trigger_single_cpu_backtrace(cpu))
+ else
dump_cpu_task(cpu);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- !!rdp->tick_nohz_enabled_snap);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
static const char * const gp_state_names[] = {
[RCU_GP_IDLE] = "RCU_GP_IDLE",
[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
@@ -398,6 +407,56 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
return j > 2 * HZ;
}
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+ int cpu;
+ struct task_struct *rcuc;
+ unsigned long j;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return false;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return false;
+
+ j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
+static void print_cpu_stat_info(int cpu)
+{
+ struct rcu_snap_record rsr, *rsrp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
+
+ if (!rcu_cpu_stall_cputime)
+ return;
+
+ rsrp = &rdp->snap_record;
+ if (rsrp->gp_seq != rdp->gp_seq)
+ return;
+
+ rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+
+ pr_err("\t hardirqs softirqs csw/system\n");
+ pr_err("\t number: %8ld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+ nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+ pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
+ div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC),
+ jiffies_to_msecs(jiffies - rsrp->jiffies));
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -407,16 +466,18 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
* of RCU grace periods that this CPU is ignorant of, for example, "1"
* if the CPU was aware of the previous grace period.
*
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ * Also print out idle info.
*/
static void print_cpu_stall_info(int cpu)
{
unsigned long delta;
bool falsepositive;
- char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
char *ticks_title;
unsigned long ticks_value;
+ bool rcuc_starved;
+ unsigned long j;
+ char buf[32];
/*
* We could be printing a lot while holding a spinlock. Avoid
@@ -431,11 +492,13 @@ static void print_cpu_stall_info(int cpu)
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
}
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n",
+ rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
+ if (rcuc_starved)
+ sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -444,12 +507,14 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz,
+ rcuc_starved ? buf : "",
falsepositive ? " (false positive?)" : "");
+
+ print_cpu_stat_info(cpu);
}
/* Complain about starvation of grace-period kthread. */
@@ -464,9 +529,10 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->__state : ~0, cpu);
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
if (gpk) {
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
@@ -476,8 +542,7 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
} else {
pr_err("Stack dump where RCU GP kthread last ran:\n");
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ dump_cpu_task(cpu);
}
}
wake_up_process(gpk);
@@ -509,7 +574,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
(long)rcu_seq_current(&rcu_state.gp_seq),
data_race(rcu_state.gp_flags),
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
- gpk->__state);
+ data_race(READ_ONCE(gpk->__state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
}
@@ -554,9 +619,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -568,11 +633,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = data_race(rcu_state.gp_activity);
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rcu_state.name, j - gpa, j, gpa,
- data_race(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
}
}
/* Rewrite if needed in case of slow consoles. */
@@ -615,9 +680,9 @@ static void print_cpu_stall(unsigned long gps)
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
@@ -646,6 +711,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
+ bool didstall = false;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -691,24 +757,46 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ jn = jiffies + ULONG_MAX / 2;
if (rcu_gp_in_progress() &&
(READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* We haven't checked in, so go dump stack. */
print_cpu_stall(gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
+ }
+ if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
}
}
@@ -742,7 +830,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
rcu_for_each_leaf_node(rnp) {
if (!cpup) {
- if (READ_ONCE(rnp->qsmask)) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
return false;
} else {
if (READ_ONCE(rnp->gp_tasks))
@@ -791,32 +879,34 @@ void show_rcu_gp_kthreads(void)
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
- ja = j - data_race(rcu_state.gp_activity);
- jr = j - data_race(rcu_state.gp_req_activity);
- js = j - data_race(rcu_state.gp_start);
- jw = j - data_race(rcu_state.gp_wake_time);
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU,
- js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- (long)data_race(rcu_state.gp_seq),
- (long)data_race(rcu_get_root()->gp_seq_needed),
- data_race(rcu_state.gp_max),
- data_race(rcu_state.gp_flags));
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
- !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) &&
- !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks))
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
continue;
pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
rnp->grplo, rnp->grphi,
- (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed),
- data_race(rnp->qsmask),
- ".b"[!!data_race(rnp->boost_kthread_task)],
- ".B"[!!data_race(rnp->boost_tasks)],
- ".E"[!!data_race(rnp->exp_tasks)],
- ".G"[!!data_race(rnp->gp_tasks)],
- data_race(rnp->n_boosts));
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -826,12 +916,12 @@ void show_rcu_gp_kthreads(void)
READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)data_race(rdp->gp_seq_needed));
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
}
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- cbs += data_race(rdp->n_cbs_invoked);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
@@ -913,11 +1003,11 @@ void rcu_fwd_progress_check(unsigned long j)
if (rcu_gp_in_progress()) {
pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
show_rcu_gp_kthreads();
} else {
pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
preempt_disable();
rdp = this_cpu_ptr(&rcu_data);
rcu_check_gp_start_stall(rdp->mynode, rdp, j);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c21b38cc25e9..19bf6fa3ee6a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -54,11 +54,11 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
-module_param(rcu_expedited, int, 0);
-module_param(rcu_normal, int, 0);
+module_param(rcu_expedited, int, 0444);
+module_param(rcu_normal, int, 0444);
static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
-#ifndef CONFIG_PREEMPT_RT
-module_param(rcu_normal_after_boot, int, 0);
+#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL)
+module_param(rcu_normal_after_boot, int, 0444);
#endif
#endif /* #ifndef CONFIG_TINY_RCU */
@@ -85,7 +85,7 @@ module_param(rcu_normal_after_boot, int, 0);
* and while lockdep is disabled.
*
* Note that if the CPU is in the idle loop from an RCU point of view (ie:
- * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * that we are in the section between ct_idle_enter() and ct_idle_exit())
* then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an
* rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
* in such a section, considering these as in extended quiescent state,
@@ -144,8 +144,45 @@ bool rcu_gp_is_normal(void)
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
+static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1);
+/*
+ * Should call_rcu() callbacks be processed with urgency or are
+ * they OK being executed with arbitrary delays?
+ */
+bool rcu_async_should_hurry(void)
+{
+ return !IS_ENABLED(CONFIG_RCU_LAZY) ||
+ atomic_read(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_should_hurry);
+
+/**
+ * rcu_async_hurry - Make future async RCU callbacks not lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a timely fashion.
+ */
+void rcu_async_hurry(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_inc(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_hurry);
+
+/**
+ * rcu_async_relax - Make future async RCU callbacks lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a lazy fashion.
+ */
+void rcu_async_relax(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_dec(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_relax);
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
@@ -195,6 +232,7 @@ static bool rcu_boot_ended __read_mostly;
void rcu_end_inkernel_boot(void)
{
rcu_unexpedite_gp();
+ rcu_async_relax();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
rcu_boot_ended = true;
@@ -220,11 +258,12 @@ void rcu_test_sync_prims(void)
{
if (!IS_ENABLED(CONFIG_PROVE_RCU))
return;
+ pr_info("Running RCU synchronous self tests\n");
synchronize_rcu();
synchronize_rcu_expedited();
}
-#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+#if !defined(CONFIG_TINY_RCU)
/*
* Switch to run-time mode once RCU has fully initialized.
@@ -239,7 +278,7 @@ static int __init rcu_set_runtime_mode(void)
}
core_initcall(rcu_set_runtime_mode);
-#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+#endif /* #if !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -247,7 +286,7 @@ struct lockdep_map rcu_lock_map = {
.name = "rcu_read_lock",
.key = &rcu_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */
};
EXPORT_SYMBOL_GPL(rcu_lock_map);
@@ -256,7 +295,7 @@ struct lockdep_map rcu_bh_lock_map = {
.name = "rcu_read_lock_bh",
.key = &rcu_bh_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
};
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
@@ -407,6 +446,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
+void finish_rcuwait(struct rcuwait *w)
+{
+ rcu_assign_pointer(w->task, NULL);
+ __set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL_GPL(finish_rcuwait);
+
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
@@ -499,6 +545,12 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
+module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
+bool rcu_exp_stall_task_details __read_mostly;
+module_param(rcu_exp_stall_task_details, bool, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
@@ -507,6 +559,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
+/**
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
+ *
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
+ */
+unsigned long get_completed_synchronize_rcu(void)
+{
+ return RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -533,14 +598,15 @@ struct early_boot_kfree_rcu {
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
+ int idx;
static struct rcu_head shead;
struct early_boot_kfree_rcu *rhp;
+ idx = srcu_down_read(&early_srcu);
+ srcu_up_read(&early_srcu, idx);
call_rcu(&head, test_callback);
- if (IS_ENABLED(CONFIG_SRCU)) {
- early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
- call_srcu(&early_srcu, &shead, test_callback);
- }
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
+ call_srcu(&early_srcu, &shead, test_callback);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -563,11 +629,10 @@ static int rcu_verify_early_boot_tests(void)
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
- if (IS_ENABLED(CONFIG_SRCU)) {
- early_boot_test_counter++;
- srcu_barrier(&early_srcu);
- WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
- }
+ early_boot_test_counter++;
+ srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
+ cleanup_srcu_struct(&early_srcu);
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f7440c0c7e43..3bba88c7ffc6 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -23,7 +23,7 @@
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
-int C_A_D = 1;
+static int C_A_D = 1;
struct pid *cad_pid;
EXPORT_SYMBOL(cad_pid);
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(cad_pid);
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+EXPORT_SYMBOL_GPL(reboot_mode);
enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
/*
@@ -47,12 +48,20 @@ int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
+struct sys_off_handler {
+ struct notifier_block nb;
+ int (*sys_off_cb)(struct sys_off_data *data);
+ void *cb_data;
+ enum sys_off_mode mode;
+ bool blocking;
+ void *list;
+};
+
/*
- * If set, this is used for preparing the system to power off.
+ * Temporary stub that prevents linkage failure while we're in process
+ * of removing all uses of legacy pm_power_off() around the kernel.
*/
-
-void (*pm_power_off_prepare)(void);
-EXPORT_SYMBOL_GPL(pm_power_off_prepare);
+void __weak (*pm_power_off)(void);
/**
* emergency_restart - reboot the system
@@ -234,6 +243,17 @@ void migrate_to_reboot_cpu(void)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
}
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for restart.
+ */
+static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list);
+
+static void do_kernel_restart_prepare(void)
+{
+ blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL);
+}
+
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -245,6 +265,7 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ do_kernel_restart_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
@@ -280,6 +301,375 @@ void kernel_halt(void)
}
EXPORT_SYMBOL_GPL(kernel_halt);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for power off.
+ */
+static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list);
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * to power off system.
+ */
+static ATOMIC_NOTIFIER_HEAD(power_off_handler_list);
+
+static int sys_off_notify(struct notifier_block *nb,
+ unsigned long mode, void *cmd)
+{
+ struct sys_off_handler *handler;
+ struct sys_off_data data = {};
+
+ handler = container_of(nb, struct sys_off_handler, nb);
+ data.cb_data = handler->cb_data;
+ data.mode = mode;
+ data.cmd = cmd;
+
+ return handler->sys_off_cb(&data);
+}
+
+static struct sys_off_handler platform_sys_off_handler;
+
+static struct sys_off_handler *alloc_sys_off_handler(int priority)
+{
+ struct sys_off_handler *handler;
+ gfp_t flags;
+
+ /*
+ * Platforms like m68k can't allocate sys_off handler dynamically
+ * at the early boot time because memory allocator isn't available yet.
+ */
+ if (priority == SYS_OFF_PRIO_PLATFORM) {
+ handler = &platform_sys_off_handler;
+ if (handler->cb_data)
+ return ERR_PTR(-EBUSY);
+ } else {
+ if (system_state > SYSTEM_RUNNING)
+ flags = GFP_ATOMIC;
+ else
+ flags = GFP_KERNEL;
+
+ handler = kzalloc(sizeof(*handler), flags);
+ if (!handler)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return handler;
+}
+
+static void free_sys_off_handler(struct sys_off_handler *handler)
+{
+ if (handler == &platform_sys_off_handler)
+ memset(handler, 0, sizeof(*handler));
+ else
+ kfree(handler);
+}
+
+/**
+ * register_sys_off_handler - Register sys-off handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers system power-off or restart handler that will be invoked
+ * at the step corresponding to the given sys-off mode. Handler's callback
+ * should return NOTIFY_DONE to permit execution of the next handler in
+ * the call chain or NOTIFY_STOP to break the chain (in error case for
+ * example).
+ *
+ * Multiple handlers can be registered at the default priority level.
+ *
+ * Only one handler can be registered at the non-default priority level,
+ * otherwise ERR_PTR(-EBUSY) is returned.
+ *
+ * Returns a new instance of struct sys_off_handler on success, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+struct sys_off_handler *
+register_sys_off_handler(enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+ int err;
+
+ handler = alloc_sys_off_handler(priority);
+ if (IS_ERR(handler))
+ return handler;
+
+ switch (mode) {
+ case SYS_OFF_MODE_POWER_OFF_PREPARE:
+ handler->list = &power_off_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_POWER_OFF:
+ handler->list = &power_off_handler_list;
+ break;
+
+ case SYS_OFF_MODE_RESTART_PREPARE:
+ handler->list = &restart_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_RESTART:
+ handler->list = &restart_handler_list;
+ break;
+
+ default:
+ free_sys_off_handler(handler);
+ return ERR_PTR(-EINVAL);
+ }
+
+ handler->nb.notifier_call = sys_off_notify;
+ handler->nb.priority = priority;
+ handler->sys_off_cb = callback;
+ handler->cb_data = cb_data;
+ handler->mode = mode;
+
+ if (handler->blocking) {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = blocking_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = blocking_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ } else {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = atomic_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ }
+
+ if (err) {
+ free_sys_off_handler(handler);
+ return ERR_PTR(err);
+ }
+
+ return handler;
+}
+EXPORT_SYMBOL_GPL(register_sys_off_handler);
+
+/**
+ * unregister_sys_off_handler - Unregister sys-off handler
+ * @handler: Sys-off handler
+ *
+ * Unregisters given sys-off handler.
+ */
+void unregister_sys_off_handler(struct sys_off_handler *handler)
+{
+ int err;
+
+ if (IS_ERR_OR_NULL(handler))
+ return;
+
+ if (handler->blocking)
+ err = blocking_notifier_chain_unregister(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_unregister(handler->list,
+ &handler->nb);
+
+ /* sanity check, shall never happen */
+ WARN_ON(err);
+
+ free_sys_off_handler(handler);
+}
+EXPORT_SYMBOL_GPL(unregister_sys_off_handler);
+
+static void devm_unregister_sys_off_handler(void *data)
+{
+ struct sys_off_handler *handler = data;
+
+ unregister_sys_off_handler(handler);
+}
+
+/**
+ * devm_register_sys_off_handler - Register sys-off handler
+ * @dev: Device that registers handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers resource-managed sys-off handler.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_sys_off_handler(struct device *dev,
+ enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(mode, priority, callback, cb_data);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+
+ return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
+ handler);
+}
+EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
+
+/**
+ * devm_register_power_off_handler - Register power-off handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using power-off mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_power_off_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_power_off_handler);
+
+/**
+ * devm_register_restart_handler - Register restart handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using restart mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_restart_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_RESTART,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_restart_handler);
+
+static struct sys_off_handler *platform_power_off_handler;
+
+static int platform_power_off_notify(struct sys_off_data *data)
+{
+ void (*platform_power_power_off_cb)(void) = data->cb_data;
+
+ platform_power_power_off_cb();
+
+ return NOTIFY_DONE;
+}
+
+/**
+ * register_platform_power_off - Register platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Registers power-off callback that will be called as last step
+ * of the power-off sequence. This callback is expected to be invoked
+ * for the last resort. Only one platform power-off callback is allowed
+ * to be registered at a time.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int register_platform_power_off(void (*power_off)(void))
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_PLATFORM,
+ platform_power_off_notify,
+ power_off);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+
+ platform_power_off_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_platform_power_off);
+
+/**
+ * unregister_platform_power_off - Unregister platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Unregisters previously registered platform power-off callback.
+ */
+void unregister_platform_power_off(void (*power_off)(void))
+{
+ if (platform_power_off_handler &&
+ platform_power_off_handler->cb_data == power_off) {
+ unregister_sys_off_handler(platform_power_off_handler);
+ platform_power_off_handler = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_platform_power_off);
+
+static int legacy_pm_power_off(struct sys_off_data *data)
+{
+ if (pm_power_off)
+ pm_power_off();
+
+ return NOTIFY_DONE;
+}
+
+static void do_kernel_power_off_prepare(void)
+{
+ blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
+}
+
+/**
+ * do_kernel_power_off - Execute kernel power-off handler call chain
+ *
+ * Expected to be called as last step of the power-off sequence.
+ *
+ * Powers off the system immediately if a power-off handler function has
+ * been registered. Otherwise does nothing.
+ */
+void do_kernel_power_off(void)
+{
+ struct sys_off_handler *sys_off = NULL;
+
+ /*
+ * Register sys-off handlers for legacy PM callback. This allows
+ * legacy PM callbacks temporary co-exist with the new sys-off API.
+ *
+ * TODO: Remove legacy handlers once all legacy PM users will be
+ * switched to the sys-off based APIs.
+ */
+ if (pm_power_off)
+ sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ legacy_pm_power_off, NULL);
+
+ atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
+
+ unregister_sys_off_handler(sys_off);
+}
+
+/**
+ * kernel_can_power_off - check whether system can be powered off
+ *
+ * Returns true if power-off handler is registered and system can be
+ * powered off, false otherwise.
+ */
+bool kernel_can_power_off(void)
+{
+ return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) ||
+ pm_power_off;
+}
+EXPORT_SYMBOL_GPL(kernel_can_power_off);
+
/**
* kernel_power_off - power_off the system
*
@@ -288,8 +678,7 @@ EXPORT_SYMBOL_GPL(kernel_halt);
void kernel_power_off(void)
{
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
+ do_kernel_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
@@ -339,7 +728,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
cmd = LINUX_REBOOT_CMD_HALT;
mutex_lock(&system_transition_mutex);
@@ -359,7 +748,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
case LINUX_REBOOT_CMD_HALT:
kernel_halt();
do_exit(0);
- panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
@@ -417,7 +805,8 @@ void ctrl_alt_del(void)
kill_cad_pid(SIGINT, 1);
}
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+#define POWEROFF_CMD_PATH_LEN 256
+static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
static const char reboot_cmd[] = "/sbin/reboot";
static int run_cmd(const char *cmd)
@@ -867,6 +1256,33 @@ static struct attribute *reboot_attrs[] = {
NULL,
};
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_reboot_table[] = {
+ {
+ .procname = "poweroff_cmd",
+ .data = &poweroff_cmd,
+ .maxlen = POWEROFF_CMD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "ctrl-alt-del",
+ .data = &C_A_D,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static void __init kernel_reboot_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_reboot_table);
+}
+#else
+#define kernel_reboot_sysctls_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
static const struct attribute_group reboot_attr_group = {
.attrs = reboot_attrs,
};
@@ -886,6 +1302,8 @@ static int __init reboot_ksysfs_init(void)
return ret;
}
+ kernel_reboot_sysctls_init();
+
return 0;
}
late_initcall(reboot_ksysfs_init);
diff --git a/kernel/relay.c b/kernel/relay.c
index d1a67fbb819d..a80fa01042e9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
- const size_t pa_size = n_pages * sizeof(struct page *);
- if (pa_size > PAGE_SIZE)
- return vzalloc(pa_size);
- return kzalloc(pa_size, GFP_KERNEL);
+ return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*
@@ -94,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
- vma->vm_flags |= VM_DONTEXPAND;
+ vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_private_data = buf;
return 0;
@@ -151,13 +148,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -440,7 +437,7 @@ int relay_prepare_cpu(unsigned int cpu)
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
- if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+ if (*per_cpu_ptr(chan->buf, cpu))
continue;
buf = relay_open_buf(chan, cpu);
if (!buf) {
@@ -510,7 +507,7 @@ struct rchan *relay_open(const char *base_filename,
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
}
chan->cb = cb;
kref_init(&chan->kref);
@@ -581,7 +578,7 @@ int relay_late_setup_files(struct rchan *chan,
if (!chan || !base_filename)
return -EINVAL;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
@@ -992,7 +989,8 @@ static size_t relay_file_read_start_pos(struct rchan_buf *buf)
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
- size_t read_pos = consumed * subbuf_size + buf->bytes_consumed;
+ size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed)
+ % (n_subbufs * subbuf_size);
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
diff --git a/kernel/resource.c b/kernel/resource.c
index ca9f5198a01f..b1763b2fd7ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -56,14 +56,6 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
static struct resource *next_resource(struct resource *p)
{
if (p->child)
@@ -73,6 +65,18 @@ static struct resource *next_resource(struct resource *p)
return p->sibling;
}
+static struct resource *next_resource_skip_children(struct resource *p)
+{
+ while (!p->sibling && p->parent)
+ p = p->parent;
+ return p->sibling;
+}
+
+#define for_each_resource(_root, _p, _skip_children) \
+ for ((_p) = (_root)->child; (_p); \
+ (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
+ next_resource(_p))
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -87,7 +91,7 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = PDE_DATA(file_inode(m->file));
+ struct resource *p = pde_data(file_inode(m->file));
loff_t l = 0;
read_lock(&resource_lock);
for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
@@ -103,7 +107,7 @@ static void r_stop(struct seq_file *m, void *v)
static int r_show(struct seq_file *m, void *v)
{
- struct resource *root = PDE_DATA(file_inode(m->file));
+ struct resource *root = pde_data(file_inode(m->file));
struct resource *r = v, *p;
unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
@@ -148,36 +152,19 @@ __initcall(ioresources_init);
static void free_resource(struct resource *res)
{
- if (!res)
- return;
-
- if (!PageSlab(virt_to_head_page(res))) {
- spin_lock(&bootmem_resource_lock);
- res->sibling = bootmem_resource_free;
- bootmem_resource_free = res;
- spin_unlock(&bootmem_resource_lock);
- } else {
+ /**
+ * If the resource was allocated using memblock early during boot
+ * we'll leak it here: we can only return full pages back to the
+ * buddy and trying to be smart and reusing them eventually in
+ * alloc_resource() overcomplicates resource handling.
+ */
+ if (res && PageSlab(virt_to_head_page(res)))
kfree(res);
- }
}
static struct resource *alloc_resource(gfp_t flags)
{
- struct resource *res = NULL;
-
- spin_lock(&bootmem_resource_lock);
- if (bootmem_resource_free) {
- res = bootmem_resource_free;
- bootmem_resource_free = res->sibling;
- }
- spin_unlock(&bootmem_resource_lock);
-
- if (res)
- memset(res, 0, sizeof(struct resource));
- else
- res = kzalloc(sizeof(struct resource), flags);
-
- return res;
+ return kzalloc(sizeof(struct resource), flags);
}
/* Return the conflict entry if you can't request it */
@@ -502,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
-static int __region_intersects(resource_size_t start, size_t size,
- unsigned long flags, unsigned long desc)
+static int __region_intersects(struct resource *parent, resource_size_t start,
+ size_t size, unsigned long flags,
+ unsigned long desc)
{
struct resource res;
int type = 0; int other = 0;
@@ -512,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size,
res.start = start;
res.end = start + size - 1;
- for (p = iomem_resource.child; p ; p = p->sibling) {
+ for (p = parent->child; p ; p = p->sibling) {
bool is_type = (((p->flags & flags) == flags) &&
((desc == IORES_DESC_NONE) ||
(desc == p->desc)));
@@ -556,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
int ret;
read_lock(&resource_lock);
- ret = __region_intersects(start, size, flags, desc);
+ ret = __region_intersects(&iomem_resource, start, size, flags, desc);
read_unlock(&resource_lock);
return ret;
@@ -900,10 +888,17 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
if (conflict->end > new->end)
new->end = conflict->end;
- printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
}
write_unlock(&resource_lock);
}
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
/**
* remove_resource - Remove a resource in the resource tree
@@ -1288,9 +1283,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource "
- "<%016llx-%016llx>\n", (unsigned long long)start,
- (unsigned long long)end);
+ pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
}
EXPORT_SYMBOL(__release_region);
@@ -1350,20 +1343,6 @@ retry:
continue;
}
- /*
- * All memory regions added from memory-hotplug path have the
- * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
- * this flag, we know that we are dealing with a resource coming
- * from HMM/devm. HMM/devm use another mechanism to add/release
- * a resource. This goes via devm_request_mem_region and
- * devm_release_mem_region.
- * HMM/devm take care to release their resources when they want,
- * so if we are dealing with them, let us just back off here.
- */
- if (!(res->flags & IORESOURCE_SYSRAM)) {
- break;
- }
-
if (!(res->flags & IORESOURCE_MEM))
break;
@@ -1663,6 +1642,7 @@ __setup("reserve=", reserve_setup);
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
struct resource *p = &iomem_resource;
+ resource_size_t end = addr + size - 1;
int err = 0;
loff_t l;
@@ -1672,12 +1652,12 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
- if (p->start >= addr + size)
+ if (p->start > end)
continue;
if (p->end < addr)
continue;
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
- PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ PFN_DOWN(p->end) >= PFN_DOWN(end))
continue;
/*
* if a resource is "BUSY", it's not a hardware resource
@@ -1688,10 +1668,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
- (unsigned long long)addr,
- (unsigned long long)(addr + size - 1),
- p->name, p);
+ pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+ &addr, &end, p->name, p);
err = -1;
break;
}
@@ -1707,37 +1685,46 @@ static int strict_iomem_checks;
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
-bool iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
{
- struct resource *p = &iomem_resource;
- bool err = false;
- loff_t l;
- int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return false;
-
- addr = addr & PAGE_MASK;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
+ struct resource *p;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
- /*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
- */
+ for_each_resource(root, p, skip_children) {
if (p->start >= addr + size)
break;
- if (p->end < addr)
+ if (p->end < addr) {
+ skip_children = true;
continue;
+ }
+ skip_children = false;
+
+ /*
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
+ */
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
+ break;
+ }
+
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
- if ((p->flags & IORESOURCE_BUSY) == 0)
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
@@ -1750,6 +1737,12 @@ bool iomem_is_exclusive(u64 addr)
return err;
}
+bool iomem_is_exclusive(u64 addr)
+{
+ return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+ PAGE_SIZE);
+}
+
struct resource_entry *resource_list_create_entry(struct resource *res,
size_t extra_size)
{
@@ -1774,62 +1767,139 @@ void resource_list_free(struct list_head *head)
}
EXPORT_SYMBOL(resource_list_free);
-#ifdef CONFIG_DEVICE_PRIVATE
-static struct resource *__request_free_mem_region(struct device *dev,
- struct resource *base, unsigned long size, const char *name)
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING (1UL << 0)
+#define GFR_REQUEST_REGION (1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+ resource_size_t align, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING) {
+ resource_size_t end;
+
+ end = min_t(resource_size_t, base->end,
+ (1ULL << MAX_PHYSMEM_BITS) - 1);
+ return end - size + 1;
+ }
+
+ return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+ resource_size_t size, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr > size && addr >= base->start;
+ /*
+ * In the ascend case be careful that the last increment by
+ * @size did not wrap 0.
+ */
+ return addr > addr - size &&
+ addr <= min_t(resource_size_t, base->end,
+ (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+ unsigned long flags)
{
- resource_size_t end, addr;
+ if (flags & GFR_DESCENDING)
+ return addr - size;
+ return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+ struct resource *res = _res;
+
+ if (res->parent)
+ remove_resource(res);
+ free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+ resource_size_t size, const unsigned long align,
+ const char *name, const unsigned long desc,
+ const unsigned long flags)
+{
+ resource_size_t addr;
struct resource *res;
struct region_devres *dr = NULL;
- size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
- end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
- addr = end - size + 1UL;
+ size = ALIGN(size, align);
res = alloc_resource(GFP_KERNEL);
if (!res)
return ERR_PTR(-ENOMEM);
- if (dev) {
+ if (dev && (flags & GFR_REQUEST_REGION)) {
dr = devres_alloc(devm_region_release,
sizeof(struct region_devres), GFP_KERNEL);
if (!dr) {
free_resource(res);
return ERR_PTR(-ENOMEM);
}
+ } else if (dev) {
+ if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+ return ERR_PTR(-ENOMEM);
}
write_lock(&resource_lock);
- for (; addr > size && addr >= base->start; addr -= size) {
- if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
- REGION_DISJOINT)
+ for (addr = gfr_start(base, size, align, flags);
+ gfr_continue(base, addr, size, flags);
+ addr = gfr_next(addr, size, flags)) {
+ if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+ REGION_DISJOINT)
continue;
- if (__request_region_locked(res, &iomem_resource, addr, size,
- name, 0))
- break;
+ if (flags & GFR_REQUEST_REGION) {
+ if (__request_region_locked(res, &iomem_resource, addr,
+ size, name, 0))
+ break;
- if (dev) {
- dr->parent = &iomem_resource;
- dr->start = addr;
- dr->n = size;
- devres_add(dev, dr);
- }
+ if (dev) {
+ dr->parent = &iomem_resource;
+ dr->start = addr;
+ dr->n = size;
+ devres_add(dev, dr);
+ }
- res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
- write_unlock(&resource_lock);
+ res->desc = desc;
+ write_unlock(&resource_lock);
+
+
+ /*
+ * A driver is claiming this region so revoke any
+ * mappings.
+ */
+ revoke_iomem(res);
+ } else {
+ res->start = addr;
+ res->end = addr + size - 1;
+ res->name = name;
+ res->desc = desc;
+ res->flags = IORESOURCE_MEM;
+
+ /*
+ * Only succeed if the resource hosts an exclusive
+ * range after the insert
+ */
+ if (__insert_resource(base, res) || res->child)
+ break;
+
+ write_unlock(&resource_lock);
+ }
- /*
- * A driver is claiming this region so revoke any mappings.
- */
- revoke_iomem(res);
return res;
}
write_unlock(&resource_lock);
- free_resource(res);
- if (dr)
+ if (flags & GFR_REQUEST_REGION) {
+ free_resource(res);
devres_free(dr);
+ } else if (dev)
+ devm_release_action(dev, remove_free_mem_region, res);
return ERR_PTR(-ERANGE);
}
@@ -1848,18 +1918,48 @@ static struct resource *__request_free_mem_region(struct device *dev,
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size)
{
- return __request_free_mem_region(dev, base, size, dev_name(dev));
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+ dev_name(dev),
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
struct resource *request_free_mem_region(struct resource *base,
unsigned long size, const char *name)
{
- return __request_free_mem_region(NULL, base, size, name);
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
}
EXPORT_SYMBOL_GPL(request_free_mem_region);
-#endif /* CONFIG_DEVICE_PRIVATE */
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+ unsigned long size, unsigned long align,
+ const char *name)
+{
+ /* Default of ascending direction and insert resource */
+ unsigned long flags = 0;
+
+ return get_free_mem_region(NULL, base, size, align, name,
+ IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
static int __init strict_iomem(char *str)
{
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 35f7bd0fced0..9de6e35fe679 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,8 +18,12 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
-#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
+#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
/*
*
@@ -81,15 +85,25 @@
* F1. <failure>
*/
-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);
+ u32 mm_cid = task_mm_cid(t);
- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ WARN_ON_ONCE((int) mm_cid < 0);
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
+ unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -100,9 +114,10 @@ efault:
return -EFAULT;
}
-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
+ mm_cid = 0;
/*
* Reset cpu_id_start to its initial state (0).
@@ -116,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
+ /*
+ * Reset mm_cid to its initial state (0).
+ */
+ if (put_user(mm_cid, &t->rseq->mm_cid))
+ return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}
@@ -128,10 +158,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
int ret;
#ifdef CONFIG_64BIT
- if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
+ if (get_user(ptr, &t->rseq->rseq_cs))
return -EFAULT;
#else
- if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+ if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
return -EFAULT;
#endif
if (!ptr) {
@@ -170,28 +200,35 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
return 0;
}
+static bool rseq_warn_flags(const char *str, u32 flags)
+{
+ u32 test_flags;
+
+ if (!flags)
+ return false;
+ test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
+ if (test_flags)
+ pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
+ test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
+ if (test_flags)
+ pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
+ return true;
+}
+
static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
{
u32 flags, event_mask;
int ret;
+ if (rseq_warn_flags("rseq_cs", cs_flags))
+ return -EINVAL;
+
/* Get thread flags. */
ret = get_user(flags, &t->rseq->flags);
if (ret)
return ret;
- /* Take critical section flags into account. */
- flags |= cs_flags;
-
- /*
- * Restart on signal can only be inhibited when restart on
- * preempt and restart on migrate are inhibited too. Otherwise,
- * a preempted signal handler could fail to restart the prior
- * execution context on sigreturn.
- */
- if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
- (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
- RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
+ if (rseq_warn_flags("rseq", flags))
return -EINVAL;
/*
@@ -203,7 +240,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
t->rseq_event_mask = 0;
preempt_enable();
- return !!(event_mask & ~flags);
+ return !!event_mask;
}
static int clear_rseq_cs(struct task_struct *t)
@@ -217,9 +254,9 @@ static int clear_rseq_cs(struct task_struct *t)
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
- return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+ return put_user(0UL, &t->rseq->rseq_cs);
#else
- if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+ if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
return -EFAULT;
return 0;
#endif
@@ -282,10 +319,18 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
- if (unlikely(rseq_update_cpu_id(t)))
+
+ /*
+ * regs is NULL if and only if the caller is in a syscall path. Skip
+ * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
+ * kill a misbehaving userspace on debug kernels.
+ */
+ if (regs) {
+ ret = rseq_ip_fixup(regs);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;
@@ -328,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}
@@ -349,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -358,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}
/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
+ * size, the required alignment is the original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
- if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ if (rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 29e8fc5d91a7..5d113aa59e77 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -38,14 +38,10 @@
#define SCFTORT_STRING "scftorture"
#define SCFTORT_FLAG SCFTORT_STRING ": "
-#define SCFTORTOUT(s, x...) \
- pr_alert(SCFTORT_FLAG s, ## x)
-
#define VERBOSE_SCFTORTOUT(s, x...) \
- do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0)
+ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0)
-#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0)
+#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
@@ -64,6 +60,7 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations.");
torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
+torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations.");
torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations.");
@@ -86,6 +83,8 @@ struct scf_statistics {
long long n_resched;
long long n_single;
long long n_single_ofl;
+ long long n_single_rpc;
+ long long n_single_rpc_ofl;
long long n_single_wait;
long long n_single_wait_ofl;
long long n_many;
@@ -101,14 +100,17 @@ static DEFINE_PER_CPU(long long, scf_invoked_count);
// Data for random primitive selection
#define SCF_PRIM_RESCHED 0
#define SCF_PRIM_SINGLE 1
-#define SCF_PRIM_MANY 2
-#define SCF_PRIM_ALL 3
-#define SCF_NPRIMS 7 // Need wait and no-wait versions of each,
- // except for SCF_PRIM_RESCHED.
+#define SCF_PRIM_SINGLE_RPC 2
+#define SCF_PRIM_MANY 3
+#define SCF_PRIM_ALL 4
+#define SCF_NPRIMS 8 // Need wait and no-wait versions of each,
+ // except for SCF_PRIM_RESCHED and
+ // SCF_PRIM_SINGLE_RPC.
static char *scf_prim_name[] = {
"resched_cpu",
"smp_call_function_single",
+ "smp_call_function_single_rpc",
"smp_call_function_many",
"smp_call_function",
};
@@ -128,6 +130,8 @@ struct scf_check {
bool scfc_out;
int scfc_cpu; // -1 for not _single().
bool scfc_wait;
+ bool scfc_rpc;
+ struct completion scfc_completion;
};
// Use to wait for all threads to start.
@@ -158,6 +162,7 @@ static void scf_torture_stats_print(void)
scfs.n_resched += scf_stats_p[i].n_resched;
scfs.n_single += scf_stats_p[i].n_single;
scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
+ scfs.n_single_rpc += scf_stats_p[i].n_single_rpc;
scfs.n_single_wait += scf_stats_p[i].n_single_wait;
scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl;
scfs.n_many += scf_stats_p[i].n_many;
@@ -168,9 +173,10 @@ static void scf_torture_stats_print(void)
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
bangstr = "!!! ";
- pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
+ pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
+ scfs.n_single_rpc, scfs.n_single_rpc_ofl,
scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
torture_onoff_stats();
pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs),
@@ -261,9 +267,10 @@ static void scf_handler(void *scfc_in)
}
this_cpu_inc(scf_invoked_count);
if (longwait <= 0) {
- if (!(r & 0xffc0))
+ if (!(r & 0xffc0)) {
udelay(r & 0x3f);
- goto out;
+ goto out;
+ }
}
if (r & 0xfff)
goto out;
@@ -282,10 +289,13 @@ static void scf_handler(void *scfc_in)
out:
if (unlikely(!scfcp))
return;
- if (scfcp->scfc_wait)
+ if (scfcp->scfc_wait) {
WRITE_ONCE(scfcp->scfc_out, true);
- else
+ if (scfcp->scfc_rpc)
+ complete(&scfcp->scfc_completion);
+ } else {
kfree(scfcp);
+ }
}
// As above, but check for correct CPU.
@@ -319,6 +329,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp->scfc_cpu = -1;
scfcp->scfc_wait = scfsp->scfs_wait;
scfcp->scfc_out = false;
+ scfcp->scfc_rpc = false;
}
}
switch (scfsp->scfs_prim) {
@@ -327,6 +338,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
cpu = torture_random(trsp) % nr_cpu_ids;
scfp->n_resched++;
resched_cpu(cpu);
+ this_cpu_inc(scf_invoked_count);
}
break;
case SCF_PRIM_SINGLE:
@@ -350,6 +362,34 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp = NULL;
}
break;
+ case SCF_PRIM_SINGLE_RPC:
+ if (!scfcp)
+ break;
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_single_rpc++;
+ scfcp->scfc_cpu = cpu;
+ scfcp->scfc_wait = true;
+ init_completion(&scfcp->scfc_completion);
+ scfcp->scfc_rpc = true;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0);
+ if (!ret) {
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ wait_for_completion(&scfcp->scfc_completion);
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ } else {
+ scfp->n_single_rpc_ofl++;
+ kfree(scfcp);
+ scfcp = NULL;
+ }
+ break;
case SCF_PRIM_MANY:
if (scfsp->scfs_wait)
scfp->n_many_wait++;
@@ -379,10 +419,12 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
}
if (scfcp && scfsp->scfs_wait) {
if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) &&
- !scfcp->scfc_out))
+ !scfcp->scfc_out)) {
+ pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
atomic_inc(&n_mb_out_errs); // Leak rather than trash!
- else
+ } else {
kfree(scfcp);
+ }
barrier(); // Prevent race-reduction compiler optimizations.
}
if (use_cpus_read_lock)
@@ -453,8 +495,8 @@ static void
scftorture_print_module_parms(const char *tag)
{
pr_alert(SCFTORT_FLAG
- "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
- verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
+ "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+ verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
}
static void scf_cleanup_handler(void *unused)
@@ -469,7 +511,7 @@ static void scf_torture_cleanup(void)
return;
WRITE_ONCE(scfdone, true);
- if (nthreads)
+ if (nthreads && scf_stats_p)
for (i = 0; i < nthreads; i++)
torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
else
@@ -497,6 +539,7 @@ static int __init scf_torture_init(void)
int firsterr = 0;
unsigned long weight_resched1 = weight_resched;
unsigned long weight_single1 = weight_single;
+ unsigned long weight_single_rpc1 = weight_single_rpc;
unsigned long weight_single_wait1 = weight_single_wait;
unsigned long weight_many1 = weight_many;
unsigned long weight_many_wait1 = weight_many_wait;
@@ -508,21 +551,25 @@ static int __init scf_torture_init(void)
scftorture_print_module_parms("Start of test");
- if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 &&
- weight_many == -1 && weight_many_wait == -1 &&
- weight_all == -1 && weight_all_wait == -1) {
- weight_resched1 = 2 * nr_cpu_ids;
- weight_single1 = 2 * nr_cpu_ids;
- weight_single_wait1 = 2 * nr_cpu_ids;
- weight_many1 = 2;
- weight_many_wait1 = 2;
- weight_all1 = 1;
- weight_all_wait1 = 1;
+ if (weight_resched <= 0 &&
+ weight_single <= 0 && weight_single_rpc <= 0 && weight_single_wait <= 0 &&
+ weight_many <= 0 && weight_many_wait <= 0 &&
+ weight_all <= 0 && weight_all_wait <= 0) {
+ weight_resched1 = weight_resched == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single1 = weight_single == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_rpc1 = weight_single_rpc == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_wait1 = weight_single_wait == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_many1 = weight_many == 0 ? 0 : 2;
+ weight_many_wait1 = weight_many_wait == 0 ? 0 : 2;
+ weight_all1 = weight_all == 0 ? 0 : 1;
+ weight_all_wait1 = weight_all_wait == 0 ? 0 : 1;
} else {
if (weight_resched == -1)
weight_resched1 = 0;
if (weight_single == -1)
weight_single1 = 0;
+ if (weight_single_rpc == -1)
+ weight_single_rpc1 = 0;
if (weight_single_wait == -1)
weight_single_wait1 = 0;
if (weight_many == -1)
@@ -534,18 +581,19 @@ static int __init scf_torture_init(void)
if (weight_all_wait == -1)
weight_all_wait1 = 0;
}
- if (weight_single1 == 0 && weight_single_wait1 == 0 &&
- weight_many1 == 0 && weight_many_wait1 == 0 &&
+ if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 &&
+ weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 &&
weight_all1 == 0 && weight_all_wait1 == 0) {
- VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
+ SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
firsterr = -EINVAL;
goto unwind;
}
if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST))
scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false);
else if (weight_resched1)
- VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
+ SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
+ scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true);
@@ -555,17 +603,17 @@ static int __init scf_torture_init(void)
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -574,24 +622,24 @@ static int __init scf_torture_init(void)
nthreads = num_online_cpus();
scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL);
if (!scf_stats_p) {
- VERBOSE_SCFTORTOUT_ERRSTRING("out of memory");
+ SCFTORTOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads);
+ VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads);
atomic_set(&n_started, nthreads);
for (i = 0; i < nthreads; i++) {
scf_stats_p[i].cpu = i;
firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i],
scf_stats_p[i].task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -601,6 +649,10 @@ static int __init scf_torture_init(void)
unwind:
torture_init_end();
scf_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_SCF_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 978fcfca5871..976092b7bd45 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,17 +1,17 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
+
+# The compilers are complaining about unused variables inside an if(0) scope
+# block. This is daft, shut them up.
+ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
# These files are disabled because they produce non-interesting flaky coverage
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
-# There are numerous data races here, however, most of them are due to plain accesses.
-# This would make it even harder for syzbot to find reproducers, because these
-# bugs trigger without specific input. Disable by default, but should re-enable
-# eventually.
+# Disable KCSAN to avoid excessive noise and performance degradation. To avoid
+# false positives ensure barriers implied by sched functions are instrumented.
KCSAN_SANITIZE := n
+KCSAN_INSTRUMENT_BARRIERS := y
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -22,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
-
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-obj-$(CONFIG_PSI) += psi.o
-obj-$(CONFIG_SCHED_CORE) += core_sched.o
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_policy.o
+obj-y += build_utility.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2067080bb235..991fc9002535 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,20 +1,42 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Auto-group scheduling implementation:
*/
-#include <linux/nospec.h>
-#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_autogroup_sysctls[] = {
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif
+
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
+ sched_autogroup_sysctl_init();
}
void autogroup_free(struct task_group *tg)
@@ -31,7 +53,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -139,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
struct task_struct *t;
unsigned long flags;
- BUG_ON(!lock_task_sighand(p, &flags));
+ if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
+ return;
prev = p->signal->autogroup;
if (prev == ag) {
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b96419974a1f..90d69f2c5eaf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
+ extern unsigned int sysctl_sched_autogroup_enabled;
int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
@@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
}
#endif /* CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..d9dc9ab3773f
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c and fair.c, the other two big
+ * compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/livepatch.h>
+#include <linux/psi.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "smp.h"
+
+#include "autogroup.h"
+#include "stats.h"
+#include "pelt.h"
+
+/* Source code modules: */
+
+#include "idle.c"
+
+#include "rt.c"
+
+#ifdef CONFIG_SMP
+# include "cpudeadline.c"
+# include "pelt.c"
+#endif
+
+#include "cputime.c"
+#include "deadline.c"
+
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..99bdd96f454f
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c, fair.c, smp.c and policy.c, the other
+ * big compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+
+#include "sched.h"
+#include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include "debug.c"
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#ifdef CONFIG_SMP
+# include "cpupri.c"
+# include "stop_task.c"
+# include "topology.c"
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c2b2859ddd82..b5cc2b53464d 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -53,15 +53,13 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include "sched.h"
-#include <linux/sched_clock.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -95,28 +93,28 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
-int sched_clock_stable(void)
+notrace int sched_clock_stable(void)
{
return static_branch_likely(&__sched_clock_stable);
}
-static void __scd_stamp(struct sched_clock_data *scd)
+notrace static void __scd_stamp(struct sched_clock_data *scd)
{
scd->tick_gtod = ktime_get_ns();
scd->tick_raw = sched_clock();
}
-static void __set_sched_clock_stable(void)
+notrace static void __set_sched_clock_stable(void)
{
struct sched_clock_data *scd;
@@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void)
* The only way to fully avoid random clock jumps is to boot with:
* "tsc=unstable".
*/
-static void __sched_clock_work(struct work_struct *work)
+notrace static void __sched_clock_work(struct work_struct *work)
{
struct sched_clock_data *scd;
int cpu;
@@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work)
static DECLARE_WORK(sched_clock_work, __sched_clock_work);
-static void __clear_sched_clock_stable(void)
+notrace static void __clear_sched_clock_stable(void)
{
if (!sched_clock_stable())
return;
@@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void)
schedule_work(&sched_clock_work);
}
-void clear_sched_clock_stable(void)
+notrace void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
@@ -196,7 +194,7 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-static void __sched_clock_gtod_offset(void)
+notrace static void __sched_clock_gtod_offset(void)
{
struct sched_clock_data *scd = this_scd();
@@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -262,7 +260,7 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -289,13 +287,31 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock() + __sched_clock_offset;
+
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock();
+
+ preempt_disable_notrace();
+ clock = sched_clock_local(this_scd());
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
@@ -351,7 +367,7 @@ again:
val = remote_clock;
}
- if (cmpxchg64(ptr, old_val, val) != old_val)
+ if (!try_cmpxchg64(ptr, &old_val, val))
goto again;
return val;
@@ -362,7 +378,7 @@ again:
*
* See cpu_clock().
*/
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
@@ -386,7 +402,7 @@ u64 sched_clock_cpu(int cpu)
}
EXPORT_SYMBOL_GPL(sched_clock_cpu);
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
{
struct sched_clock_data *scd;
@@ -403,7 +419,7 @@ void sched_clock_tick(void)
sched_clock_local(scd);
}
-void sched_clock_tick_stable(void)
+notrace void sched_clock_tick_stable(void)
{
if (!sched_clock_stable())
return;
@@ -423,7 +439,7 @@ void sched_clock_tick_stable(void)
/*
* We are going deep-idle (irqs are disabled):
*/
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
@@ -432,7 +448,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(void)
+notrace void sched_clock_idle_wakeup_event(void)
{
unsigned long flags;
@@ -458,7 +474,7 @@ void __init sched_clock_init(void)
local_irq_enable();
}
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
if (!static_branch_likely(&sched_clock_running))
return 0;
@@ -476,7 +492,7 @@ u64 sched_clock_cpu(int cpu)
* On bare metal this function should return the same as local_clock.
* Architectures and sub-architectures can override this.
*/
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
{
return local_clock();
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a778554f9dad..d57a5c1c1cd9 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Generic wait-for-completion handler;
*
@@ -11,7 +12,6 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
@@ -204,6 +204,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
@@ -241,12 +242,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d9ff40f4661..a68d1276bab0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,26 +6,98 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
+
+#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
+#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/sched/wake_q.h>
+#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_ENTRY
+# include <linux/entry-common.h>
+# endif
+#endif
+
+#include <uapi/linux/sched/types.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+
#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>
+#include <trace/events/ipi.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
-#include <linux/nospec.h>
-
-#include <linux/kcov.h>
-#include <linux/scs.h>
-
-#include <asm/switch_to.h>
-#include <asm/tlb.h>
+#include "autogroup.h"
+#include "pelt.h"
+#include "smp.h"
+#include "stats.h"
#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
+#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
-#include "pelt.h"
-#include "smp.h"
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -36,6 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -74,13 +147,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-
-/*
- * period over which we measure -rt task CPU usage in us.
- * default: 1s
- */
-unsigned int sysctl_sched_rt_period = 1000000;
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -89,7 +156,7 @@ __read_mostly int scheduler_running;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
/* kernel prio, less is more */
-static inline int __task_prio(struct task_struct *p)
+static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
@@ -111,7 +178,8 @@ static inline int __task_prio(struct task_struct *p)
*/
/* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
@@ -131,7 +199,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool
return false;
}
-static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
@@ -140,7 +209,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
@@ -177,47 +246,70 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}
-/*
- * Find left-most (aka, highest priority) task matching @cookie.
- */
-static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
- struct rb_node *node;
-
- node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
- /*
- * The idle task always matches any cookie!
- */
- if (!node)
- return idle_sched_class.pick_task(rq);
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
- return __node_2_sc(node);
+ return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
+
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
- node = rb_next(node);
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
+
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
- p = container_of(node, struct task_struct, core_node);
- if (p->core_cookie != cookie)
- return NULL;
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
- return p;
+ return sched_core_next(p, cookie);
}
/*
@@ -237,9 +329,30 @@ static DEFINE_MUTEX(sched_core_mutex);
static atomic_t sched_core_count;
static struct cpumask sched_core_mask;
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
+}
+
static void __sched_core_flip(bool enabled)
{
- int cpu, t, i;
+ unsigned long flags;
+ int cpu, t;
cpus_read_lock();
@@ -250,19 +363,14 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(cpu, &sched_core_mask) {
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- i = 0;
- local_irq_disable();
- for_each_cpu(t, smt_mask) {
- /* supports up to SMT8 */
- raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
- }
+ sched_core_lock(cpu, &flags);
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
- for_each_cpu(t, smt_mask)
- raw_spin_unlock(&cpu_rq(t)->__lock);
- local_irq_enable();
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
+ sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
}
@@ -270,10 +378,7 @@ static void __sched_core_flip(bool enabled)
/*
* Toggle the offline CPUs.
*/
- cpumask_copy(&sched_core_mask, cpu_possible_mask);
- cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
-
- for_each_cpu(cpu, &sched_core_mask)
+ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
cpu_rq(cpu)->core_enabled = enabled;
cpus_read_unlock();
@@ -346,18 +451,12 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
- */
-int sysctl_sched_rt_runtime = 950000;
-
-
-/*
* Serialization rules:
*
* Lock order:
@@ -397,8 +496,7 @@ int sysctl_sched_rt_runtime = 950000;
* p->se.load, p->rt_priority,
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
* - sched_setnuma(): p->numa_preferred_nid
- * - sched_move_task()/
- * cpu_cgroup_fork(): p->sched_task_group
+ * - sched_move_task(): p->sched_task_group
* - uclamp_update_active() p->uclamp*
*
* p->state <- TASK_*:
@@ -516,10 +614,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(rq1, rq2);
}
#endif
@@ -624,6 +722,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
+ delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -789,15 +889,11 @@ static inline void hrtick_rq_init(struct rq *rq)
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -806,7 +902,7 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -821,30 +917,28 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
+ if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
- val = old;
}
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -993,17 +1087,19 @@ int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu_and(i, sched_domain_span(sd),
- housekeeping_cpumask(HK_FLAG_TIMER)) {
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
@@ -1015,7 +1111,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
cpu = default_cpu;
unlock:
rcu_read_unlock();
@@ -1223,10 +1319,10 @@ static void set_load_weight(struct task_struct *p, bool update_load)
static DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
-unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
-unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/*
* By default RT tasks run at the maximum performance point/capacity of the
@@ -1243,7 +1339,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
-unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -1318,7 +1414,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
@@ -1373,33 +1469,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p)
task_rq_unlock(rq, p, &rf);
}
-static void uclamp_sync_util_min_rt_default(void)
-{
- struct task_struct *g, *p;
-
- /*
- * copy_process() sysctl_uclamp
- * uclamp_min_rt = X;
- * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
- * // link thread smp_mb__after_spinlock()
- * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
- * sched_post_fork() for_each_process_thread()
- * __uclamp_sync_rt() __uclamp_sync_rt()
- *
- * Ensures that either sched_post_fork() will observe the new
- * uclamp_min_rt or for_each_process_thread() will observe the new
- * task.
- */
- read_lock(&tasklist_lock);
- smp_mb__after_spinlock();
- read_unlock(&tasklist_lock);
-
- rcu_read_lock();
- for_each_process_thread(g, p)
- uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
-}
-
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -1496,8 +1565,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
- if (uc_se->value > READ_ONCE(uc_rq->value))
- WRITE_ONCE(uc_rq->value, uc_se->value);
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/*
@@ -1563,7 +1632,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
if (likely(bucket->tasks))
return;
- rq_clamp = READ_ONCE(uc_rq->value);
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
@@ -1571,7 +1640,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- WRITE_ONCE(uc_rq->value, bkt_clamp);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
@@ -1619,6 +1688,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
uclamp_update_active(struct task_struct *p)
{
@@ -1642,12 +1728,8 @@ uclamp_update_active(struct task_struct *p)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- for_each_clamp_id(clamp_id) {
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
@@ -1666,6 +1748,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css)
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
@@ -1683,7 +1770,34 @@ static void uclamp_update_root_tg(void)
static void uclamp_update_root_tg(void) { }
#endif
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ rcu_read_lock();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+ rcu_read_unlock();
+}
+
+static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
@@ -1747,6 +1861,8 @@ done:
return result;
}
+#endif
+#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
@@ -1885,7 +2001,7 @@ static void __init init_uclamp_rq(struct rq *rq)
};
}
- rq->uclamp_flags = 0;
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
@@ -1933,6 +2049,25 @@ bool sched_task_on_rq(struct task_struct *p)
return task_on_rq_queued(p);
}
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
@@ -1940,7 +2075,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
- psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
uclamp_rq_inc(rq, p);
@@ -1953,7 +2088,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -1969,6 +2104,11 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
+ if (flags & ENQUEUE_MIGRATED)
+ sched_mm_cid_migrate_to(rq, p);
+
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -1981,12 +2121,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1998,15 +2144,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -2064,7 +2202,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- else if (p->sched_class > rq->curr->sched_class)
+ else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
/*
@@ -2078,14 +2216,18 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags);
+ struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
+ struct affinity_context ac = {
+ .new_mask = cpumask_of(rq->cpu),
+ .flags = SCA_MIGRATE_DISABLE,
+ };
+
if (likely(!p->migration_disabled))
return;
@@ -2095,7 +2237,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
/*
* Violates locking rules! see comment in __do_set_cpus_allowed().
*/
- __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+ __do_set_cpus_allowed(p, &ac);
}
void migrate_disable(void)
@@ -2117,19 +2259,26 @@ EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
+ struct affinity_context ac = {
+ .new_mask = &p->cpus_mask,
+ .flags = SCA_MIGRATE_ENABLE,
+ };
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
+ if (WARN_ON_ONCE(!p->migration_disabled))
+ return;
+
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ __set_cpus_allowed_ptr(p, &ac);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
@@ -2163,7 +2312,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
- return cpu_active(cpu);
+ return cpu_active(cpu) && task_cpu_possible(cpu, p);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -2208,7 +2357,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
- BUG_ON(task_cpu(p) != new_cpu);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
@@ -2279,7 +2428,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2409,19 +2558,25 @@ out_unlock:
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
- if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
- p->cpus_ptr = new_mask;
+ if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = ctx->new_mask;
return;
}
- cpumask_copy(&p->cpus_mask, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+ cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+
+ /*
+ * Swap in a new user_cpus_ptr if SCA_USER flag set
+ */
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
}
static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
@@ -2438,7 +2593,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
*
* XXX do further audits, this smells like something putrid.
*/
- if (flags & SCA_MIGRATE_DISABLE)
+ if (ctx->flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -2457,7 +2612,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask, flags);
+ p->sched_class->set_cpus_allowed(p, ctx);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -2465,9 +2620,97 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
set_next_task(rq, p);
}
+/*
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
+ */
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- __do_set_cpus_allowed(p, new_mask, 0);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .user_mask = NULL,
+ .flags = SCA_USER, /* clear the user requested mask */
+ };
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
+
+ __do_set_cpus_allowed(p, &ac);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+static cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
+}
+
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
+{
+ cpumask_t *user_mask;
+ unsigned long flags;
+
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
+ return 0;
+
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
+ return -ENOMEM;
+
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
}
/*
@@ -2548,6 +2791,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
@@ -2630,7 +2875,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
/*
* MIGRATE_ENABLE gets here because 'p == current', but for
* anything else we cannot do is_migration_disabled(), punt
@@ -2687,28 +2932,24 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ struct affinity_context *ctx,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
unsigned int dest_cpu;
- struct rq_flags rf;
- struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
@@ -2722,22 +2963,30 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
cpu_valid_mask = cpu_online_mask;
}
+ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (!(flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, new_mask))
+ if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
+ }
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
- !cpumask_test_cpu(task_cpu(p), new_mask))) {
+ !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
ret = -EBUSY;
goto out;
}
@@ -2748,28 +2997,177 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* for groups of tasks (ie. cpuset), so that load balancing is not
* immediately required to distribute the tasks within their new mask.
*/
- dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
- __do_set_cpus_allowed(p, new_mask, flags);
+ __do_set_cpus_allowed(p, ctx);
- return affine_move_task(rq, p, &rf, dest_cpu, flags);
+ return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
out:
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, rf);
return ret;
}
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
+ return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, 0);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+
+ return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr().
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = task_user_cpus(p),
+ .flags = 0,
+ };
+ int ret;
+
+ /*
+ * Try to restore the old affinity mask with __sched_setaffinity().
+ * Cpuset masking will be done there too.
+ */
+ ret = __sched_setaffinity(p, &ac);
+ WARN_ON_ONCE(ret);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -2819,6 +3217,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
+ sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -2945,12 +3344,12 @@ out:
/*
* wait_task_inactive - wait for a thread to unschedule.
*
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
@@ -2981,12 +3380,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
+ * But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
- while (task_running(rq, p)) {
- if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+ while (task_on_cpu(rq, p)) {
+ if (!(READ_ONCE(p->__state) & match_state))
return 0;
cpu_relax();
}
@@ -2998,10 +3397,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
- running = task_running(rq, p);
+ running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || READ_ONCE(p->__state) == match_state)
+ if (READ_ONCE(p->__state) & match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -3035,7 +3434,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
@@ -3114,9 +3513,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
@@ -3133,8 +3530,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- if (IS_ENABLED(CONFIG_CPUSETS)) {
- cpuset_cpus_allowed_fallback(p);
+ if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
@@ -3146,10 +3542,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, cpu_possible_mask);
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -3249,10 +3644,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+ struct affinity_context *ctx)
{
- return set_cpus_allowed_ptr(p, new_mask);
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
}
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
@@ -3262,6 +3656,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
return false;
}
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
#endif /* !CONFIG_SMP */
static void
@@ -3277,11 +3676,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
- __schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
- __schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ __schedstat_inc(p->stats.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
@@ -3293,25 +3692,50 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
- __schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
- __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->stats.nr_wakeups_sync);
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ check_preempt_curr(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
@@ -3341,31 +3765,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_rq_held(rq);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
- else
-#endif
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
-}
-
/*
* Consider @p being inside a wait loop:
*
@@ -3399,9 +3798,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ update_rq_clock(rq);
+ check_preempt_curr(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -3420,13 +3825,6 @@ void sched_ttwu_pending(void *arg)
if (!llist)
return;
- /*
- * rq::ttwu_pending racy indication of out-standing wakeups.
- * Races such that false-negatives are possible, since they
- * are shorter lived that false-positives would be.
- */
- WRITE_ONCE(rq->ttwu_pending, 0);
-
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -3440,17 +3838,34 @@ void sched_ttwu_pending(void *arg)
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
+ /*
+ * Must be after enqueueing at least once task such that
+ * idle_cpu() does not observe a false-negative -- if it does,
+ * it is possible for select_idle_siblings() to stack a number
+ * of tasks on this CPU during that window.
+ *
+ * It is ok to clear ttwu_pending when another task pending.
+ * We will receive IPI after local irq enabled and then enqueue it.
+ * Since now nr_running > 0, idle_cpu() will always get correct result.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
-void send_call_function_single_ipi(int cpu)
+/*
+ * Prepare the scene for sending an IPI for a remote smp_call
+ *
+ * Returns true if the caller can proceed with sending the IPI.
+ * Returns false otherwise.
+ */
+bool call_function_single_prep_ipi(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- if (!set_nr_if_polling(rq->idle))
- arch_send_call_function_single_ipi(cpu);
- else
+ if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
+ return false;
+ }
+
+ return true;
}
/*
@@ -3479,15 +3894,11 @@ void wake_up_if_idle(int cpu)
if (!is_idle_task(rcu_dereference(rq->curr)))
goto out;
- if (set_nr_if_polling(rq->idle)) {
- trace_sched_wake_idle_without_ipi(cpu);
- } else {
- rq_lock_irqsave(rq, &rf);
- if (is_idle_task(rq->curr))
- smp_send_reschedule(cpu);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
- }
+ rq_lock_irqsave(rq, &rf);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ /* Else CPU is not idle, do nothing here: */
+ rq_unlock_irqrestore(rq, &rf);
out:
rcu_read_unlock();
@@ -3495,10 +3906,13 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
* Do not complicate things with the async wake_list while the CPU is
@@ -3507,6 +3921,10 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpu_active(cpu))
return false;
+ /* Ensure the task will still be allowed to run on the CPU. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
/*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
@@ -3514,13 +3932,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -3528,10 +3954,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -3564,6 +3987,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ * bits set. This allows to distinguish all wakeup scenarios.
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ state != TASK_RTLOCK_WAIT);
+ }
+
+ if (READ_ONCE(p->__state) & state) {
+ *success = 1;
+ return true;
+ }
+
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * Saved state preserves the task state across blocking on
+ * an RT lock. If the state matches, set p::saved_state to
+ * TASK_RUNNING, but do not wake the task because it waits
+ * for a lock wakeup. Also indicate success because from
+ * the regular waker's point of view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::__state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ *success = 1;
+ }
+#endif
+ return false;
+}
+
+/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
@@ -3702,13 +4174,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- if (!(READ_ONCE(p->__state) & state))
+ if (!ttwu_state_match(p, state, &success))
goto out;
- success = 1;
trace_sched_waking(p);
- WRITE_ONCE(p->__state, TASK_RUNNING);
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
@@ -3720,14 +4190,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
- if (!(READ_ONCE(p->__state) & state))
+ if (!ttwu_state_match(p, state, &success))
goto unlock;
trace_sched_waking(p);
- /* We're going to change ->state: */
- success = 1;
-
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
@@ -3808,7 +4275,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock;
/*
@@ -3848,52 +4315,117 @@ out:
return success;
}
+static bool __task_needs_rq_lock(struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /*
+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ * the task is blocked. Make sure to check @state since ttwu() can drop
+ * locks at the end, see ttwu_queue_wakelist().
+ */
+ if (state == TASK_RUNNING || state == TASK_WAKING)
+ return true;
+
+ /*
+ * Ensure we load p->on_rq after p->__state, otherwise it would be
+ * possible to, falsely, observe p->on_rq == 0.
+ *
+ * See try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+ if (p->on_rq)
+ return true;
+
+#ifdef CONFIG_SMP
+ /*
+ * Ensure the task has finished __schedule() and will not be referenced
+ * anymore. Again, see try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+#endif
+
+ return false;
+}
+
/**
- * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
+ * task_call_func - Invoke a function on task in fixed state
* @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
- * If the specified task can be quickly locked into a definite state
- * (either sleeping or on a given runqueue), arrange to keep it in that
- * state while invoking @func(@arg). This function can use ->on_rq and
- * task_curr() to work out what the state is, if required. Given that
- * @func can be invoked with a runqueue lock held, it had better be quite
- * lightweight.
+ * Fix the task in it's current state by avoiding wakeups and or rq operations
+ * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
+ * to work out what the state is, if required. Given that @func can be invoked
+ * with a runqueue lock held, it had better be quite lightweight.
*
* Returns:
- * @false if the task slipped out from under the locks.
- * @true if the task was locked onto a runqueue or is sleeping.
- * However, @func can override this by returning @false.
+ * Whatever @func returns
*/
-bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
+ struct rq *rq = NULL;
struct rq_flags rf;
- bool ret = false;
- struct rq *rq;
+ int ret;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- if (p->on_rq) {
+
+ if (__task_needs_rq_lock(p))
rq = __task_rq_lock(p, &rf);
- if (task_rq(p) == rq)
- ret = func(p, arg);
+
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
+ * p->__state to differentiate between these states.
+ */
+ ret = func(p, arg);
+
+ if (rq)
rq_unlock(rq, &rf);
- } else {
- switch (READ_ONCE(p->__state)) {
- case TASK_RUNNING:
- case TASK_WAKING:
- break;
- default:
- smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
- if (!p->on_rq)
- ret = func(p, arg);
- }
- }
+
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU. If the same task is running on that CPU throughout,
+ * the return value will be a pointer to that task's task_struct structure.
+ * If the CPU did any context switches even vaguely concurrently with the
+ * execution of this function, the return value will be a pointer to the
+ * task_struct structure of a randomly chosen task that was running on
+ * that CPU somewhere around the time that this function was executing.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee. Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+ struct task_struct *t;
+
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ t = rcu_dereference(cpu_curr(cpu));
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ return t;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -3939,7 +4471,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
@@ -3965,13 +4497,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+ init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -3979,13 +4514,33 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
+static int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3995,8 +4550,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
@@ -4044,7 +4604,7 @@ out:
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -4066,13 +4626,68 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+ {}
+};
+static int __init sched_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_core_sysctls);
+ return 0;
+}
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* fork()/clone()-time setup:
*/
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
- unsigned long flags;
-
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
@@ -4099,7 +4714,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -4118,23 +4733,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- * and the cgroup is pinned to this child due to cgroup_fork()
- * is ran before sched_fork().
- *
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rseq_migrate(p);
- /*
- * We're setting the CPU for the first time, we don't migrate,
- * so use __set_task_cpu().
- */
- __set_task_cpu(p, smp_processor_id());
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
@@ -4151,6 +4749,35 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+{
+ unsigned long flags;
+
+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_CGROUP_SCHED
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
+#endif
+ rseq_migrate(p);
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
void sched_post_fork(struct task_struct *p)
{
uclamp_post_fork(p);
@@ -4314,7 +4941,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
- * See the ttwu() WF_ON_CPU case and its ordering comment.
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
#endif
@@ -4340,10 +4968,10 @@ static inline void finish_task(struct task_struct *prev)
#ifdef CONFIG_SMP
-static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
- struct callback_head *next;
+ struct balance_callback *next;
lockdep_assert_rq_held(rq);
@@ -4359,28 +4987,58 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
static void balance_push(struct rq *rq);
-struct callback_head balance_push_callback = {
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
+struct balance_callback balance_push_callback = {
.next = NULL,
- .func = (void (*)(struct callback_head *))balance_push,
+ .func = balance_push,
};
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *
+__splice_balance_callbacks(struct rq *rq, bool split)
{
- struct callback_head *head = rq->balance_callback;
+ struct balance_callback *head = rq->balance_callback;
+
+ if (likely(!head))
+ return NULL;
lockdep_assert_rq_held(rq);
- if (head)
+ /*
+ * Must not take balance_push_callback off the list when
+ * splice_balance_callbacks() and balance_callbacks() are not
+ * in the same rq->lock section.
+ *
+ * In that case it would be possible for __schedule() to interleave
+ * and observe the list empty.
+ */
+ if (split && head == &balance_push_callback)
+ head = NULL;
+ else
rq->balance_callback = NULL;
return head;
}
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return __splice_balance_callbacks(rq, true);
+}
+
static void __balance_callbacks(struct rq *rq)
{
- do_balance_callbacks(rq, splice_balance_callbacks(rq));
+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
@@ -4397,12 +5055,12 @@ static inline void __balance_callbacks(struct rq *rq)
{
}
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return NULL;
}
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
}
@@ -4516,7 +5174,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -4574,23 +5232,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb_sched(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
/* Task is done with its stack. */
put_task_stack(prev);
@@ -4643,17 +5296,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
+ *
+ * switch_mm_cid() needs to be updated if the barriers provided
+ * by context_switch() are modified.
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -4667,14 +5323,18 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
+ /* switch_mm_cid() requires the memory barriers above. */
+ switch_mm_cid(rq, prev, next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -4721,6 +5381,11 @@ bool single_task_running(void)
}
EXPORT_SYMBOL(single_task_running);
+unsigned long long nr_context_switches_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -4943,7 +5608,9 @@ void scheduler_tick(void)
unsigned long thermal_pressure;
u64 resched_latency;
- arch_scale_freq_tick();
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ arch_scale_freq_tick();
+
sched_clock_tick();
rq_lock(rq, &rf);
@@ -4955,6 +5622,8 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);
+ task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
@@ -5066,7 +5735,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5087,7 +5756,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5220,8 +5889,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
- if (panic_on_warn)
- panic("scheduling while atomic\n");
+ check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -5298,7 +5966,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(prev->sched_class <= &fair_sched_class &&
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5323,8 +5991,7 @@ restart:
return p;
}
- /* The idle class should always have a runnable task: */
- BUG();
+ BUG(); /* The idle class should always have a runnable task. */
}
#ifdef CONFIG_SCHED_CORE
@@ -5346,66 +6013,34 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-// XXX fairness/fwd progress conditions
-/*
- * Returns
- * - NULL if there is no runnable task for this class.
- * - the highest priority task for this runqueue if it matches
- * rq->core->core_cookie or its priority is greater than max.
- * - Else returns idle_task.
- */
-static struct task_struct *
-pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
+static inline struct task_struct *pick_task(struct rq *rq)
{
- struct task_struct *class_pick, *cookie_pick;
- unsigned long cookie = rq->core->core_cookie;
-
- class_pick = class->pick_task(rq);
- if (!class_pick)
- return NULL;
-
- if (!cookie) {
- /*
- * If class_pick is tagged, return it only if it has
- * higher priority than max.
- */
- if (max && class_pick->core_cookie &&
- prio_less(class_pick, max, in_fi))
- return idle_sched_class.pick_task(rq);
+ const struct sched_class *class;
+ struct task_struct *p;
- return class_pick;
+ for_each_class(class) {
+ p = class->pick_task(rq);
+ if (p)
+ return p;
}
- /*
- * If class_pick is idle or matches cookie, return early.
- */
- if (cookie_equals(class_pick, cookie))
- return class_pick;
-
- cookie_pick = sched_core_find(rq, cookie);
-
- /*
- * If class > max && class > cookie, it is the highest priority task on
- * the core (so far) and it must be selected, otherwise we must go with
- * the cookie pick in order to satisfy the constraint.
- */
- if (prio_less(cookie_pick, class_pick, in_fi) &&
- (!max || prio_less(max, class_pick, in_fi)))
- return class_pick;
-
- return cookie_pick;
+ BUG(); /* The idle class should always have a runnable task. */
}
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+static void queue_core_balance(struct rq *rq);
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next, *max = NULL;
- const struct sched_class *class;
+ struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
- int i, j, cpu, occ = 0;
+ bool core_clock_updated = (rq == rq->core);
+ unsigned long cookie;
+ int i, cpu, occ = 0;
+ struct rq *rq_i;
bool need_sync;
if (!sched_core_enabled(rq))
@@ -5445,7 +6080,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
rq->core_pick = NULL;
- return next;
+ goto out;
}
put_prev_task_balance(rq, prev, rf);
@@ -5455,10 +6090,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}
/*
@@ -5478,12 +6121,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- for_each_class(class) {
- next = class->pick_task(rq);
- if (next)
- break;
- }
-
+ next = pick_task(rq);
if (!next->core_cookie) {
rq->core_pick = NULL;
/*
@@ -5492,83 +6130,68 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
WARN_ON_ONCE(fi_before);
task_vruntime_update(rq, next, false);
- goto done;
+ goto out_set_next;
}
}
- for_each_cpu(i, smt_mask) {
- struct rq *rq_i = cpu_rq(i);
-
- rq_i->core_pick = NULL;
+ /*
+ * For each thread: do the regular task pick and find the max prio task
+ * amongst them.
+ *
+ * Tie-break prio towards the current CPU
+ */
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ rq_i = cpu_rq(i);
- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
+
+ p = rq_i->core_pick = pick_task(rq_i);
+ if (!max || prio_less(max, p, fi_before))
+ max = p;
}
+ cookie = rq->core->core_cookie = max->core_cookie;
+
/*
- * Try and select tasks for each sibling in descending sched_class
- * order.
+ * For each thread: try and find a runnable task that matches @max or
+ * force idle.
*/
- for_each_class(class) {
-again:
- for_each_cpu_wrap(i, smt_mask, cpu) {
- struct rq *rq_i = cpu_rq(i);
- struct task_struct *p;
-
- if (rq_i->core_pick)
- continue;
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick;
- /*
- * If this sibling doesn't yet have a suitable task to
- * run; ask for the most eligible task, given the
- * highest priority task already selected for this
- * core.
- */
- p = pick_task(rq_i, class, max, fi_before);
+ if (!cookie_equals(p, cookie)) {
+ p = NULL;
+ if (cookie)
+ p = sched_core_find(rq_i, cookie);
if (!p)
- continue;
+ p = idle_sched_class.pick_task(rq_i);
+ }
- if (!is_task_rq_idle(p))
- occ++;
+ rq_i->core_pick = p;
- rq_i->core_pick = p;
- if (rq_i->idle == p && rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ if (p == rq_i->idle) {
+ if (rq_i->nr_running) {
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
-
- /*
- * If this new candidate is of higher priority than the
- * previous; and they're incompatible; we need to wipe
- * the slate and start over. pick_task makes sure that
- * p's priority is more than max if it doesn't match
- * max's cookie.
- *
- * NOTE: this is a linear max-filter and is thus bounded
- * in execution time.
- */
- if (!max || !cookie_match(max, p)) {
- struct task_struct *old_max = max;
-
- rq->core->core_cookie = p->core_cookie;
- max = p;
-
- if (old_max) {
- rq->core->core_forceidle = false;
- for_each_cpu(j, smt_mask) {
- if (j == i)
- continue;
-
- cpu_rq(j)->core_pick = NULL;
- }
- occ = 1;
- goto again;
- }
- }
+ } else {
+ occ++;
}
}
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5585,7 +6208,7 @@ again:
* non-matching user state.
*/
for_each_cpu(i, smt_mask) {
- struct rq *rq_i = cpu_rq(i);
+ rq_i = cpu_rq(i);
/*
* An online sibling might have gone offline before a task
@@ -5605,8 +6228,8 @@ again:
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
@@ -5626,8 +6249,12 @@ again:
resched_curr(rq_i);
}
-done:
+out_set_next:
set_next_task(rq, next);
+out:
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
return next;
}
@@ -5649,24 +6276,29 @@ static bool try_steal_cookie(int this, int that)
goto unlock;
p = sched_core_find(src, cookie);
- if (p == src->idle)
+ if (!p)
goto unlock;
do {
if (p == src->core_pick || p == src->curr)
goto next;
- if (!cpumask_test_cpu(this, &p->cpus_mask))
+ if (!is_cpu_allowed(p, this))
goto next;
if (p->core_occupation > dst->idle->core_occupation)
goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure that task @p
+ * is not throttled now, we also need to check whether the runqueue
+ * of the destination CPU is being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
activate_task(dst, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
resched_curr(dst);
@@ -5688,7 +6320,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
- for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
@@ -5722,9 +6354,9 @@ static void sched_core_balance(struct rq *rq)
preempt_enable();
}
-static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
-void queue_core_balance(struct rq *rq)
+static void queue_core_balance(struct rq *rq)
{
if (!sched_core_enabled(rq))
return;
@@ -5738,35 +6370,117 @@ void queue_core_balance(struct rq *rq)
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
-static inline void sched_core_cpu_starting(unsigned int cpu)
+static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- struct rq *rq, *core_rq = NULL;
- int i;
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
- core_rq = cpu_rq(cpu)->core;
+ WARN_ON_ONCE(rq->core != rq);
- if (!core_rq) {
- for_each_cpu(i, smt_mask) {
- rq = cpu_rq(i);
- if (rq->core && rq->core == rq)
- core_rq = rq;
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ goto unlock;
+
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
}
+ }
- if (!core_rq)
- core_rq = cpu_rq(cpu);
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ goto unlock;
- for_each_cpu(i, smt_mask) {
- rq = cpu_rq(i);
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
- WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ if (t == cpu)
rq->core = core_rq;
- }
+
+ WARN_ON_ONCE(rq->core != core_rq);
}
+
+unlock:
+ sched_core_unlock(cpu, &flags);
}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ goto unlock;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ goto unlock;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ goto unlock;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+
+unlock:
+ sched_core_unlock(cpu, &flags);
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
@@ -5777,6 +6491,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#endif /* CONFIG_SCHED_CORE */
/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ * optimize the AND operation out and just check for zero.
+ */
+#define SM_NONE 0x0
+#define SM_PREEMPT 0x1
+#define SM_RTLOCK_WAIT 0x2
+
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT (~0U)
+#else
+# define SM_MASK_PREEMPT SM_PREEMPT
+#endif
+
+/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
@@ -5815,7 +6547,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -5828,13 +6560,13 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, preempt);
+ schedule_debug(prev, !!sched_mode);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(preempt);
+ rcu_note_context_switch(!!sched_mode);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -5862,20 +6594,17 @@ static void __sched notrace __schedule(bool preempt)
/*
* We must load prev->state once (task_struct::state is volatile), such
- * that:
- *
- * - we form a control dependency vs deactivate_task() below.
- * - ptrace_{,un}freeze_traced() can change ->state underneath us.
+ * that we form a control dependency vs deactivate_task() below.
*/
prev_state = READ_ONCE(prev->__state);
- if (!preempt && prev_state) {
+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
+ !(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
@@ -5934,7 +6663,7 @@ static void __sched notrace __schedule(bool preempt)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(preempt, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -5955,7 +6684,7 @@ void __noreturn do_task_dead(void)
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(SM_NONE);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -5972,31 +6701,28 @@ static inline void sched_submit_work(struct task_struct *tsk)
task_flags = tsk->flags;
/*
- * If a worker went to sleep, notify and ask workqueue whether
- * it wants to wake up a task to maintain concurrency.
- * As this function is called inside the schedule() context,
- * we disable preemption to avoid it calling schedule() again
- * in the possible wakeup of a kworker and because wq_worker_sleeping()
- * requires it.
+ * If a worker goes to sleep, notify and ask workqueue whether it
+ * wants to wake up a task to maintain concurrency.
*/
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- preempt_disable();
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);
- preempt_enable_no_resched();
}
- if (tsk_is_pi_blocked(tsk))
- return;
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_schedule_flush_plug(tsk);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6016,7 +6742,7 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
@@ -6044,11 +6770,11 @@ void __sched schedule_idle(void)
*/
WARN_ON_ONCE(current->__state);
do {
- __schedule(false);
+ __schedule(SM_NONE);
} while (need_resched());
}
-#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -6079,6 +6805,18 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ do {
+ preempt_disable();
+ __schedule(SM_RTLOCK_WAIT);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
@@ -6097,7 +6835,7 @@ static void __sched notrace preempt_schedule_common(void)
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
@@ -6121,17 +6859,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled preempt_schedule
+#define preempt_schedule_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
#endif
-
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -6176,7 +6928,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
@@ -6186,126 +6938,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled NULL
#endif
-
-#endif /* CONFIG_PREEMPTION */
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-#include <linux/entry-common.h>
-
-/*
- * SC:cond_resched
- * SC:might_resched
- * SC:preempt_schedule
- * SC:preempt_schedule_notrace
- * SC:irqentry_exit_cond_resched
- *
- *
- * NONE:
- * cond_resched <- __cond_resched
- * might_resched <- RET0
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * VOLUNTARY:
- * cond_resched <- __cond_resched
- * might_resched <- __cond_resched
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * FULL:
- * cond_resched <- RET0
- * might_resched <- RET0
- * preempt_schedule <- preempt_schedule
- * preempt_schedule_notrace <- preempt_schedule_notrace
- * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
- */
-
-enum {
- preempt_dynamic_none = 0,
- preempt_dynamic_voluntary,
- preempt_dynamic_full,
-};
-
-int preempt_dynamic_mode = preempt_dynamic_full;
-
-int sched_dynamic_mode(const char *str)
-{
- if (!strcmp(str, "none"))
- return preempt_dynamic_none;
-
- if (!strcmp(str, "voluntary"))
- return preempt_dynamic_voluntary;
-
- if (!strcmp(str, "full"))
- return preempt_dynamic_full;
-
- return -EINVAL;
-}
-
-void sched_dynamic_update(int mode)
-{
- /*
- * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
- * the ZERO state, which is invalid.
- */
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-
- switch (mode) {
- case preempt_dynamic_none:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: none\n");
- break;
-
- case preempt_dynamic_voluntary:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: voluntary\n");
- break;
-
- case preempt_dynamic_full:
- static_call_update(cond_resched, (void *)&__static_call_return0);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
- break;
- }
-
- preempt_dynamic_mode = mode;
-}
-
-static int __init setup_preempt_mode(char *str)
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
{
- int mode = sched_dynamic_mode(str);
- if (mode < 0) {
- pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
- return 1;
- }
-
- sched_dynamic_update(mode);
- return 0;
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
}
-__setup("preempt=", setup_preempt_mode);
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
+#endif
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* CONFIG_PREEMPTION */
/*
* This is the entry point to schedule() from kernel preemption
@@ -6325,7 +6978,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
@@ -6341,6 +6994,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -6456,22 +7121,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} else {
p->dl.pi_se = &p->dl;
}
- p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -6551,17 +7213,29 @@ out_unlock:
EXPORT_SYMBOL(set_user_nice);
/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
@@ -6690,12 +7364,14 @@ struct task_struct *idle_task(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p)
{
- unsigned long dl_util, util, irq;
+ unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
+ max = arch_scale_cpu_capacity(cpu);
+
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
@@ -6775,10 +7451,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
return min(max, util);
}
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
- ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -6824,35 +7499,6 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -6869,16 +7515,77 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
- struct callback_head *head;
+ struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -6912,58 +7619,11 @@ recheck:
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
@@ -7074,6 +7734,7 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -7082,8 +7743,8 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -7096,7 +7757,10 @@ change:
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -7320,6 +7984,16 @@ err_size:
return -E2BIG;
}
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
@@ -7381,6 +8055,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
rcu_read_unlock();
if (likely(p)) {
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
@@ -7529,12 +8205,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &kattr);
- else if (task_has_rt_policy(p))
- kattr.sched_priority = p->rt_priority;
- else
- kattr.sched_nice = task_nice(p);
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -7555,9 +8227,98 @@ out_unlock:
return retval;
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
+ int ret = 0;
+
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ ret = -EBUSY;
+ rcu_read_unlock();
+ return ret;
+}
+#endif
+
+static int
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
+{
+ int retval;
cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (WARN_ON_ONCE(empty))
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
struct task_struct *p;
int retval;
@@ -7577,68 +8338,42 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = -EINVAL;
goto out_put_task;
}
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
+
if (!check_same_owner(p)) {
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
- goto out_free_new_mask;
+ retval = -EPERM;
+ goto out_put_task;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_free_new_mask;
-
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
+ goto out_put_task;
/*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
*/
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ retval = -ENOMEM;
+ goto out_put_task;
}
-#endif
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
- }
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
out_put_task:
put_task_struct(p);
return retval;
@@ -7726,14 +8461,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
@@ -7781,6 +8516,17 @@ int __sched __cond_resched(void)
preempt_schedule_common();
return 1;
}
+ /*
+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
+ */
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
@@ -7790,11 +8536,36 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define cond_resched_dynamic_enabled __cond_resched
+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
+#define might_resched_dynamic_enabled __cond_resched
+#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ klp_sched_try_switch();
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_cond_resched);
+
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_might_resched);
+#endif
#endif
/*
@@ -7814,9 +8585,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
@@ -7834,9 +8603,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
@@ -7854,9 +8621,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
@@ -7865,6 +8630,213 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#ifdef CONFIG_GENERIC_ENTRY
+#include <linux/entry-common.h>
+#endif
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+int preempt_dynamic_mode = preempt_dynamic_undefined;
+
+int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
+
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
+
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
+
+ return -EINVAL;
+}
+
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
+
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+void sched_dynamic_update(int mode)
+{
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+ __klp_sched_try_switch();
+ return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = false;
+ __sched_dynamic_update(preempt_dynamic_mode);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
+ }
+
+ sched_dynamic_update(mode);
+ return 1;
+}
+__setup("preempt=", setup_preempt_mode);
+
+static void __init preempt_dynamic_init(void)
+{
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
+}
+
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
+
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+
/**
* yield - yield the current processor to other threads.
*
@@ -7942,7 +8914,7 @@ again:
if (curr->sched_class != p->sched_class)
goto out_unlock;
- if (task_running(p_rq, p) || !task_is_running(p))
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p);
@@ -7973,8 +8945,7 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -8155,9 +9126,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -8183,7 +9154,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
@@ -8231,19 +9202,17 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
+#ifdef CONFIG_SMP
+ struct affinity_context ac = (struct affinity_context) {
+ .new_mask = cpumask_of(cpu),
+ .flags = 0,
+ };
+#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
- /*
- * The idle task doesn't need the kthread struct to function, but it
- * is dressed up as a per-CPU kthread and thus needs to play the part
- * if we want to avoid special-casing it in code that deals with per-CPU
- * kthreads.
- */
- set_kthread_struct(idle);
-
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_rq_lock(rq);
@@ -8256,9 +9225,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
-
#ifdef CONFIG_SMP
/*
* It's possible that init_idle() gets called multiple times on a task,
@@ -8266,7 +9232,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
*
* And since this is boot we can forgo the serialization.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+ set_cpus_allowed_common(idle, &ac);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -8312,7 +9278,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
{
int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
@@ -8321,7 +9287,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
}
int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed)
+ const struct cpumask *cs_effective_cpus)
{
int ret = 0;
@@ -8340,8 +9306,13 @@ int task_can_attach(struct task_struct *p,
}
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed))
- ret = dl_task_can_attach(p, cs_cpus_allowed);
+ cs_effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids))
+ return -EINVAL;
+ ret = dl_cpu_busy(cpu, p);
+ }
out:
return ret;
@@ -8455,7 +9426,6 @@ static void balance_push(struct rq *rq)
struct task_struct *push_task = rq->curr;
lockdep_assert_rq_held(rq);
- SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
@@ -8463,9 +9433,10 @@ static void balance_push(struct rq *rq)
rq->balance_callback = &balance_push_callback;
/*
- * Only active while going offline.
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- if (!cpu_dying(rq->cpu))
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
return;
/*
@@ -8625,8 +9596,10 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- if (dl_cpu_busy(cpu))
- return -EBUSY;
+ int ret = dl_cpu_busy(cpu, NULL);
+
+ if (ret)
+ return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
@@ -8656,6 +9629,7 @@ int sched_cpu_activate(unsigned int cpu)
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
@@ -8727,15 +9701,19 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
+
+ sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
return 0;
+ sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
+ sched_update_numa(cpu, true);
return ret;
}
sched_domains_numa_masks_clear(cpu);
@@ -8831,13 +9809,14 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
return 0;
}
#endif
void __init sched_init_smp(void)
{
- sched_init_numa();
+ sched_init_numa(NUMA_NO_NODE);
/*
* There's no userspace yet to cause hotplug operations; hence all the
@@ -8849,7 +9828,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
@@ -8893,20 +9872,17 @@ LIST_HEAD(task_groups);
static struct kmem_cache *task_group_cache __read_mostly;
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
-
void __init sched_init(void)
{
unsigned long ptr = 0;
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
- &fair_sched_class + 1 != &rt_sched_class ||
- &rt_sched_class + 1 != &dl_sched_class);
+ BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+ &fair_sched_class != &rt_sched_class + 1 ||
+ &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+ BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
@@ -8939,17 +9915,8 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
@@ -9042,14 +10009,17 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_SCHED_CORE
- rq->core = NULL;
+ rq->core = rq;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
@@ -9057,10 +10027,18 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ WARN_ON(!set_kthread_struct(current));
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -9080,18 +10058,14 @@ void __init sched_init(void)
init_uclamp();
+ preempt_dynamic_init();
+
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
- int nested = preempt_count() + rcu_preempt_depth();
-
- return (nested == preempt_offset);
-}
-void __might_sleep(const char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line)
{
unsigned int state = get_current_state();
/*
@@ -9105,11 +10079,32 @@ void __might_sleep(const char *file, int line, int preempt_offset)
(void *)current->task_state_change,
(void *)current->task_state_change);
- ___might_sleep(file, line, preempt_offset);
+ __might_resched(file, line, 0);
}
EXPORT_SYMBOL(__might_sleep);
-void ___might_sleep(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
+
+ if (preempt_count() == preempt_offset)
+ return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
{
/* Ratelimiting timestamp: */
static unsigned long prev_jiffy;
@@ -9119,7 +10114,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* WARN_ON_ONCE() by default, no rate limit required: */
rcu_sleep_check();
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
!is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
@@ -9132,29 +10127,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* Save this before calling printk(), since that will clobber it: */
preempt_disable_ip = get_preempt_disable_ip(current);
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(), current->non_block_count,
- current->pid, current->comm);
+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
if (task_stack_end_corrupted(current))
- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+ pr_emerg("Thread overran stack, or stack corrupted\n");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && !preempt_count_equals(preempt_offset)) {
- pr_err("Preemption disabled at:");
- print_ip_sym(KERN_ERR, preempt_disable_ip);
- }
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
-EXPORT_SYMBOL(___might_sleep);
+EXPORT_SYMBOL(__might_resched);
void __cant_sleep(const char *file, int line, int preempt_offset)
{
@@ -9235,9 +10234,9 @@ void normalize_rt_tasks(void)
continue;
p->se.exec_start = 0;
- schedstat_set(p->se.statistics.wait_start, 0);
- schedstat_set(p->se.statistics.sleep_start, 0);
- schedstat_set(p->se.statistics.block_start, 0);
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
if (!dl_task(p) && !rt_task(p)) {
/*
@@ -9331,6 +10330,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -9374,32 +10389,42 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
- sched_free_group(container_of(rhp, struct task_group, rcu));
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
/* Wait for possible concurrent references to cfs_rqs complete: */
- call_rcu(&tg->rcu, sched_free_group_rcu);
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- /* End participation in shares distribution: */
- unregister_fair_sched_group(tg);
-
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk, int type)
+static struct task_group *sched_get_task_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -9411,11 +10436,17 @@ static void sched_change_group(struct task_struct *tsk, int type)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
- tsk->sched_task_group = tg;
+
+ return tg;
+}
+
+static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+{
+ tsk->sched_task_group = group;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
- tsk->sched_class->task_change_group(tsk, type);
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -9432,10 +10463,19 @@ void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct task_group *group;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ /*
+ * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+ * group changes.
+ */
+ group = sched_get_task_group(tsk);
+ if (group == tsk->sched_task_group)
+ goto unlock;
+
update_rq_clock(rq);
running = task_current(rq, tsk);
@@ -9446,7 +10486,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, TASK_MOVE_GROUP);
+ sched_change_group(tsk, group);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -9460,6 +10500,7 @@ void sched_move_task(struct task_struct *tsk)
resched_curr(rq);
}
+unlock:
task_rq_unlock(rq, tsk, &rf);
}
@@ -9511,7 +10552,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_offline_group(tg);
+ sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -9521,56 +10562,22 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/*
* Relies on the RCU grace period between css_released() and this.
*/
- sched_free_group(tg);
-}
-
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
-
- update_rq_clock(rq);
- sched_change_group(task, TASK_SET_GROUP);
-
- task_rq_unlock(rq, task, &rf);
+ sched_unregister_group(tg);
}
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if it's
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (READ_ONCE(task->__state) == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+ return 0;
}
+#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -9824,7 +10831,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -9868,7 +10875,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
@@ -10079,15 +11086,21 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
+ struct sched_statistics *stats;
u64 ws = 0;
int i;
- for_each_possible_cpu(i)
- ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+ for_each_possible_cpu(i) {
+ stats = __schedstats_from_se(tg->se[i]);
+ ws += schedstat_val(stats->wait_sum);
+ }
seq_printf(sf, "wait_sum %llu\n", ws);
}
+ seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
+ seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -10119,6 +11132,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ return sched_group_set_idle(css_tg(css), idle);
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -10126,6 +11153,11 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -10184,16 +11216,20 @@ static int cpu_extra_stat_show(struct seq_file *sf,
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- u64 throttled_usec;
+ u64 throttled_usec, burst_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
+ burst_usec = cfs_b->burst_time;
+ do_div(burst_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
- "throttled_usec %llu\n",
+ "throttled_usec %llu\n"
+ "nr_bursts %d\n"
+ "burst_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
- throttled_usec);
+ throttled_usec, cfs_b->nr_burst, burst_usec);
}
#endif
return 0;
@@ -10333,6 +11369,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -10371,8 +11413,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
- .fork = cpu_cgroup_fork,
+#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
+#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
@@ -10384,6 +11427,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
+ if (cpu == smp_processor_id() && in_hardirq()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
@@ -10433,3 +11489,532 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
+
+#ifdef CONFIG_SCHED_MM_CID
+
+/*
+ * @cid_lock: Guarantee forward-progress of cid allocation.
+ *
+ * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
+ * is only used when contention is detected by the lock-free allocation so
+ * forward progress can be guaranteed.
+ */
+DEFINE_RAW_SPINLOCK(cid_lock);
+
+/*
+ * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ *
+ * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
+ * detected, it is set to 1 to ensure that all newly coming allocations are
+ * serialized by @cid_lock until the allocation which detected contention
+ * completes and sets @use_cid_lock back to 0. This guarantees forward progress
+ * of a cid allocation.
+ */
+int use_cid_lock;
+
+/*
+ * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
+ * concurrently with respect to the execution of the source runqueue context
+ * switch.
+ *
+ * There is one basic properties we want to guarantee here:
+ *
+ * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
+ * used by a task. That would lead to concurrent allocation of the cid and
+ * userspace corruption.
+ *
+ * Provide this guarantee by introducing a Dekker memory ordering to guarantee
+ * that a pair of loads observe at least one of a pair of stores, which can be
+ * shown as:
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible. But rather than using
+ * values 0 and 1, this algorithm cares about specific state transitions of the
+ * runqueue current task (as updated by the scheduler context switch), and the
+ * per-mm/cpu cid value.
+ *
+ * Let's introduce task (Y) which has task->mm == mm and task (N) which has
+ * task->mm != mm for the rest of the discussion. There are two scheduler state
+ * transitions on context switch we care about:
+ *
+ * (TSA) Store to rq->curr with transition from (N) to (Y)
+ *
+ * (TSB) Store to rq->curr with transition from (Y) to (N)
+ *
+ * On the remote-clear side, there is one transition we care about:
+ *
+ * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ *
+ * There is also a transition to UNSET state which can be performed from all
+ * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
+ * guarantees that only a single thread will succeed:
+ *
+ * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ *
+ * Just to be clear, what we do _not_ want to happen is a transition to UNSET
+ * when a thread is actively using the cid (property (1)).
+ *
+ * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ *
+ * Scenario A) (TSA)+(TMA) (from next task perspective)
+ *
+ * CPU0 CPU1
+ *
+ * Context switch CS-1 Remote-clear
+ * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
+ * (implied barrier after cmpxchg)
+ * - switch_mm_cid()
+ * - memory barrier (see switch_mm_cid()
+ * comment explaining how this barrier
+ * is combined with other scheduler
+ * barriers)
+ * - mm_cid_get (next)
+ * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ *
+ * This Dekker ensures that either task (Y) is observed by the
+ * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
+ * observed.
+ *
+ * If task (Y) store is observed by rcu_dereference(), it means that there is
+ * still an active task on the cpu. Remote-clear will therefore not transition
+ * to UNSET, which fulfills property (1).
+ *
+ * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
+ * it will move its state to UNSET, which clears the percpu cid perhaps
+ * uselessly (which is not an issue for correctness). Because task (Y) is not
+ * observed, CPU1 can move ahead to set the state to UNSET. Because moving
+ * state to UNSET is done with a cmpxchg expecting that the old state has the
+ * LAZY flag set, only one thread will successfully UNSET.
+ *
+ * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
+ * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
+ * CPU1 will observe task (Y) and do nothing more, which is fine.
+ *
+ * What we are effectively preventing with this Dekker is a scenario where
+ * neither LAZY flag nor store (Y) are observed, which would fail property (1)
+ * because this would UNSET a cid which is actively used.
+ */
+
+void sched_mm_cid_migrate_from(struct task_struct *t)
+{
+ t->migrate_from_cpu = task_cpu(t);
+}
+
+static
+int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid)
+{
+ struct mm_struct *mm = t->mm;
+ struct task_struct *src_task;
+ int src_cid, last_mm_cid;
+
+ if (!mm)
+ return -1;
+
+ last_mm_cid = t->last_mm_cid;
+ /*
+ * If the migrated task has no last cid, or if the current
+ * task on src rq uses the cid, it means the source cid does not need
+ * to be moved to the destination cpu.
+ */
+ if (last_mm_cid == -1)
+ return -1;
+ src_cid = READ_ONCE(src_pcpu_cid->cid);
+ if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
+ return -1;
+
+ /*
+ * If we observe an active task using the mm on this rq, it means we
+ * are not the last task to be migrated from this cpu for this mm, so
+ * there is no need to move src_cid to the destination cpu.
+ */
+ rcu_read_lock();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ rcu_read_unlock();
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ rcu_read_unlock();
+
+ return src_cid;
+}
+
+static
+int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid,
+ int src_cid)
+{
+ struct task_struct *src_task;
+ struct mm_struct *mm = t->mm;
+ int lazy_cid;
+
+ if (src_cid == -1)
+ return -1;
+
+ /*
+ * Attempt to clear the source cpu cid to move it to the destination
+ * cpu.
+ */
+ lazy_cid = mm_cid_set_lazy_put(src_cid);
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
+ return -1;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, this task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ rcu_read_lock();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ rcu_read_unlock();
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ rcu_read_unlock();
+
+ /*
+ * The src_cid is unused, so it can be unset.
+ */
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ return -1;
+ return src_cid;
+}
+
+/*
+ * Migration to dst cpu. Called with dst_rq lock held.
+ * Interrupts are disabled, which keeps the window of cid ownership without the
+ * source rq lock held small.
+ */
+void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+{
+ struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
+ struct mm_struct *mm = t->mm;
+ int src_cid, dst_cid, src_cpu;
+ struct rq *src_rq;
+
+ lockdep_assert_rq_held(dst_rq);
+
+ if (!mm)
+ return;
+ src_cpu = t->migrate_from_cpu;
+ if (src_cpu == -1) {
+ t->last_mm_cid = -1;
+ return;
+ }
+ /*
+ * Move the src cid if the dst cid is unset. This keeps id
+ * allocation closest to 0 in cases where few threads migrate around
+ * many cpus.
+ *
+ * If destination cid is already set, we may have to just clear
+ * the src cid to ensure compactness in frequent migrations
+ * scenarios.
+ *
+ * It is not useful to clear the src cid when the number of threads is
+ * greater or equal to the number of allowed cpus, because user-space
+ * can expect that the number of allowed cids can reach the number of
+ * allowed cpus.
+ */
+ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
+ dst_cid = READ_ONCE(dst_pcpu_cid->cid);
+ if (!mm_cid_is_unset(dst_cid) &&
+ atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+ return;
+ src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
+ src_rq = cpu_rq(src_cpu);
+ src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
+ if (src_cid == -1)
+ return;
+ src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
+ src_cid);
+ if (src_cid == -1)
+ return;
+ if (!mm_cid_is_unset(dst_cid)) {
+ __mm_cid_put(mm, src_cid);
+ return;
+ }
+ /* Move src_cid to dst cpu. */
+ mm_cid_snapshot_time(dst_rq, mm);
+ WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+}
+
+static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
+ int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ unsigned long flags;
+ int cid, lazy_cid;
+
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid))
+ return;
+
+ /*
+ * Clear the cpu cid if it is set to keep cid allocation compact. If
+ * there happens to be other tasks left on the source cpu using this
+ * mm, the next task using this mm will reallocate its cid on context
+ * switch.
+ */
+ lazy_cid = mm_cid_set_lazy_put(cid);
+ if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
+ return;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, that task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ rcu_read_lock();
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ /*
+ * The cid is unused, so it can be unset.
+ * Disable interrupts to keep the window of cid ownership without rq
+ * lock small.
+ */
+ local_irq_save(flags);
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ local_irq_restore(flags);
+}
+
+static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct mm_cid *pcpu_cid;
+ struct task_struct *curr;
+ u64 rq_clock;
+
+ /*
+ * rq->clock load is racy on 32-bit but one spurious clear once in a
+ * while is irrelevant.
+ */
+ rq_clock = READ_ONCE(rq->clock);
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ /*
+ * In order to take care of infrequently scheduled tasks, bump the time
+ * snapshot associated with this cid if an active task using the mm is
+ * observed on this rq.
+ */
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
+ int weight)
+{
+ struct mm_cid *pcpu_cid;
+ int cid;
+
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid) || cid < weight)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void task_mm_cid_work(struct callback_head *work)
+{
+ unsigned long now = jiffies, old_scan, next_scan;
+ struct task_struct *t = current;
+ struct cpumask *cidmask;
+ struct mm_struct *mm;
+ int weight, cpu;
+
+ SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+
+ work->next = work; /* Prevent double-add */
+ if (t->flags & PF_EXITING)
+ return;
+ mm = t->mm;
+ if (!mm)
+ return;
+ old_scan = READ_ONCE(mm->mm_cid_next_scan);
+ next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (!old_scan) {
+ unsigned long res;
+
+ res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
+ if (res != old_scan)
+ old_scan = res;
+ else
+ old_scan = next_scan;
+ }
+ if (time_before(now, old_scan))
+ return;
+ if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
+ return;
+ cidmask = mm_cidmask(mm);
+ /* Clear cids that were not recently used. */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_old(mm, cpu);
+ weight = cpumask_weight(cidmask);
+ /*
+ * Clear cids that are greater or equal to the cidmask weight to
+ * recompact it.
+ */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+}
+
+void init_sched_mm_cid(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ int mm_users = 0;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1)
+ mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ }
+ t->cid_work.next = &t->cid_work; /* Protect against double add */
+ init_task_work(&t->cid_work, task_mm_cid_work);
+}
+
+void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->cid_work;
+ unsigned long now = jiffies;
+
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
+ work->next != work)
+ return;
+ if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+ return;
+ task_work_add(curr, work, TWA_RESUME);
+}
+
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ rq_unlock_irqrestore(rq, &rf);
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 9a80e9a474c0..a57fd8f27498 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,8 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/prctl.h>
-#include "sched.h"
-
/*
* A simple wrapper around refcount. An allocated sched_core_cookie's
* address is used to compute the cookie of the task.
@@ -11,7 +8,7 @@ struct sched_core_cookie {
refcount_t refcnt;
};
-unsigned long sched_core_alloc_cookie(void)
+static unsigned long sched_core_alloc_cookie(void)
{
struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
if (!ck)
@@ -23,7 +20,7 @@ unsigned long sched_core_alloc_cookie(void)
return (unsigned long)ck;
}
-void sched_core_put_cookie(unsigned long cookie)
+static void sched_core_put_cookie(unsigned long cookie)
{
struct sched_core_cookie *ptr = (void *)cookie;
@@ -33,7 +30,7 @@ void sched_core_put_cookie(unsigned long cookie)
}
}
-unsigned long sched_core_get_cookie(unsigned long cookie)
+static unsigned long sched_core_get_cookie(unsigned long cookie)
{
struct sched_core_cookie *ptr = (void *)cookie;
@@ -53,12 +50,12 @@ unsigned long sched_core_get_cookie(unsigned long cookie)
*
* Returns: the old cookie
*/
-unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+static unsigned long sched_core_update_cookie(struct task_struct *p,
+ unsigned long cookie)
{
unsigned long old_cookie;
struct rq_flags rf;
struct rq *rq;
- bool enqueued;
rq = task_rq_lock(p, &rf);
@@ -70,22 +67,28 @@ unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cook
*/
SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
- enqueued = sched_core_enqueued(p);
- if (enqueued)
- sched_core_dequeue(rq, p);
+ if (sched_core_enqueued(p))
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
- if (enqueued)
+ /*
+ * Consider the cases: !prev_cookie and !cookie.
+ */
+ if (cookie && task_on_rq_queued(p))
sched_core_enqueue(rq, p);
/*
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
*/
- if (task_running(rq, p))
+ if (task_on_cpu(rq, p))
resched_curr(rq);
task_rq_unlock(rq, p, &rf);
@@ -134,6 +137,10 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
if (!static_branch_likely(&sched_smt_present))
return -ENODEV;
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID);
+
if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
(cmd != PR_SCHED_CORE_GET && uaddr))
return -EINVAL;
@@ -198,7 +205,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
default:
err = -EINVAL;
goto out;
- };
+ }
if (type == PIDTYPE_PID) {
__sched_core_set(task, cookie);
@@ -227,3 +234,67 @@ out:
return err;
}
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (p == rq_i->idle)
+ continue;
+
+ /*
+ * Note: this will account forceidle to the current cpu, even
+ * if it comes from our SMT sibling.
+ */
+ __account_forceidle_time(p, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 893eece65bfd..0de9dda09949 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
-#include <asm/irq_regs.h>
-#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
@@ -21,15 +20,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-struct cpuacct_usage {
- u64 usages[CPUACCT_STAT_NSTATS];
-};
-
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
+ u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
@@ -49,7 +44,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
@@ -68,7 +63,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage);
+ ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
@@ -99,14 +94,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of usages.
*/
- BUG_ON(index > CPUACCT_STAT_NSTATS);
+ if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
+ return 0;
#ifndef CONFIG_64BIT
/*
@@ -115,14 +112,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
- if (index == CPUACCT_STAT_NSTATS) {
- int i = 0;
-
- data = 0;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- data += cpuusage->usages[i];
- } else {
- data = cpuusage->usages[index];
+ switch (index) {
+ case CPUACCT_STAT_USER:
+ data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
+ break;
+ case CPUACCT_STAT_SYSTEM:
+ data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
+ cpustat[CPUTIME_SOFTIRQ];
+ break;
+ case CPUACCT_STAT_NSTATS:
+ data = *cpuusage;
+ break;
}
#ifndef CONFIG_64BIT
@@ -132,10 +132,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
return data;
}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- int i;
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ /* Don't allow to reset global kernel_cpustat */
+ if (ca == &root_cpuacct)
+ return;
#ifndef CONFIG_64BIT
/*
@@ -143,9 +147,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
*/
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
-
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- cpuusage->usages[i] = val;
+ *cpuusage = 0;
+ cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
+ cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
+ cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu));
@@ -196,7 +201,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL;
for_each_possible_cpu(cpu)
- cpuacct_cpuusage_write(ca, cpu, 0);
+ cpuacct_cpuusage_write(ca, cpu);
return 0;
}
@@ -243,25 +248,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
seq_printf(m, "%d", cpu);
-
- for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit
- * platforms.
- */
- raw_spin_rq_lock_irq(cpu_rq(cpu));
-#endif
-
- seq_printf(m, " %llu", cpuusage->usages[index]);
-
-#ifndef CONFIG_64BIT
- raw_spin_rq_unlock_irq(cpu_rq(cpu));
-#endif
- }
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %llu",
+ cpuacct_cpuusage_read(ca, cpu, index));
seq_puts(m, "\n");
}
return 0;
@@ -270,25 +260,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
- s64 val[CPUACCT_STAT_NSTATS];
+ struct task_cputime cputime;
+ u64 val[CPUACCT_STAT_NSTATS];
int cpu;
int stat;
- memset(val, 0, sizeof(val));
+ memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+ cputime.utime += cpustat[CPUTIME_USER];
+ cputime.utime += cpustat[CPUTIME_NICE];
+ cputime.stime += cpustat[CPUTIME_SYSTEM];
+ cputime.stime += cpustat[CPUTIME_IRQ];
+ cputime.stime += cpustat[CPUTIME_SOFTIRQ];
+
+ cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
+ cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
+ &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
+
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
- seq_printf(sf, "%s %lld\n",
- cpuacct_stat_desc[stat],
- (long long)nsec_to_clock_t(val[stat]));
+ seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
+ nsec_to_clock_t(val[stat]));
}
return 0;
@@ -338,19 +333,13 @@ static struct cftype files[] = {
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
+ unsigned int cpu = task_cpu(tsk);
struct cpuacct *ca;
- int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
- if (regs && user_mode(regs))
- index = CPUACCT_STAT_USER;
-
- rcu_read_lock();
+ lockdep_assert_rq_held(cpu_rq(cpu));
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(ca->cpuusage->usages[index], cputime);
-
- rcu_read_unlock();
+ *per_cpu_ptr(ca->cpuusage, cpu) += cputime;
}
/*
@@ -362,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
struct cpuacct *ca;
- rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
__this_cpu_add(ca->cpustat->cpustat[index], val);
- rcu_read_unlock();
}
struct cgroup_subsys cpuacct_cgrp_subsys = {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ceb03d76c0cc..57c92d751bcd 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * kernel/sched/cpudl.c
+ * kernel/sched/cpudeadline.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
-#include "sched.h"
static inline int parent(int i)
{
@@ -124,7 +123,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
unsigned long cap, max_cap = 0;
int cpu, max_cpu = -1;
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return 1;
/* Ensure the capacity of the CPUs fits the task. */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 7c2fe50fd76d..5252fb191fae 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,9 +5,6 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#include <linux/cpufreq.h>
-
-#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 57124614363d..e3211455b203 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,13 +6,6 @@
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "sched.h"
-
-#include <linux/sched/cpufreq.h>
-#include <trace/events/power.h>
-
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
struct sugov_tunables {
@@ -55,7 +48,6 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
- unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -164,11 +156,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
- sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
}
@@ -246,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -259,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long max_cap)
{
unsigned long boost;
@@ -288,7 +280,8 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
+ boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
}
@@ -317,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
- u64 time, unsigned int flags)
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -328,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
return false;
sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time);
+ sugov_iowait_apply(sg_cpu, time, max_cap);
return true;
}
@@ -339,17 +333,23 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
unsigned int next_f;
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */
@@ -378,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
/*
* Fall back to the "frequency" path if frequency invariance is not
@@ -389,18 +390,23 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
return;
}
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
/*
* Do not reduce the target performance level if the CPU has not been
* idle recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), sg_cpu->max);
+ map_util_perf(sg_cpu->util), max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -409,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time);
- j_util = j_sg_cpu->util;
- j_max = j_sg_cpu->max;
+ sugov_iowait_apply(j_sg_cpu, time, max_cap);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
@@ -537,9 +539,17 @@ static struct attribute *sugov_attrs[] = {
};
ATTRIBUTE_GROUPS(sugov);
-static struct kobj_type sugov_tunables_ktype = {
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
+static const struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
};
/********************** cpufreq governor interface *********************/
@@ -639,12 +649,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
-static void sugov_tunables_free(struct sugov_tunables *tunables)
+static void sugov_clear_global_tunables(void)
{
if (!have_governor_per_policy())
global_tunables = NULL;
-
- kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
@@ -707,7 +715,7 @@ out:
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
stop_kthread:
sugov_kthread_stop(sg_policy);
@@ -734,7 +742,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
mutex_unlock(&global_tunables_lock);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d583f2aa744e..a286e726eb4b 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,7 +22,6 @@
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
-#include "sched.h"
/*
* p->rt_priority p->prio newpri cpupri
@@ -148,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
int task_pri = convert_prio(p->prio);
int idx, cpu;
- BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+ WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 872e481d5098..af7952f12e6c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,7 +2,10 @@
/*
* Simple CPU accounting cgroup controller
*/
-#include "sched.h"
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -148,10 +151,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -227,6 +230,21 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
@@ -615,7 +633,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
.sum_exec_runtime = p->se.sum_exec_runtime,
};
- task_cputime(p, &cputime.utime, &cputime.stime);
+ if (task_cputime(p, &cputime.utime, &cputime.stime))
+ cputime.sum_exec_runtime = task_sched_runtime(p);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
@@ -828,19 +847,21 @@ u64 task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 delta;
+ int ret;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
*stime = t->stime;
- return;
+ return false;
}
do {
+ ret = false;
seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
@@ -850,6 +871,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
if (vtime->state < VTIME_SYS)
continue;
+ ret = true;
delta = vtime_delta(vtime);
/*
@@ -861,6 +883,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return ret;
}
static int vtime_state_fetch(struct vtime *vtime, int cpu)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aaacd6cfd42f..5a9a4b81c972 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,10 +15,42 @@
* Michael Trimarchi <michael@amarulasolutions.com>,
* Fabio Checconi <fchecconi@gmail.com>
*/
-#include "sched.h"
-#include "pelt.h"
-struct dl_bandwidth def_dl_bandwidth;
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_dl_sysctls[] = {
+ {
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
+ },
+ {}
+};
+
+static int __init sched_dl_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_dl_sysctls);
+ return 0;
+}
+late_initcall(sched_dl_sysctl_init);
+#endif
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
@@ -92,15 +124,12 @@ static inline int dl_bw_cpus(int i)
return cpus;
}
-static inline unsigned long __dl_bw_capacity(int i)
+static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
{
- struct root_domain *rd = cpu_rq(i)->rd;
unsigned long cap = 0;
+ int i;
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
-
- for_each_cpu_and(i, rd->span, cpu_active_mask)
+ for_each_cpu_and(i, mask, cpu_active_mask)
cap += capacity_orig_of(i);
return cap;
@@ -112,11 +141,14 @@ static inline unsigned long __dl_bw_capacity(int i)
*/
static inline unsigned long dl_bw_capacity(int i)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+ if (!sched_asym_cpucap_active() &&
capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
} else {
- return __dl_bw_capacity(i);
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return __dl_bw_capacity(cpu_rq(i)->rd->span);
}
}
@@ -130,6 +162,21 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
rd->visit_gen = gen;
return false;
}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -150,9 +197,38 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
{
return false;
}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
#endif
static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+static inline
void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -234,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
- BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
if (task_on_rq_queued(p))
return;
@@ -355,8 +431,8 @@ static void task_non_contending(struct task_struct *p)
sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- __dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(p);
}
return;
@@ -408,7 +484,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+ return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
@@ -423,12 +499,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
- raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_rt_runtime() == RUNTIME_INF)
dl_b->bw = -1;
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
- raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
@@ -533,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct rb_node *leftmost;
- BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
leftmost = rb_add_cached(&p->pushable_dl_tasks,
&rq->dl.pushable_dl_tasks_root,
@@ -570,8 +644,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return rq->online && dl_task(prev);
}
-static DEFINE_PER_CPU(struct callback_head, dl_push_head);
-static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_push_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_pull_head);
static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *);
@@ -610,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* Failed to find any suitable CPU.
* The task will never come back!
*/
- BUG_ON(dl_bandwidth_enabled());
+ WARN_ON_ONCE(dl_bandwidth_enabled());
/*
* If admission control is disabled we
@@ -683,15 +757,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_dl_task(struct rq *rq)
-{
-}
-
static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
@@ -705,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ struct rq *rq)
+{
+ /* for non-boosted task, pi_of(dl_se) == dl_se */
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+}
+
/*
* We are being explicitly informed that a new instance is starting,
* and this means that:
@@ -738,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
- dl_se->runtime = dl_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
/*
@@ -765,16 +837,14 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
+ WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
*/
- if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
- dl_se->runtime = pi_of(dl_se)->dl_runtime;
- }
+ if (dl_se->dl_deadline == 0)
+ replenish_dl_new_period(dl_se, rq);
if (dl_se->dl_yielded && dl_se->runtime > 0)
dl_se->runtime = 0;
@@ -801,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
- dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
- dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
if (dl_se->dl_yielded)
@@ -959,8 +1028,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
return;
}
- dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
- dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
}
@@ -1191,8 +1259,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
* This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is
@@ -1265,14 +1331,12 @@ static void update_curr_dl(struct rq *rq)
return;
}
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
+ schedstat_set(curr->stats.exec_max,
+ max(curr->stats.exec_max, delta_exec));
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
+ trace_sched_stat_runtime(curr, delta_exec, 0);
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
+ update_current_exec_runtime(curr, now, delta_exec);
if (dl_entity_is_special(dl_se))
return;
@@ -1391,6 +1455,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
timer->function = inactive_task_timer;
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -1420,10 +1487,9 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
- struct rb_node *leftmost = dl_rq->root.rb_leftmost;
- struct sched_dl_entity *entry;
+ struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+ struct sched_dl_entity *entry = __node_2_dle(leftmost);
- entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
@@ -1464,19 +1530,92 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
dec_dl_migration(dl_se, dl_rq);
}
-#define __node_2_dle(node) \
- rb_entry((node), struct sched_dl_entity, rb_node)
-
static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
{
return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
}
+static inline struct sched_statistics *
+__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
+{
+ return &dl_task_of(dl_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (!schedstat_enabled())
+ return;
+
+ if ((flags & DEQUEUE_SLEEP)) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+ }
+}
+
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
@@ -1500,7 +1639,9 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- BUG_ON(on_dl_rq(dl_se));
+ WARN_ON_ONCE(on_dl_rq(dl_se));
+
+ update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
/*
* If this is a wakeup or a new instance, the scheduling
@@ -1561,7 +1702,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* the throttle.
*/
p->dl.dl_throttled = 0;
- BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
+ if (!(flags & ENQUEUE_REPLENISH))
+ printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
+ task_pid_nr(p));
+
return;
}
@@ -1598,6 +1742,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return;
}
+ check_schedstat_required();
+ update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+
enqueue_dl_entity(&p->dl, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1606,6 +1753,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
dequeue_dl_entity(&p->dl);
dequeue_pushable_dl_task(rq, p);
}
@@ -1665,6 +1813,14 @@ static void yield_task_dl(struct rq *rq)
#ifdef CONFIG_SMP
+static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
+ struct rq *rq)
+{
+ return (!rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ rq->dl.earliest_dl.curr));
+}
+
static int find_later_rq(struct task_struct *task);
static int
@@ -1700,16 +1856,14 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
* Take the capacity of the CPU into account to
* ensure it fits the requirement of the task.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity))
+ if (sched_asym_cpucap_active())
select_rq |= !dl_task_fits_capacity(p, cpu);
if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
- (dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr) ||
- (cpu_rq(target)->dl.dl_nr_running == 0)))
+ dl_task_is_earliest_deadline(p, cpu_rq(target)))
cpu = target;
}
rcu_read_unlock();
@@ -1720,6 +1874,7 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
+ struct rq_flags rf;
struct rq *rq;
if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1731,8 +1886,9 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
@@ -1746,7 +1902,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -1824,7 +1980,12 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_end_dl(dl_rq, dl_se);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
@@ -1841,15 +2002,14 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
deadline_queue_push_tasks(rq);
}
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- struct dl_rq *dl_rq)
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
{
struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
- return rb_entry(left, struct sched_dl_entity, rb_node);
+ return __node_2_dle(left);
}
static struct task_struct *pick_task_dl(struct rq *rq)
@@ -1861,8 +2021,8 @@ static struct task_struct *pick_task_dl(struct rq *rq)
if (!sched_dl_runnable(rq))
return NULL;
- dl_se = pick_next_dl_entity(rq, dl_rq);
- BUG_ON(!dl_se);
+ dl_se = pick_next_dl_entity(dl_rq);
+ WARN_ON_ONCE(!dl_se);
p = dl_task_of(dl_se);
return p;
@@ -1881,6 +2041,12 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_start_dl(dl_rq, dl_se);
+
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
@@ -1926,7 +2092,7 @@ static void task_fork_dl(struct task_struct *p)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
@@ -1938,15 +2104,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+
next_node:
if (next_node) {
- p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(next_node);
if (pick_dl_task(rq, p, cpu))
return p;
@@ -2062,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (later_rq->dl.dl_nr_running &&
- !dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr)) {
+ if (!dl_task_is_earliest_deadline(task, later_rq)) {
/*
* Target rq has tasks of equal or earlier deadline,
* retrying does not release any lock and is unlikely
@@ -2078,8 +2244,9 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_running(rq, task) ||
+ task_on_cpu(rq, task) ||
!dl_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
@@ -2092,9 +2259,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
* its earliest one has a later deadline than our
* task, the rq is a good one.
*/
- if (!later_rq->dl.dl_nr_running ||
- dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr))
+ if (dl_task_is_earliest_deadline(task, later_rq))
break;
/* Otherwise we try again. */
@@ -2112,15 +2277,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
- struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
return p;
}
@@ -2144,12 +2308,6 @@ static int push_dl_task(struct rq *rq)
return 0;
retry:
- if (is_migration_disabled(next_task))
- return 0;
-
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* If next_task preempts rq->curr, and rq->curr
* can move away, it makes sense to just reschedule
@@ -2162,6 +2320,12 @@ retry:
return 0;
}
+ if (is_migration_disabled(next_task))
+ return 0;
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -2195,13 +2359,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ activate_task(later_rq, next_task, 0);
ret = 1;
resched_curr(later_rq);
@@ -2272,9 +2430,7 @@ static void pull_dl_task(struct rq *this_rq)
* - it will preempt the last one we pulled (if any).
*/
if (p && dl_time_before(p->dl.deadline, dmin) &&
- (!this_rq->dl.dl_nr_running ||
- dl_time_before(p->dl.deadline,
- this_rq->dl.earliest_dl.curr))) {
+ dl_task_is_earliest_deadline(p, this_rq)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!task_on_rq_queued(p));
@@ -2319,7 +2475,7 @@ skip:
*/
static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
@@ -2330,13 +2486,12 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+ struct affinity_context *ctx)
{
struct root_domain *src_rd;
struct rq *rq;
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
src_rd = rq->rd;
@@ -2346,7 +2501,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, new_mask)) {
+ if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -2360,7 +2515,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- set_cpus_allowed_common(p, new_mask, flags);
+ set_cpus_allowed_common(p, ctx);
}
/* Assumes rq->lock is held */
@@ -2509,17 +2664,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || task_current(rq, p)) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2527,17 +2685,33 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
}
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+ return p->dl.dl_throttled;
+}
+#endif
+
DEFINE_SCHED_CLASS(dl) = {
.enqueue_task = enqueue_task_dl,
@@ -2570,6 +2744,9 @@ DEFINE_SCHED_CLASS(dl) = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_dl,
+#endif
};
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
@@ -2635,9 +2812,6 @@ void sched_dl_do_global(void)
int cpu;
unsigned long flags;
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
@@ -2741,7 +2915,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2754,18 +2928,11 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
- * Default limits for DL period; on the top end we guard against small util
- * tasks still getting ridiculously long effective runtimes, on the bottom end we
- * guard against timer DoS.
- */
-unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
-unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
-
-/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
@@ -2851,62 +3018,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
}
#ifdef CONFIG_SMP
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
-{
- unsigned long flags, cap;
- unsigned int dest_cpu;
- struct dl_bw *dl_b;
- bool overflow;
- int ret;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cap = dl_bw_capacity(dest_cpu);
- overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
- if (overflow) {
- ret = -EBUSY;
- } else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- int cpus = dl_bw_cpus(dest_cpu);
-
- __dl_add(dl_b, p->dl.dl_bw, cpus);
- ret = 0;
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
-
- return ret;
-}
-
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
+ unsigned long flags, cap;
struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
+ cap = __dl_bw_capacity(trial);
raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ if (__dl_overflow(cur_dl_b, cap, 0, 0))
ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched();
@@ -2914,7 +3044,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-bool dl_cpu_busy(unsigned int cpu)
+int dl_cpu_busy(int cpu, struct task_struct *p)
{
unsigned long flags, cap;
struct dl_bw *dl_b;
@@ -2924,11 +3054,22 @@ bool dl_cpu_busy(unsigned int cpu)
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cap = dl_bw_capacity(cpu);
- overflow = __dl_overflow(dl_b, cap, 0, 0);
+ overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
+
+ if (!overflow && p) {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+ }
+
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- return overflow;
+ return overflow ? -EBUSY : 0;
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0c5ec2776ddf..0b2340a79b65 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,7 +6,6 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
-#include "sched.h"
/*
* This allows printing both to /proc/sched_debug and
@@ -173,16 +172,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[16];
+ unsigned int scaling;
if (cnt > 15)
cnt = 15;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
- if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ if (kstrtouint(buf, 10, &scaling))
return -EINVAL;
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
if (sched_update_scaling())
return -EINVAL;
@@ -275,6 +280,45 @@ static const struct file_operations sched_dynamic_fops = {
__read_mostly bool sched_debug_verbose;
+#ifdef CONFIG_SMP
+static struct dentry *sd_dentry;
+
+
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ ssize_t result;
+ bool orig;
+
+ cpus_read_lock();
+ mutex_lock(&sched_domains_mutex);
+
+ orig = sched_debug_verbose;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sched_debug_verbose && !orig)
+ update_sched_domain_debugfs();
+ else if (!sched_debug_verbose && orig) {
+ debugfs_remove(sd_dentry);
+ sd_dentry = NULL;
+ }
+
+ mutex_unlock(&sched_domains_mutex);
+ cpus_read_unlock();
+
+ return result;
+}
+#else
+#define sched_verbose_write debugfs_write_file_bool
+#endif
+
+static const struct file_operations sched_verbose_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_verbose_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
static const struct seq_operations sched_debug_sops;
static int sched_debug_open(struct inode *inode, struct file *filp)
@@ -298,13 +342,14 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);
debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
- debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
+ debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
#ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+ debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
@@ -327,6 +372,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
@@ -338,7 +384,6 @@ late_initcall(sched_init_debug);
#ifdef CONFIG_SMP
static cpumask_var_t sd_sysctl_cpus;
-static struct dentry *sd_dentry;
static int sd_flags_show(struct seq_file *m, void *v)
{
@@ -388,22 +433,37 @@ void update_sched_domain_debugfs(void)
{
int cpu, i;
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
+ if (!sched_debug_verbose)
+ return;
+
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
- if (!sd_dentry)
+ if (!sd_dentry) {
sd_dentry = debugfs_create_dir("domains", debugfs_sched);
+ /* rebuild sd_sysctl_cpus if empty since it gets cleared below */
+ if (cpumask_empty(sd_sysctl_cpus))
+ cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
+ }
+
for_each_cpu(cpu, sd_sysctl_cpus) {
struct sched_domain *sd;
struct dentry *d_cpu;
char buf[32];
snprintf(buf, sizeof(buf), "cpu%d", cpu);
- debugfs_remove(debugfs_lookup(buf, sd_dentry));
+ debugfs_lookup_and_remove(buf, sd_dentry);
d_cpu = debugfs_create_dir(buf, sd_dentry);
i = 0;
@@ -435,9 +495,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
struct sched_entity *se = tg->se[cpu];
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
-#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
+ #F, (long long)schedstat_val(stats->F))
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \
+ #F, SPLIT_NS((long long)schedstat_val(stats->F)))
if (!se)
return;
@@ -447,16 +509,19 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
- PN_SCHEDSTAT(se->statistics.wait_start);
- PN_SCHEDSTAT(se->statistics.sleep_start);
- PN_SCHEDSTAT(se->statistics.block_start);
- PN_SCHEDSTAT(se->statistics.sleep_max);
- PN_SCHEDSTAT(se->statistics.block_max);
- PN_SCHEDSTAT(se->statistics.exec_max);
- PN_SCHEDSTAT(se->statistics.slice_max);
- PN_SCHEDSTAT(se->statistics.wait_max);
- PN_SCHEDSTAT(se->statistics.wait_sum);
- P_SCHEDSTAT(se->statistics.wait_count);
+ struct sched_statistics *stats;
+ stats = __schedstats_from_se(se);
+
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
}
P(se->load.weight);
@@ -522,10 +587,11 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -600,6 +666,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
+ cfs_rq->idle_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+ cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
@@ -794,6 +865,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_idle_min_granularity);
PN(sysctl_sched_wakeup_granularity);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
@@ -905,25 +977,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
P(numa_pages_migrated);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
- mpol_put(pol);
#endif
}
@@ -938,8 +1000,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"---------------------------------------------------------"
"----------\n");
-#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
-#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
PN(se.exec_start);
PN(se.vruntime);
@@ -952,33 +1014,34 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
- PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
- PN_SCHEDSTAT(se.statistics.wait_start);
- PN_SCHEDSTAT(se.statistics.sleep_start);
- PN_SCHEDSTAT(se.statistics.block_start);
- PN_SCHEDSTAT(se.statistics.sleep_max);
- PN_SCHEDSTAT(se.statistics.block_max);
- PN_SCHEDSTAT(se.statistics.exec_max);
- PN_SCHEDSTAT(se.statistics.slice_max);
- PN_SCHEDSTAT(se.statistics.wait_max);
- PN_SCHEDSTAT(se.statistics.wait_sum);
- P_SCHEDSTAT(se.statistics.wait_count);
- PN_SCHEDSTAT(se.statistics.iowait_sum);
- P_SCHEDSTAT(se.statistics.iowait_count);
- P_SCHEDSTAT(se.statistics.nr_migrations_cold);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
- P_SCHEDSTAT(se.statistics.nr_forced_migrations);
- P_SCHEDSTAT(se.statistics.nr_wakeups);
- P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
- P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
- P_SCHEDSTAT(se.statistics.nr_wakeups_local);
- P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
- P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
- P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
+ PN_SCHEDSTAT(sum_sleep_runtime);
+ PN_SCHEDSTAT(sum_block_runtime);
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
+ PN_SCHEDSTAT(iowait_sum);
+ P_SCHEDSTAT(iowait_count);
+ P_SCHEDSTAT(nr_migrations_cold);
+ P_SCHEDSTAT(nr_failed_migrations_affine);
+ P_SCHEDSTAT(nr_failed_migrations_running);
+ P_SCHEDSTAT(nr_failed_migrations_hot);
+ P_SCHEDSTAT(nr_forced_migrations);
+ P_SCHEDSTAT(nr_wakeups);
+ P_SCHEDSTAT(nr_wakeups_sync);
+ P_SCHEDSTAT(nr_wakeups_migrate);
+ P_SCHEDSTAT(nr_wakeups_local);
+ P_SCHEDSTAT(nr_wakeups_remote);
+ P_SCHEDSTAT(nr_wakeups_affine);
+ P_SCHEDSTAT(nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(nr_wakeups_passive);
+ P_SCHEDSTAT(nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
@@ -996,6 +1059,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}
__P(nr_switches);
@@ -1044,7 +1111,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..373ff5f55884 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c