diff options
author | Alexei Starovoitov <ast@kernel.org> | 2023-07-25 17:04:27 -0700 |
---|---|---|
committer | Alexei Starovoitov <ast@kernel.org> | 2023-07-25 17:06:49 -0700 |
commit | aa89592fcb3af8372bd39ff468fdd65477f57201 (patch) | |
tree | f695504d7bc7d67cacba1c1617b2c263b6f06c5b | |
parent | 284779dbf4e98753458708783af8c35630674a21 (diff) | |
parent | d62cc390c2e99ae267ffe4b8d7e2e08b6c758c32 (diff) |
Merge branch 'bpf-disable-preemption-in-perf_event_output-helpers-code'
Jiri Olsa says:
====================
bpf: Disable preemption in perf_event_output helpers code
hi,
we got report of kernel crash [1][3] within bpf_event_output helper.
The reason is the nesting protection code in bpf_event_output that expects
disabled preemption, which is not guaranteed for programs executed by
bpf_prog_run_array_cg.
I managed to reproduce on tracing side where we have the same problem
in bpf_perf_event_output. The reproducer [2] just creates busy uprobe
and call bpf_perf_event_output helper a lot.
v3 changes:
- added acks and fixed 'Fixes' tag style [Hou Tao]
- added Closes tag to patch 2
v2 changes:
- I changed 'Fixes' commits to where I saw we switched from preempt_disable
to migrate_disable, but I'm not completely sure about the patch 2, because
it was tricky to find, would be nice if somebody could check on that
thanks,
jirka
[1] https://github.com/cilium/cilium/issues/26756
[2] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/commit/?h=bpf_output_fix_reproducer&id=8054dcc634121b884c7c331329d61d93351d03b5
[3] slack:
[66194.378161] BUG: kernel NULL pointer dereference, address: 0000000000000001
[66194.378324] #PF: supervisor instruction fetch in kernel mode
[66194.378447] #PF: error_code(0x0010) - not-present page
...
[66194.378692] Oops: 0010 [#1] PREEMPT SMP NOPTI
...
[66194.380666] <TASK>
[66194.380775] ? perf_output_sample+0x12a/0x9a0
[66194.380902] ? finish_task_switch.isra.0+0x81/0x280
[66194.381024] ? perf_event_output+0x66/0xa0
[66194.381148] ? bpf_event_output+0x13a/0x190
[66194.381270] ? bpf_event_output_data+0x22/0x40
[66194.381391] ? bpf_prog_dfc84bbde731b257_cil_sock4_connect+0x40a/0xacb
[66194.381519] ? xa_load+0x87/0xe0
[66194.381635] ? __cgroup_bpf_run_filter_sock_addr+0xc1/0x1a0
[66194.381759] ? release_sock+0x3e/0x90
[66194.381876] ? sk_setsockopt+0x1a1/0x12f0
[66194.381996] ? udp_pre_connect+0x36/0x50
[66194.382114] ? inet_dgram_connect+0x93/0xa0
[66194.382233] ? __sys_connect+0xb4/0xe0
[66194.382353] ? udp_setsockopt+0x27/0x40
[66194.382470] ? __pfx_udp_push_pending_frames+0x10/0x10
[66194.382593] ? __sys_setsockopt+0xdf/0x1a0
[66194.382713] ? __x64_sys_connect+0xf/0x20
[66194.382832] ? do_syscall_64+0x3a/0x90
[66194.382949] ? entry_SYSCALL_64_after_hwframe+0x72/0xdc
[66194.383077] </TASK>
---
====================
Link: https://lore.kernel.org/r/20230725084206.580930-1-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r-- | kernel/trace/bpf_trace.c | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5f2dcabad202..bd1a42b23f3f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); - int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, @@ -670,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, }, }; struct perf_sample_data *sd; - int err; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; @@ -688,9 +691,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); - out: this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); return err; } @@ -715,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -732,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_sample_data *sd; struct pt_regs *regs; + int nest_level; u64 ret; + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; @@ -748,6 +754,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ret = __bpf_perf_event_output(regs, map, flags, sd); out: this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); return ret; } |