From eb166e522c77699fc19bfa705652327a1e51a117 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:48:54 -0700 Subject: bpf: Allow helper bpf_get_[ns_]current_pid_tgid() for all prog types Currently bpf_get_current_pid_tgid() is allowed in tracing, cgroup and sk_msg progs while bpf_get_ns_current_pid_tgid() is only allowed in tracing progs. We have an internal use case where for an application running in a container (with pid namespace), user wants to get the pid associated with the pid namespace in a cgroup bpf program. Currently, cgroup bpf progs already allow bpf_get_current_pid_tgid(). Let us allow bpf_get_ns_current_pid_tgid() as well. With auditing the code, bpf_get_current_pid_tgid() is also used by sk_msg prog. But there are no side effect to expose these two helpers to all prog types since they do not reveal any kernel specific data. The detailed discussion is in [1]. So with this patch, both bpf_get_current_pid_tgid() and bpf_get_ns_current_pid_tgid() are put in bpf_base_func_proto(), making them available to all program types. [1] https://lore.kernel.org/bpf/20240307232659.1115872-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240315184854.2975190-1-yonghong.song@linux.dev --- kernel/trace/bpf_trace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a5c4efc73c3..1b041911b1d8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1525,8 +1525,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: @@ -1582,8 +1580,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_get_ns_current_pid_tgid: - return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: -- cgit From 6b9c2950c912780ce113079c9c52041b1e2a611a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:48 -0700 Subject: bpf: flatten bpf_probe_register call chain bpf_probe_register() and __bpf_probe_register() have identical signatures and bpf_probe_register() just redirect to __bpf_probe_register(). So get rid of this extra function call step to simplify following the source code. It has no difference at runtime due to inlining, of course. Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-2-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1b041911b1d8..30ecf62f8a17 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2425,7 +2425,7 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { struct tracepoint *tp = btp->tp; @@ -2443,11 +2443,6 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * prog); } -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - return __bpf_probe_register(btp, prog); -} - int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); -- cgit From d4dfc5700e867b22ab94f960f9a9972696a637d5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:49 -0700 Subject: bpf: pass whole link instead of prog when triggering raw tracepoint Instead of passing prog as an argument to bpf_trace_runX() helpers, that are called from tracepoint triggering calls, store BPF link itself (struct bpf_raw_tp_link for raw tracepoints). This will allow to pass extra information like BPF cookie into raw tracepoint registration. Instead of replacing `struct bpf_prog *prog = __data;` with corresponding `struct bpf_raw_tp_link *link = __data;` assignment in `__bpf_trace_##call` I just passed `__data` through into underlying bpf_trace_runX() call. This works well because we implicitly cast `void *`, and it also avoids naming clashes with arguments coming from tracepoint's "proto" list. We could have run into the same problem with "prog", we just happened to not have a tracepoint that has "prog" input argument. We are less lucky with "link", as there are tracepoints using "link" argument name already. So instead of trying to avoid naming conflicts, let's just remove intermediate local variable. It doesn't hurt readibility, it's either way a bit of a maze of calls and macros, that requires careful reading. Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-3-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 30ecf62f8a17..17de91ad4a1f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2366,8 +2366,10 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) } static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct bpf_prog *prog = link->link.prog; + cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); @@ -2404,12 +2406,12 @@ out: #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ + void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ + __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); @@ -2425,9 +2427,10 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; + struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's @@ -2439,13 +2442,12 @@ int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, - prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, -- cgit From 68ca5d4eebb8c4de246ee5f634eee26bc689562d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:50 -0700 Subject: bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs Wire up BPF cookie for raw tracepoint programs (both BTF and non-BTF aware variants). This brings them up to part w.r.t. BPF cookie usage with classic tracepoint and fentry/fexit programs. Acked-by: Stanislav Fomichev Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-4-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 17de91ad4a1f..434e3ece6688 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2004,6 +2004,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2066,6 +2068,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); @@ -2369,15 +2374,23 @@ static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { struct bpf_prog *prog = link->link.prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } + + run_ctx.bpf_cookie = link->cookie; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } -- cgit From a8497506cd2c0fc90a64f6f5d2744a0ddb2c81eb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 14:20:13 -0700 Subject: bpf: Avoid get_kernel_nofault() to fetch kprobe entry IP get_kernel_nofault() (or, rather, underlying copy_from_kernel_nofault()) is not free and it does pop up in performance profiles when kprobes are heavily utilized with CONFIG_X86_KERNEL_IBT=y config. Let's avoid using it if we know that fentry_ip - 4 can't cross page boundary. We do that by masking lowest 12 bits and checking if they are Another benefit (and actually what caused a closer look at this part of code) is that now LBR record is (typically) not wasted on copy_from_kernel_nofault() call and code, which helps tools like retsnoop that grab LBR records from inside BPF code in kretprobes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/bpf/20240319212013.1046779-1-andrii@kernel.org --- kernel/trace/bpf_trace.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 434e3ece6688..6d0c95638e1b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1053,9 +1053,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip) { u32 instr; - /* Being extra safe in here in case entry ip is on the page-edge. */ - if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) - return fentry_ip; + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { + if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) + return fentry_ip; + } else { + instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); + } if (is_endbr(instr)) fentry_ip -= ENDBR_INSN_SIZE; return fentry_ip; -- cgit