diff options
| author | Alexei Starovoitov <ast@kernel.org> | 2025-11-24 09:45:27 -0800 |
|---|---|---|
| committer | Alexei Starovoitov <ast@kernel.org> | 2025-11-24 09:47:11 -0800 |
| commit | acf8726466a6254617ade092bb3eded8c04947f7 (patch) | |
| tree | bcbde53baf8efb5649e1d71a225c3dca036fc05e /kernel | |
| parent | fad804002ef3cae8ca0509849d0d9539be069095 (diff) | |
| parent | 402e44b31e9d8cb082d85870ee0d0ad54f97c311 (diff) | |
Merge branch 'bpf-trampoline-support-jmp-mode'
Menglong Dong says:
====================
bpf trampoline support "jmp" mode
For now, the bpf trampoline is called by the "call" instruction. However,
it break the RSB and introduce extra overhead in x86_64 arch.
For example, we hook the function "foo" with fexit, the call and return
logic will be like this:
call foo -> call trampoline -> call foo-body ->
return foo-body -> return foo
As we can see above, there are 3 call, but 2 return, which break the RSB
balance. We can pseudo a "return" here, but it's not the best choice,
as it will still cause once RSB miss:
call foo -> call trampoline -> call foo-body ->
return foo-body -> return dummy -> return foo
The "return dummy" doesn't pair the "call trampoline", which can also
cause the RSB miss.
Therefore, we introduce the "jmp" mode for bpf trampoline, as advised by
Alexei in [1]. And the logic will become this:
call foo -> jmp trampoline -> call foo-body ->
return foo-body -> return foo
As we can see above, the RSB is totally balanced after this series.
In this series, we introduce the FTRACE_OPS_FL_JMP for ftrace to make it
use the "jmp" instruction instead of "call".
And we also do some adjustment to bpf_arch_text_poke() to allow us specify
the old and new poke_type.
For the BPF_TRAMP_F_SHARE_IPMODIFY case, we will fallback to the "call"
mode, as it need to get the function address from the stack, which is not
supported in "jmp" mode.
Before this series, we have the following performance with the bpf
benchmark:
$ cd tools/testing/selftests/bpf
$ ./benchs/run_bench_trigger.sh
usermode-count : 890.171 ± 1.522M/s
kernel-count : 409.184 ± 0.330M/s
syscall-count : 26.792 ± 0.010M/s
fentry : 171.242 ± 0.322M/s
fexit : 80.544 ± 0.045M/s
fmodret : 78.301 ± 0.065M/s
rawtp : 192.906 ± 0.900M/s
tp : 81.883 ± 0.209M/s
kprobe : 52.029 ± 0.113M/s
kprobe-multi : 62.237 ± 0.060M/s
kprobe-multi-all: 4.761 ± 0.014M/s
kretprobe : 23.779 ± 0.046M/s
kretprobe-multi: 29.134 ± 0.012M/s
kretprobe-multi-all: 3.822 ± 0.003M/
And after this series, we have the following performance:
usermode-count : 890.443 ± 0.307M/s
kernel-count : 416.139 ± 0.055M/s
syscall-count : 31.037 ± 0.813M/s
fentry : 169.549 ± 0.519M/s
fexit : 136.540 ± 0.518M/s
fmodret : 159.248 ± 0.188M/s
rawtp : 194.475 ± 0.144M/s
tp : 84.505 ± 0.041M/s
kprobe : 59.951 ± 0.071M/s
kprobe-multi : 63.153 ± 0.177M/s
kprobe-multi-all: 4.699 ± 0.012M/s
kretprobe : 23.740 ± 0.015M/s
kretprobe-multi: 29.301 ± 0.022M/s
kretprobe-multi-all: 3.869 ± 0.005M/s
As we can see above, the performance of fexit increase from 80.544M/s to
136.540M/s, and the "fmodret" increase from 78.301M/s to 159.248M/s.
Link: https://lore.kernel.org/bpf/20251117034906.32036-1-dongml2@chinatelecom.cn/
Changes since v2:
* reject if the addr is already "jmp" in register_ftrace_direct() and
__modify_ftrace_direct() in the 1st patch.
* fix compile error in powerpc in the 5th patch.
* changes in the 6th patch:
- fix the compile error by wrapping the write to tr->fops->flags with
CONFIG_DYNAMIC_FTRACE_WITH_JMP
- reset BPF_TRAMP_F_SKIP_FRAME when the second try of modify_fentry in
bpf_trampoline_update()
Link: https://lore.kernel.org/bpf/20251114092450.172024-1-dongml2@chinatelecom.cn/
Changes since v1:
* change the bool parameter that we add to save_args() to "u32 flags"
* rename bpf_trampoline_need_jmp() to bpf_trampoline_use_jmp()
* add new function parameter to bpf_arch_text_poke instead of introduce
bpf_arch_text_poke_type()
* rename bpf_text_poke to bpf_trampoline_update_fentry
* remove the BPF_TRAMP_F_JMPED and check the current mode with the origin
flags instead.
Link: https://lore.kernel.org/bpf/CAADnVQLX54sVi1oaHrkSiLqjJaJdm3TQjoVrgU-LZimK6iDcSA@mail.gmail.com/[1]
====================
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20251118123639.688444-1-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/core.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/trampoline.c | 79 | ||||
| -rw-r--r-- | kernel/trace/Kconfig | 12 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 17 |
4 files changed, 95 insertions, 18 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ef4448f18aad..c8ae6ab31651 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3150,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } -int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2) +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { return -ENOTSUPP; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 04104397c432..976d89011b15 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -175,23 +175,42 @@ out: return tr; } -static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr) { + enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL; void *ip = tr->func.addr; + + if (!new_addr) + new_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(tr->flags)) + new_t = BPF_MOD_JUMP; + + if (!old_addr) + old_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(orig_flags)) + old_t = BPF_MOD_JUMP; + + return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); +} + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr) +{ int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr, bool lock_direct_mutex) { - void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { @@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, + new_addr); } return ret; } @@ -225,7 +245,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } return ret; @@ -336,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { - int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, - NULL, im->ip_epilogue); + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); @@ -410,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, tr->cur_image->image); + err = unregister_fentry(tr, orig_flags, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -434,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: - if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && - (tr->flags & BPF_TRAMP_F_CALL_ORIG)) - tr->flags |= BPF_TRAMP_F_ORIG_STACK; + if (tr->flags & BPF_TRAMP_F_CALL_ORIG) { + if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) { + /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the + * first try, reset it in the second try. + */ + tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME; + } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) { + /* Use "jmp" instead of "call" for the trampoline + * in the origin call case, and we don't need to + * skip the frame. + */ + tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME; + } + } #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, @@ -467,10 +499,18 @@ again: if (err) goto out_free; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); + err = modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); @@ -493,8 +533,15 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) + if (err) { tr->flags = orig_flags; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + } kfree(tlinks); return err; @@ -570,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, if (err) return err; tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) @@ -618,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + BPF_MOD_NOP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c79da81e4f..4661b9e606e0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. +config HAVE_DYNAMIC_FTRACE_WITH_JMP + bool + help + If the architecture supports to replace the __fentry__ with a + "jmp" instruction. + config HAVE_SYSCALL_TRACEPOINTS bool help @@ -330,6 +336,12 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config DYNAMIC_FTRACE_WITH_JMP + def_bool y + depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS + depends on HAVE_DYNAMIC_FTRACE_WITH_JMP + config FPROBE bool "Kernel Function Probe (fprobe)" depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 59cfacb8a5bb..bbb37c0f8c6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5951,7 +5951,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { del = __ftrace_lookup_ip(direct_functions, entry->ip); - if (del && del->direct == addr) { + if (del && ftrace_jmp_get(del->direct) == + ftrace_jmp_get(addr)) { remove_hash_entry(direct_functions, del); kfree(del); } @@ -6016,8 +6017,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + mutex_lock(&direct_mutex); + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6138,6 +6146,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; |
