summaryrefslogtreecommitdiff
path: root/kernel/bpf
AgeCommit message (Collapse)Author
2024-07-12bpf: use check_sub_overflow() to check for subtraction overflowsShung-Hsi Yu
Similar to previous patch that drops signed_add*_overflows() and uses (compiler) builtin-based check_add_overflow(), do the same for signed_sub*_overflows() and replace them with the generic check_sub_overflow() to make future refactoring easier and have the checks implemented more efficiently. Unsigned overflow check for subtraction does not use helpers and are simple enough already, so they're left untouched. After the change GCC 13.3.0 generates cleaner assembly on x86_64: if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || 139bf: mov 0x28(%r12),%rax 139c4: mov %edx,0x54(%r12) 139c9: sub %r11,%rax 139cc: mov %rax,0x28(%r12) 139d1: jo 14627 <adjust_reg_min_max_vals+0x1237> check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { 139d7: mov 0x30(%r12),%rax 139dc: sub %r9,%rax 139df: mov %rax,0x30(%r12) if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || 139e4: jo 14627 <adjust_reg_min_max_vals+0x1237> ... *dst_smin = S64_MIN; 14627: movabs $0x8000000000000000,%rax 14631: mov %rax,0x28(%r12) *dst_smax = S64_MAX; 14636: sub $0x1,%rax 1463a: mov %rax,0x30(%r12) Before the change it gives: if (signed_sub_overflows(dst_reg->smin_value, smax_val) || 13a50: mov 0x28(%r12),%rdi 13a55: mov %edx,0x54(%r12) dst_reg->smax_value = S64_MAX; 13a5a: movabs $0x7fffffffffffffff,%rdx 13a64: mov %eax,0x50(%r12) dst_reg->smin_value = S64_MIN; 13a69: movabs $0x8000000000000000,%rax s64 res = (s64)((u64)a - (u64)b); 13a73: mov %rdi,%rsi 13a76: sub %rcx,%rsi if (b < 0) 13a79: test %rcx,%rcx 13a7c: js 145ea <adjust_reg_min_max_vals+0x119a> if (signed_sub_overflows(dst_reg->smin_value, smax_val) || 13a82: cmp %rsi,%rdi 13a85: jl 13ac7 <adjust_reg_min_max_vals+0x677> signed_sub_overflows(dst_reg->smax_value, smin_val)) { 13a87: mov 0x30(%r12),%r8 s64 res = (s64)((u64)a - (u64)b); 13a8c: mov %r8,%rax 13a8f: sub %r9,%rax return res > a; 13a92: cmp %rax,%r8 13a95: setl %sil if (b < 0) 13a99: test %r9,%r9 13a9c: js 147d1 <adjust_reg_min_max_vals+0x1381> dst_reg->smax_value = S64_MAX; 13aa2: movabs $0x7fffffffffffffff,%rdx dst_reg->smin_value = S64_MIN; 13aac: movabs $0x8000000000000000,%rax if (signed_sub_overflows(dst_reg->smin_value, smax_val) || 13ab6: test %sil,%sil 13ab9: jne 13ac7 <adjust_reg_min_max_vals+0x677> dst_reg->smin_value -= smax_val; 13abb: mov %rdi,%rax dst_reg->smax_value -= smin_val; 13abe: mov %r8,%rdx dst_reg->smin_value -= smax_val; 13ac1: sub %rcx,%rax dst_reg->smax_value -= smin_val; 13ac4: sub %r9,%rdx 13ac7: mov %rax,0x28(%r12) ... 13ad1: mov %rdx,0x30(%r12) ... if (signed_sub_overflows(dst_reg->smin_value, smax_val) || 145ea: cmp %rsi,%rdi 145ed: jg 13ac7 <adjust_reg_min_max_vals+0x677> 145f3: jmp 13a87 <adjust_reg_min_max_vals+0x637> Suggested-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20240712080127.136608-4-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-12bpf: use check_add_overflow() to check for addition overflowsShung-Hsi Yu
signed_add*_overflows() was added back when there was no overflow-check helper. With the introduction of such helpers in commit f0907827a8a91 ("compiler.h: enable builtin overflow checkers and add fallback code"), we can drop signed_add*_overflows() in kernel/bpf/verifier.c and use the generic check_add_overflow() instead. This will make future refactoring easier, and takes advantage of compiler-emitted hardware instructions that efficiently implement these checks. After the change GCC 13.3.0 generates cleaner assembly on x86_64: err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); 13625: mov 0x28(%rbx),%r9 /* r9 = src_reg->smin_value */ 13629: mov 0x30(%rbx),%rcx /* rcx = src_reg->smax_value */ ... if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || 141c1: mov %r9,%rax 141c4: add 0x28(%r12),%rax 141c9: mov %rax,0x28(%r12) 141ce: jo 146e4 <adjust_reg_min_max_vals+0x1294> check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { 141d4: add 0x30(%r12),%rcx 141d9: mov %rcx,0x30(%r12) if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || 141de: jo 146e4 <adjust_reg_min_max_vals+0x1294> ... *dst_smin = S64_MIN; 146e4: movabs $0x8000000000000000,%rax 146ee: mov %rax,0x28(%r12) *dst_smax = S64_MAX; 146f3: sub $0x1,%rax 146f7: mov %rax,0x30(%r12) Before the change it gives: s64 smin_val = src_reg->smin_value; 675: mov 0x28(%rsi),%r8 s64 smax_val = src_reg->smax_value; u64 umin_val = src_reg->umin_value; u64 umax_val = src_reg->umax_value; 679: mov %rdi,%rax /* rax = dst_reg */ if (signed_add_overflows(dst_reg->smin_value, smin_val) || 67c: mov 0x28(%rdi),%rdi /* rdi = dst_reg->smin_value */ u64 umin_val = src_reg->umin_value; 680: mov 0x38(%rsi),%rdx u64 umax_val = src_reg->umax_value; 684: mov 0x40(%rsi),%rcx s64 res = (s64)((u64)a + (u64)b); 688: lea (%r8,%rdi,1),%r9 /* r9 = dst_reg->smin_value + src_reg->smin_value */ return res < a; 68c: cmp %r9,%rdi 68f: setg %r10b /* r10b = (dst_reg->smin_value + src_reg->smin_value) > dst_reg->smin_value */ if (b < 0) 693: test %r8,%r8 696: js 72b <scalar_min_max_add+0xbb> signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; 69c: movabs $0x7fffffffffffffff,%rdi s64 smax_val = src_reg->smax_value; 6a6: mov 0x30(%rsi),%r8 dst_reg->smin_value = S64_MIN; 6aa: 00 00 00 movabs $0x8000000000000000,%rsi if (signed_add_overflows(dst_reg->smin_value, smin_val) || 6b4: test %r10b,%r10b /* (dst_reg->smin_value + src_reg->smin_value) > dst_reg->smin_value ? goto 6cb */ 6b7: jne 6cb <scalar_min_max_add+0x5b> signed_add_overflows(dst_reg->smax_value, smax_val)) { 6b9: mov 0x30(%rax),%r10 /* r10 = dst_reg->smax_value */ s64 res = (s64)((u64)a + (u64)b); 6bd: lea (%r10,%r8,1),%r11 /* r11 = dst_reg->smax_value + src_reg->smax_value */ if (b < 0) 6c1: test %r8,%r8 6c4: js 71e <scalar_min_max_add+0xae> if (signed_add_overflows(dst_reg->smin_value, smin_val) || 6c6: cmp %r11,%r10 /* (dst_reg->smax_value + src_reg->smax_value) <= dst_reg->smax_value ? goto 723 */ 6c9: jle 723 <scalar_min_max_add+0xb3> } else { dst_reg->smin_value += smin_val; dst_reg->smax_value += smax_val; } 6cb: mov %rsi,0x28(%rax) ... 6d5: mov %rdi,0x30(%rax) ... if (signed_add_overflows(dst_reg->smin_value, smin_val) || 71e: cmp %r11,%r10 721: jl 6cb <scalar_min_max_add+0x5b> dst_reg->smin_value += smin_val; 723: mov %r9,%rsi dst_reg->smax_value += smax_val; 726: mov %r11,%rdi 729: jmp 6cb <scalar_min_max_add+0x5b> return res > a; 72b: cmp %r9,%rdi 72e: setl %r10b 732: jmp 69c <scalar_min_max_add+0x2c> 737: nopw 0x0(%rax,%rax,1) Note: unlike adjust_ptr_min_max_vals() and scalar*_min_max_add(), it is necessary to introduce intermediate variable in adjust_jmp_off() to keep the functional behavior unchanged. Without an intermediate variable imm/off will be altered even on overflow. Suggested-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com> Link: https://lore.kernel.org/r/20240712080127.136608-3-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-12bpf: fix overflow check in adjust_jmp_off()Shung-Hsi Yu
adjust_jmp_off() incorrectly used the insn->imm field for all overflow check, which is incorrect as that should only be done or the BPF_JMP32 | BPF_JA case, not the general jump instruction case. Fix it by using insn->off for overflow check in the general case. Fixes: 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and jump to the 1st insn.") Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com> Link: https://lore.kernel.org/r/20240712080127.136608-2-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-12bpf: Eliminate remaining "make W=1" warnings in kernel/bpf/btf.oAlan Maguire
As reported by Mirsad [1] we still see format warnings in kernel/bpf/btf.o at W=1 warning level: CC kernel/bpf/btf.o ./kernel/bpf/btf.c: In function ‘btf_type_seq_show_flags’: ./kernel/bpf/btf.c:7553:21: warning: assignment left-hand side might be a candidate for a format attribute [-Wsuggest-attribute=format] 7553 | sseq.showfn = btf_seq_show; | ^ ./kernel/bpf/btf.c: In function ‘btf_type_snprintf_show’: ./kernel/bpf/btf.c:7604:31: warning: assignment left-hand side might be a candidate for a format attribute [-Wsuggest-attribute=format] 7604 | ssnprintf.show.showfn = btf_snprintf_show; | ^ Combined with CONFIG_WERROR=y these can halt the build. The fix (annotating the structure field with __printf()) suggested by Mirsad resolves these. Apologies I missed this last time. No other W=1 warnings were observed in kernel/bpf after this fix. [1] https://lore.kernel.org/bpf/92c9d047-f058-400c-9c7d-81d4dc1ef71b@gmail.com/ Fixes: b3470da314fd ("bpf: annotate BTF show functions with __printf") Reported-by: Mirsad Todorovac <mtodorovac69@gmail.com> Suggested-by: Mirsad Todorovac <mtodorovac69@gmail.com> Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20240712092859.1390960-1-alan.maguire@oracle.com
2024-07-11bpf: annotate BTF show functions with __printfAlan Maguire
-Werror=suggest-attribute=format warns about two functions in kernel/bpf/btf.c [1]; add __printf() annotations to silence these warnings since for CONFIG_WERROR=y they will trigger build failures. [1] https://lore.kernel.org/bpf/a8b20c72-6631-4404-9e1f-0410642d7d20@gmail.com/ Fixes: 31d0bc81637d ("bpf: Move to generic BTF show support, apply it to seq files/strings") Reported-by: Mirsad Todorovac <mtodorovac69@gmail.com> Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Tested-by: Mirsad Todorovac <mtodorovac69@yahoo.com> Link: https://lore.kernel.org/r/20240711182321.963667-1-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-11Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/netJakub Kicinski
Cross-merge networking fixes after downstream PR. Conflicts: net/sched/act_ct.c 26488172b029 ("net/sched: Fix UAF when resolving a clash") 3abbd7ed8b76 ("act_ct: prepare for stolen verdict coming from conntrack and nat engine") No adjacent changes. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-07-10bpf: Defer work in bpf_timer_cancel_and_freeKumar Kartikeya Dwivedi
Currently, the same case as previous patch (two timer callbacks trying to cancel each other) can be invoked through bpf_map_update_elem as well, or more precisely, freeing map elements containing timers. Since this relies on hrtimer_cancel as well, it is prone to the same deadlock situation as the previous patch. It would be sufficient to use hrtimer_try_to_cancel to fix this problem, as the timer cannot be enqueued after async_cancel_and_free. Once async_cancel_and_free has been done, the timer must be reinitialized before it can be armed again. The callback running in parallel trying to arm the timer will fail, and freeing bpf_hrtimer without waiting is sufficient (given kfree_rcu), and bpf_timer_cb will return HRTIMER_NORESTART, preventing the timer from being rearmed again. However, there exists a UAF scenario where the callback arms the timer before entering this function, such that if cancellation fails (due to timer callback invoking this routine, or the target timer callback running concurrently). In such a case, if the timer expiration is significantly far in the future, the RCU grace period expiration happening before it will free the bpf_hrtimer state and along with it the struct hrtimer, that is enqueued. Hence, it is clear cancellation needs to occur after async_cancel_and_free, and yet it cannot be done inline due to deadlock issues. We thus modify bpf_timer_cancel_and_free to defer work to the global workqueue, adding a work_struct alongside rcu_head (both used at _different_ points of time, so can share space). Update existing code comments to reflect the new state of affairs. Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20240709185440.1104957-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-10bpf: Fail bpf_timer_cancel when callback is being cancelledKumar Kartikeya Dwivedi
Given a schedule: timer1 cb timer2 cb bpf_timer_cancel(timer2); bpf_timer_cancel(timer1); Both bpf_timer_cancel calls would wait for the other callback to finish executing, introducing a lockup. Add an atomic_t count named 'cancelling' in bpf_hrtimer. This keeps track of all in-flight cancellation requests for a given BPF timer. Whenever cancelling a BPF timer, we must check if we have outstanding cancellation requests, and if so, we must fail the operation with an error (-EDEADLK) since cancellation is synchronous and waits for the callback to finish executing. This implies that we can enter a deadlock situation involving two or more timer callbacks executing in parallel and attempting to cancel one another. Note that we avoid incrementing the cancelling counter for the target timer (the one being cancelled) if bpf_timer_cancel is not invoked from a callback, to avoid spurious errors. The whole point of detecting cur->cancelling and returning -EDEADLK is to not enter a busy wait loop (which may or may not lead to a lockup). This does not apply in case the caller is in a non-callback context, the other side can continue to cancel as it sees fit without running into errors. Background on prior attempts: Earlier versions of this patch used a bool 'cancelling' bit and used the following pattern under timer->lock to publish cancellation status. lock(t->lock); t->cancelling = true; mb(); if (cur->cancelling) return -EDEADLK; unlock(t->lock); hrtimer_cancel(t->timer); t->cancelling = false; The store outside the critical section could overwrite a parallel requests t->cancelling assignment to true, to ensure the parallely executing callback observes its cancellation status. It would be necessary to clear this cancelling bit once hrtimer_cancel is done, but lack of serialization introduced races. Another option was explored where bpf_timer_start would clear the bit when (re)starting the timer under timer->lock. This would ensure serialized access to the cancelling bit, but may allow it to be cleared before in-flight hrtimer_cancel has finished executing, such that lockups can occur again. Thus, we choose an atomic counter to keep track of all outstanding cancellation requests and use it to prevent lockups in case callbacks attempt to cancel each other while executing in parallel. Reported-by: Dohyun Kim <dohyunkim@google.com> Reported-by: Neel Natu <neelnatu@google.com> Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20240709185440.1104957-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-10bpf: fix order of args in call to bpf_map_kvcallocMohammad Shehar Yaar Tausif
The original function call passed size of smap->bucket before the number of buckets which raises the error 'calloc-transposed-args' on compilation. Vlastimil Babka added: The order of parameters can be traced back all the way to 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") accross several refactorings, and that's why the commit is used as a Fixes: tag. In v6.10-rc1, a different commit 2c321f3f70bc ("mm: change inlined allocation helpers to account at the call site") however exposed the order of args in a way that gcc-14 has enough visibility to start warning about it, because (in !CONFIG_MEMCG case) bpf_map_kvcalloc is then a macro alias for kvcalloc instead of a static inline wrapper. To sum up the warning happens when the following conditions are all met: - gcc-14 is used (didn't see it with gcc-13) - commit 2c321f3f70bc is present - CONFIG_MEMCG is not enabled in .config - CONFIG_WERROR turns this from a compiler warning to error Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Reviewed-by: Andrii Nakryiko <andrii@kernel.org> Tested-by: Christian Kujau <lists@nerdbynature.de> Signed-off-by: Mohammad Shehar Yaar Tausif <sheharyaar48@gmail.com> Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Link: https://lore.kernel.org/r/20240710100521.15061-2-vbabka@suse.cz Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-10mm: remove CONFIG_MEMCG_KMEMJohannes Weiner
CONFIG_MEMCG_KMEM used to be a user-visible option for whether slab tracking is enabled. It has been default-enabled and equivalent to CONFIG_MEMCG for almost a decade. We've only grown more kernel memory accounting sites since, and there is no imaginable cgroup usecase going forward that wants to track user pages but not the multitude of user-drivable kernel allocations. Link: https://lkml.kernel.org/r/20240701153148.452230-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Acked-by: David Hildenbrand <david@redhat.com> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-07-09bpf: relax zero fixed offset constraint on KF_TRUSTED_ARGS/KF_RCUMatt Bobrowski
Currently, BPF kfuncs which accept trusted pointer arguments i.e. those flagged as KF_TRUSTED_ARGS, KF_RCU, or KF_RELEASE, all require an original/unmodified trusted pointer argument to be supplied to them. By original/unmodified, it means that the backing register holding the trusted pointer argument that is to be supplied to the BPF kfunc must have its fixed offset set to zero, or else the BPF verifier will outright reject the BPF program load. However, this zero fixed offset constraint that is currently enforced by the BPF verifier onto BPF kfuncs specifically flagged to accept KF_TRUSTED_ARGS or KF_RCU trusted pointer arguments is rather unnecessary, and can limit their usability in practice. Specifically, it completely eliminates the possibility of constructing a derived trusted pointer from an original trusted pointer. To put it simply, a derived pointer is a pointer which points to one of the nested member fields of the object being pointed to by the original trusted pointer. This patch relaxes the zero fixed offset constraint that is enforced upon BPF kfuncs which specifically accept KF_TRUSTED_ARGS, or KF_RCU arguments. Although, the zero fixed offset constraint technically also applies to BPF kfuncs accepting KF_RELEASE arguments, relaxing this constraint for such BPF kfuncs has subtle and unwanted side-effects. This was discovered by experimenting a little further with an initial version of this patch series [0]. The primary issue with relaxing the zero fixed offset constraint on BPF kfuncs accepting KF_RELEASE arguments is that it'd would open up the opportunity for BPF programs to supply both trusted pointers and derived trusted pointers to them. For KF_RELEASE BPF kfuncs specifically, this could be problematic as resources associated with the backing pointer could be released by the backing BPF kfunc and cause instabilities for the rest of the kernel. With this new fixed offset semantic in-place for BPF kfuncs accepting KF_TRUSTED_ARGS and KF_RCU arguments, we now have more flexibility when it comes to the BPF kfuncs that we're able to introduce moving forward. Early discussions covering the possibility of relaxing the zero fixed offset constraint can be found using the link below. This will provide more context on where all this has stemmed from [1]. Notably, pre-existing tests have been updated such that they provide coverage for the updated zero fixed offset functionality. Specifically, the nested offset test was converted from a negative to positive test as it was already designed to assert zero fixed offset semantics of a KF_TRUSTED_ARGS BPF kfunc. [0] https://lore.kernel.org/bpf/ZnA9ndnXKtHOuYMe@google.com/ [1] https://lore.kernel.org/bpf/ZhkbrM55MKQ0KeIV@google.com/ Signed-off-by: Matt Bobrowski <mattbobrowski@google.com> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20240709210939.1544011-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-09Merge tag 'for-netdev' of ↵Paolo Abeni
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next Daniel Borkmann says: ==================== pull-request: bpf-next 2024-07-08 The following pull-request contains BPF updates for your *net-next* tree. We've added 102 non-merge commits during the last 28 day(s) which contain a total of 127 files changed, 4606 insertions(+), 980 deletions(-). The main changes are: 1) Support resilient split BTF which cuts down on duplication and makes BTF as compact as possible wrt BTF from modules, from Alan Maguire & Eduard Zingerman. 2) Add support for dumping kfunc prototypes from BTF which enables both detecting as well as dumping compilable prototypes for kfuncs, from Daniel Xu. 3) Batch of s390x BPF JIT improvements to add support for BPF arena and to implement support for BPF exceptions, from Ilya Leoshkevich. 4) Batch of riscv64 BPF JIT improvements in particular to add 12-argument support for BPF trampolines and to utilize bpf_prog_pack for the latter, from Pu Lehui. 5) Extend BPF test infrastructure to add a CHECKSUM_COMPLETE validation option for skbs and add coverage along with it, from Vadim Fedorenko. 6) Inline bpf_get_current_task/_btf() helpers in the arm64 BPF JIT which gives a small 1% performance improvement in micro-benchmarks, from Puranjay Mohan. 7) Extend the BPF verifier to track the delta between linked registers in order to better deal with recent LLVM code optimizations, from Alexei Starovoitov. 8) Fix bpf_wq_set_callback_impl() kfunc signature where the third argument should have been a pointer to the map value, from Benjamin Tissoires. 9) Extend BPF selftests to add regular expression support for test output matching and adjust some of the selftest when compiled under gcc, from Cupertino Miranda. 10) Simplify task_file_seq_get_next() and remove an unnecessary loop which always iterates exactly once anyway, from Dan Carpenter. 11) Add the capability to offload the netfilter flowtable in XDP layer through kfuncs, from Florian Westphal & Lorenzo Bianconi. 12) Various cleanups in networking helpers in BPF selftests to shave off a few lines of open-coded functions on client/server handling, from Geliang Tang. 13) Properly propagate prog->aux->tail_call_reachable out of BPF verifier, so that x86 JIT does not need to implement detection, from Leon Hwang. 14) Fix BPF verifier to add a missing check_func_arg_reg_off() to prevent an out-of-bounds memory access for dynpointers, from Matt Bobrowski. 15) Fix bpf_session_cookie() kfunc to return __u64 instead of long pointer as it might lead to problems on 32-bit archs, from Jiri Olsa. 16) Enhance traffic validation and dynamic batch size support in xsk selftests, from Tushar Vyavahare. bpf-next-for-netdev * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (102 commits) selftests/bpf: DENYLIST.aarch64: Remove fexit_sleep selftests/bpf: amend for wrong bpf_wq_set_callback_impl signature bpf: helpers: fix bpf_wq_set_callback_impl signature libbpf: Add NULL checks to bpf_object__{prev_map,next_map} selftests/bpf: Remove exceptions tests from DENYLIST.s390x s390/bpf: Implement exceptions s390/bpf: Change seen_reg to a mask bpf: Remove unnecessary loop in task_file_seq_get_next() riscv, bpf: Optimize stack usage of trampoline bpf, devmap: Add .map_alloc_check selftests/bpf: Remove arena tests from DENYLIST.s390x selftests/bpf: Add UAF tests for arena atomics selftests/bpf: Introduce __arena_global s390/bpf: Support arena atomics s390/bpf: Enable arena s390/bpf: Support address space cast instruction s390/bpf: Support BPF_PROBE_MEM32 s390/bpf: Land on the next JITed instruction after exception s390/bpf: Introduce pre- and post- probe functions s390/bpf: Get rid of get_probe_mem_regno() ... ==================== Link: https://patch.msgid.link/20240708221438.10974-1-daniel@iogearbox.net Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-07-08bpf: helpers: fix bpf_wq_set_callback_impl signatureBenjamin Tissoires
I realized this while having a map containing both a struct bpf_timer and a struct bpf_wq: the third argument provided to the bpf_wq callback is not the struct bpf_wq pointer itself, but the pointer to the value in the map. Which means that the users need to double cast the provided "value" as this is not a struct bpf_wq *. This is a change of API, but there doesn't seem to be much users of bpf_wq right now, so we should be able to go with this right now. Fixes: 81f1d7a583fa ("bpf: wq: add bpf_wq_set_callback_impl") Signed-off-by: Benjamin Tissoires <bentiss@kernel.org> Link: https://lore.kernel.org/r/20240708-fix-wq-v2-1-667e5c9fbd99@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-07-08bpf: Remove unnecessary loop in task_file_seq_get_next()Dan Carpenter
After commit 0ede61d8589c ("file: convert to SLAB_TYPESAFE_BY_RCU") this loop always iterates exactly one time. Delete the for statement and pull the code in a tab. Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Christian Brauner <brauner@kernel.org> Acked-by: Jiri Olsa <jolsa@kernel.org> Acked-by: Yonghong Song <yonghong.song@linux.dev> Link: https://lore.kernel.org/bpf/ZoWJF51D4zWb6f5t@stanley.mountain
2024-07-04Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/netJakub Kicinski
Cross-merge networking fixes after downstream PR. Conflicts: drivers/net/phy/aquantia/aquantia.h 219343755eae ("net: phy: aquantia: add missing include guards") 61578f679378 ("net: phy: aquantia: add support for PHY LEDs") drivers/net/ethernet/wangxun/libwx/wx_hw.c bd07a9817846 ("net: txgbe: remove separate irq request for MSI and INTx") b501d261a5b3 ("net: txgbe: add FDIR ATR support") https://lore.kernel.org/all/20240703112936.483c1975@canb.auug.org.au/ include/linux/mlx5/mlx5_ifc.h 048a403648fc ("net/mlx5: IFC updates for changing max EQs") 99be56171fa9 ("net/mlx5e: SHAMPO, Re-enable HW-GRO") https://lore.kernel.org/all/20240701133951.6926b2e3@canb.auug.org.au/ Adjacent changes: drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c 4130c67cd123 ("wifi: iwlwifi: mvm: check vif for NULL/ERR_PTR before dereference") 3f3126515fbe ("wifi: iwlwifi: mvm: add mvm-specific guard") include/net/mac80211.h 816c6bec09ed ("wifi: mac80211: fix BSS_CHANGED_UNSOL_BCAST_PROBE_RESP") 5a009b42e041 ("wifi: mac80211: track changes in AP's TPE") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-07-02bpf, devmap: Add .map_alloc_checkFlorian Lehner
Use the .map_allock_check callback to perform allocation checks before allocating memory for the devmap. Signed-off-by: Florian Lehner <dev@der-flo.net> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20240615101158.57889-1-dev@der-flo.net
2024-07-02bpf: Fix atomic probe zero-extensionIlya Leoshkevich
Zero-extending results of atomic probe operations fails with: verifier bug. zext_dst is set, but no reg is defined The problem is that insn_def_regno() handles BPF_ATOMICs, but not BPF_PROBE_ATOMICs. Fix by adding the missing condition. Fixes: d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT") Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20240701234304.14336-2-iii@linux.ibm.com
2024-07-02net: Move flush list retrieval to where it is used.Sebastian Andrzej Siewior
The bpf_net_ctx_get_.*_flush_list() are used at the top of the function. This means the variable is always assigned even if unused. By moving the function to where it is used, it is possible to delay the initialisation until it is unavoidable. Not sure how much this gains in reality but by looking at bq_enqueue() (in devmap.c) gcc pushes one register less to the stack. \o/. Move flush list retrieval to where it is used. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Acked-by: Jesper Dangaard Brouer <hawk@kernel.org> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-07-02net: Optimize xdp_do_flush() with bpf_net_context infos.Sebastian Andrzej Siewior
Every NIC driver utilizing XDP should invoke xdp_do_flush() after processing all packages. With the introduction of the bpf_net_context logic the flush lists (for dev, CPU-map and xsk) are lazy initialized only if used. However xdp_do_flush() tries to flush all three of them so all three lists are always initialized and the likely empty lists are "iterated". Without the usage of XDP but with CONFIG_DEBUG_NET the lists are also initialized due to xdp_do_check_flushed(). Jakub suggest to utilize the hints in bpf_net_context and avoid invoking the flush function. This will also avoiding initializing the lists which are otherwise unused. Introduce bpf_net_ctx_get_all_used_flush_lists() to return the individual list if not-empty. Use the logic in xdp_do_flush() and xdp_do_check_flushed(). Remove the not needed .*_check_flush(). Suggested-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-07-01bpf: Use precise image size for struct_ops trampolinePu Lehui
For trampoline using bpf_prog_pack, we need to generate a rw_image buffer with size of (image_end - image). For regular trampoline, we use the precise image size generated by arch_bpf_trampoline_size to allocate rw_image. But for struct_ops trampoline, we allocate rw_image directly using close to PAGE_SIZE size. We do not need to allocate for that much, as the patch size is usually much smaller than PAGE_SIZE. Let's use precise image size for it too. Signed-off-by: Pu Lehui <pulehui@huawei.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Tested-by: Björn Töpel <bjorn@rivosinc.com> #riscv Acked-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/bpf/20240622030437.3973492-2-pulehui@huaweicloud.com
2024-06-27Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/netJakub Kicinski
Cross-merge networking fixes after downstream PR. No conflicts. Adjacent changes: e3f02f32a050 ("ionic: fix kernel panic due to multi-buffer handling") d9c04209990b ("ionic: Mark error paths in the data path as unlikely") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-27Merge tag 'asm-generic-fixes-6.10' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic Pull asm-generic fixes from Arnd Bergmann: "These are some bugfixes for system call ABI issues I found while working on a cleanup series. None of these are urgent since these bugs have gone unnoticed for many years, but I think we probably want to backport them all to stable kernels, so it makes sense to have the fixes included as early as possible. One more fix addresses a compile-time warning in kallsyms that was uncovered by a patch I did to enable additional warnings in 6.10. I had mistakenly thought that this fix was already merged through the module tree, but as Geert pointed out it was still missing" * tag 'asm-generic-fixes-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic: kallsyms: rework symbol lookup return codes linux/syscalls.h: add missing __user annotations syscalls: mmap(): use unsigned offset type consistently s390: remove native mmap2() syscall hexagon: fix fadvise64_64 calling conventions csky, hexagon: fix broken sys_sync_file_range sh: rework sync_file_range ABI powerpc: restore some missing spu syscalls parisc: use generic sys_fanotify_mark implementation parisc: use correct compat recv/recvfrom syscalls sparc: fix compat recv/recvfrom syscalls sparc: fix old compat_sys_select() syscalls: fix compat_sys_io_pgetevents_time64 usage ftruncate: pass a signed offset
2024-06-27kallsyms: rework symbol lookup return codesArnd Bergmann
Building with W=1 in some configurations produces a false positive warning for kallsyms: kernel/kallsyms.c: In function '__sprint_symbol.isra': kernel/kallsyms.c:503:17: error: 'strcpy' source argument is the same as destination [-Werror=restrict] 503 | strcpy(buffer, name); | ^~~~~~~~~~~~~~~~~~~~ This originally showed up while building with -O3, but later started happening in other configurations as well, depending on inlining decisions. The underlying issue is that the local 'name' variable is always initialized to the be the same as 'buffer' in the called functions that fill the buffer, which gcc notices while inlining, though it could see that the address check always skips the copy. The calling conventions here are rather unusual, as all of the internal lookup functions (bpf_address_lookup, ftrace_mod_address_lookup, ftrace_func_address_lookup, module_address_lookup and kallsyms_lookup_buildid) already use the provided buffer and either return the address of that buffer to indicate success, or NULL for failure, but the callers are written to also expect an arbitrary other buffer to be returned. Rework the calling conventions to return the length of the filled buffer instead of its address, which is simpler and easier to follow as well as avoiding the warning. Leave only the kallsyms_lookup() calling conventions unchanged, since that is called from 16 different functions and adapting this would be a much bigger change. Link: https://lore.kernel.org/lkml/20200107214042.855757-1-arnd@arndb.de/ Link: https://lore.kernel.org/lkml/20240326130647.7bfb1d92@gandalf.local.home/ Tested-by: Geert Uytterhoeven <geert+renesas@glider.be> Reviewed-by: Luis Chamberlain <mcgrof@kernel.org> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
2024-06-26bpf: add missing check_func_arg_reg_off() to prevent out-of-bounds memory ↵Matt Bobrowski
accesses Currently, it's possible to pass in a modified CONST_PTR_TO_DYNPTR to a global function as an argument. The adverse effects of this is that BPF helpers can continue to make use of this modified CONST_PTR_TO_DYNPTR from within the context of the global function, which can unintentionally result in out-of-bounds memory accesses and therefore compromise overall system stability i.e. [ 244.157771] BUG: KASAN: slab-out-of-bounds in bpf_dynptr_data+0x137/0x140 [ 244.161345] Read of size 8 at addr ffff88810914be68 by task test_progs/302 [ 244.167151] CPU: 0 PID: 302 Comm: test_progs Tainted: G O E 6.10.0-rc3-00131-g66b586715063 #533 [ 244.174318] Call Trace: [ 244.175787] <TASK> [ 244.177356] dump_stack_lvl+0x66/0xa0 [ 244.179531] print_report+0xce/0x670 [ 244.182314] ? __virt_addr_valid+0x200/0x3e0 [ 244.184908] kasan_report+0xd7/0x110 [ 244.187408] ? bpf_dynptr_data+0x137/0x140 [ 244.189714] ? bpf_dynptr_data+0x137/0x140 [ 244.192020] bpf_dynptr_data+0x137/0x140 [ 244.194264] bpf_prog_b02a02fdd2bdc5fa_global_call_bpf_dynptr_data+0x22/0x26 [ 244.198044] bpf_prog_b0fe7b9d7dc3abde_callback_adjust_bpf_dynptr_reg_off+0x1f/0x23 [ 244.202136] bpf_user_ringbuf_drain+0x2c7/0x570 [ 244.204744] ? 0xffffffffc0009e58 [ 244.206593] ? __pfx_bpf_user_ringbuf_drain+0x10/0x10 [ 244.209795] bpf_prog_33ab33f6a804ba2d_user_ringbuf_callback_const_ptr_to_dynptr_reg_off+0x47/0x4b [ 244.215922] bpf_trampoline_6442502480+0x43/0xe3 [ 244.218691] __x64_sys_prlimit64+0x9/0xf0 [ 244.220912] do_syscall_64+0xc1/0x1d0 [ 244.223043] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 244.226458] RIP: 0033:0x7ffa3eb8f059 [ 244.228582] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8f 1d 0d 00 f7 d8 64 89 01 48 [ 244.241307] RSP: 002b:00007ffa3e9c6eb8 EFLAGS: 00000206 ORIG_RAX: 000000000000012e [ 244.246474] RAX: ffffffffffffffda RBX: 00007ffa3e9c7cdc RCX: 00007ffa3eb8f059 [ 244.250478] RDX: 00007ffa3eb162b4 RSI: 0000000000000000 RDI: 00007ffa3e9c7fb0 [ 244.255396] RBP: 00007ffa3e9c6ed0 R08: 00007ffa3e9c76c0 R09: 0000000000000000 [ 244.260195] R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffff80 [ 244.264201] R13: 000000000000001c R14: 00007ffc5d6b4260 R15: 00007ffa3e1c7000 [ 244.268303] </TASK> Add a check_func_arg_reg_off() to the path in which the BPF verifier verifies the arguments of global function arguments, specifically those which take an argument of type ARG_PTR_TO_DYNPTR | MEM_RDONLY. Also, process_dynptr_func() doesn't appear to perform any explicit and strict type matching on the supplied register type, so let's also enforce that a register either type PTR_TO_STACK or CONST_PTR_TO_DYNPTR is by the caller. Reported-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Acked-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Matt Bobrowski <mattbobrowski@google.com> Link: https://lore.kernel.org/r/20240625062857.92760-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-24net: Move per-CPU flush-lists to bpf_net_context on PREEMPT_RT.Sebastian Andrzej Siewior
The per-CPU flush lists, which are accessed from within the NAPI callback (xdp_do_flush() for instance), are per-CPU. There are subject to the same problem as struct bpf_redirect_info. Add the per-CPU lists cpu_map_flush_list, dev_map_flush_list and xskmap_map_flush_list to struct bpf_net_context. Add wrappers for the access. The lists initialized on first usage (similar to bpf_net_ctx_get_ri()). Cc: "Björn Töpel" <bjorn@kernel.org> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: Eduard Zingerman <eddyz87@gmail.com> Cc: Hao Luo <haoluo@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Fastabend <john.fastabend@gmail.com> Cc: Jonathan Lemon <jonathan.lemon@gmail.com> Cc: KP Singh <kpsingh@kernel.org> Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Cc: Magnus Karlsson <magnus.karlsson@intel.com> Cc: Martin KaFai Lau <martin.lau@linux.dev> Cc: Song Liu <song@kernel.org> Cc: Stanislav Fomichev <sdf@google.com> Cc: Yonghong Song <yonghong.song@linux.dev> Acked-by: Jesper Dangaard Brouer <hawk@kernel.org> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Link: https://patch.msgid.link/20240620132727.660738-16-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-24net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.Sebastian Andrzej Siewior
The XDP redirect process is two staged: - bpf_prog_run_xdp() is invoked to run a eBPF program which inspects the packet and makes decisions. While doing that, the per-CPU variable bpf_redirect_info is used. - Afterwards xdp_do_redirect() is invoked and accesses bpf_redirect_info and it may also access other per-CPU variables like xskmap_flush_list. At the very end of the NAPI callback, xdp_do_flush() is invoked which does not access bpf_redirect_info but will touch the individual per-CPU lists. The per-CPU variables are only used in the NAPI callback hence disabling bottom halves is the only protection mechanism. Users from preemptible context (like cpu_map_kthread_run()) explicitly disable bottom halves for protections reasons. Without locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. PREEMPT_RT has forced-threaded interrupts enabled and every NAPI-callback runs in a thread. If each thread has its own data structure then locking can be avoided. Create a struct bpf_net_context which contains struct bpf_redirect_info. Define the variable on stack, use bpf_net_ctx_set() to save a pointer to it, bpf_net_ctx_clear() removes it again. The bpf_net_ctx_set() may nest. For instance a function can be used from within NET_RX_SOFTIRQ/ net_rx_action which uses bpf_net_ctx_set() and NET_TX_SOFTIRQ which does not. Therefore only the first invocations updates the pointer. Use bpf_net_ctx_get_ri() as a wrapper to retrieve the current struct bpf_redirect_info. The returned data structure is zero initialized to ensure nothing is leaked from stack. This is done on first usage of the struct. bpf_net_ctx_set() sets bpf_redirect_info::kern_flags to 0 to note that initialisation is required. First invocation of bpf_net_ctx_get_ri() will memset() the data structure and update bpf_redirect_info::kern_flags. bpf_redirect_info::nh is excluded from memset because it is only used once BPF_F_NEIGH is set which also sets the nh member. The kern_flags is moved past nh to exclude it from memset. The pointer to bpf_net_context is saved task's task_struct. Using always the bpf_net_context approach has the advantage that there is almost zero differences between PREEMPT_RT and non-PREEMPT_RT builds. Cc: Andrii Nakryiko <andrii@kernel.org> Cc: Eduard Zingerman <eddyz87@gmail.com> Cc: Hao Luo <haoluo@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Fastabend <john.fastabend@gmail.com> Cc: KP Singh <kpsingh@kernel.org> Cc: Martin KaFai Lau <martin.lau@linux.dev> Cc: Song Liu <song@kernel.org> Cc: Stanislav Fomichev <sdf@google.com> Cc: Yonghong Song <yonghong.song@linux.dev> Acked-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Jesper Dangaard Brouer <hawk@kernel.org> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Link: https://patch.msgid.link/20240620132727.660738-15-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-24bpf: Fix may_goto with negative offset.Alexei Starovoitov
Zac's syzbot crafted a bpf prog that exposed two bugs in may_goto. The 1st bug is the way may_goto is patched. When offset is negative it should be patched differently. The 2nd bug is in the verifier: when current state may_goto_depth is equal to visited state may_goto_depth it means there is an actual infinite loop. It's not correct to prune exploration of the program at this point. Note, that this check doesn't limit the program to only one may_goto insn, since 2nd and any further may_goto will increment may_goto_depth only in the queued state pushed for future exploration. The current state will have may_goto_depth == 0 regardless of number of may_goto insns and the verifier has to explore the program until bpf_exit. Fixes: 011832b97b31 ("bpf: Introduce may_goto instruction") Reported-by: Zac Ecob <zacecob@protonmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Eduard Zingerman <eddyz87@gmail.com> Closes: https://lore.kernel.org/bpf/CAADnVQL-15aNp04-cyHRn47Yv61NXfYyhopyZtUyxNojUZUXpA@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20240619235355.85031-1-alexei.starovoitov@gmail.com
2024-06-23bpf: fix build when CONFIG_DEBUG_INFO_BTF[_MODULES] is undefinedAlan Maguire
Kernel test robot reports that kernel build fails with resilient split BTF changes. Examining the associated config and code we see that btf_relocate_id() is defined under CONFIG_DEBUG_INFO_BTF_MODULES. Moving it outside the #ifdef solves the issue. Reported-by: kernel test robot <lkp@intel.com> Closes: https://lore.kernel.org/oe-kbuild-all/202406221742.d2srFLVI-lkp@intel.com/ Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Link: https://lore.kernel.org/r/20240623135224.27981-1-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-21libbpf,bpf: Share BTF relocate-related code with kernelAlan Maguire
Share relocation implementation with the kernel. As part of this, we also need the type/string iteration functions so also share btf_iter.c file. Relocation code in kernel and userspace is identical save for the impementation of the reparenting of split BTF to the relocated base BTF and retrieval of the BTF header from "struct btf"; these small functions need separate user-space and kernel implementations for the separate "struct btf"s they operate upon. One other wrinkle on the kernel side is we have to map .BTF.ids in modules as they were generated with the type ids used at BTF encoding time. btf_relocate() optionally returns an array mapping from old BTF ids to relocated ids, so we use that to fix up these references where needed for kfuncs. Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Eduard Zingerman <eddyz87@gmail.com> Link: https://lore.kernel.org/bpf/20240620091733.1967885-5-alan.maguire@oracle.com
2024-06-21bpf: Fix overrunning reservations in ringbufDaniel Borkmann
The BPF ring buffer internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters: consumer_pos is the consumer counter to show which logical position the consumer consumed the data, and producer_pos which is the producer counter denoting the amount of data reserved by all producers. Each time a record is reserved, the producer that "owns" the record will successfully advance producer counter. In user space each time a record is read, the consumer of the data advanced the consumer counter once it finished processing. Both counters are stored in separate pages so that from user space, the producer counter is read-only and the consumer counter is read-write. One aspect that simplifies and thus speeds up the implementation of both producers and consumers is how the data area is mapped twice contiguously back-to-back in the virtual memory, allowing to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. Each record has a struct bpf_ringbuf_hdr { u32 len; u32 pg_off; } header for book-keeping the length and offset, and is inaccessible to the BPF program. Helpers like bpf_ringbuf_reserve() return `(void *)hdr + BPF_RINGBUF_HDR_SZ` for the BPF program to use. Bing-Jhong and Muhammad reported that it is however possible to make a second allocated memory chunk overlapping with the first chunk and as a result, the BPF program is now able to edit first chunk's header. For example, consider the creation of a BPF_MAP_TYPE_RINGBUF map with size of 0x4000. Next, the consumer_pos is modified to 0x3000 /before/ a call to bpf_ringbuf_reserve() is made. This will allocate a chunk A, which is in [0x0,0x3008], and the BPF program is able to edit [0x8,0x3008]. Now, lets allocate a chunk B with size 0x3000. This will succeed because consumer_pos was edited ahead of time to pass the `new_prod_pos - cons_pos > rb->mask` check. Chunk B will be in range [0x3008,0x6010], and the BPF program is able to edit [0x3010,0x6010]. Due to the ring buffer memory layout mentioned earlier, the ranges [0x0,0x4000] and [0x4000,0x8000] point to the same data pages. This means that chunk B at [0x4000,0x4008] is chunk A's header. bpf_ringbuf_submit() / bpf_ringbuf_discard() use the header's pg_off to then locate the bpf_ringbuf itself via bpf_ringbuf_restore_from_rec(). Once chunk B modified chunk A's header, then bpf_ringbuf_commit() refers to the wrong page and could cause a crash. Fix it by calculating the oldest pending_pos and check whether the range from the oldest outstanding record to the newest would span beyond the ring buffer size. If that is the case, then reject the request. We've tested with the ring buffer benchmark in BPF selftests (./benchs/run_bench_ringbufs.sh) before/after the fix and while it seems a bit slower on some benchmarks, it is still not significantly enough to matter. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Bing-Jhong Billy Jheng <billy@starlabs.sg> Reported-by: Muhammad Ramdhan <ramdhan@starlabs.sg> Co-developed-by: Bing-Jhong Billy Jheng <billy@starlabs.sg> Co-developed-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Bing-Jhong Billy Jheng <billy@starlabs.sg> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20240621140828.18238-1-daniel@iogearbox.net
2024-06-21bpf: Fix the corner case with may_goto and jump to the 1st insn.Alexei Starovoitov
When the following program is processed by the verifier: L1: may_goto L2 goto L1 L2: w0 = 0 exit the may_goto insn is first converted to: L1: r11 = *(u64 *)(r10 -8) if r11 == 0x0 goto L2 r11 -= 1 *(u64 *)(r10 -8) = r11 goto L1 L2: w0 = 0 exit then later as the last step the verifier inserts: *(u64 *)(r10 -8) = BPF_MAX_LOOPS as the first insn of the program to initialize loop count. When the first insn happens to be a branch target of some jmp the bpf_patch_insn_data() logic will produce: L1: *(u64 *)(r10 -8) = BPF_MAX_LOOPS r11 = *(u64 *)(r10 -8) if r11 == 0x0 goto L2 r11 -= 1 *(u64 *)(r10 -8) = r11 goto L1 L2: w0 = 0 exit because instruction patching adjusts all jmps and calls, but for this particular corner case it's incorrect and the L1 label should be one instruction down, like: *(u64 *)(r10 -8) = BPF_MAX_LOOPS L1: r11 = *(u64 *)(r10 -8) if r11 == 0x0 goto L2 r11 -= 1 *(u64 *)(r10 -8) = r11 goto L1 L2: w0 = 0 exit and that's what this patch is fixing. After bpf_patch_insn_data() call adjust_jmp_off() to adjust all jmps that point to newly insert BPF_ST insn to point to insn after. Note that bpf_patch_insn_data() cannot easily be changed to accommodate this logic, since jumps that point before or after a sequence of patched instructions have to be adjusted with the full length of the patch. Conceptually it's somewhat similar to "insert" of instructions between other instructions with weird semantics. Like "insert" before 1st insn would require adjustment of CALL insns to point to newly inserted 1st insn, but not an adjustment JMP insns that point to 1st, yet still adjusting JMP insns that cross over 1st insn (point to insn before or insn after), hence use simple adjust_jmp_off() logic to fix this corner case. Ideally bpf_patch_insn_data() would have an auxiliary info to say where 'the start of newly inserted patch is', but it would be too complex for backport. Fixes: 011832b97b31 ("bpf: Introduce may_goto instruction") Reported-by: Zac Ecob <zacecob@protonmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Eduard Zingerman <eddyz87@gmail.com> Closes: https://lore.kernel.org/bpf/CAADnVQJ_WWx8w4b=6Gc2EpzAjgv+6A0ridnMz2TvS2egj4r3Gw@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20240619011859.79334-1-alexei.starovoitov@gmail.com
2024-06-21bpf: Add security_file_post_open() LSM hook to sleepable_lsm_hooksMatt Bobrowski
The new generic LSM hook security_file_post_open() was recently added to the LSM framework in commit 8f46ff5767b0b ("security: Introduce file_post_open hook"). Let's proactively add this generic LSM hook to the sleepable_lsm_hooks BTF ID set, because I can't see there being any strong reasons not to, and it's only a matter of time before someone else comes around and asks for it to be there. security_file_post_open() is inherently sleepable as it's purposely situated in the kernel that allows LSMs to directly read out the contents of the backing file if need be. Additionally, it's called directly after security_file_open(), and that LSM hook in itself already exists in the sleepable_lsm_hooks BTF ID set. Signed-off-by: Matt Bobrowski <mattbobrowski@google.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20240618192923.379852-1-mattbobrowski@google.com
2024-06-20bpf: remove redeclaration of new_n in bpf_verifier_vlogRafael Passos
This new_n is defined in the start of this function. Its value is overwritten by `new_n = min(n, log->len_total);` a couple lines before my change, rendering the shadow declaration unnecessary. Signed-off-by: Rafael Passos <rafael@rcpassos.me> Link: https://lore.kernel.org/r/20240615022641.210320-4-rafael@rcpassos.me Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-20bpf: remove unused parameter in __bpf_free_used_btfsRafael Passos
Fixes a compiler warning. The __bpf_free_used_btfs function was taking an extra unused struct bpf_prog_aux *aux param Signed-off-by: Rafael Passos <rafael@rcpassos.me> Link: https://lore.kernel.org/r/20240615022641.210320-3-rafael@rcpassos.me Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-20bpf: remove unused parameter in bpf_jit_binary_pack_finalizeRafael Passos
Fixes a compiler warning. the bpf_jit_binary_pack_finalize function was taking an extra bpf_prog parameter that went unused. This removves it and updates the callers accordingly. Signed-off-by: Rafael Passos <rafael@rcpassos.me> Link: https://lore.kernel.org/r/20240615022641.210320-2-rafael@rcpassos.me Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-20bpf, verifier: Correct tail_call_reachable for bpf progLeon Hwang
It's confusing to inspect 'prog->aux->tail_call_reachable' with drgn[0], when bpf prog has tail call but 'tail_call_reachable' is false. This patch corrects 'tail_call_reachable' when bpf prog has tail call. Signed-off-by: Leon Hwang <hffilwlqm@gmail.com> Link: https://lore.kernel.org/r/20240610124224.34673-2-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-20Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/netJakub Kicinski
Cross-merge networking fixes after downstream PR. Conflicts: drivers/net/ethernet/broadcom/bnxt/bnxt.c 1e7962114c10 ("bnxt_en: Restore PTP tx_avail count in case of skb_pad() error") 165f87691a89 ("bnxt_en: add timestamping statistics support") No adjacent changes. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-18bpf: Fix remap of arena.Alexei Starovoitov
The bpf arena logic didn't account for mremap operation. Add a refcnt for multiple mmap events to prevent use-after-free in arena_vm_close. Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Reported-by: Pengfei Xu <pengfei.xu@intel.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Barret Rhoden <brho@google.com> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Closes: https://lore.kernel.org/bpf/Zmuw29IhgyPNKnIM@xpf.sh.intel.com Link: https://lore.kernel.org/bpf/20240617171812.76634-1-alexei.starovoitov@gmail.com
2024-06-17bpf: Add missed var_off setting in coerce_subreg_to_size_sx()Yonghong Song
In coerce_subreg_to_size_sx(), for the case where upper sign extension bits are the same for smax32 and smin32 values, we missed to setup properly. This is especially problematic if both smax32 and smin32's sign extension bits are 1. The following is a simple example illustrating the inconsistent verifier states due to missed var_off: 0: (85) call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: (bf) r3 = r0 ; R0_w=scalar(id=1) R3_w=scalar(id=1) 2: (57) r3 &= 15 ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) 3: (47) r3 |= 128 ; R3_w=scalar(smin=umin=smin32=umin32=128,smax=umax=smax32=umax32=143,var_off=(0x80; 0xf)) 4: (bc) w7 = (s8)w3 REG INVARIANTS VIOLATION (alu): range bounds violation u64=[0xffffff80, 0x8f] s64=[0xffffff80, 0x8f] u32=[0xffffff80, 0x8f] s32=[0x80, 0xffffff8f] var_off=(0x80, 0xf) The var_off=(0x80, 0xf) is not correct, and the correct one should be var_off=(0xffffff80; 0xf) since from insn 3, we know that at insn 4, the sign extension bits will be 1. This patch fixed this issue by setting var_off properly. Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Link: https://lore.kernel.org/r/20240615174632.3995278-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-17bpf: Add missed var_off setting in set_sext32_default_val()Yonghong Song
Zac reported a verification failure and Alexei reproduced the issue with a simple reproducer ([1]). The verification failure is due to missed setting for var_off. The following is the reproducer in [1]: 0: R1=ctx() R10=fp0 0: (71) r3 = *(u8 *)(r10 -387) ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R10=fp0 1: (bc) w7 = (s8)w3 ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R7_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f)) 2: (36) if w7 >= 0x2533823b goto pc-3 mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r7 stack= before 1: (bc) w7 = (s8)w3 mark_precise: frame0: regs=r3 stack= before 0: (71) r3 = *(u8 *)(r10 -387) 2: R7_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f)) 3: (b4) w0 = 0 ; R0_w=0 4: (95) exit Note that after insn 1, the var_off for R7 is (0x0; 0x7f). This is not correct since upper 24 bits of w7 could be 0 or 1. So correct var_off should be (0x0; 0xffffffff). Missing var_off setting in set_sext32_default_val() caused later incorrect analysis in zext_32_to_64(dst_reg) and reg_bounds_sync(dst_reg). To fix the issue, set var_off correctly in set_sext32_default_val(). The correct reg state after insn 1 becomes: 1: (bc) w7 = (s8)w3 ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R7_w=scalar(smin=0,smax=umax=0xffffffff,smin32=-128,smax32=127,var_off=(0x0; 0xffffffff)) and at insn 2, the verifier correctly determines either branch is possible. [1] https://lore.kernel.org/bpf/CAADnVQLPU0Shz7dWV4bn2BgtGdxN3uFHPeobGBA72tpg5Xoykw@mail.gmail.com/ Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Reported-by: Zac Ecob <zacecob@protonmail.com> Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Link: https://lore.kernel.org/r/20240615174626.3994813-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-14bpf: Track delta between "linked" registers.Alexei Starovoitov
Compilers can generate the code r1 = r2 r1 += 0x1 if r2 < 1000 goto ... use knowledge of r2 range in subsequent r1 operations So remember constant delta between r2 and r1 and update r1 after 'if' condition. Unfortunately LLVM still uses this pattern for loops with 'can_loop' construct: for (i = 0; i < 1000 && can_loop; i++) The "undo" pass was introduced in LLVM https://reviews.llvm.org/D121937 to prevent this optimization, but it cannot cover all cases. Instead of fighting middle end optimizer in BPF backend teach the verifier about this pattern. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Eduard Zingerman <eddyz87@gmail.com> Link: https://lore.kernel.org/bpf/20240613013815.953-3-alexei.starovoitov@gmail.com
2024-06-13bpf: crypto: make state and IV dynptr nullableVadim Fedorenko
Some ciphers do not require state and IV buffer, but with current implementation 0-sized dynptr is always needed. With adjustment to verifier we can provide NULL instead of 0-sized dynptr. Make crypto kfuncs ready for this. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Vadim Fedorenko <vadfed@meta.com> Link: https://lore.kernel.org/r/20240613211817.1551967-3-vadfed@meta.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-13bpf: verifier: make kfuncs args nullalbleVadim Fedorenko
Some arguments to kfuncs might be NULL in some cases. But currently it's not possible to pass NULL to any BTF structures because the check for the suffix is located after all type checks. Move it to earlier place to allow nullable args. Acked-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Vadim Fedorenko <vadfed@meta.com> Link: https://lore.kernel.org/r/20240613211817.1551967-2-vadfed@meta.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-13bpf: fix UML x86_64 compile failureMaciej Żenczykowski
pcpu_hot (defined in arch/x86) is not available on user mode linux (ARCH=um) Cc: Andrii Nakryiko <andrii@kernel.org> Cc: John Fastabend <john.fastabend@gmail.com> Cc: Alexei Starovoitov <ast@kernel.org> Fixes: 1ae6921009e5 ("bpf: inline bpf_get_smp_processor_id() helper") Signed-off-by: Maciej Żenczykowski <maze@google.com> Link: https://lore.kernel.org/r/20240613173146.2524647-1-maze@google.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-13bpf: Fix bpf_dynptr documentation commentsDaniel Xu
The function argument names were changed but the doc comment was not. Fix htmldocs build warning by updating doc comments. Fixes: cce4c40b9606 ("bpf: treewide: Align kfunc signatures to prog point-of-view") Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> Link: https://lore.kernel.org/r/d0b0eb05f91e12e5795966153b11998d3fc1d433.1718295425.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-13bpf: Reduce stack consumption in check_stack_write_fixed_offDaniel Borkmann
The fake_reg moved into env->fake_reg given it consumes a lot of stack space (120 bytes). Migrate the fake_reg in check_stack_write_fixed_off() as well now that we have it. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/r/20240613115310.25383-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-13bpf: Fix reg_set_min_max corruption of fake_regDaniel Borkmann
Juan reported that after doing some changes to buzzer [0] and implementing a new fuzzing strategy guided by coverage, they noticed the following in one of the probes: [...] 13: (79) r6 = *(u64 *)(r0 +0) ; R0=map_value(ks=4,vs=8) R6_w=scalar() 14: (b7) r0 = 0 ; R0_w=0 15: (b4) w0 = -1 ; R0_w=0xffffffff 16: (74) w0 >>= 1 ; R0_w=0x7fffffff 17: (5c) w6 &= w0 ; R0_w=0x7fffffff R6_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) 18: (44) w6 |= 2 ; R6_w=scalar(smin=umin=smin32=umin32=2,smax=umax=umax32=0x7fffffff,var_off=(0x2; 0x7ffffffd)) 19: (56) if w6 != 0x7ffffffd goto pc+1 REG INVARIANTS VIOLATION (true_reg2): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg2): const tnum out of sync with range bounds u64=[0x0, 0xffffffffffffffff] s64=[0x8000000000000000, 0x7fffffffffffffff] u32=[0x0, 0xffffffff] s32=[0x80000000, 0x7fffffff] var_off=(0x7fffffff, 0x0) 19: R6_w=0x7fffffff 20: (95) exit from 19 to 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: (14) w6 -= 2147483632 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=14,var_off=(0x2; 0xfffffffd)) 22: (76) if w6 s>= 0xe goto pc+1 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=13,var_off=(0x2; 0xfffffffd)) 23: (95) exit from 22 to 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: (14) w6 -= 14 ; R6_w=0 [...] What can be seen here is a register invariant violation on line 19. After the binary-or in line 18, the verifier knows that bit 2 is set but knows nothing about the rest of the content which was loaded from a map value, meaning, range is [2,0x7fffffff] with var_off=(0x2; 0x7ffffffd). When in line 19 the verifier analyzes the branch, it splits the register states in reg_set_min_max() into the registers of the true branch (true_reg1, true_reg2) and the registers of the false branch (false_reg1, false_reg2). Since the test is w6 != 0x7ffffffd, the src_reg is a known constant. Internally, the verifier creates a "fake" register initialized as scalar to the value of 0x7ffffffd, and then passes it onto reg_set_min_max(). Now, for line 19, it is mathematically impossible to take the false branch of this program, yet the verifier analyzes it. It is impossible because the second bit of r6 will be set due to the prior or operation and the constant in the condition has that bit unset (hex(fd) == binary(1111 1101). When the verifier first analyzes the false / fall-through branch, it will compute an intersection between the var_off of r6 and of the constant. This is because the verifier creates a "fake" register initialized to the value of the constant. The intersection result later refines both registers in regs_refine_cond_op(): [...] t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); [...] Since the verifier is analyzing the false branch of the conditional jump, reg1 is equal to false_reg1 and reg2 is equal to false_reg2, i.e. the reg2 is the "fake" register that was meant to hold a constant value. The resulting var_off of the intersection says that both registers now hold a known value of var_off=(0x7fffffff, 0x0) or in other words: this operation manages to make the verifier think that the "constant" value that was passed in the jump operation now holds a different value. Normally this would not be an issue since it should not influence the true branch, however, false_reg2 and true_reg2 are pointers to the same "fake" register. Meaning, the false branch can influence the results of the true branch. In line 24, the verifier assumes R6_w=0, but the actual runtime value in this case is 1. The fix is simply not passing in the same "fake" register location as inputs to reg_set_min_max(), but instead making a copy. Moving the fake_reg into the env also reduces stack consumption by 120 bytes. With this, the verifier successfully rejects invalid accesses from the test program. [0] https://github.com/google/buzzer Fixes: 67420501e868 ("bpf: generalize reg_set_min_max() to handle non-const register comparisons") Reported-by: Juan José López Jaimez <jjlopezjaimez@google.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: John Fastabend <john.fastabend@gmail.com> Link: https://lore.kernel.org/r/20240613115310.25383-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-12bpf: treewide: Align kfunc signatures to prog point-of-viewDaniel Xu
Previously, kfunc declarations in bpf_kfuncs.h (and others) used "user facing" types for kfuncs prototypes while the actual kfunc definitions used "kernel facing" types. More specifically: bpf_dynptr vs bpf_dynptr_kern, __sk_buff vs sk_buff, and xdp_md vs xdp_buff. It wasn't an issue before, as the verifier allows aliased types. However, since we are now generating kfunc prototypes in vmlinux.h (in addition to keeping bpf_kfuncs.h around), this conflict creates compilation errors. Fix this conflict by using "user facing" types in kfunc definitions. This results in more casts, but otherwise has no additional runtime cost. Note, similar to 5b268d1ebcdc ("bpf: Have bpf_rdonly_cast() take a const pointer"), we also make kfuncs take const arguments where appropriate in order to make the kfunc more permissive. Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> Link: https://lore.kernel.org/r/b58346a63a0e66bc9b7504da751b526b0b189a67.1718207789.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-12bpf: verifier: Relax caller requirements for kfunc projection type argsDaniel Xu
Currently, if a kfunc accepts a projection type as an argument (eg struct __sk_buff *), the caller must exactly provide exactly the same type with provable provenance. However in practice, kfuncs that accept projection types _must_ cast to the underlying type before use b/c projection type layouts are completely made up. Thus, it is ok to relax the verifier rules around implicit conversions. We will use this functionality in the next commit when we align kfuncs to user-facing types. Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> Link: https://lore.kernel.org/r/e2c025cb09ccfd4af1ec9e18284dc3cecff7514d.1718207789.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-06-10Merge tag 'for-netdev' of ↵Jakub Kicinski
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next Daniel Borkmann says: ==================== pull-request: bpf-next 2024-06-06 We've added 54 non-merge commits during the last 10 day(s) which contain a total of 50 files changed, 1887 insertions(+), 527 deletions(-). The main changes are: 1) Add a user space notification mechanism via epoll when a struct_ops object is getting detached/unregistered, from Kui-Feng Lee. 2) Big batch of BPF selftest refactoring for sockmap and BPF congctl tests, from Geliang Tang. 3) Add BTF field (type and string fields, right now) iterator support to libbpf instead of using existing callback-based approaches, from Andrii Nakryiko. 4) Extend BPF selftests for the latter with a new btf_field_iter selftest, from Alan Maguire. 5) Add new kfuncs for a generic, open-coded bits iterator, from Yafang Shao. 6) Fix BPF selftests' kallsyms_find() helper under kernels configured with CONFIG_LTO_CLANG_THIN, from Yonghong Song. 7) Remove a bunch of unused structs in BPF selftests, from David Alan Gilbert. 8) Convert test_sockmap section names into names understood by libbpf so it can deduce program type and attach type, from Jakub Sitnicki. 9) Extend libbpf with the ability to configure log verbosity via LIBBPF_LOG_LEVEL environment variable, from Mykyta Yatsenko. 10) Fix BPF selftests with regards to bpf_cookie and find_vma flakiness in nested VMs, from Song Liu. 11) Extend riscv32/64 JITs to introduce shift/add helpers to generate Zba optimization, from Xiao Wang. 12) Enable BPF programs to declare arrays and struct fields with kptr, bpf_rb_root, and bpf_list_head, from Kui-Feng Lee. * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (54 commits) selftests/bpf: Drop useless arguments of do_test in bpf_tcp_ca selftests/bpf: Use start_test in test_dctcp in bpf_tcp_ca selftests/bpf: Use start_test in test_dctcp_fallback in bpf_tcp_ca selftests/bpf: Add start_test helper in bpf_tcp_ca selftests/bpf: Use connect_to_fd_opts in do_test in bpf_tcp_ca libbpf: Auto-attach struct_ops BPF maps in BPF skeleton selftests/bpf: Add btf_field_iter selftests selftests/bpf: Fix send_signal test with nested CONFIG_PARAVIRT libbpf: Remove callback-based type/string BTF field visitor helpers bpftool: Use BTF field iterator in btfgen libbpf: Make use of BTF field iterator in BTF handling code libbpf: Make use of BTF field iterator in BPF linker code libbpf: Add BTF field iterator selftests/bpf: Ignore .llvm.<hash> suffix in kallsyms_find() selftests/bpf: Fix bpf_cookie and find_vma in nested VM selftests/bpf: Test global bpf_list_head arrays. selftests/bpf: Test global bpf_rb_root arrays and fields in nested struct types. selftests/bpf: Test kptr arrays and kptrs in nested struct fields. bpf: limit the number of levels of a nested struct type. bpf: look into the types of the fields of a struct type recursively. ... ==================== Link: https://lore.kernel.org/r/20240606223146.23020-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>