summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c38
-rw-r--r--kernel/bpf/bpf_insn_array.c304
-rw-r--r--kernel/bpf/bpf_iter.c29
-rw-r--r--kernel/bpf/bpf_local_storage.c235
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/cgroup.c8
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/disasm.c3
-rw-r--r--kernel/bpf/hashtab.c67
-rw-r--r--kernel/bpf/helpers.c355
-rw-r--r--kernel/bpf/liveness.c46
-rw-r--r--kernel/bpf/log.c3
-rw-r--r--kernel/bpf/range_tree.c21
-rw-r--r--kernel/bpf/ringbuf.c116
-rw-r--r--kernel/bpf/rqspinlock.c90
-rw-r--r--kernel/bpf/stackmap.c66
-rw-r--r--kernel/bpf/stream.c162
-rw-r--r--kernel/bpf/syscall.c106
-rw-r--r--kernel/bpf/token.c47
-rw-r--r--kernel/bpf/trampoline.c88
-rw-r--r--kernel/bpf/verifier.c1001
23 files changed, 1918 insertions, 898 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7fd0badfacb1..232cbc97434d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 80b1765a3159..1eeb31c5b317 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
}
/* Called from syscall */
-static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
- if (index >= array->map.max_entries) {
+ if (index >= map->max_entries) {
*next = 0;
return 0;
}
- if (index == array->map.max_entries - 1)
+ if (index == map->max_entries - 1)
return -ENOENT;
*next = index + 1;
@@ -448,19 +447,12 @@ static void array_map_free_internal_structs(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free fields other than timer and workqueue
- * on uref dropping to zero.
- */
- if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
- for (i = 0; i < array->map.max_entries; i++) {
- if (btf_record_has_field(map->record, BPF_TIMER))
- bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
- if (btf_record_has_field(map->record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
- if (btf_record_has_field(map->record, BPF_TASK_WORK))
- bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
- }
- }
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -796,7 +788,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
@@ -822,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
@@ -1211,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
@@ -1315,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = perf_event_fd_array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
@@ -1351,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
@@ -1436,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = array_of_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
new file mode 100644
index 000000000000..c96630cb75bf
--- /dev/null
+++ b/kernel/bpf/bpf_insn_array.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Isovalent */
+
+#include <linux/bpf.h>
+
+struct bpf_insn_array {
+ struct bpf_map map;
+ atomic_t used;
+ long *ips;
+ DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
+};
+
+#define cast_insn_array(MAP_PTR) \
+ container_of((MAP_PTR), struct bpf_insn_array, map)
+
+#define INSN_DELETED ((u32)-1)
+
+static inline u64 insn_array_alloc_size(u32 max_entries)
+{
+ const u64 base_size = sizeof(struct bpf_insn_array);
+ const u64 entry_size = sizeof(struct bpf_insn_array_value);
+
+ return base_size + max_entries * (entry_size + sizeof(long));
+}
+
+static int insn_array_alloc_check(union bpf_attr *attr)
+{
+ u32 value_size = sizeof(struct bpf_insn_array_value);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != value_size || attr->map_flags != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void insn_array_free(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ bpf_map_area_free(insn_array);
+}
+
+static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
+{
+ u64 size = insn_array_alloc_size(attr->max_entries);
+ struct bpf_insn_array *insn_array;
+
+ insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
+ if (!insn_array)
+ return ERR_PTR(-ENOMEM);
+
+ /* ips are allocated right after the insn_array->values[] array */
+ insn_array->ips = (void *)&insn_array->values[attr->max_entries];
+
+ bpf_map_init_from_attr(&insn_array->map, attr);
+
+ /* BPF programs aren't allowed to write to the map */
+ insn_array->map.map_flags |= BPF_F_RDONLY_PROG;
+
+ return &insn_array->map;
+}
+
+static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return NULL;
+
+ return &insn_array->values[index];
+}
+
+static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+ struct bpf_insn_array_value val = {};
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return -E2BIG;
+
+ if (unlikely(map_flags & BPF_NOEXIST))
+ return -EEXIST;
+
+ copy_map_value(map, &val, value);
+ if (val.jitted_off || val.xlated_off)
+ return -EINVAL;
+
+ insn_array->values[index].orig_off = val.orig_off;
+
+ return 0;
+}
+
+static long insn_array_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static int insn_array_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ if (!btf_type_is_i64(value_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u64 insn_array_mem_usage(const struct bpf_map *map)
+{
+ return insn_array_alloc_size(map->max_entries);
+}
+
+static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ if ((off % sizeof(long)) != 0 ||
+ (off / sizeof(long)) >= map->max_entries)
+ return -EINVAL;
+
+ /* from BPF's point of view, this map is a jump table */
+ *imm = (unsigned long)insn_array->ips + off;
+
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
+
+const struct bpf_map_ops insn_array_map_ops = {
+ .map_alloc_check = insn_array_alloc_check,
+ .map_alloc = insn_array_alloc,
+ .map_free = insn_array_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = insn_array_lookup_elem,
+ .map_update_elem = insn_array_update_elem,
+ .map_delete_elem = insn_array_delete_elem,
+ .map_check_btf = insn_array_check_btf,
+ .map_mem_usage = insn_array_mem_usage,
+ .map_direct_value_addr = insn_array_map_direct_value_addr,
+ .map_btf_id = &insn_array_btf_ids[0],
+};
+
+static inline bool is_frozen(struct bpf_map *map)
+{
+ guard(mutex)(&map->freeze_mutex);
+
+ return map->frozen;
+}
+
+static bool is_insn_array(const struct bpf_map *map)
+{
+ return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
+}
+
+static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
+ const struct bpf_prog *prog)
+{
+ u32 off;
+ int i;
+
+ for (i = 0; i < insn_array->map.max_entries; i++) {
+ off = insn_array->values[i].orig_off;
+
+ if (off >= prog->len)
+ return false;
+
+ if (off > 0) {
+ if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ struct bpf_insn_array_value *values = insn_array->values;
+ int i;
+
+ if (!is_frozen(map))
+ return -EINVAL;
+
+ if (!valid_offsets(insn_array, prog))
+ return -EINVAL;
+
+ /*
+ * There can be only one program using the map
+ */
+ if (atomic_xchg(&insn_array->used, 1))
+ return -EBUSY;
+
+ /*
+ * Reset all the map indexes to the original values. This is needed,
+ * e.g., when a replay of verification with different log level should
+ * be performed.
+ */
+ for (i = 0; i < map->max_entries; i++)
+ values[i].xlated_off = values[i].orig_off;
+
+ return 0;
+}
+
+int bpf_insn_array_ready(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (!insn_array->ips[i])
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+void bpf_insn_array_release(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ atomic_set(&insn_array->used, 0);
+}
+
+void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off <= off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ insn_array->values[i].xlated_off += len - 1;
+ }
+}
+
+void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off < off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (insn_array->values[i].xlated_off < off + len)
+ insn_array->values[i].xlated_off = INSN_DELETED;
+ else
+ insn_array->values[i].xlated_off -= len;
+ }
+}
+
+/*
+ * This function is called by JITs. The image is the real program
+ * image, the offsets array set up the xlated -> jitted mapping.
+ * The offsets[xlated] offset should point to the beginning of
+ * the jitted instruction.
+ */
+void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
+{
+ struct bpf_insn_array *insn_array;
+ struct bpf_map *map;
+ u32 xlated_off;
+ int i, j;
+
+ if (!offsets || !image)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (!is_insn_array(map))
+ continue;
+
+ insn_array = cast_insn_array(map);
+ for (j = 0; j < map->max_entries; j++) {
+ xlated_off = insn_array->values[j].xlated_off;
+ if (xlated_off == INSN_DELETED)
+ continue;
+ if (xlated_off < prog->aux->subprog_start)
+ continue;
+ xlated_off -= prog->aux->subprog_start;
+ if (xlated_off >= prog->len)
+ continue;
+
+ insn_array->values[j].jitted_off = offsets[xlated_off];
+ insn_array->ips[j] = (long)(image + offsets[xlated_off]);
+ }
+ }
+}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 6ac35430c573..eec60b57bd3d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -634,37 +634,24 @@ release_prog:
int bpf_iter_new_fd(struct bpf_link *link)
{
struct bpf_iter_link *iter_link;
- struct file *file;
unsigned int flags;
- int err, fd;
+ int err;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- return fd;
-
- file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto free_fd;
- }
+
+ FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags));
+ if (fdf.err)
+ return fdf.err;
iter_link = container_of(link, struct bpf_iter_link, link);
- err = prepare_seq_file(file, iter_link);
+ err = prepare_seq_file(fd_prepare_file(fdf), iter_link);
if (err)
- goto free_file;
+ return err; /* Automatic cleanup handles fput */
- fd_install(fd, file);
- return fd;
-
-free_file:
- fput(file);
-free_fd:
- put_unused_fd(fd);
- return err;
+ return fd_publish(fdf);
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b931fbceb54d..e2fe6c32822b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
- if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+ if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- if (smap->bpf_ma) {
- selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
- if (selem)
- /* Keep the original bpf_map_kzalloc behavior
- * before started using the bpf_mem_cache_alloc.
- *
- * No need to use zero_map_value. The bpf_selem_free()
- * only does bpf_mem_cache_free when there is
- * no other bpf prog is using the selem.
- */
- memset(SDATA(selem)->data, 0, smap->map.value_size);
+ if (smap->use_kmalloc_nolock) {
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
} else {
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
gfp_flags | __GFP_NOWARN);
}
if (selem) {
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return selem;
}
- if (charge_mem)
- mem_uncharge(smap, owner, smap->elem_size);
+ mem_uncharge(smap, owner, smap->elem_size);
return NULL;
}
-/* rcu tasks trace callback for bpf_ma == false */
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -127,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(local_storage, rcu);
}
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- bpf_mem_cache_raw_free(local_storage);
+ kfree_nolock(local_storage);
}
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
@@ -143,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
call_rcu(rcu, bpf_local_storage_free_rcu);
}
-/* Handle bpf_ma == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(local_storage, rcu);
- else
- call_rcu_tasks_trace(&local_storage->rcu,
- __bpf_local_storage_free_trace_rcu);
-}
-
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *smap,
- bool bpf_ma, bool reuse_now)
+ bool reuse_now)
{
if (!local_storage)
return;
- if (!bpf_ma) {
+ if (!local_storage->use_kmalloc_nolock) {
__bpf_local_storage_free(local_storage, reuse_now);
return;
}
- if (!reuse_now) {
- call_rcu_tasks_trace(&local_storage->rcu,
- bpf_local_storage_free_trace_rcu);
+ if (reuse_now) {
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
return;
}
- if (smap)
- bpf_mem_cache_free(&smap->storage_ma, local_storage);
- else
- /* smap could be NULL if the selem that triggered
- * this 'local_storage' creation had been long gone.
- * In this case, directly do call_rcu().
- */
- call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
}
-/* rcu tasks trace callback for bpf_ma == false */
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
@@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(selem, rcu);
}
-/* Handle bpf_ma == false */
+/* Handle use_kmalloc_nolock == false */
static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
@@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
migrate_disable();
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
migrate_enable();
- bpf_mem_cache_raw_free(selem);
+ kfree_nolock(selem);
}
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
- struct bpf_local_storage_map *smap,
bool reuse_now)
{
- if (!smap->bpf_ma) {
- /* Only task storage has uptrs and task storage
- * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
- * for task storage, so this bpf_obj_free_fields() won't unpin
- * any uptr.
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ if (!smap->use_kmalloc_nolock) {
+ /*
+ * No uptr will be unpin even when reuse_now == false since uptr
+ * is only supported in task local storage, where
+ * smap->use_kmalloc_nolock == true.
*/
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
@@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
}
if (reuse_now) {
- /* reuse_now == true only happens when the storage owner
- * (e.g. task_struct) is being destructed or the map itself
- * is being destructed (ie map_free). In both cases,
- * no bpf prog can have a hold on the selem. It is
- * safe to unpin the uptrs and free the selem now.
- */
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- /* Instead of using the vanilla call_rcu(),
- * bpf_mem_cache_free will be able to reuse selem
- * immediately.
+ /*
+ * While it is okay to call bpf_obj_free_fields() that unpins uptr when
+ * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
*/
- bpf_mem_cache_free(&smap->selem_ma, selem);
+ call_rcu(&selem->rcu, bpf_selem_free_rcu);
return;
}
@@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
{
struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map *smap;
struct hlist_node *n;
/* The "_safe" iteration is needed.
@@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
* but bpf_selem_free will use the selem->rcu_head
* which is union-ized with the selem->free_node.
*/
- hlist_for_each_entry_safe(selem, n, list, free_node) {
- smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- bpf_selem_free(selem, smap, reuse_now);
- }
+ hlist_for_each_entry_safe(selem, n, list, free_node)
+ bpf_selem_free(selem, reuse_now);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, struct hlist_head *free_selem_list)
+ struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
* The owner may be freed once the last selem is unlinked
* from local_storage.
*/
- if (uncharge_mem)
- mem_uncharge(smap, owner, smap->elem_size);
+ mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
@@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
return free_local_storage;
}
-static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *storage_smap,
- struct bpf_local_storage_elem *selem)
-{
-
- struct bpf_local_storage_map *selem_smap;
-
- /* local_storage->smap may be NULL. If it is, get the bpf_ma
- * from any selem in the local_storage->list. The bpf_ma of all
- * local_storage and selem should have the same value
- * for the same map type.
- *
- * If the local_storage->list is already empty, the caller will not
- * care about the bpf_ma value also because the caller is not
- * responsible to free the local_storage.
- */
-
- if (storage_smap)
- return storage_smap->bpf_ma;
-
- if (!selem) {
- struct hlist_node *n;
-
- n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
- bpf_rcu_lock_held());
- if (!n)
- return false;
-
- selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
- }
- selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-
- return selem_smap->bpf_ma;
-}
-
static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
- bool bpf_ma, free_local_storage = false;
+ bool free_local_storage = false;
HLIST_HEAD(selem_free_list);
unsigned long flags;
@@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
- storage_smap = rcu_dereference_check(local_storage->smap,
- bpf_rcu_lock_held());
- bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, &selem_free_list);
+ local_storage, selem, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&selem_free_list, reuse_now);
if (free_local_storage)
- bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
+ bpf_local_storage_free(local_storage, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
unsigned long flags;
raw_spin_lock_irqsave(&b->lock, flags);
- RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_irqrestore(&b->lock, flags);
}
@@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->bpf_ma)
- storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ if (smap->use_kmalloc_nolock)
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
@@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner,
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
+ storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
bpf_selem_link_storage_nolock(storage, first_selem);
bpf_selem_link_map(smap, first_selem);
@@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner,
bpf_selem_unlink_map(first_selem);
err = -EAGAIN;
goto uncharge;
-
- /* Note that even first_selem was linked to smap's
- * bucket->list, first_selem can be freed immediately
- * (instead of kfree_rcu) because
- * bpf_local_storage_map_free() does a
- * synchronize_rcu_mult (waiting for both sleepable and
- * normal programs) before walking the bucket->list.
- * Hence, no one is accessing selem from the
- * bucket->list under rcu_read_lock().
- */
}
return 0;
uncharge:
- bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
+ bpf_local_storage_free(storage, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- bpf_selem_free(selem, smap, true);
+ bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- true, &old_selem_free_list);
+ &old_selem_free_list);
}
unlock:
@@ -664,7 +593,7 @@ unlock:
bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
- bpf_selem_free(alloc_selem, smap, true);
+ bpf_selem_free(alloc_selem, true);
}
return err ? ERR_PTR(err) : SDATA(selem);
}
@@ -730,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
- struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
- bool bpf_ma, free_storage = false;
+ bool free_storage = false;
HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
- storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
- bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
-
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added to or deleted from the
@@ -762,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, &free_selem_list);
+ local_storage, selem, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&free_selem_list, true);
if (free_storage)
- bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
+ bpf_local_storage_free(local_storage, true);
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -782,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
return usage;
}
-/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
- * A deadlock free allocator is useful for storage that the bpf prog can easily
- * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
- * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
- * memory immediately. To be reuse-immediate safe, the owner destruction
- * code path needs to go through a rcu grace period before calling
- * bpf_local_storage_destroy().
- *
- * When bpf_ma == false, the kmalloc and kfree are used.
- */
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache,
- bool bpf_ma)
+ bool use_kmalloc_nolock)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -829,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
/* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
* preemptible context. Thus, enforce all storages to use
- * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
+ * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
*/
- smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
- if (smap->bpf_ma) {
- err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
- if (err)
- goto free_smap;
-
- err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
- if (err) {
- bpf_mem_alloc_destroy(&smap->selem_ma);
- goto free_smap;
- }
- }
+ smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
@@ -912,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();
- if (smap->bpf_ma) {
+ if (smap->use_kmalloc_nolock) {
rcu_barrier_tasks_trace();
- if (!rcu_trace_implies_rcu_gp())
- rcu_barrier();
- bpf_mem_alloc_destroy(&smap->selem_ma);
- bpf_mem_alloc_destroy(&smap->storage_ma);
+ rcu_barrier();
}
kvfree(smap->buckets);
bpf_map_area_free(smap);
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0a59df1c550a..7cb6e8d4282c 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity)
BTF_ID(func, bpf_lsm_audit_rule_match)
#endif
BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_SET_END(bpf_lsm_disabled_hooks)
/* List of LSM hooks that should operate on 'current' cgroup regardless
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index a41e6730edcf..278490683d28 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1162,6 +1162,7 @@ bool bpf_struct_ops_get(const void *kdata)
map = __bpf_map_inc_not_zero(&st_map->map, false);
return !IS_ERR(map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_get);
void bpf_struct_ops_put(const void *kdata)
{
@@ -1173,6 +1174,7 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_put);
u32 bpf_struct_ops_id(const void *kdata)
{
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 248f517d66d0..69988af44b37 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1665,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* returned value != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
int *uaddrlen,
enum cgroup_bpf_attach_type atype,
void *t_ctx,
@@ -1676,7 +1676,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
.uaddr = uaddr,
.t_ctx = t_ctx,
};
- struct sockaddr_storage unspec;
+ struct sockaddr_storage storage;
struct cgroup *cgrp;
int ret;
@@ -1688,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
return 0;
if (!ctx.uaddr) {
- memset(&unspec, 0, sizeof(unspec));
- ctx.uaddr = (struct sockaddr *)&unspec;
+ memset(&storage, 0, sizeof(storage));
+ ctx.uaddr = (struct sockaddr_unsized *)&storage;
ctx.uaddrlen = 0;
} else {
ctx.uaddrlen = *uaddrlen;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d595fe512498..c8ae6ab31651 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1450,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
bpf_prog_clone_free(fp_other);
}
+static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_map *map;
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ bpf_insn_array_adjust(map, off, len);
+ }
+#endif
+}
+
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
struct bpf_insn insn_buff[16], aux[2];
@@ -1505,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
clone = tmp;
insn_delta = rewritten - 1;
+ /* Instructions arrays must be updated using absolute xlated offsets */
+ adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
+
/* Walk new program and skip insns we just inserted. */
insn = clone->insnsi + i + insn_delta;
insn_cnt += insn_delta;
@@ -1688,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code)
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
+ [BPF_JMP | BPF_JA | BPF_X] = true,
[BPF_JMP | BPF_JCOND] = true,
};
#undef BPF_INSN_3_TBL
@@ -3129,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
-int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
- void *addr1, void *addr2)
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+ enum bpf_text_poke_type new_t, void *old_addr,
+ void *new_addr)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 20883c6b1546..f8a3c7eb451e 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
+ verbose(cbs->private_data, "(%02x) gotox r%d\n",
+ insn->code, insn->dst_reg);
} else if (insn->code == (BPF_JMP | BPF_JCOND) &&
insn->src_reg == BPF_MAY_GOTO) {
verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c2fcd0cd51e5..c8a9b27f8663 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
-{
- if (btf_record_has_field(htab->map.record, BPF_TIMER))
- bpf_obj_free_timer(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
- bpf_obj_free_task_work(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
-}
-
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
@@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- htab_free_internal_structs(htab, elem);
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -669,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -947,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
+ void *ptr;
+
if (!onallcpus) {
/* copy true value_size bytes */
- copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
+ ptr = this_cpu_ptr(pptr);
+ copy_map_value(&htab->map, ptr, value);
+ bpf_obj_free_fields(htab->map.record, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value_long(&htab->map, ptr, value + off);
+ bpf_obj_free_fields(htab->map.record, ptr);
off += size;
}
}
@@ -1098,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1206,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1275,8 +1266,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1338,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1416,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1452,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1509,8 +1496,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) {
- /* We only free timer on uref dropping to zero */
- htab_free_internal_structs(htab, l);
+ /* We only free internal structs on uref dropping to zero */
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
@@ -1521,13 +1509,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We only free timer and workqueue on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
- if (!htab_is_prealloc(htab))
- htab_free_malloced_internal_structs(htab);
- else
- htab_free_prealloced_internal_structs(htab);
- }
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ if (htab_is_prealloc(htab))
+ htab_free_prealloced_internal_structs(htab);
+ else
+ htab_free_malloced_internal_structs(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c9fab9a356df..db72b96f9c8c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -28,6 +28,7 @@
#include <linux/verification.h>
#include <linux/task_work.h>
#include <linux/irq_work.h>
+#include <linux/buildid.h>
#include "../../lib/kstrtox.h"
@@ -42,8 +43,7 @@
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -59,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -77,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_delete_elem(map, key);
}
@@ -134,8 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
- !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -777,9 +774,11 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
+ preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -792,6 +791,7 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work)
rcu_read_unlock_trace();
}
+static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
+{
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+
+ kfree_nolock(cb);
+}
+
static void bpf_wq_delete_work(struct work_struct *work)
{
struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
cancel_work_sync(&w->work);
- kfree_rcu(w, cb.rcu);
+ call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
}
static void bpf_timer_delete_work(struct work_struct *work)
@@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call
- * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * call_rcu() right after for both preallocated and non-preallocated
* maps. The async->cb = NULL was already done and no code path can see
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
- * kmalloc_nolock() is available, avoid locking issues by using
- * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
- */
- cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
+ cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
* or pinned in bpffs.
*/
WRITE_ONCE(async->cb, NULL);
- kfree(cb);
+ kfree_nolock(cb);
ret = -EPERM;
}
out:
@@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer _before_ calling us, such that failing to cancel it here will
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
* Therefore, we _need_ to cancel any outstanding timers before we do
- * kfree_rcu, even though no more timers can be armed.
+ * call_rcu, even though no more timers can be armed.
*
* Moreover, we need to schedule work even if timer does not belong to
* the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val)
* completion.
*/
if (hrtimer_try_to_cancel(&t->timer) >= 0)
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
else
queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
@@ -1657,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.arg2_btf_id = BPF_PTR_POISON,
};
+struct bpf_dynptr_file_impl {
+ struct freader freader;
+ /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
+ u64 offset;
+ u64 size;
+};
+
/* Since the upper 8 bits of dynptr->size is reserved, the
* maximum supported size is 2^24 - 1.
*/
@@ -1685,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
}
-u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ return df->size;
+ }
+
return ptr->size & DYNPTR_SIZE_MASK;
}
-static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
+{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->offset += off;
+ return;
+ }
+ ptr->offset += off;
+}
+
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
{
u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
- ptr->size = new_size | metadata;
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->size = new_size;
+ return;
+ }
+ ptr->size = (u32)new_size | metadata;
}
-int bpf_dynptr_check_size(u32 size)
+int bpf_dynptr_check_size(u64 size)
{
return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
}
+static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
+{
+ const void *ptr;
+
+ if (!buf)
+ return -EINVAL;
+
+ df->freader.buf = buf;
+ df->freader.buf_sz = len;
+ ptr = freader_fetch(&df->freader, offset + df->offset, len);
+ if (!ptr)
+ return df->freader.err;
+
+ if (ptr != buf) /* Force copying into the buffer */
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
enum bpf_dynptr_type type, u32 offset, u32 size)
{
@@ -1716,7 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
int err;
@@ -1751,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
-static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src,
- u32 offset, u64 flags)
+static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
+ u64 offset, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1782,14 +1834,16 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
case BPF_DYNPTR_TYPE_SKB_META:
memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
return 0;
+ case BPF_DYNPTR_TYPE_FILE:
+ return bpf_file_fetch_bytes(src->data, offset, dst, len);
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
}
}
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
- u32, offset, u64, flags)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
+ u64, offset, u64, flags)
{
return __bpf_dynptr_read(dst, len, src, offset, flags);
}
@@ -1805,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
-int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
- u32 len, u64 flags)
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
+ u64 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1839,18 +1893,16 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
case BPF_DYNPTR_TYPE_SKB_META:
- if (flags)
- return -EINVAL;
- memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
- return 0;
+ return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
+ len, flags);
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
}
}
-BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
- u32, len, u64, flags)
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
+ u64, len, u64, flags)
{
return __bpf_dynptr_write(dst, offset, src, len, flags);
}
@@ -1866,7 +1918,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
{
enum bpf_dynptr_type type;
int err;
@@ -2681,12 +2733,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
- void *buffer__opt, u32 buffer__szk)
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
- u32 len = buffer__szk;
+ u64 len = buffer__szk;
int err;
if (!ptr->data)
@@ -2720,6 +2772,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
}
case BPF_DYNPTR_TYPE_SKB_META:
return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_FILE:
+ err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
+ return err ? NULL : buffer__opt;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2768,8 +2823,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
- void *buffer__opt, u32 buffer__szk)
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2801,10 +2856,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
- u32 size;
+ u64 size;
if (!ptr->data || start > end)
return -EINVAL;
@@ -2814,7 +2869,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end
if (start > size || end > size)
return -ERANGE;
- ptr->offset += start;
+ bpf_dynptr_advance_offset(ptr, start);
bpf_dynptr_set_size(ptr, end - start);
return 0;
@@ -2837,7 +2892,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
return __bpf_dynptr_is_rdonly(ptr);
}
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
+__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2874,14 +2929,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
* Copies data from source dynptr to destination dynptr.
* Returns 0 on success; negative error, otherwise.
*/
-__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
- struct bpf_dynptr *src_ptr, u32 src_off, u32 size)
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
+ struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
{
struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
void *src_slice, *dst_slice;
char buf[256];
- u32 off;
+ u64 off;
src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
@@ -2903,7 +2958,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
off = 0;
while (off < size) {
- u32 chunk_sz = min_t(u32, sizeof(buf), size - off);
+ u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
int err;
err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
@@ -2929,10 +2984,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
* at @offset with the constant byte @val.
* Returns 0 on success; negative error, otherwise.
*/
- __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val)
- {
+__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
- u32 chunk_sz, write_off;
+ u64 chunk_sz, write_off;
char buf[256];
void* slice;
int err;
@@ -2951,11 +3006,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
return err;
/* Non-linear data under the dynptr, write from a local buffer */
- chunk_sz = min_t(u32, sizeof(buf), size);
+ chunk_sz = min_t(u64, sizeof(buf), size);
memset(buf, val, chunk_sz);
for (write_off = 0; write_off < size; write_off += chunk_sz) {
- chunk_sz = min_t(u32, sizeof(buf), size - write_off);
+ chunk_sz = min_t(u64, sizeof(buf), size - write_off);
err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
if (err)
return err;
@@ -3675,34 +3730,21 @@ err_out:
return -EFAULT;
}
-/**
- * bpf_strnstr - Find the first substring in a length-limited string
- * @s1__ign: The string to be searched
- * @s2__ign: The string to search for
- * @len: the maximum number of characters to search
- *
- * Return:
- * * >=0 - Index of the first character of the first occurrence of @s2__ign
- * within the first @len characters of @s1__ign
- * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
- * * %-EFAULT - Cannot read one of the strings
- * * %-E2BIG - One of the strings is too large
- * * %-ERANGE - One of the strings is outside of kernel address space
- */
-__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len)
+static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
+ bool ignore_case)
{
char c1, c2;
int i, j;
- if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
- !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
- __get_kernel_nofault(&c2, s2__ign + j, char, err_out);
+ __get_kernel_nofault(&c2, s2 + j, char, err_out);
if (c2 == '\0')
return i;
/*
@@ -3712,7 +3754,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
*/
if (i + j == len)
break;
- __get_kernel_nofault(&c1, s1__ign + j, char, err_out);
+ __get_kernel_nofault(&c1, s1 + j, char, err_out);
+
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+
if (c1 == '\0')
return -ENOENT;
if (c1 != c2)
@@ -3722,7 +3770,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
return -E2BIG;
if (i + j == len)
return -ENOENT;
- s1__ign++;
+ s1++;
}
return -E2BIG;
err_out:
@@ -3744,8 +3792,69 @@ err_out:
*/
__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
- return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
+}
+
+/**
+ * bpf_strcasestr - Find the first substring in a string, ignoring the case of
+ * the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
+}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, false);
}
+
+/**
+ * bpf_strncasestr - Find the first substring in a length-limited string,
+ * ignoring the case of the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, true);
+}
+
#ifdef CONFIG_KEYS
/**
* bpf_lookup_user_key - lookup a key by its serial
@@ -4166,7 +4275,8 @@ release_prog:
}
/**
- * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
@@ -4175,15 +4285,17 @@ release_prog:
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
- void *map__map, bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
}
/**
- * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
+ * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
@@ -4192,13 +4304,62 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
- void *map__map, bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
}
+static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_dynptr_file_impl *state;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
+ if (!state) {
+ bpf_dynptr_set_null(ptr);
+ return -ENOMEM;
+ }
+ state->offset = 0;
+ state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
+ freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
+ bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
+ bpf_dynptr_set_rdonly(ptr);
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ if (!df)
+ return 0;
+
+ freader_cleanup(&df->freader);
+ bpf_mem_free(&bpf_global_ma, df);
+ bpf_dynptr_set_null(ptr);
+ return 0;
+}
+
__bpf_kfunc_end_defs();
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
@@ -4342,6 +4503,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_local_irq_save)
BTF_ID_FLAGS(func, bpf_local_irq_restore)
+#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
@@ -4350,6 +4512,7 @@ BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
@@ -4367,13 +4530,17 @@ BTF_ID_FLAGS(func, bpf_strnlen);
BTF_ID_FLAGS(func, bpf_strspn);
BTF_ID_FLAGS(func, bpf_strcspn);
BTF_ID_FLAGS(func, bpf_strstr);
+BTF_ID_FLAGS(func, bpf_strcasestr);
BTF_ID_FLAGS(func, bpf_strnstr);
+BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -4414,7 +4581,7 @@ late_initcall(kfunc_init);
/* Get a pointer to dynptr data up to len bytes for read only access. If
* the dynptr doesn't have continuous data up to len bytes, return NULL.
*/
-const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
{
const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
@@ -4425,9 +4592,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
* the dynptr doesn't have continuous data up to len bytes, or the dynptr
* is read only, return NULL.
*/
-void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
{
if (__bpf_dynptr_is_rdonly(ptr))
return NULL;
return (void *)__bpf_dynptr_data(ptr, len);
}
+
+void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
+{
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, val);
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, val);
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, val);
+}
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 3c611aba7f52..60db5d655495 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -34,7 +34,7 @@
* - read and write marks propagation.
* - The propagation phase is a textbook live variable data flow analysis:
*
- * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
+ * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
* state[cc, i].live_before =
* (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
*
@@ -54,7 +54,7 @@
* The equation for "must_write_acc" propagation looks as follows:
*
* state[cc, i].must_write_acc =
- * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)]
+ * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
* U state[cc, i].must_write
*
* (An intersection of all "must_write_acc" for instruction successors
@@ -195,8 +195,10 @@ static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
return ERR_PTR(-ENOMEM);
result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
GFP_KERNEL_ACCOUNT);
- if (!result->must_write_set)
+ if (!result->must_write_set) {
+ kvfree(result);
return ERR_PTR(-ENOMEM);
+ }
memcpy(&result->callchain, callchain, sizeof(*callchain));
result->insn_cnt = subprog_sz;
hash_add(liveness->func_instances, &result->hl_node, key);
@@ -445,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn)
__diag_push();
__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
-inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+/*
+ * Returns an array of instructions succ, with succ->items[0], ...,
+ * succ->items[n-1] with successor instructions, where n=succ->cnt
+ */
+inline struct bpf_iarray *
+bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
{
static const struct opcode_info {
bool can_jump;
@@ -472,19 +479,29 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
_J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
#undef _J
};
+ struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = &prog->insnsi[idx];
const struct opcode_info *opcode_info;
- int i = 0, insn_sz;
+ struct bpf_iarray *succ, *jt;
+ int insn_sz;
+
+ jt = env->insn_aux_data[idx].jt;
+ if (unlikely(jt))
+ return jt;
+
+ /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
+ succ = env->succ;
+ succ->cnt = 0;
opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
if (opcode_info->can_fallthrough)
- succ[i++] = idx + insn_sz;
+ succ->items[succ->cnt++] = idx + insn_sz;
if (opcode_info->can_jump)
- succ[i++] = idx + bpf_jmp_offset(insn) + 1;
+ succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;
- return i;
+ return succ;
}
__diag_pop();
@@ -522,6 +539,8 @@ static int propagate_to_outer_instance(struct bpf_verifier_env *env,
this_subprog_start = callchain_subprog_start(callchain);
outer_instance = get_outer_instance(env, instance);
+ if (IS_ERR(outer_instance))
+ return PTR_ERR(outer_instance);
callsite = callchain->callsites[callchain->curframe - 1];
reset_stack_write_marks(env, outer_instance, callsite);
@@ -544,11 +563,12 @@ static inline bool update_insn(struct bpf_verifier_env *env,
struct bpf_insn_aux_data *aux = env->insn_aux_data;
u64 new_before, new_after, must_write_acc;
struct per_frame_masks *insn, *succ_insn;
- u32 succ_num, s, succ[2];
+ struct bpf_iarray *succ;
+ u32 s;
bool changed;
- succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
- if (unlikely(succ_num == 0))
+ succ = bpf_insn_successors(env, insn_idx);
+ if (succ->cnt == 0)
return false;
changed = false;
@@ -560,8 +580,8 @@ static inline bool update_insn(struct bpf_verifier_env *env,
* of successors plus all "must_write" slots of instruction itself.
*/
must_write_acc = U64_MAX;
- for (s = 0; s < succ_num; ++s) {
- succ_insn = get_frame_masks(instance, frame, succ[s]);
+ for (s = 0; s < succ->cnt; ++s) {
+ succ_insn = get_frame_masks(instance, frame, succ->items[s]);
new_after |= succ_insn->live_before;
must_write_acc &= succ_insn->must_write_acc;
}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index f50533169cc3..a0c3b35de2ce 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
[PTR_TO_ARENA] = "arena",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
+ [PTR_TO_INSN] = "insn",
[PTR_TO_MAP_KEY] = "map_key",
[CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
};
@@ -500,6 +501,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
return "xdp";
case BPF_DYNPTR_TYPE_SKB_META:
return "skb_meta";
+ case BPF_DYNPTR_TYPE_FILE:
+ return "file";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
index 37b80a23ae1a..99c63d982c5d 100644
--- a/kernel/bpf/range_tree.c
+++ b/kernel/bpf/range_tree.c
@@ -2,7 +2,6 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/interval_tree_generic.h>
#include <linux/slab.h>
-#include <linux/bpf_mem_alloc.h>
#include <linux/bpf.h>
#include "range_tree.h"
@@ -21,7 +20,7 @@
* in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
*
* The implementation relies on external lock to protect rbtree-s.
- * The alloc/free of range_node-s is done via bpf_mem_alloc.
+ * The alloc/free of range_node-s is done via kmalloc_nolock().
*
* bpf arena is using range_tree to represent unallocated slots.
* At init time:
@@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
range_it_insert(rn, rt);
/* Add a range */
- migrate_disable();
- new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
- migrate_enable();
+ new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
if (!new_rn)
return -ENOMEM;
new_rn->rn_start = last + 1;
@@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
} else {
/* in the middle of the clearing range */
range_it_remove(rn, rt);
- migrate_disable();
- bpf_mem_free(&bpf_global_ma, rn);
- migrate_enable();
+ kfree_nolock(rn);
}
}
return 0;
@@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
range_it_remove(right, rt);
left->rn_last = right->rn_last;
range_it_insert(left, rt);
- migrate_disable();
- bpf_mem_free(&bpf_global_ma, right);
- migrate_enable();
+ kfree_nolock(right);
} else if (left) {
/* Combine with the left range */
range_it_remove(left, rt);
@@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
right->rn_start = start;
range_it_insert(right, rt);
} else {
- migrate_disable();
- left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
- migrate_enable();
+ left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
if (!left)
return -ENOMEM;
left->rn_start = start;
@@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt)
while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
- bpf_mem_free(&bpf_global_ma, rn);
+ kfree_nolock(rn);
}
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 719d73299397..f6a075ffac63 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -13,7 +13,7 @@
#include <linux/btf_ids.h>
#include <asm/rqspinlock.h>
-#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE)
/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
#define RINGBUF_PGOFF \
@@ -30,6 +30,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
+ bool overwrite_mode;
rqspinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
@@ -73,6 +74,7 @@ struct bpf_ringbuf {
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
unsigned long pending_pos;
+ unsigned long overwrite_pos; /* position after the last overwritten record */
char data[] __aligned(PAGE_SIZE);
};
@@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work)
* considering that the maximum value of data_sz is (4GB - 1), there
* will be no overflow, so just note the size limit in the comments.
*/
-static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode)
{
struct bpf_ringbuf *rb;
@@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->consumer_pos = 0;
rb->producer_pos = 0;
rb->pending_pos = 0;
+ rb->overwrite_mode = overwrite_mode;
return rb;
}
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
{
+ bool overwrite_mode = false;
struct bpf_ringbuf_map *rb_map;
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_RB_OVERWRITE) {
+ if (attr->map_type != BPF_MAP_TYPE_RINGBUF)
+ return ERR_PTR(-EINVAL);
+ overwrite_mode = true;
+ }
+
if (attr->key_size || attr->value_size ||
!is_power_of_2(attr->max_entries) ||
!PAGE_ALIGNED(attr->max_entries))
@@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
bpf_map_init_from_attr(&rb_map->map, attr);
- rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode);
if (!rb_map->rb) {
bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
@@ -216,6 +226,8 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
{
+ irq_work_sync(&rb->work);
+
/* copy pages pointer and nr_pages to local variable, as we are going
* to unmap rb itself with vunmap() below
*/
@@ -293,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
}
+/*
+ * Return an estimate of the available data in the ring buffer.
+ * Note: the returned value can exceed the actual ring buffer size because the
+ * function is not synchronized with the producer. The producer acquires the
+ * ring buffer's spinlock, but this function does not.
+ */
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
- unsigned long cons_pos, prod_pos;
+ unsigned long cons_pos, prod_pos, over_pos;
cons_pos = smp_load_acquire(&rb->consumer_pos);
- prod_pos = smp_load_acquire(&rb->producer_pos);
- return prod_pos - cons_pos;
+
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = smp_load_acquire(&rb->overwrite_pos);
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - max(cons_pos, over_pos);
+ } else {
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - cons_pos;
+ }
}
static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
@@ -402,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
return (void*)((addr & PAGE_MASK) - off);
}
+static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb,
+ unsigned long new_prod_pos,
+ unsigned long cons_pos,
+ unsigned long pend_pos)
+{
+ /*
+ * No space if oldest not yet committed record until the newest
+ * record span more than (ringbuf_size - 1).
+ */
+ if (new_prod_pos - pend_pos > rb->mask)
+ return false;
+
+ /* Ok, we have space in overwrite mode */
+ if (unlikely(rb->overwrite_mode))
+ return true;
+
+ /*
+ * No space if producer position advances more than (ringbuf_size - 1)
+ * ahead of consumer position when not in overwrite mode.
+ */
+ if (new_prod_pos - cons_pos > rb->mask)
+ return false;
+
+ return true;
+}
+
+static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len)
+{
+ hdr_len &= ~BPF_RINGBUF_DISCARD_BIT;
+ return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8);
+}
+
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
- unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
+ unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags;
struct bpf_ringbuf_hdr *hdr;
- u32 len, pg_off, tmp_size, hdr_len;
+ u32 len, pg_off, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
@@ -429,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
hdr_len = READ_ONCE(hdr->len);
if (hdr_len & BPF_RINGBUF_BUSY_BIT)
break;
- tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
- tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
- pend_pos += tmp_size;
+ pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
}
rb->pending_pos = pend_pos;
- /* check for out of ringbuf space:
- * - by ensuring producer position doesn't advance more than
- * (ringbuf_size - 1) ahead
- * - by ensuring oldest not yet committed record until newest
- * record does not span more than (ringbuf_size - 1)
- */
- if (new_prod_pos - cons_pos > rb->mask ||
- new_prod_pos - pend_pos > rb->mask) {
+ if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) {
raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
+ /*
+ * In overwrite mode, advance overwrite_pos when the ring buffer is full.
+ * The key points are to stay on record boundaries and consume enough records
+ * to fit the new one.
+ */
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = rb->overwrite_pos;
+ while (new_prod_pos - over_pos > rb->mask) {
+ hdr = (void *)rb->data + (over_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ /*
+ * The bpf_ringbuf_has_space() check above ensures we won’t
+ * step over a record currently being worked on by another
+ * producer.
+ */
+ over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+ }
+ /*
+ * smp_store_release(&rb->producer_pos, new_prod_pos) at
+ * the end of the function ensures that when consumer sees
+ * the updated rb->producer_pos, it always sees the updated
+ * rb->overwrite_pos, so when consumer reads overwrite_pos
+ * after smp_load_acquire(r->producer_pos), the overwrite_pos
+ * will always be valid.
+ */
+ WRITE_ONCE(rb->overwrite_pos, over_pos);
+ }
+
hdr = (void *)rb->data + (prod_pos & rb->mask);
pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
hdr->len = size | BPF_RINGBUF_BUSY_BIT;
@@ -576,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
return smp_load_acquire(&rb->producer_pos);
+ case BPF_RB_OVERWRITE_POS:
+ return smp_load_acquire(&rb->overwrite_pos);
default:
return 0;
}
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index a00561b1d3e5..f7d0c8d4644e 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -89,15 +89,14 @@ struct rqspinlock_timeout {
DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
-static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts)
+static bool is_lock_released(rqspinlock_t *lock, u32 mask)
{
if (!(atomic_read_acquire(&lock->val) & (mask)))
return true;
return false;
}
-static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
- struct rqspinlock_timeout *ts)
+static noinline int check_deadlock_AA(rqspinlock_t *lock)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
int cnt = min(RES_NR_HELD, rqh->cnt);
@@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
* more locks, which reduce to ABBA). This is not exhaustive, and we rely on
* timeouts as the final line of defense.
*/
-static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
- struct rqspinlock_timeout *ts)
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
@@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
* Let's ensure to break out of this loop if the lock is available for
* us to potentially acquire.
*/
- if (is_lock_released(lock, mask, ts))
+ if (is_lock_released(lock, mask))
return 0;
/*
@@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
return 0;
}
-static noinline int check_deadlock(rqspinlock_t *lock, u32 mask,
- struct rqspinlock_timeout *ts)
-{
- int ret;
-
- ret = check_deadlock_AA(lock, mask, ts);
- if (ret)
- return ret;
- ret = check_deadlock_ABBA(lock, mask, ts);
- if (ret)
- return ret;
-
- return 0;
-}
-
static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
{
- u64 time = ktime_get_mono_fast_ns();
u64 prev = ts->cur;
+ u64 time;
if (!ts->timeout_end) {
- ts->cur = time;
- ts->timeout_end = time + ts->duration;
+ if (check_deadlock_AA(lock))
+ return -EDEADLK;
+ ts->cur = ktime_get_mono_fast_ns();
+ ts->timeout_end = ts->cur + ts->duration;
return 0;
}
+ time = ktime_get_mono_fast_ns();
if (time > ts->timeout_end)
return -ETIMEDOUT;
@@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
*/
if (prev + NSEC_PER_MSEC < time) {
ts->cur = time;
- return check_deadlock(lock, mask, ts);
+ return check_deadlock_ABBA(lock, mask);
}
return 0;
@@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
int val, ret = 0;
RES_INIT_TIMEOUT(ts);
+ /*
+ * The fast path is not invoked for the TAS fallback, so we must grab
+ * the deadlock detection entry here.
+ */
grab_held_lock_entry(lock);
/*
@@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
goto queue;
}
- /*
- * Grab an entry in the held locks array, to enable deadlock detection.
- */
- grab_held_lock_entry(lock);
+ /* Deadlock detection entry already held after failing fast path. */
/*
* We're pending, wait for the owner to go away.
@@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
* queuing.
*/
queue:
- lockevent_inc(lock_slowpath);
/*
- * Grab deadlock detection entry for the queue path.
+ * Do not queue if we're a waiter and someone is attempting this lock on
+ * the same CPU. In case of NMIs, this prevents long timeouts where we
+ * interrupt the pending waiter, and the owner, that will eventually
+ * signal the head of our queue, both of which are logically but not
+ * physically part of the queue, hence outside the scope of the idx > 0
+ * check above for the trylock fallback.
*/
- grab_held_lock_entry(lock);
+ if (check_deadlock_AA(lock)) {
+ ret = -EDEADLK;
+ goto err_release_entry;
+ }
+ lockevent_inc(lock_slowpath);
+ /* Deadlock detection entry already held after failing fast path. */
node = this_cpu_ptr(&rqnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -467,19 +463,17 @@ queue:
* not be nested NMIs taking spinlocks. That may not be true in
* some architectures even though the chance of needing more than
* 4 nodes will still be extremely unlikely. When that happens,
- * we fall back to spinning on the lock directly without using
- * any MCS node. This is not the most elegant solution, but is
- * simple enough.
+ * we fall back to attempting a trylock operation without using
+ * any MCS node. Unlike qspinlock which cannot fail, we have the
+ * option of failing the slow path, and under contention, such a
+ * trylock spinning will likely be treated unfairly due to lack of
+ * queueing, hence do not spin.
*/
- if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
+ if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
lockevent_inc(lock_no_node);
- RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
- while (!queued_spin_trylock(lock)) {
- if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
- lockevent_inc(rqspinlock_lock_timeout);
- goto err_release_node;
- }
- cpu_relax();
+ if (!queued_spin_trylock(lock)) {
+ ret = -EDEADLK;
+ goto err_release_node;
}
goto release;
}
@@ -540,7 +534,7 @@ queue:
val = arch_mcs_spin_lock_contended(&node->locked);
if (val == RES_TIMEOUT_VAL) {
- ret = -EDEADLK;
+ ret = -ETIMEDOUT;
goto waitq_timeout;
}
@@ -575,6 +569,14 @@ queue:
val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+ /* Disable queue destruction when we detect deadlocks. */
+ if (ret == -EDEADLK) {
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+ arch_mcs_spin_unlock_contended(&next->locked);
+ goto err_release_node;
+ }
+
waitq_timeout:
if (ret) {
/*
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4d53cdd1374c..da3d328f5c15 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map)
sizeof(struct bpf_stack_build_id) : sizeof(u64);
}
+/**
+ * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth
+ * @size: Size of the buffer/map value in bytes
+ * @elem_size: Size of each stack trace element
+ * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...)
+ *
+ * Return: Maximum number of stack trace entries that can be safely stored
+ */
+static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags)
+{
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 max_depth;
+ u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack);
+
+ max_depth = size / elem_size;
+ max_depth += skip;
+ if (max_depth > curr_sysctl_max_stack)
+ return curr_sysctl_max_stack;
+
+ return max_depth;
+}
+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u64 elem_size = sizeof(struct stack_map_bucket) +
@@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+ u32 hash, id, trace_nr, trace_len, i, max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map,
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr = trace->nr - skip;
+ max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
+ trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
trace_len = trace_nr * sizeof(u64);
ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
@@ -300,22 +323,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
- u32 max_depth = map->value_size / stack_map_data_size(map);
- u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 elem_size = stack_map_data_size(map);
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
+ u32 max_depth;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- max_depth += skip;
- if (max_depth > sysctl_perf_event_max_stack)
- max_depth = sysctl_perf_event_max_stack;
-
+ max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
trace = get_perf_callchain(regs, kernel, user, max_depth,
- false, false);
+ false, false, 0);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
return -EFAULT;
nr_kernel = count_kernel_ip(trace);
+ __u64 nr = trace->nr; /* save original */
if (kernel) {
- __u64 nr = trace->nr;
-
trace->nr = nr_kernel;
ret = __bpf_get_stackid(map, trace, flags);
-
- /* restore nr */
- trace->nr = nr;
} else { /* user */
u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
ret = __bpf_get_stackid(map, trace, flags);
}
+
+ /* restore nr */
+ trace->nr = nr;
+
return ret;
}
@@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags, bool may_fault)
{
- u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
+ u32 trace_nr, copy_len, elem_size, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
bool crosstask = task && task != current;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto clear;
}
- num_elem = size / elem_size;
- max_depth = num_elem + skip;
- if (sysctl_perf_event_max_stack < max_depth)
- max_depth = sysctl_perf_event_max_stack;
+ max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
if (may_fault)
rcu_read_lock(); /* need RCU for perf's callchain below */
- if (trace_in)
+ if (trace_in) {
trace = trace_in;
- else if (kernel && task)
+ trace->nr = min_t(u32, trace->nr, max_depth);
+ } else if (kernel && task) {
trace = get_callchain_entry_for_task(task, max_depth);
- else
+ } else {
trace = get_perf_callchain(regs, kernel, user, max_depth,
- crosstask, false);
+ crosstask, false, 0);
+ }
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
@@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
}
trace_nr = trace->nr - skip;
- trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index eb6c5a21c2ef..0b6bc3f30335 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -4,111 +4,10 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
-#include <linux/percpu.h>
-#include <linux/refcount.h>
#include <linux/gfp.h>
#include <linux/memory.h>
-#include <linux/local_lock.h>
#include <linux/mutex.h>
-/*
- * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
- * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
- * stash it in a local per-CPU variable, and bump allocate from the page
- * whenever items need to be printed to a stream. Each page holds a global
- * atomic refcount in its first 4 bytes, and then records of variable length
- * that describe the printed messages. Once the global refcount has dropped to
- * zero, it is a signal to free the page back to the kernel's page allocator,
- * given all the individual records in it have been consumed.
- *
- * It is possible the same page is used to serve allocations across different
- * programs, which may be consumed at different times individually, hence
- * maintaining a reference count per-page is critical for correct lifetime
- * tracking.
- *
- * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
- * lands.
- */
-struct bpf_stream_page {
- refcount_t ref;
- u32 consumed;
- char buf[];
-};
-
-/* Available room to add data to a refcounted page. */
-#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
-
-static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
-static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
-
-static bool bpf_stream_page_local_lock(unsigned long *flags)
-{
- return local_trylock_irqsave(&stream_local_lock, *flags);
-}
-
-static void bpf_stream_page_local_unlock(unsigned long *flags)
-{
- local_unlock_irqrestore(&stream_local_lock, *flags);
-}
-
-static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
-{
- struct page *p;
-
- if (!stream_page)
- return;
- p = virt_to_page(stream_page);
- free_pages_nolock(p, 0);
-}
-
-static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
-{
- refcount_inc(&stream_page->ref);
-}
-
-static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
-{
- if (refcount_dec_and_test(&stream_page->ref))
- bpf_stream_page_free(stream_page);
-}
-
-static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
-{
- refcount_set(&stream_page->ref, 1);
- stream_page->consumed = 0;
-}
-
-static struct bpf_stream_page *bpf_stream_page_replace(void)
-{
- struct bpf_stream_page *stream_page, *old_stream_page;
- struct page *page;
-
- page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0);
- if (!page)
- return NULL;
- stream_page = page_address(page);
- bpf_stream_page_init(stream_page);
-
- old_stream_page = this_cpu_read(stream_pcpu_page);
- if (old_stream_page)
- bpf_stream_page_put(old_stream_page);
- this_cpu_write(stream_pcpu_page, stream_page);
- return stream_page;
-}
-
-static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
-{
- int min = offsetof(struct bpf_stream_elem, str[0]);
- int consumed = stream_page->consumed;
- int total = BPF_STREAM_PAGE_SZ;
- int rem = max(0, total - consumed - min);
-
- /* Let's give room of at least 8 bytes. */
- WARN_ON_ONCE(rem % 8 != 0);
- rem = rem < 8 ? 0 : rem;
- return min(len, rem);
-}
-
static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
{
init_llist_node(&elem->node);
@@ -116,54 +15,12 @@ static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
elem->consumed_len = 0;
}
-static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
-{
- unsigned long addr = (unsigned long)elem;
-
- return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
-}
-
-static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
-{
- u32 consumed = stream_page->consumed;
-
- stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
- return (struct bpf_stream_elem *)&stream_page->buf[consumed];
-}
-
-static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
-{
- struct bpf_stream_elem *elem = NULL;
- struct bpf_stream_page *page;
- int room = 0;
-
- page = this_cpu_read(stream_pcpu_page);
- if (!page)
- page = bpf_stream_page_replace();
- if (!page)
- return NULL;
-
- room = bpf_stream_page_check_room(page, len);
- if (room != len)
- page = bpf_stream_page_replace();
- if (!page)
- return NULL;
- bpf_stream_page_get(page);
- room = bpf_stream_page_check_room(page, len);
- WARN_ON_ONCE(room != len);
-
- elem = bpf_stream_page_push_elem(page, room);
- bpf_stream_elem_init(elem, room);
- return elem;
-}
-
static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
{
const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
struct bpf_stream_elem *elem;
- unsigned long flags;
+ size_t alloc_size;
- BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
/*
* Length denotes the amount of data to be written as part of stream element,
* thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
@@ -172,10 +29,13 @@ static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
if (len < 0 || len > max_len)
return NULL;
- if (!bpf_stream_page_local_lock(&flags))
+ alloc_size = offsetof(struct bpf_stream_elem, str[len]);
+ elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1);
+ if (!elem)
return NULL;
- elem = bpf_stream_page_reserve_elem(len);
- bpf_stream_page_local_unlock(&flags);
+
+ bpf_stream_elem_init(elem, len);
+
return elem;
}
@@ -231,10 +91,7 @@ static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bp
static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
{
- struct bpf_stream_page *p;
-
- p = bpf_stream_page_from_elem(elem);
- bpf_stream_page_put(p);
+ kfree_nolock(elem);
}
static void bpf_stream_free_list(struct llist_node *list)
@@ -355,7 +212,8 @@ __bpf_kfunc_start_defs();
* Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
* enum in headers.
*/
-__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
+__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, void *aux__prog)
{
struct bpf_bprintf_data data = {
.get_bin_args = true,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a9456a3e730..6589acc89ef8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
+ synchronize_rcu_expedited();
}
static void unpin_uptr_kaddr(void *kaddr)
@@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
return ptr;
}
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
@@ -1219,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
return src - orig_src;
}
+EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
int map_check_no_btf(const struct bpf_map *map,
const struct btf *btf,
@@ -1478,6 +1494,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
if (!bpf_token_capable(token, CAP_BPF))
goto put_token;
break;
@@ -1570,7 +1587,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
goto free_map;
}
} else if (attr->excl_prog_hash_size) {
- return -EINVAL;
+ err = -EINVAL;
+ goto free_map;
}
err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
@@ -1709,9 +1727,6 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
- if (attr->flags & ~BPF_F_LOCK)
- return -EINVAL;
-
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1719,9 +1734,9 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
- if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK))
- return -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ if (err)
+ return err;
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key))
@@ -1784,11 +1799,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- err = -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->flags, ~0);
+ if (err)
goto err_put;
- }
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -1992,13 +2005,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
void *key, *value;
int err = 0;
- if (attr->batch.elem_flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- return -EINVAL;
- }
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
value_size = bpf_map_value_size(map);
@@ -2055,12 +2064,9 @@ int generic_map_lookup_batch(struct bpf_map *map,
u32 value_size, cp, max_count;
int err;
- if (attr->batch.elem_flags & ~BPF_F_LOCK)
- return -EINVAL;
-
- if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK))
- return -EINVAL;
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
value_size = bpf_map_value_size(map);
@@ -2315,7 +2321,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (!in_irq() && !irqs_disabled())
+ if (!in_hardirq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -2413,7 +2419,7 @@ static void __bpf_prog_put(struct bpf_prog *prog)
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
- if (in_irq() || irqs_disabled()) {
+ if (in_hardirq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
} else {
@@ -2447,6 +2453,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
struct bpf_prog_stats *stats;
unsigned int flags;
+ if (unlikely(!prog->stats))
+ return;
+
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->misses);
@@ -2838,6 +2847,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
return err;
}
+static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
+ continue;
+
+ err = bpf_insn_array_ready(prog->aux->used_maps[i]);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD keyring_id
@@ -3067,6 +3093,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_used_maps;
+ err = bpf_prog_mark_insn_arrays_ready(prog);
+ if (err < 0)
+ goto free_used_maps;
+
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
@@ -5019,19 +5049,19 @@ static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_insn *insns_sanitized;
bool fault;
- if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
+ if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) {
+ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
+ if (!insns_sanitized)
+ return -ENOMEM;
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ fault = copy_to_user(uinsns, insns_sanitized, ulen);
+ kfree(insns_sanitized);
+ if (fault)
+ return -EFAULT;
+ } else {
info.xlated_prog_insns = 0;
- goto done;
}
- insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
- if (!insns_sanitized)
- return -ENOMEM;
- uinsns = u64_to_user_ptr(info.xlated_prog_insns);
- ulen = min_t(u32, info.xlated_prog_len, ulen);
- fault = copy_to_user(uinsns, insns_sanitized, ulen);
- kfree(insns_sanitized);
- if (fault)
- return -EFAULT;
}
if (bpf_prog_is_offloaded(prog->aux)) {
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 0bbe412f854e..feecd8f4dbf9 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = {
int bpf_token_create(union bpf_attr *attr)
{
+ struct bpf_token *token __free(kfree) = NULL;
struct bpf_mount_opts *mnt_opts;
- struct bpf_token *token = NULL;
struct user_namespace *userns;
struct inode *inode;
- struct file *file;
CLASS(fd, f)(attr->token_create.bpffs_fd);
struct path path;
struct super_block *sb;
umode_t mode;
- int err, fd;
+ int err;
if (fd_empty(f))
return -EBADF;
@@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr)
inode->i_fop = &bpf_token_fops;
clear_nlink(inode); /* make sure it is unlinked */
- file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
- if (IS_ERR(file)) {
- iput(inode);
- return PTR_ERR(file);
- }
+ FD_PREPARE(fdf, O_CLOEXEC,
+ alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME,
+ O_RDWR, &bpf_token_fops));
+ if (fdf.err)
+ return fdf.err;
token = kzalloc(sizeof(*token), GFP_USER);
- if (!token) {
- err = -ENOMEM;
- goto out_file;
- }
+ if (!token)
+ return -ENOMEM;
atomic64_set(&token->refcnt, 1);
- /* remember bpffs owning userns for future ns_capable() checks */
- token->userns = get_user_ns(userns);
-
+ /* remember bpffs owning userns for future ns_capable() checks. */
+ token->userns = userns;
token->allowed_cmds = mnt_opts->delegate_cmds;
token->allowed_maps = mnt_opts->delegate_maps;
token->allowed_progs = mnt_opts->delegate_progs;
@@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr)
err = security_bpf_token_create(token, attr, &path);
if (err)
- goto out_token;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- err = fd;
- goto out_token;
- }
-
- file->private_data = token;
- fd_install(fd, file);
-
- return fd;
+ return err;
-out_token:
- bpf_token_free(token);
-out_file:
- fput(file);
- return err;
+ get_user_ns(token->userns);
+ fd_prepare_file(fdf)->private_data = no_free_ptr(token);
+ return fd_publish(fdf);
}
int bpf_token_get_info_by_fd(struct bpf_token *token,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5949095e51c3..976d89011b15 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -175,23 +175,42 @@ out:
return tr;
}
-static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
+static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr)
{
+ enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL;
void *ip = tr->func.addr;
+
+ if (!new_addr)
+ new_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(tr->flags))
+ new_t = BPF_MOD_JUMP;
+
+ if (!old_addr)
+ old_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(orig_flags))
+ old_t = BPF_MOD_JUMP;
+
+ return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
+}
+
+static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr)
+{
int ret;
if (tr->func.ftrace_managed)
ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr,
bool lock_direct_mutex)
{
- void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed) {
@@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
else
ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
} else {
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
+ new_addr);
}
return ret;
}
@@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
if (tr->func.ftrace_managed) {
- ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ if (ret)
+ return ret;
ret = register_ftrace_direct(tr->fops, (long)new_addr);
} else {
- ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
}
return ret;
@@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
* call_rcu_tasks() is not necessary.
*/
if (im->ip_after_call) {
- int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
- NULL, im->ip_epilogue);
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ im->ip_epilogue);
WARN_ON(err);
if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
@@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
return PTR_ERR(tlinks);
if (total == 0) {
- err = unregister_fentry(tr, tr->cur_image->image);
+ err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = NULL;
goto out;
@@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
again:
- if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
- (tr->flags & BPF_TRAMP_F_CALL_ORIG))
- tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+ if (tr->flags & BPF_TRAMP_F_CALL_ORIG) {
+ if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) {
+ /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the
+ * first try, reset it in the second try.
+ */
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME;
+ } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) {
+ /* Use "jmp" instead of "call" for the trampoline
+ * in the origin call case, and we don't need to
+ * skip the frame.
+ */
+ tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME;
+ }
+ }
#endif
size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
@@ -465,10 +499,18 @@ again:
if (err)
goto out_free;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
+ err = modify_fentry(tr, orig_flags, tr->cur_image->image,
+ im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
@@ -479,11 +521,6 @@ again:
* BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
* trampoline again, and retry register.
*/
- /* reset fops->func and fops->trampoline for re-register */
- tr->fops->func = NULL;
- tr->fops->trampoline = 0;
-
- /* free im memory and reallocate later */
bpf_tramp_image_free(im);
goto again;
}
@@ -496,8 +533,15 @@ again:
tr->cur_image = im;
out:
/* If any error happens, restore previous flags */
- if (err)
+ if (err) {
tr->flags = orig_flags;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+ }
kfree(tlinks);
return err;
@@ -573,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
if (err)
return err;
tr->extension_prog = link->link.prog;
- return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
}
if (cnt >= BPF_MAX_TRAMP_LINKS)
@@ -621,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ BPF_MOD_NOP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
guard(mutex)(&tgt_prog->aux->ext_mutex);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ff40e5e65c43..f0ca69f888fa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -209,8 +209,6 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
-static void specialize_kfunc(struct bpf_verifier_env *env,
- u32 func_id, u16 offset, unsigned long *addr);
static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
@@ -515,6 +513,7 @@ static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_task_work_add_kfunc(u32 func_id);
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
@@ -547,6 +546,21 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn)
(bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}
+static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ /* bpf_timer callbacks are never sleepable. */
+ if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback)
+ return false;
+
+ /* bpf_wq and bpf_task_work callbacks are always sleepable. */
+ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+ return true;
+
+ verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
+ return false;
+}
+
static bool is_may_goto_insn(struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
@@ -676,6 +690,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_XDP;
case DYNPTR_TYPE_SKB_META:
return BPF_DYNPTR_TYPE_SKB_META;
+ case DYNPTR_TYPE_FILE:
+ return BPF_DYNPTR_TYPE_FILE;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -694,6 +710,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_XDP;
case BPF_DYNPTR_TYPE_SKB_META:
return DYNPTR_TYPE_SKB_META;
+ case BPF_DYNPTR_TYPE_FILE:
+ return DYNPTR_TYPE_FILE;
default:
return 0;
}
@@ -701,7 +719,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return type == BPF_DYNPTR_TYPE_RINGBUF;
+ return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -812,6 +830,15 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
struct bpf_func_state *state = func(env, reg);
int spi, ref_obj_id, i;
+ /*
+ * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
+ return -EFAULT;
+ }
spi = dynptr_get_spi(env, reg);
if (spi < 0)
return spi;
@@ -1410,7 +1437,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf
dst->acquired_refs = src->acquired_refs;
dst->active_locks = src->active_locks;
dst->active_preempt_locks = src->active_preempt_locks;
- dst->active_rcu_lock = src->active_rcu_lock;
+ dst->active_rcu_locks = src->active_rcu_locks;
dst->active_irq_id = src->active_irq_id;
dst->active_lock_id = src->active_lock_id;
dst->active_lock_ptr = src->active_lock_ptr;
@@ -2093,7 +2120,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
@@ -2103,12 +2130,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
if (err)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elem->st.speculative |= speculative;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env, "The sequence of %d jumps is too complex.\n",
env->stack_size);
- return NULL;
+ return ERR_PTR(-E2BIG);
}
if (elem->st.parent) {
++elem->st.parent->branches;
@@ -2903,7 +2930,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- return NULL;
+ return ERR_PTR(-ENOMEM);
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
@@ -2915,7 +2942,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
verbose(env,
"The sequence of %d jumps is too complex for async cb.\n",
env->stack_size);
- return NULL;
+ return ERR_PTR(-E2BIG);
}
/* Unlike push_stack() do not copy_verifier_state().
* The caller state doesn't matter.
@@ -2926,7 +2953,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem->st.in_sleepable = is_sleepable;
frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT);
if (!frame)
- return NULL;
+ return ERR_PTR(-ENOMEM);
init_func_state(env, frame,
BPF_MAIN_FUNC /* callsite */,
0 /* frameno within this callchain */,
@@ -3097,6 +3124,9 @@ struct bpf_kfunc_btf_tab {
u32 nr_descs;
};
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
+ int insn_idx);
+
static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
@@ -3114,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b)
return d0->offset - d1->offset;
}
-static const struct bpf_kfunc_desc *
+static struct bpf_kfunc_desc *
find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
struct bpf_kfunc_desc desc = {
@@ -3237,12 +3267,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
{
const struct btf_type *func, *func_proto;
struct bpf_kfunc_btf_tab *btf_tab;
+ struct btf_func_model func_model;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
struct bpf_kfunc_desc *desc;
const char *func_name;
struct btf *desc_btf;
- unsigned long call_imm;
unsigned long addr;
int err;
@@ -3326,19 +3356,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
func_name);
return -EINVAL;
}
- specialize_kfunc(env, func_id, offset, &addr);
-
- if (bpf_jit_supports_far_kfunc_call()) {
- call_imm = func_id;
- } else {
- call_imm = BPF_CALL_IMM(addr);
- /* Check whether the relative offset overflows desc->imm */
- if ((unsigned long)(s32)call_imm != call_imm) {
- verbose(env, "address of kernel function %s is out of range\n",
- func_name);
- return -EINVAL;
- }
- }
if (bpf_dev_bound_kfunc_id(func_id)) {
err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
@@ -3346,18 +3363,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return err;
}
+ err = btf_distill_func_proto(&env->log, desc_btf,
+ func_proto, func_name,
+ &func_model);
+ if (err)
+ return err;
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = call_imm;
desc->offset = offset;
desc->addr = addr;
- err = btf_distill_func_proto(&env->log, desc_btf,
- func_proto, func_name,
- &desc->func_model);
- if (!err)
- sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_id_off, NULL);
- return err;
+ desc->func_model = func_model;
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id_off, NULL);
+ return 0;
}
static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
@@ -3372,16 +3391,43 @@ static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
return 0;
}
-static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
+static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+{
+ unsigned long call_imm;
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = desc->func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(desc->addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel func_id %u is out of range\n",
+ desc->func_id);
+ return -EINVAL;
+ }
+ }
+ desc->imm = call_imm;
+ return 0;
+}
+
+static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
{
struct bpf_kfunc_desc_tab *tab;
+ int i, err;
- tab = prog->aux->kfunc_tab;
+ tab = env->prog->aux->kfunc_tab;
if (!tab)
- return;
+ return 0;
+
+ for (i = 0; i < tab->nr_descs; i++) {
+ err = set_kfunc_desc_imm(env, &tab->descs[i]);
+ if (err)
+ return err;
+ }
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_desc_cmp_by_imm_off, NULL);
+ return 0;
}
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -3509,8 +3555,12 @@ static int check_subprogs(struct bpf_verifier_env *env)
subprog[cur_subprog].has_ld_abs = true;
if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
- if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+ if (BPF_OP(code) == BPF_CALL)
goto next;
+ if (BPF_OP(code) == BPF_EXIT) {
+ subprog[cur_subprog].exit_idx = i;
+ goto next;
+ }
off = i + bpf_jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
@@ -4392,6 +4442,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
bt_reg_mask(bt));
return -EFAULT;
}
+ if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
+ && subseq_idx - idx != 1) {
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+ }
} else if (opcode == BPF_EXIT) {
bool r0_precise;
@@ -5826,8 +5881,7 @@ bad_type:
static bool in_sleepable(struct bpf_verifier_env *env)
{
- return env->prog->sleepable ||
- (env->cur_state && env->cur_state->in_sleepable);
+ return env->cur_state->in_sleepable;
}
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -5835,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env)
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
- return env->cur_state->active_rcu_lock ||
+ return env->cur_state->active_rcu_locks ||
env->cur_state->active_locks ||
!in_sleepable(env);
}
@@ -5988,6 +6042,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+/*
+ * Return the size of the memory region accessible from a pointer to map value.
+ * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible.
+ */
+static u32 map_mem_size(const struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ return map->max_entries * sizeof(long);
+
+ return map->value_size;
+}
+
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size, bool zero_size_allowed,
@@ -5997,11 +6063,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
+ u32 mem_size = map_mem_size(map);
struct btf_record *rec;
int err, i;
- err = check_mem_region_access(env, regno, off, size, map->value_size,
- zero_size_allowed);
+ err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
if (err)
return err;
@@ -6416,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
+ if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ strict = true;
break;
case PTR_TO_CTX:
pointer_desc = "context ";
@@ -7039,6 +7107,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
struct file __rcu *exe_file;
+#ifdef CONFIG_MEMCG
+ struct task_struct __rcu *owner;
+#endif
};
/* skb->sk, req->sk are not RCU protected, but we mark them as such
@@ -7078,6 +7149,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
+ struct mm_struct *vm_mm;
+ struct file *vm_file;
+};
+
static bool type_is_rcu(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
const char *field_name, u32 btf_id)
@@ -7119,6 +7195,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
"__safe_trusted_or_null");
@@ -7502,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
- /* if map is read-only, track its contents as scalars */
+ /*
+ * If map is read-only, track its contents as scalars,
+ * unless it is an insn array (see the special case below)
+ */
if (tnum_is_const(reg->var_off) &&
bpf_map_is_rdonly(map) &&
- map->ops->map_direct_value_addr) {
+ map->ops->map_direct_value_addr &&
+ map->map_type != BPF_MAP_TYPE_INSN_ARRAY) {
int map_off = off + reg->var_off.value;
u64 val = 0;
@@ -7516,6 +7597,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
regs[value_regno].type = SCALAR_VALUE;
__mark_reg_known(&regs[value_regno], val);
+ } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ if (bpf_size != BPF_DW) {
+ verbose(env, "Invalid read of %d bytes from insn_array\n",
+ size);
+ return -EACCES;
+ }
+ copy_register_state(&regs[value_regno], reg);
+ regs[value_regno].type = PTR_TO_INSN;
} else {
mark_reg_unknown(env, regs, value_regno);
}
@@ -8464,6 +8553,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
case BPF_TASK_WORK:
field_off = map->record->task_work_off;
break;
+ case BPF_WORKQUEUE:
+ field_off = map->record->wq_off;
+ break;
default:
verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
@@ -8505,13 +8597,17 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno,
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_map *map = reg->map_ptr;
- u64 val = reg->var_off.value;
+ int err;
- if (map->record->wq_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
- val + reg->off, map->record->wq_off);
- return -EINVAL;
+ err = check_map_field_pointer(env, regno, BPF_WORKQUEUE);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_wq helper");
+ return -EFAULT;
}
+
meta->map.uid = reg->map_uid;
meta->map.ptr = map;
return 0;
@@ -8866,7 +8962,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
struct bpf_func_state *fold, *fcur;
- int i, fr;
+ int i, fr, num_slots;
reset_idmap_scratch(env);
for (fr = old->curframe; fr >= 0; fr--) {
@@ -8879,7 +8975,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
&fcur->regs[i],
&env->idmap_scratch);
- for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+ num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
+ fcur->allocated_stack / BPF_REG_SIZE);
+ for (i = 0; i < num_slots; i++) {
if (!is_spilled_reg(&fold->stack[i]) ||
!is_spilled_reg(&fcur->stack[i]))
continue;
@@ -9014,8 +9112,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
/* branch out active iter state */
queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
- if (!queued_st)
- return -ENOMEM;
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
queued_iter = get_iter_from_state(queued_st, meta);
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
@@ -10052,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem)
goto error;
break;
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ goto error;
default:
break;
}
@@ -10366,8 +10466,6 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
-static bool is_task_work_add_kfunc(u32 func_id);
-
static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
@@ -10586,10 +10684,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog,
- is_bpf_wq_set_callback_impl_kfunc(insn->imm) ||
- is_task_work_add_kfunc(insn->imm));
- if (!async_cb)
- return -EFAULT;
+ is_async_cb_sleepable(env, insn));
+ if (IS_ERR(async_cb))
+ return PTR_ERR(async_cb);
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
@@ -10605,8 +10702,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
* proceed with next instruction within current frame.
*/
callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
- if (!callback_state)
- return -ENOMEM;
+ if (IS_ERR(callback_state))
+ return PTR_ERR(callback_state);
err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
callback_state);
@@ -10646,7 +10743,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (env->subprog_info[subprog].might_sleep &&
- (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks ||
+ (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
env->cur_state->active_irq_id || !in_sleepable(env))) {
verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
"i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
@@ -10660,8 +10757,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
- verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
- subprog, sub_name);
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
if (env->subprog_info[subprog].changes_pkt_data)
clear_all_pkt_pointers(env);
/* mark global subprog for verifying after main prog */
@@ -10974,6 +11072,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
bool in_callback_fn;
int err;
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
+
callee = state->frame[state->curframe];
r0 = &callee->regs[BPF_REG_0];
if (r0->type == PTR_TO_STACK) {
@@ -11224,7 +11326,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
return -EINVAL;
}
- if (check_lock && env->cur_state->active_rcu_lock) {
+ if (check_lock && env->cur_state->active_rcu_locks) {
verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
return -EINVAL;
}
@@ -11359,6 +11461,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
return *ptr && (*ptr)->func ? 0 : -EINVAL;
}
+/* Check if we're in a sleepable context. */
+static inline bool in_sleepable_context(struct bpf_verifier_env *env)
+{
+ return !env->cur_state->active_rcu_locks &&
+ !env->cur_state->active_preempt_locks &&
+ !env->cur_state->active_irq_id &&
+ in_sleepable(env);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -11419,15 +11530,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (env->cur_state->active_rcu_lock) {
+ if (env->cur_state->active_rcu_locks) {
if (fn->might_sleep) {
verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
func_id_name(func_id), func_id);
return -EINVAL;
}
-
- if (in_sleepable(env) && is_storage_get_function(func_id))
- env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
if (env->cur_state->active_preempt_locks) {
@@ -11436,9 +11544,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_id_name(func_id), func_id);
return -EINVAL;
}
-
- if (in_sleepable(env) && is_storage_get_function(func_id))
- env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
if (env->cur_state->active_irq_id) {
@@ -11447,11 +11552,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_id_name(func_id), func_id);
return -EINVAL;
}
-
- if (in_sleepable(env) && is_storage_get_function(func_id))
- env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
+ /* Track non-sleepable context for helpers. */
+ if (!in_sleepable_context(env))
+ env->insn_aux_data[insn_idx].non_sleepable = true;
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -11482,15 +11588,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (meta.release_regno) {
err = -EINVAL;
- /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
- * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
- * is safe to do directly.
- */
if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
- if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
- verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
- return -EFAULT;
- }
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
} else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
u32 ref_obj_id = meta.ref_obj_id;
@@ -11884,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->prog->call_get_func_ip = true;
}
+ if (func_id == BPF_FUNC_tail_call) {
+ if (env->cur_state->curframe) {
+ struct bpf_verifier_state *branch;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+ clear_all_pkt_pointers(env);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ env->insn_idx--;
+ } else {
+ changes_data = false;
+ }
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -12258,9 +12375,11 @@ enum special_kfunc_type {
KF_bpf_res_spin_unlock,
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
+ KF_bpf_dynptr_from_file,
+ KF_bpf_dynptr_file_discard,
KF___bpf_trap,
- KF_bpf_task_work_schedule_signal,
- KF_bpf_task_work_schedule_resume,
+ KF_bpf_task_work_schedule_signal_impl,
+ KF_bpf_task_work_schedule_resume_impl,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12330,14 +12449,16 @@ BTF_ID(func, bpf_res_spin_lock)
BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, bpf_dynptr_from_file)
+BTF_ID(func, bpf_dynptr_file_discard)
BTF_ID(func, __bpf_trap)
-BTF_ID(func, bpf_task_work_schedule_signal)
-BTF_ID(func, bpf_task_work_schedule_resume)
+BTF_ID(func, bpf_task_work_schedule_signal_impl)
+BTF_ID(func, bpf_task_work_schedule_resume_impl)
static bool is_task_work_add_kfunc(u32 func_id)
{
- return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
- func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
@@ -13293,6 +13414,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
dynptr_arg_type |= DYNPTR_TYPE_XDP;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ meta->release_regno = regno;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
(dynptr_arg_type & MEM_UNINIT)) {
enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
@@ -13827,9 +13953,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_reg_state *regs;
branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
- if (!branch) {
+ if (IS_ERR(branch)) {
verbose(env, "failed to push state for failed lock acquisition\n");
- return -ENOMEM;
+ return PTR_ERR(branch);
}
regs = branch->frame[branch->curframe]->regs;
@@ -13861,6 +13987,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ /* Track non-sleepable context for kfuncs, same as for helpers. */
+ if (!in_sleepable_context(env))
+ insn_aux->non_sleepable = true;
+
/* Check the arguments */
err = check_kfunc_args(env, &meta, insn_idx);
if (err < 0)
@@ -13907,36 +14037,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
- if (env->cur_state->active_rcu_lock) {
+ if (rcu_lock) {
+ env->cur_state->active_rcu_locks++;
+ } else if (rcu_unlock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
- if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
- verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
- return -EACCES;
- }
-
- if (rcu_lock) {
- verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ if (env->cur_state->active_rcu_locks == 0) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
return -EINVAL;
- } else if (rcu_unlock) {
+ }
+ if (--env->cur_state->active_rcu_locks == 0) {
bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
if (reg->type & MEM_RCU) {
reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
reg->type |= PTR_UNTRUSTED;
}
}));
- env->cur_state->active_rcu_lock = false;
- } else if (sleepable) {
- verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
- return -EACCES;
}
- } else if (rcu_lock) {
- env->cur_state->active_rcu_lock = true;
- } else if (rcu_unlock) {
- verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
- return -EINVAL;
+ } else if (sleepable && env->cur_state->active_rcu_locks) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
}
if (env->cur_state->active_preempt_locks) {
@@ -13969,12 +14096,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- err = release_reference(env, regs[meta.release_regno].ref_obj_id);
- if (err) {
- verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, meta.func_id);
- return err;
+ struct bpf_reg_state *reg = &regs[meta.release_regno];
+
+ if (meta.initialized_dynptr.ref_obj_id) {
+ err = unmark_stack_slots_dynptr(env, reg);
+ } else {
+ err = release_reference(env, reg->ref_obj_id);
+ if (err)
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
}
+ if (err)
+ return err;
}
if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
@@ -14280,16 +14413,15 @@ struct bpf_sanitize_info {
bool mask_to_left;
};
-static struct bpf_verifier_state *
-sanitize_speculative_path(struct bpf_verifier_env *env,
- const struct bpf_insn *insn,
- u32 next_idx, u32 curr_idx)
+static int sanitize_speculative_path(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ u32 next_idx, u32 curr_idx)
{
struct bpf_verifier_state *branch;
struct bpf_reg_state *regs;
branch = push_stack(env, next_idx, curr_idx, true);
- if (branch && insn) {
+ if (!IS_ERR(branch) && insn) {
regs = branch->frame[branch->curframe]->regs;
if (BPF_SRC(insn->code) == BPF_K) {
mark_reg_unknown(env, regs, insn->dst_reg);
@@ -14298,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env,
mark_reg_unknown(env, regs, insn->src_reg);
}
}
- return branch;
+ return PTR_ERR_OR_ZERO(branch);
}
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
@@ -14317,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
- bool ret;
int err;
if (can_skip_alu_sanitation(env, insn))
@@ -14390,11 +14521,12 @@ do_sim:
tmp = *dst_reg;
copy_register_state(dst_reg, ptr_reg);
}
- ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
- env->insn_idx);
- if (!ptr_is_dst_reg && ret)
+ err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
+ if (err < 0)
+ return REASON_STACK;
+ if (!ptr_is_dst_reg)
*dst_reg = tmp;
- return !ret ? REASON_STACK : 0;
+ return 0;
}
static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
@@ -15948,6 +16080,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
+ if (reg1 == reg2) {
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JLE:
+ case BPF_JSGE:
+ case BPF_JSLE:
+ case BPF_JEQ:
+ return 1;
+ case BPF_JGT:
+ case BPF_JLT:
+ case BPF_JSGT:
+ case BPF_JSLT:
+ case BPF_JNE:
+ return 0;
+ case BPF_JSET:
+ if (tnum_is_const(t1))
+ return t1.value != 0;
+ else
+ return (smin1 <= 0 && smax1 >= 0) ? -1 : 1;
+ default:
+ return -1;
+ }
+ }
+
switch (opcode) {
case BPF_JEQ:
/* constants, umin/umax and smin/smax checks would be
@@ -16394,6 +16550,13 @@ static int reg_set_min_max(struct bpf_verifier_env *env,
if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
return 0;
+ /* We compute branch direction for same SCALAR_VALUE registers in
+ * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET)
+ * on the same registers, we don't need to adjust the min/max values.
+ */
+ if (false_reg1 == false_reg2)
+ return 0;
+
/* fallthrough (FALSE) branch */
regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
reg_bounds_sync(false_reg1);
@@ -16714,8 +16877,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* branch out 'fallthrough' insn as a new state to explore */
queued_st = push_stack(env, idx + 1, idx, false);
- if (!queued_st)
- return -ENOMEM;
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
queued_st->may_goto_depth++;
if (prev_st)
@@ -16793,10 +16956,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* the fall-through branch for simulation under speculative
* execution.
*/
- if (!env->bypass_spec_v1 &&
- !sanitize_speculative_path(env, insn, *insn_idx + 1,
- *insn_idx))
- return -EFAULT;
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx);
+ if (err < 0)
+ return err;
+ }
if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off;
@@ -16806,11 +16970,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* program will go. If needed, push the goto branch for
* simulation under speculative execution.
*/
- if (!env->bypass_spec_v1 &&
- !sanitize_speculative_path(env, insn,
- *insn_idx + insn->off + 1,
- *insn_idx))
- return -EFAULT;
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1,
+ *insn_idx);
+ if (err < 0)
+ return err;
+ }
if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe);
return 0;
@@ -16831,10 +16996,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
}
- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
- false);
- if (!other_branch)
- return -EFAULT;
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
if (BPF_SRC(insn->code) == BPF_X) {
@@ -17017,7 +17181,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- WARN_ON_ONCE(map->max_entries != 1);
+ WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
+ map->max_entries != 1);
/* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
@@ -17769,6 +17934,247 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env)
return 0;
}
+static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+ size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+ struct bpf_iarray *new;
+
+ new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+ if (!new) {
+ /* this is what callers always want, so simplify the call site */
+ kvfree(old);
+ return NULL;
+ }
+
+ new->cnt = n_elem;
+ return new;
+}
+
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+ struct bpf_insn_array_value *value;
+ u32 i;
+
+ for (i = start; i <= end; i++) {
+ value = map->ops->map_lookup_elem(map, &i);
+ /*
+ * map_lookup_elem of an array map will never return an error,
+ * but not checking it makes some static analysers to worry
+ */
+ if (IS_ERR(value))
+ return PTR_ERR(value);
+ else if (!value)
+ return -EINVAL;
+ items[i - start] = value->xlated_off;
+ }
+ return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+ return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+ int unique = 1;
+ int i;
+
+ sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+ for (i = 1; i < cnt; i++)
+ if (items[i] != items[unique - 1])
+ items[unique++] = items[i];
+
+ return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+ u32 n = end - start + 1;
+ int err;
+
+ err = copy_insn_array(map, start, end, off);
+ if (err)
+ return err;
+
+ return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+ struct bpf_iarray *jt;
+ int err;
+ int n;
+
+ jt = iarray_realloc(NULL, map->max_entries);
+ if (!jt)
+ return ERR_PTR(-ENOMEM);
+
+ n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+ if (n < 0) {
+ err = n;
+ goto err_free;
+ }
+ if (n == 0) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ jt->cnt = n;
+ return jt;
+
+err_free:
+ kvfree(jt);
+ return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+ int subprog_start, int subprog_end)
+{
+ struct bpf_iarray *jt = NULL;
+ struct bpf_map *map;
+ struct bpf_iarray *jt_cur;
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++) {
+ /*
+ * TODO (when needed): collect only jump tables, not static keys
+ * or maps for indirect calls
+ */
+ map = env->insn_array_maps[i];
+
+ jt_cur = jt_from_map(map);
+ if (IS_ERR(jt_cur)) {
+ kvfree(jt);
+ return jt_cur;
+ }
+
+ /*
+ * This is enough to check one element. The full table is
+ * checked to fit inside the subprog later in create_jt()
+ */
+ if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+ u32 old_cnt = jt ? jt->cnt : 0;
+ jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
+ if (!jt) {
+ kvfree(jt_cur);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+ }
+
+ kvfree(jt_cur);
+ }
+
+ if (!jt) {
+ verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+ return ERR_PTR(-EINVAL);
+ }
+
+ jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+ return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+ static struct bpf_subprog_info *subprog;
+ int subprog_start, subprog_end;
+ struct bpf_iarray *jt;
+ int i;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ subprog_start = subprog->start;
+ subprog_end = (subprog + 1)->start;
+ jt = jt_from_subprog(env, subprog_start, subprog_end);
+ if (IS_ERR(jt))
+ return jt;
+
+ /* Check that the every element of the jump table fits within the given subprogram */
+ for (i = 0; i < jt->cnt; i++) {
+ if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+ verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+ t, subprog_start, subprog_end);
+ kvfree(jt);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+ bool keep_exploring = false;
+ struct bpf_iarray *jt;
+ int i, w;
+
+ jt = env->insn_aux_data[t].jt;
+ if (!jt) {
+ jt = create_jt(t, env);
+ if (IS_ERR(jt))
+ return PTR_ERR(jt);
+
+ env->insn_aux_data[t].jt = jt;
+ }
+
+ mark_prune_point(env, t);
+ for (i = 0; i < jt->cnt; i++) {
+ w = jt->items[i];
+ if (w < 0 || w >= env->prog->len) {
+ verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ mark_jmp_point(env, w);
+
+ /* EXPLORED || DISCOVERED */
+ if (insn_state[w])
+ continue;
+
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+
+ insn_stack[env->cfg.cur_stack++] = w;
+ insn_state[w] |= DISCOVERED;
+ keep_exploring = true;
+ }
+
+ return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
+static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
+{
+ static struct bpf_subprog_info *subprog;
+ struct bpf_iarray *jt;
+
+ if (env->insn_aux_data[t].jt)
+ return 0;
+
+ jt = iarray_realloc(NULL, 2);
+ if (!jt)
+ return -ENOMEM;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ jt->items[0] = t + 1;
+ jt->items[1] = subprog->exit_idx;
+ env->insn_aux_data[t].jt = jt;
+ return 0;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -17829,6 +18235,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_subprog_might_sleep(env, t);
if (bpf_helper_changes_pkt_data(insn->imm))
mark_subprog_changes_pkt_data(env, t);
+ if (insn->imm == BPF_FUNC_tail_call)
+ visit_tailcall_insn(env, t);
} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -17861,8 +18269,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
- if (BPF_SRC(insn->code) != BPF_K)
- return -EINVAL;
+ if (BPF_SRC(insn->code) == BPF_X)
+ return visit_gotox_insn(t, env);
if (BPF_CLASS(insn->code) == BPF_JMP)
off = insn->off;
@@ -17989,8 +18397,9 @@ err_free:
*/
static int compute_postorder(struct bpf_verifier_env *env)
{
- u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2];
+ u32 cur_postorder, i, top, stack_sz, s;
int *stack = NULL, *postorder = NULL, *state = NULL;
+ struct bpf_iarray *succ;
postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
@@ -18014,11 +18423,11 @@ static int compute_postorder(struct bpf_verifier_env *env)
stack_sz--;
continue;
}
- succ_cnt = bpf_insn_successors(env->prog, top, succ);
- for (s = 0; s < succ_cnt; ++s) {
- if (!state[succ[s]]) {
- stack[stack_sz++] = succ[s];
- state[succ[s]] |= DISCOVERED;
+ succ = bpf_insn_successors(env, top);
+ for (s = 0; s < succ->cnt; ++s) {
+ if (!state[succ->items[s]]) {
+ stack[stack_sz++] = succ->items[s];
+ state[succ->items[s]] |= DISCOVERED;
}
}
state[top] |= EXPLORED;
@@ -18790,6 +19199,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
case PTR_TO_ARENA:
return true;
+ case PTR_TO_INSN:
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ rold->off == rcur->off && range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
default:
return regs_exact(rold, rcur, idmap);
}
@@ -18970,7 +19383,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
if (old->active_preempt_locks != cur->active_preempt_locks)
return false;
- if (old->active_rcu_lock != cur->active_rcu_lock)
+ if (old->active_rcu_locks != cur->active_rcu_locks)
return false;
if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
@@ -19142,7 +19555,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
bt_set_frame_slot(&env->bt, fr, i);
first = false;
}
- if (!first)
+ if (!first && (env->log.level & BPF_LOG_LEVEL2))
verbose(env, "\n");
}
@@ -19782,9 +20195,6 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
return PROCESS_BPF_EXIT;
if (env->cur_state->curframe) {
- err = bpf_update_live_stack(env);
- if (err)
- return err;
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
if (err)
@@ -19799,6 +20209,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
return PROCESS_BPF_EXIT;
}
+static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
+ int regno,
+ struct bpf_map *map,
+ u32 *pmin_index, u32 *pmax_index)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ u64 min_index, max_index;
+ const u32 size = 8;
+
+ if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
+ (min_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
+ regno, reg->umin_value, reg->off);
+ return -ERANGE;
+ }
+ if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
+ (max_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
+ regno, reg->umax_value, reg->off);
+ return -ERANGE;
+ }
+
+ min_index /= size;
+ max_index /= size;
+
+ if (max_index >= map->max_entries) {
+ verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n",
+ regno, min_index, max_index, map->max_entries);
+ return -EINVAL;
+ }
+
+ *pmin_index = min_index;
+ *pmax_index = max_index;
+ return 0;
+}
+
+/* gotox *dst_reg */
+static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ struct bpf_verifier_state *other_branch;
+ struct bpf_reg_state *dst_reg;
+ struct bpf_map *map;
+ u32 min_index, max_index;
+ int err = 0;
+ int n;
+ int i;
+
+ dst_reg = reg_state(env, insn->dst_reg);
+ if (dst_reg->type != PTR_TO_INSN) {
+ verbose(env, "R%d has type %s, expected PTR_TO_INSN\n",
+ insn->dst_reg, reg_type_str(env, dst_reg->type));
+ return -EINVAL;
+ }
+
+ map = dst_reg->map_ptr;
+ if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg))
+ return -EFAULT;
+
+ if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env,
+ "R%d has incorrect map type %d", insn->dst_reg, map->map_type))
+ return -EFAULT;
+
+ err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index);
+ if (err)
+ return err;
+
+ /* Ensure that the buffer is large enough */
+ if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
+ env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
+ max_index - min_index + 1);
+ if (!env->gotox_tmp_buf)
+ return -ENOMEM;
+ }
+
+ n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+ if (n < 0)
+ return n;
+ if (n == 0) {
+ verbose(env, "register R%d doesn't point to any offset in map id=%d\n",
+ insn->dst_reg, map->id);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < n - 1; i++) {
+ other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
+ env->insn_idx, env->cur_state->speculative);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
+ }
+ env->insn_idx = env->gotox_tmp_buf->items[n-1];
+ return 0;
+}
+
static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
{
int err;
@@ -19901,6 +20404,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->src_reg != BPF_REG_0 ||
+ insn->imm != 0 || insn->off != 0) {
+ verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+ return -EINVAL;
+ }
+ return check_indirect_jump(env, insn);
+ }
+
if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
@@ -20417,6 +20929,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_QUEUE:
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
break;
default:
verbose(env,
@@ -20488,6 +21001,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
env->used_maps[env->used_map_cnt++] = map;
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ err = bpf_insn_array_init(map, env->prog);
+ if (err) {
+ verbose(env, "Failed to properly initialize insn array\n");
+ return err;
+ }
+ env->insn_array_maps[env->insn_array_map_cnt++] = map;
+ }
+
return env->used_map_cnt - 1;
}
@@ -20734,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
}
}
+static void release_insn_arrays(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_release(env->insn_array_maps[i]);
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
{
struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
@@ -20775,6 +21324,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
}
adjust_insn_aux_data(env, new_prog, off, len);
adjust_subprog_starts(env, off, len);
+ adjust_insn_arrays(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
}
@@ -20937,6 +21487,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
return 0;
}
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int end = start + len;
+ int i;
+
+ for (i = start; i < end; i++) {
+ if (aux_data[i].jt) {
+ kvfree(aux_data[i].jt);
+ aux_data[i].jt = NULL;
+ }
+
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+}
+
static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
{
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
@@ -20946,6 +21517,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
+ /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+ clear_insn_aux_data(env, off, cnt);
+
err = bpf_remove_insns(env->prog, off, cnt);
if (err)
return err;
@@ -20958,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
if (err)
return err;
+ adjust_insn_arrays_after_remove(env, off, cnt);
+
memmove(aux_data + off, aux_data + off + cnt,
sizeof(*aux_data) * (orig_prog_len - off - cnt));
@@ -21497,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
struct bpf_insn *insn;
void *old_bpf_func;
int err, num_exentries;
+ int old_len, subprog_start_adjustment = 0;
if (env->subprog_cnt <= 1)
return 0;
@@ -21571,6 +22148,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->func_idx = i;
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
func[i]->aux->func_info = prog->aux->func_info;
func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
@@ -21600,6 +22178,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i]->aux->arena = prog->aux->arena;
+ func[i]->aux->used_maps = env->used_maps;
+ func[i]->aux->used_map_cnt = env->used_map_cnt;
num_exentries = 0;
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
@@ -21624,7 +22204,15 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
if (!i)
func[i]->aux->exception_boundary = env->seen_exception;
+
+ /*
+ * To properly pass the absolute subprog start to jit
+ * all instruction adjustments should be accumulated
+ */
+ old_len = func[i]->len;
func[i] = bpf_int_jit_compile(func[i]);
+ subprog_start_adjustment += func[i]->len - old_len;
+
if (!func[i]->jited) {
err = -ENOTSUPP;
goto out_free;
@@ -21677,6 +22265,15 @@ static int jit_subprogs(struct bpf_verifier_env *env)
cond_resched();
}
+ /*
+ * Cleanup func[i]->aux fields which aren't required
+ * or can become invalid in future
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ func[i]->aux->used_maps = NULL;
+ func[i]->aux->used_map_cnt = 0;
+ }
+
/* finally lock prog and jit images for all functions and
* populate kallsysm. Begin at the first subprogram, since
* bpf_prog_load will add the kallsyms for the main program.
@@ -21806,46 +22403,47 @@ static int fixup_call_args(struct bpf_verifier_env *env)
}
/* replace a generic kfunc with a specialized version if necessary */
-static void specialize_kfunc(struct bpf_verifier_env *env,
- u32 func_id, u16 offset, unsigned long *addr)
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
{
struct bpf_prog *prog = env->prog;
bool seen_direct_write;
void *xdp_kfunc;
bool is_rdonly;
+ u32 func_id = desc->func_id;
+ u16 offset = desc->offset;
+ unsigned long addr = desc->addr;
+
+ if (offset) /* return if module BTF is used */
+ return 0;
if (bpf_dev_bound_kfunc_id(func_id)) {
xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
- if (xdp_kfunc) {
- *addr = (unsigned long)xdp_kfunc;
- return;
- }
+ if (xdp_kfunc)
+ addr = (unsigned long)xdp_kfunc;
/* fallback to default kfunc when not supported by netdev */
- }
-
- if (offset)
- return;
-
- if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
seen_direct_write = env->seen_direct_write;
is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
if (is_rdonly)
- *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+ addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
/* restore env->seen_direct_write to its original value, since
* may_access_direct_pkt_data mutates it
*/
env->seen_direct_write = seen_direct_write;
+ } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_set_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ if (!env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_dynptr_from_file_sleepable;
}
-
- if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] &&
- bpf_lsm_has_d_inode_locked(prog))
- *addr = (unsigned long)bpf_set_dentry_xattr_locked;
-
- if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] &&
- bpf_lsm_has_d_inode_locked(prog))
- *addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ desc->addr = addr;
+ return 0;
}
static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
@@ -21868,7 +22466,8 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
- const struct bpf_kfunc_desc *desc;
+ struct bpf_kfunc_desc *desc;
+ int err;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -21888,6 +22487,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
+ err = specialize_kfunc(env, desc, insn_idx);
+ if (err)
+ return err;
+
if (!bpf_jit_supports_far_kfunc_call())
insn->imm = BPF_CALL_IMM(desc->addr);
if (insn->off)
@@ -22483,8 +23086,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
if (is_storage_get_function(insn->imm)) {
- if (!in_sleepable(env) ||
- env->insn_aux_data[i + delta].storage_get_func_atomic)
+ if (env->insn_aux_data[i + delta].non_sleepable)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
else
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
@@ -22917,7 +23519,9 @@ next_insn:
}
}
- sort_kfunc_descs_by_imm_off(env->prog);
+ ret = sort_kfunc_descs_by_imm_off(env);
+ if (ret)
+ return ret;
return 0;
}
@@ -23154,6 +23758,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
state->curframe = 0;
state->speculative = false;
state->branches = 1;
+ state->in_sleepable = env->prog->sleepable;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT);
if (!state->frame[0]) {
kfree(state);
@@ -23173,7 +23778,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
struct bpf_subprog_arg_info *arg;
struct bpf_reg_state *reg;
- verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
ret = btf_prepare_func_args(env, subprog);
if (ret)
goto out;
@@ -24276,14 +24882,13 @@ static int compute_live_registers(struct bpf_verifier_env *env)
for (i = 0; i < env->cfg.cur_postorder; ++i) {
int insn_idx = env->cfg.insn_postorder[i];
struct insn_live_regs *live = &state[insn_idx];
- int succ_num;
- u32 succ[2];
+ struct bpf_iarray *succ;
u16 new_out = 0;
u16 new_in = 0;
- succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
- for (int s = 0; s < succ_num; ++s)
- new_out |= state[succ[s]].in;
+ succ = bpf_insn_successors(env, insn_idx);
+ for (int s = 0; s < succ->cnt; ++s)
+ new_out |= state[succ->items[s]].in;
new_in = (new_out & ~live->def) | live->use;
if (new_out != live->out || new_in != live->in) {
live->in = new_in;
@@ -24336,11 +24941,11 @@ static int compute_scc(struct bpf_verifier_env *env)
const u32 insn_cnt = env->prog->len;
int stack_sz, dfs_sz, err = 0;
u32 *stack, *pre, *low, *dfs;
- u32 succ_cnt, i, j, t, w;
+ u32 i, j, t, w;
u32 next_preorder_num;
u32 next_scc_id;
bool assign_scc;
- u32 succ[2];
+ struct bpf_iarray *succ;
next_preorder_num = 1;
next_scc_id = 1;
@@ -24447,12 +25052,12 @@ dfs_continue:
stack[stack_sz++] = w;
}
/* Visit 'w' successors */
- succ_cnt = bpf_insn_successors(env->prog, w, succ);
- for (j = 0; j < succ_cnt; ++j) {
- if (pre[succ[j]]) {
- low[w] = min(low[w], low[succ[j]]);
+ succ = bpf_insn_successors(env, w);
+ for (j = 0; j < succ->cnt; ++j) {
+ if (pre[succ->items[j]]) {
+ low[w] = min(low[w], low[succ->items[j]]);
} else {
- dfs[dfs_sz++] = succ[j];
+ dfs[dfs_sz++] = succ->items[j];
goto dfs_continue;
}
}
@@ -24469,8 +25074,8 @@ dfs_continue:
* or if component has a self reference.
*/
assign_scc = stack[stack_sz - 1] != w;
- for (j = 0; j < succ_cnt; ++j) {
- if (succ[j] == w) {
+ for (j = 0; j < succ->cnt; ++j) {
+ if (succ->items[j] == w) {
assign_scc = true;
break;
}
@@ -24532,6 +25137,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
+ env->succ = iarray_realloc(NULL, 2);
+ if (!env->succ)
+ goto err_free_env;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
@@ -24755,6 +25363,8 @@ skip_full_check:
adjust_btf_func(env);
err_release_maps:
+ if (ret)
+ release_insn_arrays(env);
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_used_maps() will release them.
@@ -24775,11 +25385,14 @@ err_release_maps:
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
+ clear_insn_aux_data(env, 0, env->prog->len);
vfree(env->insn_aux_data);
err_free_env:
bpf_stack_liveness_free(env);
kvfree(env->cfg.insn_postorder);
kvfree(env->scc_info);
+ kvfree(env->succ);
+ kvfree(env->gotox_tmp_buf);
kvfree(env);
return ret;
}