diff options
Diffstat (limited to 'tools/perf/util/bpf_skel')
-rw-r--r-- | tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c | 11 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/func_latency.bpf.c | 46 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/kwork_top.bpf.c | 4 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/kwork_trace.bpf.c | 2 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_contention.bpf.c | 447 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_data.h | 23 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/off_cpu.bpf.c | 98 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/syscall_summary.bpf.c | 153 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/syscall_summary.h | 27 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 17 |
10 files changed, 798 insertions, 30 deletions
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c index 4a62ed593e84..e4352881e3fa 100644 --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -431,9 +431,9 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) { bool augmented, do_output = false; - int zero = 0, size, aug_size, index, - value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); + int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */ + s64 aug_size, size; unsigned int nr, *beauty_map; struct beauty_payload_enter *payload; void *arg, *payload_offset; @@ -484,14 +484,11 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) } else if (size > 0 && size <= value_size) { /* struct */ if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg)) augmented = true; - } else if (size < 0 && size >= -6) { /* buffer */ + } else if ((int)size < 0 && size >= -6) { /* buffer */ index = -(size + 1); barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick. index &= 7; // Satisfy the bounds checking with the verifier in some kernels. - aug_size = args->args[index]; - - if (aug_size > TRACE_AUG_MAX_BUF) - aug_size = TRACE_AUG_MAX_BUF; + aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index]; if (aug_size > 0) { if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg)) diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c index f613dc9cb123..e731a79a753a 100644 --- a/tools/perf/util/bpf_skel/func_latency.bpf.c +++ b/tools/perf/util/bpf_skel/func_latency.bpf.c @@ -38,9 +38,19 @@ struct { int enabled = 0; +// stats +__s64 total; +__s64 count; +__s64 max; +__s64 min; + const volatile int has_cpu = 0; const volatile int has_task = 0; const volatile int use_nsec = 0; +const volatile unsigned int bucket_range; +const volatile unsigned int min_latency; +const volatile unsigned int max_latency; +const volatile unsigned int bucket_num = NUM_BUCKET; SEC("kprobe/func") int BPF_PROG(func_begin) @@ -92,7 +102,8 @@ int BPF_PROG(func_end) start = bpf_map_lookup_elem(&functime, &tid); if (start) { __s64 delta = bpf_ktime_get_ns() - *start; - __u32 key; + __u64 val = delta; + __u32 key = 0; __u64 *hist; bpf_map_delete_elem(&functime, &tid); @@ -100,17 +111,46 @@ int BPF_PROG(func_end) if (delta < 0) return 0; + if (bucket_range != 0) { + val = delta / cmp_base; + + if (min_latency > 0) { + if (val > min_latency) + val -= min_latency; + else + goto do_lookup; + } + + // Less than 1 unit (ms or ns), or, in the future, + // than the min latency desired. + if (val > 0) { // 1st entry: [ 1 unit .. bucket_range units ) + key = val / bucket_range + 1; + if (key >= bucket_num) + key = bucket_num - 1; + } + + goto do_lookup; + } // calculate index using delta - for (key = 0; key < (NUM_BUCKET - 1); key++) { + for (key = 0; key < (bucket_num - 1); key++) { if (delta < (cmp_base << key)) break; } +do_lookup: hist = bpf_map_lookup_elem(&latency, &key); if (!hist) return 0; - *hist += 1; + __sync_fetch_and_add(hist, 1); + + __sync_fetch_and_add(&total, delta); // always in nsec + __sync_fetch_and_add(&count, 1); + + if (delta > max) + max = delta; + if (delta < min) + min = delta; } return 0; diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c index 594da91965a2..73e32e063030 100644 --- a/tools/perf/util/bpf_skel/kwork_top.bpf.c +++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c @@ -18,7 +18,9 @@ enum kwork_class_type { }; #define MAX_ENTRIES 102400 -#define MAX_NR_CPUS 2048 +#ifndef MAX_NR_CPUS +#define MAX_NR_CPUS 4096 +#endif #define PF_KTHREAD 0x00200000 #define MAX_COMMAND_LEN 16 diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c index cbd79bc4b330..9ce9c8dddc4b 100644 --- a/tools/perf/util/bpf_skel/kwork_trace.bpf.c +++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c @@ -80,7 +80,7 @@ static __always_inline int local_strncmp(const char *s1, for (i = 0; i < sz; i++) { ret = (unsigned char)s1[i] - (unsigned char)s2[i]; - if (ret || !s1[i] || !s2[i]) + if (ret || !s1[i]) break; } diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index 1069bda5d733..96e7d853b9ed 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -11,6 +11,12 @@ /* for collect_lock_syms(). 4096 was rejected by the verifier */ #define MAX_CPUS 1024 +/* for collect_zone_lock(). It should be more than the actual zones. */ +#define MAX_ZONES 10 + +/* for do_lock_delay(). Arbitrarily set to 1 million. */ +#define MAX_LOOP (1U << 20) + /* lock contention flags from include/trace/events/lock.h */ #define LCB_F_SPIN (1U << 0) #define LCB_F_READ (1U << 1) @@ -27,6 +33,38 @@ struct { __uint(max_entries, MAX_ENTRIES); } stacks SEC(".maps"); +/* buffer for owner stacktrace */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 1); +} stack_buf SEC(".maps"); + +/* a map for tracing owner stacktrace to owner stack id */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); // owner stacktrace + __uint(value_size, sizeof(__s32)); // owner stack id + __uint(max_entries, 1); +} owner_stacks SEC(".maps"); + +/* a map for tracing lock address to owner data */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); // lock address + __uint(value_size, sizeof(struct owner_tracing_data)); + __uint(max_entries, 1); +} owner_data SEC(".maps"); + +/* a map for contention_key (stores owner stack id) to contention data */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct contention_key)); + __uint(value_size, sizeof(struct contention_data)); + __uint(max_entries, 1); +} owner_stat SEC(".maps"); + /* maintain timestamp at the beginning of contention */ struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -100,6 +138,27 @@ struct { __uint(max_entries, 1); } cgroup_filter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} slab_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(struct slab_cache_data)); + __uint(max_entries, 1); +} slab_caches SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 1); +} lock_delays SEC(".maps"); + struct rw_semaphore___old { struct task_struct *owner; } __attribute__((preserve_access_index)); @@ -116,16 +175,21 @@ struct mm_struct___new { struct rw_semaphore mmap_lock; } __attribute__((preserve_access_index)); +extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak; + /* control flags */ const volatile int has_cpu; const volatile int has_task; const volatile int has_type; const volatile int has_addr; const volatile int has_cgroup; +const volatile int has_slab; const volatile int needs_callstack; const volatile int stack_skip; const volatile int lock_owner; const volatile int use_cgroup_v2; +const volatile int max_stack; +const volatile int lock_delay; /* determine the key of lock stat */ const volatile int aggr_mode; @@ -136,6 +200,8 @@ int perf_subsys_id = -1; __u64 end_ts; +__u32 slab_cache_id; + /* error stat */ int task_fail; int stack_fail; @@ -145,6 +211,9 @@ int data_fail; int task_map_full; int data_map_full; +struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak; +void bpf_task_release(struct task_struct *p) __ksym __weak; + static inline __u64 get_current_cgroup_id(void) { struct task_struct *task; @@ -202,7 +271,7 @@ static inline int can_record(u64 *ctx) __u64 addr = ctx[0]; ok = bpf_map_lookup_elem(&addr_filter, &addr); - if (!ok) + if (!ok && !has_slab) return 0; } @@ -215,6 +284,17 @@ static inline int can_record(u64 *ctx) return 0; } + if (has_slab && bpf_get_kmem_cache) { + __u8 *ok; + __u64 addr = ctx[0]; + long kmem_cache_addr; + + kmem_cache_addr = (long)bpf_get_kmem_cache(addr); + ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr); + if (!ok) + return 0; + } + return 1; } @@ -318,6 +398,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) return 0; } +static inline long delay_callback(__u64 idx, void *arg) +{ + __u64 target = *(__u64 *)arg; + + if (target <= bpf_ktime_get_ns()) + return 1; + + /* just to kill time */ + (void)bpf_get_prandom_u32(); + + return 0; +} + +static inline void do_lock_delay(__u64 duration) +{ + __u64 target = bpf_ktime_get_ns() + duration; + + bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0); +} + +static inline void check_lock_delay(__u64 lock) +{ + __u64 *delay; + + delay = bpf_map_lookup_elem(&lock_delays, &lock); + if (delay) + do_lock_delay(*delay); +} + static inline struct tstamp_data *get_tstamp_elem(__u32 flags) { __u32 pid; @@ -357,6 +466,61 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags) return pelem; } +static inline s32 get_owner_stack_id(u64 *stacktrace) +{ + s32 *id, new_id; + static s64 id_gen = 1; + + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); + if (id) + return *id; + + new_id = (s32)__sync_fetch_and_add(&id_gen, 1); + + bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST); + + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); + if (id) + return *id; + + return -1; +} + +static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count) +{ + __sync_fetch_and_add(&data->total_time, duration); + __sync_fetch_and_add(&data->count, count); + + /* FIXME: need atomic operations */ + if (data->max_time < duration) + data->max_time = duration; + if (data->min_time > duration) + data->min_time = duration; +} + +static inline void update_owner_stat(u32 id, u64 duration, u32 flags) +{ + struct contention_key key = { + .stack_id = id, + .pid = 0, + .lock_addr_or_cgroup = 0, + }; + struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key); + + if (!data) { + struct contention_data first = { + .total_time = duration, + .max_time = duration, + .min_time = duration, + .count = 1, + .flags = flags, + }; + bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST); + } else { + update_contention_data(data, duration, 1); + } +} + SEC("tp_btf/contention_begin") int contention_begin(u64 *ctx) { @@ -374,6 +538,72 @@ int contention_begin(u64 *ctx) pelem->flags = (__u32)ctx[1]; if (needs_callstack) { + u32 i = 0; + u32 id = 0; + int owner_pid; + u64 *buf; + struct task_struct *task; + struct owner_tracing_data *otdata; + + if (!lock_owner) + goto skip_owner; + + task = get_lock_owner(pelem->lock, pelem->flags); + if (!task) + goto skip_owner; + + owner_pid = BPF_CORE_READ(task, pid); + + buf = bpf_map_lookup_elem(&stack_buf, &i); + if (!buf) + goto skip_owner; + for (i = 0; i < max_stack; i++) + buf[i] = 0x0; + + if (!bpf_task_from_pid) + goto skip_owner; + + task = bpf_task_from_pid(owner_pid); + if (!task) + goto skip_owner; + + bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0); + bpf_task_release(task); + + otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); + id = get_owner_stack_id(buf); + + /* + * Contention just happens, or corner case `lock` is owned by process not + * `owner_pid`. For the corner case we treat it as unexpected internal error and + * just ignore the precvious tracing record. + */ + if (!otdata || otdata->pid != owner_pid) { + struct owner_tracing_data first = { + .pid = owner_pid, + .timestamp = pelem->timestamp, + .count = 1, + .stack_id = id, + }; + bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY); + } + /* Contention is ongoing and new waiter joins */ + else { + __sync_fetch_and_add(&otdata->count, 1); + + /* + * The owner is the same, but stacktrace might be changed. In this case we + * store/update `owner_stat` based on current owner stack id. + */ + if (id != otdata->stack_id) { + update_owner_stat(id, pelem->timestamp - otdata->timestamp, + pelem->flags); + + otdata->timestamp = pelem->timestamp; + otdata->stack_id = id; + } + } +skip_owner: pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | stack_skip); if (pelem->stack_id < 0) @@ -410,6 +640,7 @@ int contention_end(u64 *ctx) struct tstamp_data *pelem; struct contention_key key = {}; struct contention_data *data; + __u64 timestamp; __u64 duration; bool need_delete = false; @@ -437,12 +668,88 @@ int contention_end(u64 *ctx) need_delete = true; } - duration = bpf_ktime_get_ns() - pelem->timestamp; + timestamp = bpf_ktime_get_ns(); + duration = timestamp - pelem->timestamp; if ((__s64)duration < 0) { __sync_fetch_and_add(&time_fail, 1); goto out; } + if (needs_callstack && lock_owner) { + struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); + + if (!otdata) + goto skip_owner; + + /* Update `owner_stat` */ + update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags); + + /* No contention is occurring, delete `lock` entry in `owner_data` */ + if (otdata->count <= 1) + bpf_map_delete_elem(&owner_data, &pelem->lock); + /* + * Contention is still ongoing, with a new owner (current task). `owner_data` + * should be updated accordingly. + */ + else { + u32 i = 0; + s32 ret = (s32)ctx[1]; + u64 *buf; + + otdata->timestamp = timestamp; + __sync_fetch_and_add(&otdata->count, -1); + + buf = bpf_map_lookup_elem(&stack_buf, &i); + if (!buf) + goto skip_owner; + for (i = 0; i < (u32)max_stack; i++) + buf[i] = 0x0; + + /* + * `ret` has the return code of the lock function. + * If `ret` is negative, the current task terminates lock waiting without + * acquiring it. Owner is not changed, but we still need to update the owner + * stack. + */ + if (ret < 0) { + s32 id = 0; + struct task_struct *task; + + if (!bpf_task_from_pid) + goto skip_owner; + + task = bpf_task_from_pid(otdata->pid); + if (!task) + goto skip_owner; + + bpf_get_task_stack(task, buf, + max_stack * sizeof(unsigned long), 0); + bpf_task_release(task); + + id = get_owner_stack_id(buf); + + /* + * If owner stack is changed, update owner stack id for this lock. + */ + if (id != otdata->stack_id) + otdata->stack_id = id; + } + /* + * Otherwise, update tracing data with the current task, which is the new + * owner. + */ + else { + otdata->pid = pid; + /* + * We don't want to retrieve callstack here, since it is where the + * current task acquires the lock and provides no additional + * information. We simply assign -1 to invalidate it. + */ + otdata->stack_id = -1; + } + } + } +skip_owner: switch (aggr_mode) { case LOCK_AGGR_CALLER: key.stack_id = pelem->stack_id; @@ -487,8 +794,28 @@ int contention_end(u64 *ctx) }; int err; - if (aggr_mode == LOCK_AGGR_ADDR) - first.flags |= check_lock_type(pelem->lock, pelem->flags); + if (aggr_mode == LOCK_AGGR_ADDR) { + first.flags |= check_lock_type(pelem->lock, + pelem->flags & LCB_F_TYPE_MASK); + + /* Check if it's from a slab object */ + if (bpf_get_kmem_cache) { + struct kmem_cache *s; + struct slab_cache_data *d; + + s = bpf_get_kmem_cache(pelem->lock); + if (s != NULL) { + /* + * Save the ID of the slab cache in the flags + * (instead of full address) to reduce the + * space in the contention_data. + */ + d = bpf_map_lookup_elem(&slab_caches, &s); + if (d != NULL) + first.flags |= d->id; + } + } + } err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); if (err < 0) { @@ -506,16 +833,12 @@ int contention_end(u64 *ctx) } found: - __sync_fetch_and_add(&data->total_time, duration); - __sync_fetch_and_add(&data->count, 1); - - /* FIXME: need atomic operations */ - if (data->max_time < duration) - data->max_time = duration; - if (data->min_time > duration) - data->min_time = duration; + update_contention_data(data, duration, 1); out: + if (lock_delay) + check_lock_delay(pelem->lock); + pelem->lock = 0; if (need_delete) bpf_map_delete_elem(&tstamp, &pid); @@ -524,6 +847,11 @@ out: extern struct rq runqueues __ksym; +const volatile __u64 contig_page_data_addr; +const volatile __u64 node_data_addr; +const volatile int nr_nodes; +const volatile int sizeof_zone; + struct rq___old { raw_spinlock_t lock; } __attribute__((preserve_access_index)); @@ -532,6 +860,59 @@ struct rq___new { raw_spinlock_t __lock; } __attribute__((preserve_access_index)); +static void collect_zone_lock(void) +{ + __u64 nr_zones, zone_off; + __u64 lock_addr, lock_off; + __u32 lock_flag = LOCK_CLASS_ZONE_LOCK; + + zone_off = offsetof(struct pglist_data, node_zones); + lock_off = offsetof(struct zone, lock); + + if (contig_page_data_addr) { + struct pglist_data *contig_page_data; + + contig_page_data = (void *)(long)contig_page_data_addr; + nr_zones = BPF_CORE_READ(contig_page_data, nr_zones); + + for (int i = 0; i < MAX_ZONES; i++) { + __u64 zone_addr; + + if (i >= nr_zones) + break; + + zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off; + lock_addr = zone_addr + lock_off; + + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + } else if (nr_nodes > 0) { + struct pglist_data **node_data = (void *)(long)node_data_addr; + + for (int i = 0; i < nr_nodes; i++) { + struct pglist_data *pgdat = NULL; + int err; + + err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]); + if (err < 0 || pgdat == NULL) + break; + + nr_zones = BPF_CORE_READ(pgdat, nr_zones); + for (int k = 0; k < MAX_ZONES; k++) { + __u64 zone_addr; + + if (k >= nr_zones) + break; + + zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off; + lock_addr = zone_addr + lock_off; + + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + } + } +} + SEC("raw_tp/bpf_test_finish") int BPF_PROG(collect_lock_syms) { @@ -553,6 +934,9 @@ int BPF_PROG(collect_lock_syms) lock_flag = LOCK_CLASS_RQLOCK; bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); } + + collect_zone_lock(); + return 0; } @@ -563,4 +947,43 @@ int BPF_PROG(end_timestamp) return 0; } +/* + * bpf_iter__kmem_cache added recently so old kernels don't have it in the + * vmlinux.h. But we cannot add it here since it will cause a compiler error + * due to redefinition of the struct on later kernels. + * + * So it uses a CO-RE trick to access the member only if it has the type. + * This will support both old and new kernels without compiler errors. + */ +struct bpf_iter__kmem_cache___new { + struct kmem_cache *s; +} __attribute__((preserve_access_index)); + +SEC("iter/kmem_cache") +int slab_cache_iter(void *ctx) +{ + struct kmem_cache *s = NULL; + struct slab_cache_data d; + const char *nameptr; + + if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) { + struct bpf_iter__kmem_cache___new *iter = ctx; + + s = iter->s; + } + + if (s == NULL) + return 0; + + nameptr = s->name; + bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr); + + d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT; + if (d.id >= LCB_F_SLAB_ID_END) + return 0; + + bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST); + return 0; +} + char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h index de12892f992f..28c5e5aced7f 100644 --- a/tools/perf/util/bpf_skel/lock_data.h +++ b/tools/perf/util/bpf_skel/lock_data.h @@ -3,6 +3,13 @@ #ifndef UTIL_BPF_SKEL_LOCK_DATA_H #define UTIL_BPF_SKEL_LOCK_DATA_H +struct owner_tracing_data { + u32 pid; // Who has the lock. + u32 count; // How many waiters for this lock. + u64 timestamp; // The time while the owner acquires lock and contention is going on. + s32 stack_id; // Identifier for `owner_stat`, which stores as value in `owner_stacks` +}; + struct tstamp_data { u64 timestamp; u64 lock; @@ -32,7 +39,15 @@ struct contention_task_data { #define LCD_F_MMAP_LOCK (1U << 31) #define LCD_F_SIGHAND_LOCK (1U << 30) -#define LCB_F_MAX_FLAGS (1U << 7) +#define LCB_F_SLAB_ID_SHIFT 16 +#define LCB_F_SLAB_ID_START (1U << 16) +#define LCB_F_SLAB_ID_END (1U << 26) +#define LCB_F_SLAB_ID_MASK 0x03FF0000U + +#define LCB_F_TYPE_MAX (1U << 7) +#define LCB_F_TYPE_MASK 0x0000007FU + +#define SLAB_NAME_MAX 28 struct contention_data { u64 total_time; @@ -52,6 +67,12 @@ enum lock_aggr_mode { enum lock_class_sym { LOCK_CLASS_NONE, LOCK_CLASS_RQLOCK, + LOCK_CLASS_ZONE_LOCK, +}; + +struct slab_cache_data { + u32 id; + char name[SLAB_NAME_MAX]; }; #endif /* UTIL_BPF_SKEL_LOCK_DATA_H */ diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c index c152116df72f..72763bb8d1de 100644 --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -18,10 +18,19 @@ #define MAX_STACKS 32 #define MAX_ENTRIES 102400 +#define MAX_CPUS 4096 +#define MAX_OFFCPU_LEN 37 + +// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1 +struct __stack { + u64 array[MAX_STACKS]; +}; + struct tstamp_data { __u32 stack_id; __u32 state; __u64 timestamp; + struct __stack stack; }; struct offcpu_key { @@ -39,6 +48,24 @@ struct { __uint(max_entries, MAX_ENTRIES); } stacks SEC(".maps"); +struct offcpu_data { + u64 array[MAX_OFFCPU_LEN]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, MAX_CPUS); +} offcpu_output SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct offcpu_data)); + __uint(max_entries, 1); +} offcpu_payload SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); @@ -97,6 +124,8 @@ const volatile bool uses_cgroup_v1 = false; int perf_subsys_id = -1; +__u64 offcpu_thresh_ns; + /* * Old kernel used to call it task_struct->state and now it's '__state'. * Use BPF CO-RE "ignored suffix rule" to deal with it like below: @@ -183,6 +212,47 @@ static inline int can_record(struct task_struct *t, int state) return 1; } +static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n) +{ + int len = 0; + + for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len) + to->array[n + 2 + i] = from->array[i]; + + return len; +} + +/** + * off_cpu_dump - dump off-cpu samples to ring buffer + * @data: payload for dumping off-cpu samples + * @key: off-cpu data + * @stack: stack trace of the task before being scheduled out + * + * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id + * information of the task, and dump it as a raw sample to perf ring buffer + */ +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key, + struct __stack *stack, __u64 delta) +{ + int n = 0, len = 0; + + data->array[n++] = (u64)key->tgid << 32 | key->pid; + data->array[n++] = delta; + + /* data->array[n] is callchain->nr (updated later) */ + data->array[n + 1] = PERF_CONTEXT_USER; + data->array[n + 2] = 0; + len = copy_stack(stack, data, n); + + /* update length of callchain */ + data->array[n] = len + 1; + n += len + 2; + + data->array[n++] = key->cgroup_id; + + return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64)); +} + static int off_cpu_stat(u64 *ctx, struct task_struct *prev, struct task_struct *next, int state) { @@ -207,6 +277,16 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev, pelem->state = state; pelem->stack_id = stack_id; + /* + * If stacks are successfully collected by bpf_get_stackid(), collect them once more + * in task_storage for direct off-cpu sample dumping + */ + if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) { + /* + * This empty if block is used to avoid 'result unused warning' from bpf_get_stack(). + * If the collection fails, continue with the logic for the next task. + */ + } next: pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); @@ -221,11 +301,19 @@ next: __u64 delta = ts - pelem->timestamp; __u64 *total; - total = bpf_map_lookup_elem(&off_cpu, &key); - if (total) - *total += delta; - else - bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + if (delta >= offcpu_thresh_ns) { + int zero = 0; + struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero); + + if (data) + off_cpu_dump(ctx, data, &key, &pelem->stack, delta); + } else { + total = bpf_map_lookup_elem(&off_cpu, &key); + if (total) + *total += delta; + else + bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + } /* prevent to reuse the timestamp later */ pelem->timestamp = 0; diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c new file mode 100644 index 000000000000..1bcd066a5199 --- /dev/null +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Trace raw_syscalls tracepoints to collect system call statistics. + */ + +#include "vmlinux.h" +#include "syscall_summary.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* This is to calculate a delta between sys-enter and sys-exit for each thread */ +struct syscall_trace { + int nr; /* syscall number is only available at sys-enter */ + int unused; + u64 timestamp; +}; + +#define MAX_ENTRIES (128 * 1024) + +struct syscall_trace_map { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); /* tid */ + __type(value, struct syscall_trace); + __uint(max_entries, MAX_ENTRIES); +} syscall_trace_map SEC(".maps"); + +struct syscall_stats_map { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct syscall_key); + __type(value, struct syscall_stats); + __uint(max_entries, MAX_ENTRIES); +} syscall_stats_map SEC(".maps"); + +int enabled; /* controlled from userspace */ + +const volatile enum syscall_aggr_mode aggr_mode; +const volatile int use_cgroup_v2; + +int perf_subsys_id = -1; + +static inline __u64 get_current_cgroup_id(void) +{ + struct task_struct *task; + struct cgroup *cgrp; + + if (use_cgroup_v2) + return bpf_get_current_cgroup_id(); + + task = bpf_get_current_task_btf(); + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + + cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); + return BPF_CORE_READ(cgrp, kn, id); +} + +static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration, + long ret) +{ + struct syscall_key key = { + .cpu_or_tid = cpu_or_tid, + .cgroup = cgroup_id, + .nr = nr, + }; + struct syscall_stats *stats; + + stats = bpf_map_lookup_elem(&syscall_stats_map, &key); + if (stats == NULL) { + struct syscall_stats zero = {}; + + bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST); + stats = bpf_map_lookup_elem(&syscall_stats_map, &key); + if (stats == NULL) + return; + } + + __sync_fetch_and_add(&stats->count, 1); + if (ret < 0) + __sync_fetch_and_add(&stats->error, 1); + + if (duration > 0) { + __sync_fetch_and_add(&stats->total_time, duration); + __sync_fetch_and_add(&stats->squared_sum, duration * duration); + if (stats->max_time < duration) + stats->max_time = duration; + if (stats->min_time > duration || stats->min_time == 0) + stats->min_time = duration; + } + + return; +} + +SEC("tp_btf/sys_enter") +int sys_enter(u64 *ctx) +{ + int tid; + struct syscall_trace st; + + if (!enabled) + return 0; + + st.nr = ctx[1]; /* syscall number */ + st.unused = 0; + st.timestamp = bpf_ktime_get_ns(); + + tid = bpf_get_current_pid_tgid(); + bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY); + + return 0; +} + +SEC("tp_btf/sys_exit") +int sys_exit(u64 *ctx) +{ + int tid; + int key = 0; + u64 cgroup = 0; + long ret = ctx[1]; /* return value of the syscall */ + struct syscall_trace *st; + s64 delta; + + if (!enabled) + return 0; + + tid = bpf_get_current_pid_tgid(); + st = bpf_map_lookup_elem(&syscall_trace_map, &tid); + if (st == NULL) + return 0; + + if (aggr_mode == SYSCALL_AGGR_THREAD) + key = tid; + else if (aggr_mode == SYSCALL_AGGR_CGROUP) + cgroup = get_current_cgroup_id(); + else + key = bpf_get_smp_processor_id(); + + delta = bpf_ktime_get_ns() - st->timestamp; + update_stats(key, cgroup, st->nr, delta, ret); + + bpf_map_delete_elem(&syscall_trace_map, &tid); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h new file mode 100644 index 000000000000..72ccccb45925 --- /dev/null +++ b/tools/perf/util/bpf_skel/syscall_summary.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Data structures shared between BPF and tools. */ +#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H +#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H + +enum syscall_aggr_mode { + SYSCALL_AGGR_THREAD, + SYSCALL_AGGR_CPU, + SYSCALL_AGGR_CGROUP, +}; + +struct syscall_key { + u64 cgroup; + int cpu_or_tid; + int nr; +}; + +struct syscall_stats { + u64 total_time; + u64 squared_sum; + u64 max_time; + u64 min_time; + u32 count; + u32 error; +}; + +#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */ diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h index 4dcad7b682bd..a59ce912be18 100644 --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -195,4 +195,21 @@ struct bpf_perf_event_data_kern { */ struct rq {}; +struct kmem_cache { + const char *name; +} __attribute__((preserve_access_index)); + +struct bpf_iter__kmem_cache { + struct kmem_cache *s; +} __attribute__((preserve_access_index)); + +struct zone { + spinlock_t lock; +} __attribute__((preserve_access_index)); + +struct pglist_data { + struct zone node_zones[6]; /* value for all possible config */ + int nr_zones; +} __attribute__((preserve_access_index)); + #endif // __VMLINUX_H |