diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 609 |
1 files changed, 424 insertions, 185 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 95e703891b24..f34c99f8ce8f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1270,6 +1270,10 @@ static void put_ctx(struct perf_event_context *ctx) if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); + } else { + smp_mb__after_atomic(); /* pairs with wait_var_event() */ + if (ctx->task == TASK_TOMBSTONE) + wake_up_var(&ctx->refcount); } } @@ -2167,7 +2171,7 @@ static void perf_put_aux_event(struct perf_event *event) * If the event is an aux_event, tear down all links to * it from other events. */ - for_each_sibling_event(iter, event->group_leader) { + for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue; @@ -2325,7 +2329,11 @@ static void perf_child_detach(struct perf_event *event) if (WARN_ON_ONCE(!parent_event)) return; + /* + * Can't check this from an IPI, the holder is likey another CPU. + * lockdep_assert_held(&parent_event->child_mutex); + */ sync_child_event(event); list_del_init(&event->child_list); @@ -2343,6 +2351,11 @@ event_filter_match(struct perf_event *event) perf_cgroup_match(event); } +static inline bool is_event_in_freq_mode(struct perf_event *event) +{ + return event->attr.freq && event->attr.sample_freq; +} + static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { @@ -2380,7 +2393,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq--; epc->nr_freq--; } @@ -2450,8 +2463,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL -#define DETACH_EXIT 0x08UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2477,12 +2491,15 @@ __perf_remove_from_context(struct perf_event *event, */ if (flags & DETACH_EXIT) state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD) { event->pending_disable = 1; state = PERF_EVENT_STATE_DEAD; } event_sched_out(event, ctx); perf_event_set_state(event, min(event->state, state)); + if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) @@ -2628,6 +2645,41 @@ void perf_event_disable_inatomic(struct perf_event *event) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); +static void perf_event_unthrottle(struct perf_event *event, bool start) +{ + event->hw.interrupts = 0; + if (start) + event->pmu->start(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 1); +} + +static void perf_event_throttle(struct perf_event *event) +{ + event->pmu->stop(event, 0); + event->hw.interrupts = MAX_INTERRUPTS; + if (event == event->group_leader) + perf_log_throttle(event, 0); +} + +static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_unthrottle(leader, skip_start_event ? leader != event : true); + for_each_sibling_event(sibling, leader) + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); +} + +static void perf_event_throttle_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_throttle(leader); + for_each_sibling_event(sibling, leader) + perf_event_throttle(sibling); +} + static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { @@ -2656,10 +2708,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) + perf_event_unthrottle(event, false); perf_pmu_disable(event->pmu); @@ -2674,7 +2724,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq++; epc->nr_freq++; } @@ -4237,14 +4287,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list) hwc = &event->hw; - if (hwc->interrupts == MAX_INTERRUPTS) { - hwc->interrupts = 0; - perf_log_throttle(event, 1); - if (!event->attr.freq || !event->attr.sample_freq) - event->pmu->start(event, 0); - } + if (hwc->interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, is_event_in_freq_mode(event)); - if (!event->attr.freq || !event->attr.sample_freq) + if (!is_event_in_freq_mode(event)) continue; /* @@ -4516,7 +4562,8 @@ out: static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4543,7 +4590,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -5125,6 +5172,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5521,6 +5569,8 @@ static void perf_free_addr_filters(struct perf_event *event); /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5550,6 +5600,7 @@ static void __free_event(struct perf_event *event) * put_pmu_ctx() needs an event->ctx reference, because of * epc->ctx. */ + WARN_ON_ONCE(!pmu); WARN_ON_ONCE(!event->ctx); WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); @@ -5562,8 +5613,13 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) - module_put(event->pmu->module); + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5600,13 +5656,13 @@ static void _free_event(struct perf_event *event) /* * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. + * of inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } @@ -5689,7 +5745,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; - LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not @@ -5718,15 +5773,17 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { - void *var = NULL; - /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). @@ -5759,44 +5816,24 @@ again: tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { - perf_remove_from_context(child, DETACH_GROUP); - list_move(&child->child_list, &free_list); + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); } else { - var = &ctx->refcount; + child = NULL; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); - put_ctx(ctx); - if (var) { - /* - * If perf_event_free_task() has deleted all events from the - * ctx while the child_mutex got released above, make sure to - * notify about the preceding put_ctx(). - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); + if (child) { + /* Last reference unless ->pending_task work is pending */ + put_event(child); } + put_ctx(ctx); + goto again; } mutex_unlock(&event->child_mutex); - list_for_each_entry_safe(child, tmp, &free_list, child_list) { - void *var = &child->ctx->refcount; - - list_del(&child->child_list); - /* Last reference unless ->pending_task work is pending */ - put_event(child); - - /* - * Wake any perf_event_free_task() waiting for this event to be - * freed. - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); - } - no_ctx: /* * Last reference unless ->pending_task work is pending on this event @@ -6068,8 +6105,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; @@ -6167,14 +6210,6 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); - /* - * We could be throttled; unthrottle now to avoid the tick - * trying to unthrottle while we already re-started the event. - */ - if (event->hw.interrupts == MAX_INTERRUPTS) { - event->hw.interrupts = 0; - perf_log_throttle(event, 1); - } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -6182,6 +6217,14 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); + /* + * Once the period is force-reset, the event starts immediately. + * But the event/group could be throttled. Unthrottle the + * event/group now to avoid the next tick trying to unthrottle + * while we already re-started the event/group. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, true); perf_pmu_enable(event->pmu); } } @@ -6239,12 +6282,18 @@ static int perf_event_set_output(struct perf_event *event, static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6301,7 +6350,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog, 0); + err = __perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -6620,9 +6669,22 @@ void ring_buffer_put(struct perf_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); @@ -6630,8 +6692,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -6647,14 +6709,16 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event, vma->vm_mm); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex @@ -6847,6 +6911,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; long user_extra = 0, extra = 0; int ret, flags = 0; + mapped_f mapped; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6877,6 +6942,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&event->mmap_mutex); ret = -EINVAL; + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) { + ret = -ENODEV; + goto unlock; + } + if (vma->vm_pgoff == 0) { nr_pages -= 1; @@ -7055,8 +7130,9 @@ aux_unlock: if (!ret) ret = map_range(rb, vma); - if (!ret && event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); return ret; } @@ -7067,6 +7143,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -9946,7 +10025,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes, void perf_event_itrace_started(struct perf_event *event) { - event->attach_state |= PERF_ATTACH_ITRACE; + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); } static void perf_log_itrace_start(struct perf_event *event) @@ -10029,14 +10108,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle && - hwc->interrupts > max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + perf_event_throttle_group(event); + ret = 1; } if (event->attr.freq) { @@ -11069,11 +11147,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, - u64 bpf_cookie) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); @@ -11108,6 +11190,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + void perf_event_free_bpf_prog(struct perf_event *event) { if (!event->prog) @@ -11130,7 +11226,15 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; @@ -12235,6 +12339,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ @@ -12248,21 +12355,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) { scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; + } + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12356,7 +12585,7 @@ static struct pmu *perf_init_event(struct perf_event *event) struct pmu *pmu; int type, ret; - guard(srcu)(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ /* * Save original type before calling pmu->event_init() since certain @@ -12580,6 +12809,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); @@ -12651,7 +12881,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, hwc = &event->hw; hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) + if (is_event_in_freq_mode(event)) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; @@ -12758,6 +12988,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return_ptr(event); } @@ -12957,6 +13194,9 @@ set: goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) @@ -13138,6 +13378,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { @@ -13145,6 +13390,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -13441,7 +13690,7 @@ err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: - free_event(event); + put_event(event); err_task: if (task) put_task_struct(task); @@ -13478,6 +13727,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -13549,7 +13803,7 @@ err_unlock: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + put_event(event); err: return ERR_PTR(err); } @@ -13689,10 +13943,12 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; + unsigned int attach_state; if (parent_event) { /* @@ -13707,22 +13963,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); + /* PERF_ATTACH_ITRACE might be set concurrently */ + attach_state = READ_ONCE(event->attach_state); } - perf_remove_from_context(event, detach_flags | DETACH_EXIT); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); + /* - * Kick perf_poll() for is_event_hup(); + * Match the refcount initialization. Make sure it doesn't happen + * twice if pmu_detach_event() calls it on an already exited task. */ - perf_event_wakeup(parent_event); - put_event(event); + if (attach_state & PERF_ATTACH_CHILD) { + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + * perf_pending_task() might have one too. + */ + put_event(event); + } + return; } @@ -13732,15 +14004,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *task, bool exit) { - struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - WARN_ON_ONCE(child != current); - - child_ctx = perf_pin_task_context(child); - if (!child_ctx) + ctx = perf_pin_task_context(task); + if (!ctx) return; /* @@ -13753,27 +14023,28 @@ static void perf_event_exit_task_context(struct task_struct *child) * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ - mutex_lock(&child_ctx->mutex); + mutex_lock(&ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ - raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); + raw_spin_lock_irq(&ctx->lock); + if (exit) + task_ctx_sched_out(ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp, NULL); - put_ctx(child_ctx); /* cannot be last */ - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); - put_task_struct(current); /* cannot be last */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + put_ctx(ctx); /* cannot be last */ + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ - clone_ctx = unclone_ctx(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irq(&ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -13783,28 +14054,48 @@ static void perf_event_exit_task_context(struct task_struct *child) * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ - perf_event_task(child, child_ctx, 0); + if (exit) + perf_event_task(task, ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) + perf_event_exit_event(child_event, ctx, false); - mutex_unlock(&child_ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(child_ctx); + if (!exit) { + /* + * perf_event_release_kernel() could still have a reference on + * this context. In that case we must wait for these events to + * have been freed (in particular all their references to this + * task must've been dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, + refcount_read(&ctx->refcount) == 1); + } + put_ctx(ctx); } /* - * When a child task exits, feed back event values to parent events. + * When a task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ -void perf_event_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *task) { struct perf_event *event, *tmp; - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, + WARN_ON_ONCE(task != current); + + mutex_lock(&task->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &task->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); @@ -13815,42 +14106,23 @@ void perf_event_exit_task(struct task_struct *child) */ smp_store_release(&event->owner, NULL); } - mutex_unlock(&child->perf_event_mutex); + mutex_unlock(&task->perf_event_mutex); - perf_event_exit_task_context(child); + perf_event_exit_task_context(task, true); /* * The perf_event_exit_task_context calls perf_event_task - * with child's task_ctx, which generates EXIT events for - * child contexts and sets child->perf_event_ctxp[] to NULL. + * with task's task_ctx, which generates EXIT events for + * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ - perf_event_task(child, NULL, 0); + perf_event_task(task, NULL, 0); /* * Detach the perf_ctx_data for the system-wide event. */ guard(percpu_read)(&global_ctx_data_rwsem); - detach_task_ctx_data(child); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - put_event(event); + detach_task_ctx_data(task); } /* @@ -13862,48 +14134,7 @@ static void perf_free_event(struct perf_event *event, */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = rcu_access_pointer(task->perf_event_ctxp); - if (!ctx) - return; - - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp, NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); - - - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - - mutex_unlock(&ctx->mutex); - - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ + perf_event_exit_task_context(task, false); } void perf_event_delayed_put(struct task_struct *task) @@ -13980,6 +14211,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, |