summaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c689
-rw-r--r--kernel/events/ring_buffer.c29
-rw-r--r--kernel/events/uprobes.c372
3 files changed, 665 insertions, 425 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0bb21659e252..f34c99f8ce8f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1270,6 +1270,10 @@ static void put_ctx(struct perf_event_context *ctx)
if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
+ } else {
+ smp_mb__after_atomic(); /* pairs with wait_var_event() */
+ if (ctx->task == TASK_TOMBSTONE)
+ wake_up_var(&ctx->refcount);
}
}
@@ -2167,7 +2171,7 @@ static void perf_put_aux_event(struct perf_event *event)
* If the event is an aux_event, tear down all links to
* it from other events.
*/
- for_each_sibling_event(iter, event->group_leader) {
+ for_each_sibling_event(iter, event) {
if (iter->aux_event != event)
continue;
@@ -2325,7 +2329,11 @@ static void perf_child_detach(struct perf_event *event)
if (WARN_ON_ONCE(!parent_event))
return;
+ /*
+ * Can't check this from an IPI, the holder is likey another CPU.
+ *
lockdep_assert_held(&parent_event->child_mutex);
+ */
sync_child_event(event);
list_del_init(&event->child_list);
@@ -2343,6 +2351,11 @@ event_filter_match(struct perf_event *event)
perf_cgroup_match(event);
}
+static inline bool is_event_in_freq_mode(struct perf_event *event)
+{
+ return event->attr.freq && event->attr.sample_freq;
+}
+
static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2380,7 +2393,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq) {
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
epc->nr_freq--;
}
@@ -2450,7 +2463,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2465,6 +2480,7 @@ __perf_remove_from_context(struct perf_event *event,
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
ctx_time_update(cpuctx, ctx);
@@ -2473,16 +2489,22 @@ __perf_remove_from_context(struct perf_event *event,
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
- if (flags & DETACH_DEAD)
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD) {
event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+ }
event_sched_out(event, ctx);
+ perf_event_set_state(event, min(event->state, state));
+
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -2623,6 +2645,41 @@ void perf_event_disable_inatomic(struct perf_event *event)
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ event->pmu->stop(event, 0);
+ event->hw.interrupts = MAX_INTERRUPTS;
+ if (event == event->group_leader)
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2651,10 +2708,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
@@ -2669,7 +2724,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq) {
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq++;
epc->nr_freq++;
}
@@ -3938,7 +3993,7 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
if (*perf_event_fasync(event))
- event->pending_kill = POLL_HUP;
+ event->pending_kill = POLL_ERR;
perf_event_wakeup(event);
} else {
@@ -4232,14 +4287,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- if (!event->attr.freq || !event->attr.sample_freq)
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
- if (!event->attr.freq || !event->attr.sample_freq)
+ if (!is_event_in_freq_mode(event))
continue;
/*
@@ -4511,7 +4562,8 @@ out:
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4538,7 +4590,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5120,6 +5172,7 @@ static bool is_sb_event(struct perf_event *event)
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5513,33 +5566,11 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_free_addr_filters(struct perf_event *event);
-static void perf_pending_task_sync(struct perf_event *event)
-{
- struct callback_head *head = &event->pending_task;
-
- if (!event->pending_work)
- return;
- /*
- * If the task is queued to the current task's queue, we
- * obviously can't wait for it to complete. Simply cancel it.
- */
- if (task_work_cancel(current, head)) {
- event->pending_work = 0;
- local_dec(&event->ctx->nr_no_switch_fast);
- return;
- }
-
- /*
- * All accesses related to the event are within the same RCU section in
- * perf_pending_task(). The RCU grace period before the event is freed
- * will make sure all those accesses are complete by then.
- */
- rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
-}
-
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5569,6 +5600,7 @@ static void __free_event(struct perf_event *event)
* put_pmu_ctx() needs an event->ctx reference, because of
* epc->ctx.
*/
+ WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
@@ -5581,8 +5613,13 @@ static void __free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5594,7 +5631,6 @@ static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
irq_work_sync(&event->pending_disable_irq);
- perf_pending_task_sync(event);
unaccount_event(event);
@@ -5620,13 +5656,13 @@ static void _free_event(struct perf_event *event)
/*
* Used to free events which have a known refcount of 1, such as in error paths
- * where the event isn't exposed yet and inherited events.
+ * of inherited events.
*/
static void free_event(struct perf_event *event)
{
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
- "unexpected event refcount: %ld; ptr=%p\n",
- atomic_long_read(&event->refcount), event)) {
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
/* leak to avoid use-after-free */
return;
}
@@ -5687,10 +5723,17 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
+ struct perf_event *parent;
+
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ parent = event->parent;
_free_event(event);
+
+ /* Matches the refcount bump in inherit_event() */
+ if (parent)
+ put_event(parent);
}
/*
@@ -5702,7 +5745,6 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
- LIST_HEAD(free_list);
/*
* If we got here through err_alloc: free_event(event); we will not
@@ -5731,15 +5773,17 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
- void *var = NULL;
-
/*
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
@@ -5772,50 +5816,30 @@ again:
tmp = list_first_entry_or_null(&event->child_list,
struct perf_event, child_list);
if (tmp == child) {
- perf_remove_from_context(child, DETACH_GROUP);
- list_move(&child->child_list, &free_list);
- /*
- * This matches the refcount bump in inherit_event();
- * this can't be the last reference.
- */
- put_event(event);
+ perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
} else {
- var = &ctx->refcount;
+ child = NULL;
}
mutex_unlock(&event->child_mutex);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
- if (var) {
- /*
- * If perf_event_free_task() has deleted all events from the
- * ctx while the child_mutex got released above, make sure to
- * notify about the preceding put_ctx().
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
+ if (child) {
+ /* Last reference unless ->pending_task work is pending */
+ put_event(child);
}
+ put_ctx(ctx);
+
goto again;
}
mutex_unlock(&event->child_mutex);
- list_for_each_entry_safe(child, tmp, &free_list, child_list) {
- void *var = &child->ctx->refcount;
-
- list_del(&child->child_list);
- free_event(child);
-
- /*
- * Wake any perf_event_free_task() waiting for this event to be
- * freed.
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
- }
-
no_ctx:
- put_event(event); /* Must be the 'last' reference */
+ /*
+ * Last reference unless ->pending_task work is pending on this event
+ * or any of its children.
+ */
+ put_event(event);
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -6081,14 +6105,20 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
event->attr.pinned))
- return events;
+ return EPOLLERR;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -6180,14 +6210,6 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(event->pmu);
- /*
- * We could be throttled; unthrottle now to avoid the tick
- * trying to unthrottle while we already re-started the event.
- */
- if (event->hw.interrupts == MAX_INTERRUPTS) {
- event->hw.interrupts = 0;
- perf_log_throttle(event, 1);
- }
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -6195,6 +6217,14 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
perf_pmu_enable(event->pmu);
}
}
@@ -6252,12 +6282,18 @@ static int perf_event_set_output(struct perf_event *event,
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6314,7 +6350,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = perf_event_set_bpf_prog(event, prog, 0);
+ err = __perf_event_set_bpf_prog(event, prog, 0);
if (err) {
bpf_prog_put(prog);
return err;
@@ -6633,9 +6669,22 @@ void ring_buffer_put(struct perf_buffer *rb)
call_rcu(&rb->rcu_head, rb_free_rcu);
}
+typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
+
+#define get_mapped(event, func) \
+({ struct pmu *pmu; \
+ mapped_f f = NULL; \
+ guard(rcu)(); \
+ pmu = READ_ONCE(event->pmu); \
+ if (pmu) \
+ f = pmu->func; \
+ f; \
+})
+
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f mapped = get_mapped(event, event_mapped);
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
@@ -6643,8 +6692,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ if (mapped)
+ mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -6660,14 +6709,16 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
- if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event, vma->vm_mm);
+ /* FIXIES vs perf_pmu_unregister() */
+ if (unmapped)
+ unmapped(event, vma->vm_mm);
/*
* The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
@@ -6860,6 +6911,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret, flags = 0;
+ mapped_f mapped;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6890,6 +6942,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
@@ -7068,8 +7130,9 @@ aux_unlock:
if (!ret)
ret = map_range(rb, vma);
- if (!ret && event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
return ret;
}
@@ -7080,6 +7143,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -7231,12 +7297,6 @@ static void perf_pending_task(struct callback_head *head)
int rctx;
/*
- * All accesses to the event must belong to the same implicit RCU read-side
- * critical section as the ->pending_work reset. See comment in
- * perf_pending_task_sync().
- */
- rcu_read_lock();
- /*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
@@ -7246,9 +7306,8 @@ static void perf_pending_task(struct callback_head *head)
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_no_switch_fast);
- rcuwait_wake_up(&event->pending_work_wait);
}
- rcu_read_unlock();
+ put_event(event);
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
@@ -9966,7 +10025,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes,
void perf_event_itrace_started(struct perf_event *event)
{
- event->attach_state |= PERF_ATTACH_ITRACE;
+ WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
}
static void perf_log_itrace_start(struct perf_event *event)
@@ -10049,14 +10108,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle &&
- hwc->interrupts > max_samples_per_tick)) {
- __this_cpu_inc(perf_throttled_count);
- tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
+ }
+
+ if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
+ perf_event_throttle_group(event);
+ ret = 1;
}
if (event->attr.freq) {
@@ -10243,6 +10301,7 @@ static int __perf_event_overflow(struct perf_event *event,
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
local_inc(&event->ctx->nr_no_switch_fast);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@ -11088,11 +11147,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
- u64 bpf_cookie)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
@@ -11127,6 +11190,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!event->prog)
@@ -11149,7 +11226,15 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -ENOENT;
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
u64 bpf_cookie)
{
return -ENOENT;
@@ -12254,6 +12339,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
@@ -12267,21 +12355,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ /*
+ * Mostly an empty lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
}
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
+ *
+ * Notably, the entirety of event creation, from perf_init_event()
+ * (which will now fail, because of the above) until
+ * perf_install_in_context() should be under SRCU such that
+ * this synchronizes against event creation. This avoids trying to
+ * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12375,7 +12585,7 @@ static struct pmu *perf_init_event(struct perf_event *event)
struct pmu *pmu;
int type, ret;
- guard(srcu)(&pmus_srcu);
+ guard(srcu)(&pmus_srcu); /* pmu idr/list access */
/*
* Save original type before calling pmu->event_init() since certain
@@ -12599,13 +12809,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
- rcuwait_init(&event->pending_work_wait);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -12671,7 +12881,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
hwc = &event->hw;
hwc->sample_period = attr->sample_period;
- if (attr->freq && attr->sample_freq)
+ if (is_event_in_freq_mode(event))
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
@@ -12778,6 +12988,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ lockdep_assert_held(&pmus_srcu);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return_ptr(event);
}
@@ -12977,6 +13194,9 @@ set:
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
@@ -13158,6 +13378,11 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
if (!is_perf_file(group)) {
@@ -13165,6 +13390,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_fd;
}
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13461,7 +13690,7 @@ err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
err_alloc:
- free_event(event);
+ put_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -13498,6 +13727,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
@@ -13569,7 +13803,7 @@ err_unlock:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ put_event(event);
err:
return ERR_PTR(err);
}
@@ -13709,10 +13943,12 @@ static void sync_child_event(struct perf_event *child_event)
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
+ unsigned int attach_state;
if (parent_event) {
/*
@@ -13727,28 +13963,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
+ /* PERF_ATTACH_ITRACE might be set concurrently */
+ attach_state = READ_ONCE(event->attach_state);
}
- perf_remove_from_context(event, detach_flags);
-
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
*/
if (parent_event) {
mutex_unlock(&parent_event->child_mutex);
+
/*
- * Kick perf_poll() for is_event_hup();
+ * Match the refcount initialization. Make sure it doesn't happen
+ * twice if pmu_detach_event() calls it on an already exited task.
*/
- perf_event_wakeup(parent_event);
- free_event(event);
- put_event(parent_event);
+ if (attach_state & PERF_ATTACH_CHILD) {
+ /*
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+ /*
+ * pmu_detach_event() will have an extra refcount.
+ * perf_pending_task() might have one too.
+ */
+ put_event(event);
+ }
+
return;
}
@@ -13758,15 +14004,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
- struct perf_event_context *child_ctx, *clone_ctx = NULL;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
- WARN_ON_ONCE(child != current);
-
- child_ctx = perf_pin_task_context(child);
- if (!child_ctx)
+ ctx = perf_pin_task_context(task);
+ if (!ctx)
return;
/*
@@ -13779,27 +14023,28 @@ static void perf_event_exit_task_context(struct task_struct *child)
* without ctx::mutex (it cannot because of the move_group double mutex
* lock thing). See the comments in perf_install_in_context().
*/
- mutex_lock(&child_ctx->mutex);
+ mutex_lock(&ctx->mutex);
/*
* In a single ctx::lock section, de-schedule the events and detach the
* context from the task such that we cannot ever get it scheduled back
* in.
*/
- raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
+ raw_spin_lock_irq(&ctx->lock);
+ if (exit)
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
- put_ctx(child_ctx); /* cannot be last */
- WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
- put_task_struct(current); /* cannot be last */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ put_ctx(ctx); /* cannot be last */
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
- clone_ctx = unclone_ctx(child_ctx);
- raw_spin_unlock_irq(&child_ctx->lock);
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irq(&ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -13809,28 +14054,48 @@ static void perf_event_exit_task_context(struct task_struct *child)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- perf_event_task(child, child_ctx, 0);
+ if (exit)
+ perf_event_task(task, ctx, 0);
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
+ perf_event_exit_event(child_event, ctx, false);
- mutex_unlock(&child_ctx->mutex);
+ mutex_unlock(&ctx->mutex);
- put_ctx(child_ctx);
+ if (!exit) {
+ /*
+ * perf_event_release_kernel() could still have a reference on
+ * this context. In that case we must wait for these events to
+ * have been freed (in particular all their references to this
+ * task must've been dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount,
+ refcount_read(&ctx->refcount) == 1);
+ }
+ put_ctx(ctx);
}
/*
- * When a child task exits, feed back event values to parent events.
+ * When a task exits, feed back event values to parent events.
*
* Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
-void perf_event_exit_task(struct task_struct *child)
+void perf_event_exit_task(struct task_struct *task)
{
struct perf_event *event, *tmp;
- mutex_lock(&child->perf_event_mutex);
- list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ WARN_ON_ONCE(task != current);
+
+ mutex_lock(&task->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &task->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
@@ -13841,44 +14106,23 @@ void perf_event_exit_task(struct task_struct *child)
*/
smp_store_release(&event->owner, NULL);
}
- mutex_unlock(&child->perf_event_mutex);
+ mutex_unlock(&task->perf_event_mutex);
- perf_event_exit_task_context(child);
+ perf_event_exit_task_context(task, true);
/*
* The perf_event_exit_task_context calls perf_event_task
- * with child's task_ctx, which generates EXIT events for
- * child contexts and sets child->perf_event_ctxp[] to NULL.
+ * with task's task_ctx, which generates EXIT events for
+ * task contexts and sets task->perf_event_ctxp[] to NULL.
* At this point we need to send EXIT events to cpu contexts.
*/
- perf_event_task(child, NULL, 0);
+ perf_event_task(task, NULL, 0);
/*
* Detach the perf_ctx_data for the system-wide event.
*/
guard(percpu_read)(&global_ctx_data_rwsem);
- detach_task_ctx_data(child);
-}
-
-static void perf_free_event(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *parent = event->parent;
-
- if (WARN_ON_ONCE(!parent))
- return;
-
- mutex_lock(&parent->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent->child_mutex);
-
- put_event(parent);
-
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
- free_event(event);
+ detach_task_ctx_data(task);
}
/*
@@ -13890,48 +14134,7 @@ static void perf_free_event(struct perf_event *event,
*/
void perf_event_free_task(struct task_struct *task)
{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = rcu_access_pointer(task->perf_event_ctxp);
- if (!ctx)
- return;
-
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
-
-
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
-
- mutex_unlock(&ctx->mutex);
-
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
+ perf_event_exit_task_context(task, false);
}
void perf_event_delayed_put(struct task_struct *task)
@@ -14008,6 +14211,14 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
@@ -14016,6 +14227,9 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
+
pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
@@ -14037,8 +14251,6 @@ inherit_event(struct perf_event *parent_event,
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -14059,7 +14271,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5130b119d0ae..d2aef87c7e9f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order;
+ bool use_contiguous_pages = event->pmu->capabilities & (
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
+ /*
+ * Initialize max_order to 0 for page allocation. This allocates single
+ * pages to minimize memory fragmentation. This is overridden if the
+ * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
+ */
+ int max_order = 0;
+ int ret = -ENOMEM;
if (!has_aux(event))
return -EOPNOTSUPP;
@@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
if (!overwrite) {
/*
- * Watermark defaults to half the buffer, and so does the
- * max_order, to aid PMU drivers in double buffering.
+ * Watermark defaults to half the buffer, to aid PMU drivers
+ * in double buffering.
*/
if (!watermark)
watermark = min_t(unsigned long,
@@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
(unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
- * Use aux_watermark as the basis for chunking to
- * help PMU drivers honor the watermark.
+ * If using contiguous pages, use aux_watermark as the basis
+ * for chunking to help PMU drivers honor the watermark.
*/
- max_order = get_order(watermark);
+ if (use_contiguous_pages)
+ max_order = get_order(watermark);
} else {
/*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
+ * If using contiguous pages, we need to start with the
+ * max_order that fits in nr_pages, not the other way around,
+ * hence ilog2() and not get_order.
*/
- max_order = ilog2(nr_pages);
+ if (use_contiguous_pages)
+ max_order = ilog2(nr_pages);
watermark = 0;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 615b4e6d22c7..4c965ba77f9f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -29,6 +29,7 @@
#include <linux/workqueue.h>
#include <linux/srcu.h>
#include <linux/oom.h> /* check_stable_address_space */
+#include <linux/pagewalk.h>
#include <linux/uprobes.h>
@@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
}
/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma: vma that holds the pte pointing to page
- * @addr: address the old @page is mapped at
- * @old_page: the page we are replacing by new_page
- * @new_page: the modified page we replace page by
- *
- * If @new_page is NULL, only unmap @old_page.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *old_page, struct page *new_page)
-{
- struct folio *old_folio = page_folio(old_page);
- struct folio *new_folio;
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
- int err;
- struct mmu_notifier_range range;
- pte_t pte;
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + PAGE_SIZE);
-
- if (new_page) {
- new_folio = page_folio(new_page);
- err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
- if (err)
- return err;
- }
-
- /* For folio_free_swap() below */
- folio_lock(old_folio);
-
- mmu_notifier_invalidate_range_start(&range);
- err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw))
- goto unlock;
- VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- pte = ptep_get(pvmw.pte);
-
- /*
- * Handle PFN swap PTES, such as device-exclusive ones, that actually
- * map pages: simply trigger GUP again to fix it up.
- */
- if (unlikely(!pte_present(pte))) {
- page_vma_mapped_walk_done(&pvmw);
- goto unlock;
- }
-
- if (new_page) {
- folio_get(new_folio);
- folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(new_folio, vma);
- } else
- /* no new page, just dec_mm_counter for old_page */
- dec_mm_counter(mm, MM_ANONPAGES);
-
- if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_folio));
- inc_mm_counter(mm, MM_ANONPAGES);
- }
-
- flush_cache_page(vma, addr, pte_pfn(pte));
- ptep_clear_flush(vma, addr, pvmw.pte);
- if (new_page)
- set_pte_at(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
-
- folio_remove_rmap_pte(old_folio, old_page, vma);
- if (!folio_mapped(old_folio))
- folio_free_swap(old_folio);
- page_vma_mapped_walk_done(&pvmw);
- folio_put(old_folio);
-
- err = 0;
- unlock:
- mmu_notifier_invalidate_range_end(&range);
- folio_unlock(old_folio);
- return err;
-}
-
-/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
@@ -463,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
+static bool orig_page_is_identical(struct vm_area_struct *vma,
+ unsigned long vaddr, struct page *page, bool *pmd_mappable)
+{
+ const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
+ struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
+ index);
+ struct page *orig_page;
+ bool identical;
+
+ if (IS_ERR(orig_folio))
+ return false;
+ orig_page = folio_file_page(orig_folio, index);
+
+ *pmd_mappable = folio_test_pmd_mappable(orig_folio);
+ identical = folio_test_uptodate(orig_folio) &&
+ pages_identical(page, orig_page);
+ folio_put(orig_folio);
+ return identical;
+}
+
+static int __uprobe_write_opcode(struct vm_area_struct *vma,
+ struct folio_walk *fw, struct folio *folio,
+ unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ const bool is_register = !!is_swbp_insn(&opcode);
+ bool pmd_mappable;
+
+ /* For now, we'll only handle PTE-mapped folios. */
+ if (fw->level != FW_LEVEL_PTE)
+ return -EFAULT;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
+ * but the VMA might not be writable.
+ */
+ if (!pte_write(fw->pte)) {
+ if (!PageAnonExclusive(fw->page))
+ return -EFAULT;
+ if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
+ return -EFAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /*
+ * We'll temporarily unmap the page and flush the TLB, such that we can
+ * modify the page atomically.
+ */
+ flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
+ fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
+ copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * there are no unexpected folio references ...
+ */
+ if (is_register || userfaultfd_missing(vma) ||
+ (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
+ folio_test_swapcache(folio) * folio_nr_pages(folio)))
+ goto remap;
+
+ /*
+ * ... and the mapped page is identical to the original page that
+ * would get faulted in on next access.
+ */
+ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
+ goto remap;
+
+ dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, fw->page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+
+ return pmd_mappable;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
+ return 0;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -474,146 +479,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
+ * @vma: the probed virtual memory area.
+ * @opcode_vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- struct page *old_page, *new_page;
- struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
- bool orig_page_huge = false;
unsigned int gup_flags = FOLL_FORCE;
+ struct mmu_notifier_range range;
+ struct folio_walk fw;
+ struct folio *folio;
+ struct page *page;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
-retry:
+ if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
+ return -EINVAL;
+
+ /*
+ * When registering, we have to break COW to get an exclusive anonymous
+ * page that we can safely modify. Use FOLL_WRITE to trigger a write
+ * fault if required. When unregistering, we might be lucky and the
+ * anon page is already gone. So defer write faults until really
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+ * cannot deal with PMDs yet.
+ */
if (is_register)
- gup_flags |= FOLL_SPLIT_PMD;
- /* Read the page with vaddr into memory */
- old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
- if (IS_ERR(old_page))
- return PTR_ERR(old_page);
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
- ret = verify_opcode(old_page, vaddr, &opcode);
+retry:
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
if (ret <= 0)
- goto put_old;
-
- if (is_zero_page(old_page)) {
- ret = -EINVAL;
- goto put_old;
- }
+ goto out;
+ folio = page_folio(page);
- if (WARN(!is_register && PageCompound(old_page),
- "uprobe unregister should never work on compound page\n")) {
- ret = -EINVAL;
- goto put_old;
+ ret = verify_opcode(page, opcode_vaddr, &opcode);
+ if (ret <= 0) {
+ folio_put(folio);
+ goto out;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
- if (ret)
- goto put_old;
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
ref_ctr_updated = 1;
}
ret = 0;
- if (!is_register && !PageAnon(old_page))
- goto put_old;
-
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_old;
-
- ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
- if (!new_page)
- goto put_old;
-
- __SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (unlikely(!folio_test_anon(folio))) {
+ VM_WARN_ON_ONCE(is_register);
+ folio_put(folio);
+ goto out;
+ }
if (!is_register) {
- struct page *orig_page;
- pgoff_t index;
-
- VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
-
- index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
- orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
- index);
-
- if (orig_page) {
- if (PageUptodate(orig_page) &&
- pages_identical(new_page, orig_page)) {
- /* let go new_page */
- put_page(new_page);
- new_page = NULL;
-
- if (PageCompound(orig_page))
- orig_page_huge = true;
- }
- put_page(orig_page);
- }
+ /*
+ * In the common case, we'll be able to zap the page when
+ * unregistering. So trigger MMU notifiers now, as we won't
+ * be able to do it under PTL.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ vaddr, vaddr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ ret = -EAGAIN;
+ /* Walk the page tables again, to perform the actual update. */
+ if (folio_walk_start(&fw, vma, vaddr, 0)) {
+ if (fw.page == page)
+ ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+ folio_walk_end(&fw, vma);
}
- ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
- if (new_page)
- put_page(new_page);
-put_old:
- put_page(old_page);
+ if (!is_register)
+ mmu_notifier_invalidate_range_end(&range);
- if (unlikely(ret == -EAGAIN))
+ folio_put(folio);
+ switch (ret) {
+ case -EFAULT:
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
+ fallthrough;
+ case -EAGAIN:
goto retry;
+ default:
+ break;
+ }
+out:
/* Revert back reference counter if instruction update failed. */
- if (ret && is_register && ref_ctr_updated)
+ if (ret < 0 && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
/* try collapse pmd for compound page */
- if (!ret && orig_page_huge)
+ if (ret > 0)
collapse_pte_mapped_thp(mm, vaddr, false);
- return ret;
+ return ret < 0 ? ret : 0;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
}
/**
* set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_orig_insn(struct arch_uprobe *auprobe,
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr,
+ return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
}
@@ -1134,10 +1139,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
return ret;
}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long vaddr)
+static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
bool first_uprobe;
int ret;
@@ -1153,7 +1158,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (first_uprobe)
set_bit(MMF_HAS_UPROBES, &mm->flags);
- ret = set_swbp(&uprobe->arch, mm, vaddr);
+ ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
clear_bit(MMF_RECALC_UPROBES, &mm->flags);
else if (first_uprobe)
@@ -1162,11 +1167,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
+
set_bit(MMF_RECALC_UPROBES, &mm->flags);
- return set_orig_insn(&uprobe->arch, mm, vaddr);
+ return set_orig_insn(&uprobe->arch, vma, vaddr);
}
struct map_info {
@@ -1296,10 +1303,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new, mm))
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ err = install_breakpoint(uprobe, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe, mm))
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
unlock:
@@ -1472,7 +1479,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
- err |= remove_breakpoint(uprobe, mm, vaddr);
+ err |= remove_breakpoint(uprobe, vma, vaddr);
}
mmap_read_unlock(mm);
@@ -1610,7 +1617,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
@@ -1956,6 +1963,9 @@ static void free_ret_instance(struct uprobe_task *utask,
* to-be-reused return instances for future uretprobes. If ri_timer()
* happens to be running right now, though, we fallback to safety and
* just perform RCU-delated freeing of ri.
+ * Admittedly, this is a rather simple use of seqcount, but it nicely
+ * abstracts away all the necessary memory barriers, so we use
+ * a well-supported kernel primitive here.
*/
if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
/* immediate reuse of ri without RCU GP is OK */
@@ -2016,12 +2026,20 @@ static void ri_timer(struct timer_list *timer)
/* RCU protects return_instance from freeing. */
guard(rcu)();
- write_seqcount_begin(&utask->ri_seqcount);
+ /*
+ * See free_ret_instance() for notes on seqcount use.
+ * We also employ raw API variants to avoid lockdep false-positive
+ * warning complaining about enabled preemption. The timer can only be
+ * invoked once for a uprobe_task. Therefore there can only be one
+ * writer. The reader does not require an even sequence count to make
+ * progress, so it is OK to remain preemptible on PREEMPT_RT.
+ */
+ raw_write_seqcount_begin(&utask->ri_seqcount);
for_each_ret_instance_rcu(ri, utask->return_instances)
hprobe_expire(&ri->hprobe, false);
- write_seqcount_end(&utask->ri_seqcount);
+ raw_write_seqcount_end(&utask->ri_seqcount);
}
static struct uprobe_task *alloc_utask(void)