From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 09:01:33 -0700 Subject: x86/irq: Fix move_irq_desc() for nodes without ram Don't move it if target node is -1. Signed-off-by: Yinghai Lu LKML-Reference: <4A785B5D.4070702@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee57bf2..3fd30197da2e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - /* those all static, do move them */ - if (desc->irq < NR_IRQS_LEGACY) + /* those static or target node is -1, do not move them */ + if (desc->irq < NR_IRQS_LEGACY || node == -1) return desc; if (desc->node != node) -- cgit From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 164 +++++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 868102172aa4..615440ab9295 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; - delta = now - prev; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + return nr; } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; + u64 overflow; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } + if (hwc->interrupts == MAX_INTERRUPTS) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * Nothing to do, we already reset hwc->interrupts. */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); +} - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + atomic64_add(nr, &counter->count); - return ret; -} + if (!hwc->sample_period) + return; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - data->period = counter->hw.last_period; + if (!data->regs) + return; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,26 +3560,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; +/* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + /* * Software counter: cpu wall time clock */ -- cgit From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440ab9295..117622cb73a3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); -- cgit From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a3..002310540417 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 002310540417..546e62d62941 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: -- cgit From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d62941..5229d1666fa5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d1666fa5..b0b20a07f394 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; -- cgit From 39cbb602b543e477df71dca84b5b2e36f8bd29fc Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 7 Aug 2009 12:01:08 -0400 Subject: Remove double removal of blktrace directory commit fd51d251e4cdb21f68e9dbc4336514d64a105a79 Author: Stefan Raspl Date: Tue May 19 09:59:08 2009 +0200 blktrace: remove debugfs entries on bad path added in an explicit invocation of debugfs_remove for bt->dir, in blk_remove_buf_file_callback we are also getting the directory removed. On occasion I am seeing memory corruption that I have bisected down to this commit. [The testing involves a (long) series of I/O benchmarks with blktrace invoked around the actual runs.] I believe that this committed patch is correct, but the problem actually lies in the code in blk_remove_buf_file_callback. With this patch I am able to consistently get complete runs whereas previously I could not get a single run to complete. The first part of the patch simply moves the debugfs_remove below the relay_close: the relay_close call will remove files under bt->dir, and so we should not remove the directory until all the files we created have been removed. (Note: This is not sufficient to fix the problem - the file system code has ref counts on the directoy, so our invocation does not cause the directory to actually be removed. Nonetheless, we should not rely upon that feature.) Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0aed9ba..7a34cb563fec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - debugfs_remove(bt->dir); relay_close(bt->rchan); + debugfs_remove(bt->dir); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); return 0; } -- cgit