diff options
-rw-r--r-- | Documentation/ABI/obsolete/automount-tracefs-debugfs | 20 | ||||
-rw-r--r-- | Documentation/trace/eprobetrace.rst | 269 | ||||
-rw-r--r-- | Documentation/trace/index.rst | 1 | ||||
-rw-r--r-- | include/linux/trace_events.h | 3 | ||||
-rw-r--r-- | include/trace/events/sched.h | 2 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 27 | ||||
-rw-r--r-- | kernel/trace/Makefile | 2 | ||||
-rw-r--r-- | kernel/trace/preemptirq_delay_test.c | 13 | ||||
-rw-r--r-- | kernel/trace/rv/rv.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace.c | 49 | ||||
-rw-r--r-- | kernel/trace/trace.h | 4 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 154 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 28 | ||||
-rw-r--r-- | kernel/trace/trace_hwlat.c | 5 |
14 files changed, 498 insertions, 85 deletions
diff --git a/Documentation/ABI/obsolete/automount-tracefs-debugfs b/Documentation/ABI/obsolete/automount-tracefs-debugfs new file mode 100644 index 000000000000..a5196ec78cb5 --- /dev/null +++ b/Documentation/ABI/obsolete/automount-tracefs-debugfs @@ -0,0 +1,20 @@ +What: /sys/kernel/debug/tracing +Date: May 2008 +KernelVersion: 2.6.27 +Contact: linux-trace-kernel@vger.kernel.org +Description: + + The ftrace was first added to the kernel, its interface was placed + into the debugfs file system under the "tracing" directory. Access + to the files were in /sys/kernel/debug/tracing. As systems wanted + access to the tracing interface without having to enable debugfs, a + new interface was created called "tracefs". This was a stand alone + file system and was usually mounted in /sys/kernel/tracing. + + To allow older tooling to continue to operate, when mounting + debugfs, the tracefs file system would automatically get mounted in + the "tracing" directory of debugfs. The tracefs interface was added + in January 2015 in the v4.1 kernel. + + All tooling should now be using tracefs directly and the "tracing" + directory in debugfs should be removed by January 2030. diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst new file mode 100644 index 000000000000..89b5157cfab8 --- /dev/null +++ b/Documentation/trace/eprobetrace.rst @@ -0,0 +1,269 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Eprobe - Event-based Probe Tracing +================================== + +:Author: Steven Rostedt <rostedt@goodmis.org> + +- Written for v6.17 + +Overview +======== + +Eprobes are dynamic events that are placed on existing events to either +dereference a field that is a pointer, or simply to limit what fields are +recorded in the trace event. + +Eprobes depend on kprobe events so to enable this feature, build your kernel +with CONFIG_EPROBE_EVENTS=y. + +Eprobes are created via the /sys/kernel/tracing/dynamic_events file. + +Synopsis of eprobe_events +------------------------- +:: + + e[:[EGRP/][EEVENT]] GRP.EVENT [FETCHARGS] : Set a probe + -:[EGRP/][EEVENT] : Clear a probe + + EGRP : Group name of the new event. If omitted, use "eprobes" for it. + EEVENT : Event name. If omitted, the event name is generated and will + be the same event name as the event it attached to. + GRP : Group name of the event to attach to. + EVENT : Event name of the event to attach to. + + FETCHARGS : Arguments. Each probe can have up to 128 args. + $FIELD : Fetch the value of the event field called FIELD. + @ADDR : Fetch memory at ADDR (ADDR should be in kernel) + @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) + $comm : Fetch current task comm. + +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4) + \IMM : Store an immediate value to the argument. + NAME=FETCHARG : Set NAME as the argument name of FETCHARG. + FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types + (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types + (x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char", + "string", "ustring", "symbol", "symstr" and "bitfield" are + supported. + +Types +----- +The FETCHARGS above is very similar to the kprobe events as described in +Documentation/trace/kprobetrace.rst. + +The difference between eprobes and kprobes FETCHARGS is that eprobes has a +$FIELD command that returns the content of the event field of the event +that is attached. Eprobes do not have access to registers, stacks and function +arguments that kprobes has. + +If a field argument is a pointer, it may be dereferenced just like a memory +address using the FETCHARGS syntax. + + +Attaching to dynamic events +--------------------------- + +Eprobes may attach to dynamic events as well as to normal events. It may +attach to a kprobe event, a synthetic event or a fprobe event. This is useful +if the type of a field needs to be changed. See Example 2 below. + +Usage examples +============== + +Example 1 +--------- + +The basic usage of eprobes is to limit the data that is being recorded into +the tracing buffer. For example, a common event to trace is the sched_switch +trace event. That has a format of:: + + field:unsigned short common_type; offset:0; size:2; signed:0; + field:unsigned char common_flags; offset:2; size:1; signed:0; + field:unsigned char common_preempt_count; offset:3; size:1; signed:0; + field:int common_pid; offset:4; size:4; signed:1; + + field:char prev_comm[16]; offset:8; size:16; signed:0; + field:pid_t prev_pid; offset:24; size:4; signed:1; + field:int prev_prio; offset:28; size:4; signed:1; + field:long prev_state; offset:32; size:8; signed:1; + field:char next_comm[16]; offset:40; size:16; signed:0; + field:pid_t next_pid; offset:56; size:4; signed:1; + field:int next_prio; offset:60; size:4; signed:1; + +The first four fields are common to all events and can not be limited. But the +rest of the event has 60 bytes of information. It records the names of the +previous and next tasks being scheduled out and in, as well as their pids and +priorities. It also records the state of the previous task. If only the pids +of the tasks are of interest, why waste the ring buffer with all the other +fields? + +An eprobe can limit what gets recorded. Note, it does not help in performance, +as all the fields are recorded in a temporary buffer to process the eprobe. +:: + + # echo 'e:sched/switch sched.sched_switch prev=$prev_pid:u32 next=$next_pid:u32' >> /sys/kernel/tracing/dynamic_events + # echo 1 > /sys/kernel/tracing/events/sched/switch/enable + # cat /sys/kernel/tracing/trace + + # tracer: nop + # + # entries-in-buffer/entries-written: 2721/2721 #P:8 + # + # _-----=> irqs-off/BH-disabled + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / _-=> migrate-disable + # |||| / delay + # TASK-PID CPU# ||||| TIMESTAMP FUNCTION + # | | | ||||| | | + sshd-session-1082 [004] d..4. 5041.239906: switch: (sched.sched_switch) prev=1082 next=0 + bash-1085 [001] d..4. 5041.240198: switch: (sched.sched_switch) prev=1085 next=141 + kworker/u34:5-141 [001] d..4. 5041.240259: switch: (sched.sched_switch) prev=141 next=1085 + <idle>-0 [004] d..4. 5041.240354: switch: (sched.sched_switch) prev=0 next=1082 + bash-1085 [001] d..4. 5041.240385: switch: (sched.sched_switch) prev=1085 next=141 + kworker/u34:5-141 [001] d..4. 5041.240410: switch: (sched.sched_switch) prev=141 next=1085 + bash-1085 [001] d..4. 5041.240478: switch: (sched.sched_switch) prev=1085 next=0 + sshd-session-1082 [004] d..4. 5041.240526: switch: (sched.sched_switch) prev=1082 next=0 + <idle>-0 [001] d..4. 5041.247524: switch: (sched.sched_switch) prev=0 next=90 + <idle>-0 [002] d..4. 5041.247545: switch: (sched.sched_switch) prev=0 next=16 + kworker/1:1-90 [001] d..4. 5041.247580: switch: (sched.sched_switch) prev=90 next=0 + rcu_sched-16 [002] d..4. 5041.247591: switch: (sched.sched_switch) prev=16 next=0 + <idle>-0 [002] d..4. 5041.257536: switch: (sched.sched_switch) prev=0 next=16 + rcu_sched-16 [002] d..4. 5041.257573: switch: (sched.sched_switch) prev=16 next=0 + +Note, without adding the "u32" after the prev_pid and next_pid, the values +would default showing in hexadecimal. + +Example 2 +--------- + +If a specific system call is to be recorded but the syscalls events are not +enabled, the raw_syscalls can still be used (syscalls are system call +events are not normal events, but are created from the raw_syscalls events +within the kernel). In order to trace the openat system call, one can create +an event probe on top of the raw_syscalls event: +:: + + # cd /sys/kernel/tracing + # cat events/raw_syscalls/sys_enter/format + name: sys_enter + ID: 395 + format: + field:unsigned short common_type; offset:0; size:2; signed:0; + field:unsigned char common_flags; offset:2; size:1; signed:0; + field:unsigned char common_preempt_count; offset:3; size:1; signed:0; + field:int common_pid; offset:4; size:4; signed:1; + + field:long id; offset:8; size:8; signed:1; + field:unsigned long args[6]; offset:16; size:48; signed:0; + + print fmt: "NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", REC->id, REC->args[0], REC->args[1], REC->args[2], REC->args[3], REC->args[4], REC->args[5] + +From the source code, the sys_openat() has: +:: + + int sys_openat(int dirfd, const char *path, int flags, mode_t mode) + { + return my_syscall4(__NR_openat, dirfd, path, flags, mode); + } + +The path is the second parameter, and that is what is wanted. +:: + + # echo 'e:openat raw_syscalls.sys_enter nr=$id filename=+8($args):ustring' >> dynamic_events + +This is being run on x86_64 where the word size is 8 bytes and the openat +system call __NR_openat is set at 257. +:: + + # echo 'nr == 257' > events/eprobes/openat/filter + +Now enable the event and look at the trace. +:: + + # echo 1 > events/eprobes/openat/enable + # cat trace + + # tracer: nop + # + # entries-in-buffer/entries-written: 4/4 #P:8 + # + # _-----=> irqs-off/BH-disabled + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / _-=> migrate-disable + # |||| / delay + # TASK-PID CPU# ||||| TIMESTAMP FUNCTION + # | | | ||||| | | + cat-1298 [003] ...2. 2060.875970: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault) + cat-1298 [003] ...2. 2060.876197: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault) + cat-1298 [003] ...2. 2060.879126: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault) + cat-1298 [003] ...2. 2060.879639: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault) + +The filename shows "(fault)". This is likely because the filename has not been +pulled into memory yet and currently trace events cannot fault in memory that +is not present. When an eprobe tries to read memory that has not been faulted +in yet, it will show the "(fault)" text. + +To get around this, as the kernel will likely pull in this filename and make +it present, attaching it to a synthetic event that can pass the address of the +filename from the entry of the event to the end of the event, this can be used +to show the filename when the system call returns. + +Remove the old eprobe:: + + # echo 1 > events/eprobes/openat/enable + # echo '-:openat' >> dynamic_events + +This time make an eprobe where the address of the filename is saved:: + + # echo 'e:openat_start raw_syscalls.sys_enter nr=$id filename=+8($args):x64' >> dynamic_events + +Create a synthetic event that passes the address of the filename to the +end of the event:: + + # echo 's:filename u64 file' >> dynamic_events + # echo 'hist:keys=common_pid:f=filename if nr == 257' > events/eprobes/openat_start/trigger + # echo 'hist:keys=common_pid:file=$f:onmatch(eprobes.openat_start).trace(filename,$file) if id == 257' > events/raw_syscalls/sys_exit/trigger + +Now that the address of the filename has been passed to the end of the +system call, create another eprobe to attach to the exit event to show the +string:: + + # echo 'e:openat synthetic.filename filename=+0($file):ustring' >> dynamic_events + # echo 1 > events/eprobes/openat/enable + # cat trace + + # tracer: nop + # + # entries-in-buffer/entries-written: 4/4 #P:8 + # + # _-----=> irqs-off/BH-disabled + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / _-=> migrate-disable + # |||| / delay + # TASK-PID CPU# ||||| TIMESTAMP FUNCTION + # | | | ||||| | | + cat-1331 [001] ...5. 2944.787977: openat: (synthetic.filename) filename="/etc/ld.so.cache" + cat-1331 [001] ...5. 2944.788480: openat: (synthetic.filename) filename="/lib/x86_64-linux-gnu/libc.so.6" + cat-1331 [001] ...5. 2944.793426: openat: (synthetic.filename) filename="/usr/lib/locale/locale-archive" + cat-1331 [001] ...5. 2944.831362: openat: (synthetic.filename) filename="trace" + +Example 3 +--------- + +If syscall trace events are available, the above would not need the first +eprobe, but it would still need the last one:: + + # echo 's:filename u64 file' >> dynamic_events + # echo 'hist:keys=common_pid:f=filename' > events/syscalls/sys_enter_openat/trigger + # echo 'hist:keys=common_pid:file=$f:onmatch(syscalls.sys_enter_openat).trace(filename,$file)' > events/syscalls/sys_exit_openat/trigger + # echo 'e:openat synthetic.filename filename=+0($file):ustring' >> dynamic_events + # echo 1 > events/eprobes/openat/enable + +And this would produce the same result as Example 2. diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index cc1dc5a087e8..b4a429dc4f7a 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -36,6 +36,7 @@ the Linux kernel. kprobes kprobetrace fprobetrace + eprobetrace fprobe ring-buffer-design diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index fa9cf4292dff..04307a19cde3 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -480,7 +480,6 @@ enum { EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, - EVENT_FILE_FL_SOFT_MODE_BIT, EVENT_FILE_FL_SOFT_DISABLED_BIT, EVENT_FILE_FL_TRIGGER_MODE_BIT, EVENT_FILE_FL_TRIGGER_COND_BIT, @@ -618,7 +617,6 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored - * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED * SOFT_DISABLED - When set, do not trace the event (even though its * tracepoint may be enabled) * TRIGGER_MODE - When set, invoke the triggers associated with the event @@ -633,7 +631,6 @@ enum { EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), - EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9f16ee2603ba..7b2645b50e78 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -829,8 +829,6 @@ TRACE_EVENT(sched_wake_idle_without_ipi, /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. - * - * Postfixed with _tp to make them easily identifiable in the code. */ DECLARE_TRACE(pelt_cfs, TP_PROTO(struct cfs_rq *cfs_rq), diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f135d04e1860..d2c79da81e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -200,6 +200,19 @@ menuconfig FTRACE if FTRACE +config TRACEFS_AUTOMOUNT_DEPRECATED + bool "Automount tracefs on debugfs [DEPRECATED]" + depends on TRACING + default y + help + The tracing interface was moved from /sys/kernel/debug/tracing + to /sys/kernel/tracing in 2015, but the tracing file system + was still automounted in /sys/kernel/debug for backward + compatibility with tooling. + + The new interface has been around for more than 10 years and + the old debug mount will soon be removed. + config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING @@ -780,6 +793,20 @@ config UPROBE_EVENTS This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config EPROBE_EVENTS + bool "Enable event-based dynamic events" + depends on TRACING + depends on HAVE_REGS_AND_STACK_ACCESS_API + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + Eprobes are dynamic events that can be placed on other existing + events. It can be used to limit what fields are recorded in + an event or even dereference a field of an event. It can + convert the type of an event field. For example, turn an + address into a string. + config BPF_EVENTS depends on BPF_SYSCALL depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 057cd975d014..dcb4e02afc5f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o +obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 314ffc143039..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); - struct cpumask cpu_mask; + cpumask_var_t cpu_mask; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; if (cpu_affinity > -1) { - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu_affinity, &cpu_mask); - if (set_cpus_allowed_ptr(current, &cpu_mask)) + cpumask_clear(cpu_mask); + cpumask_set_cpu(cpu_affinity, cpu_mask); + if (set_cpus_allowed_ptr(current, cpu_mask)) pr_err("cpu_affinity:%d, failed\n", cpu_affinity); } @@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data) __set_current_state(TASK_RUNNING); + free_cpumask_var(cpu_mask); + return 0; } diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index bd7d56dbf6c2..1482e91c39f4 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -674,8 +674,6 @@ static bool __read_mostly monitoring_on; */ bool rv_monitoring_on(void) { - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_rmb(); return READ_ONCE(monitoring_on); } @@ -695,8 +693,6 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, static void turn_monitoring_off(void) { WRITE_ONCE(monitoring_on, false); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void reset_all_monitors(void) @@ -712,8 +708,6 @@ static void reset_all_monitors(void) static void turn_monitoring_on(void) { WRITE_ONCE(monitoring_on, true); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void turn_monitoring_on_with_reset(void) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7996f26c3f46..b9716178f728 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -936,7 +936,6 @@ int tracing_is_enabled(void) * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ - smp_rmb(); return !global_trace.buffer_disabled; } @@ -1107,8 +1106,6 @@ void tracer_tracing_on(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 0; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -1640,8 +1637,6 @@ void tracer_tracing_off(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 1; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -2710,8 +2705,6 @@ void trace_buffered_event_enable(void) static void enable_trace_buffered_event(void *data) { - /* Probably not needed, but do it anyway */ - smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } @@ -5931,17 +5924,27 @@ static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_insert_eval_map(struct module *mod, - struct trace_eval_map **start, int len) +static void +trace_event_update_with_eval_map(struct module *mod, + struct trace_eval_map **start, + int len) { struct trace_eval_map **map; - if (len <= 0) - return; + /* Always run sanitizer only if btf_type_tag attr exists. */ + if (len <= 0) { + if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && + IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && + __has_attribute(btf_type_tag))) + return; + } map = start; - trace_event_eval_update(map, len); + trace_event_update_all(map, len); + + if (len <= 0) + return; trace_insert_eval_map_file(mod, start, len); } @@ -6297,7 +6300,7 @@ static bool tracer_options_updated; static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ - if (!tr->dir) + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) return; /* Only create trace option files after update_tracer_options finish */ @@ -8978,13 +8981,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; } static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (WARN_ON(!tr->dir)) - return ERR_PTR(-ENODEV); - /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* All sub buffers have a descriptor */ return tr->dir; } @@ -10250,6 +10253,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; @@ -10271,6 +10275,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) if (IS_ERR(fc)) return ERR_CAST(fc); + pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); + ret = vfs_parse_fs_string(fc, "source", "tracefs", strlen("tracefs")); if (!ret) @@ -10281,6 +10287,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) put_fs_context(fc); return mnt; } +#endif /** * tracing_init_dentry - initialize top level trace array @@ -10305,6 +10312,7 @@ int tracing_init_dentry(void) if (WARN_ON(!tracefs_initialized())) return -ENODEV; +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount @@ -10313,6 +10321,7 @@ int tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); +#endif return 0; } @@ -10329,7 +10338,7 @@ static void __init eval_map_work_func(struct work_struct *work) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); + trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) @@ -10382,9 +10391,6 @@ bool module_exists(const char *module) static void trace_module_add_evals(struct module *mod) { - if (!mod->num_trace_evals) - return; - /* * Modules with bad taint do not have events created, do * not bother with enums either. @@ -10392,7 +10398,8 @@ static void trace_module_add_evals(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); + /* Even if no trace_evals, this need to sanitize field types. */ + trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bd084953a98b..1dbf1d3cf2f1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_eval_update(struct trace_eval_map **map, int len); +void trace_event_update_all(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_update_all(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d01e5c910ce1..9f3e9537417d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable; @@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * is set we do not want the event to be enabled before we * clear the bit. * - * When soft_disable is not set but the SOFT_MODE flag is, + * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ @@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; - clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = false; /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else - disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); + disable = !soft_mode; if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); @@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, WARN_ON_ONCE(ret); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & EVENT_FILE_FL_SOFT_MODE) + /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ + if (soft_mode) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do - * nothing (but set SOFT_MODE). If the event is disabled, we + * nothing (but set soft_mode). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ @@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = true; /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } @@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; - /* Keep the event disabled, when going to SOFT_MODE. */ + /* Keep the event disabled, when going to soft mode. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & EVENT_FILE_FL_SOFT_DISABLED || - flags & EVENT_FILE_FL_SOFT_MODE) + if (atomic_read(&file->sm_ref) != 0) strcat(buf, "*"); strcat(buf, "\n"); @@ -3267,43 +3267,120 @@ static void add_str_to_module(struct module *module, char *str) list_add(&modstr->next, &module_strings); } +#define ATTRIBUTE_STR "__attribute__(" +#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1) + +/* Remove all __attribute__() from @type. Return allocated string or @type. */ +static char *sanitize_field_type(const char *type) +{ + char *attr, *tmp, *next, *ret = (char *)type; + int depth; + + next = (char *)type; + while ((attr = strstr(next, ATTRIBUTE_STR))) { + /* Retry if "__attribute__(" is a part of another word. */ + if (attr != next && !isspace(attr[-1])) { + next = attr + ATTRIBUTE_STR_LEN; + continue; + } + + if (ret == type) { + ret = kstrdup(type, GFP_KERNEL); + if (WARN_ON_ONCE(!ret)) + return NULL; + attr = ret + (attr - type); + } + + /* the ATTRIBUTE_STR already has the first '(' */ + depth = 1; + next = attr + ATTRIBUTE_STR_LEN; + do { + tmp = strpbrk(next, "()"); + /* There is unbalanced parentheses */ + if (WARN_ON_ONCE(!tmp)) { + kfree(ret); + return (char *)type; + } + + if (*tmp == '(') + depth++; + else + depth--; + next = tmp + 1; + } while (depth > 0); + next = skip_spaces(next); + strcpy(attr, next); + next = attr; + } + return ret; +} + +static char *find_replacable_eval(const char *type, const char *eval_string, + int len) +{ + char *ptr; + + if (!eval_string) + return NULL; + + ptr = strchr(type, '['); + if (!ptr) + return NULL; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + return NULL; + + if (strncmp(eval_string, ptr, len) != 0) + return NULL; + + return ptr; +} + static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; + const char *eval_string = NULL; struct list_head *head; + int len = 0; char *ptr; char *str; - int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ - if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + if (call->flags & TRACE_EVENT_FL_DYNAMIC) return; + if (map) { + eval_string = map->eval_string; + len = strlen(map->eval_string); + } + head = trace_get_fields(call); list_for_each_entry(field, head, link) { - ptr = strchr(field->type, '['); - if (!ptr) - continue; - ptr++; - - if (!isalpha(*ptr) && *ptr != '_') - continue; + str = sanitize_field_type(field->type); + if (!str) + return; - if (strncmp(map->eval_string, ptr, len) != 0) - continue; + ptr = find_replacable_eval(str, eval_string, len); + if (ptr) { + if (str == field->type) { + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + } - str = kstrdup(field->type, GFP_KERNEL); - if (WARN_ON_ONCE(!str)) - return; - ptr = str + (ptr - field->type); - ptr = eval_replace(ptr, map, len); - /* enum/sizeof string smaller than value */ - if (WARN_ON_ONCE(!ptr)) { - kfree(str); - continue; + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } } + if (str == field->type) + continue; /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated @@ -3313,14 +3390,18 @@ static void update_event_fields(struct trace_event_call *call, add_str_to_module(call->module, str); field->type = str; + if (field->filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(field->type); } } -void trace_event_eval_update(struct trace_eval_map **map, int len) +/* Update all events for replacing eval and sanitizing */ +void trace_event_update_all(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; + bool updated; int last_i; int i; @@ -3333,6 +3414,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) last_system = call->class->system; } + updated = false; /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the @@ -3352,8 +3434,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } update_event_printk(call, map[i]); update_event_fields(call, map[i]); + updated = true; } } + /* If not updated yet, update field for sanitizing. */ + if (!updated) + update_event_fields(call, NULL); cond_resched(); } up_write(&trace_event_sem); @@ -3587,7 +3673,7 @@ static int probe_remove_event_call(struct trace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress + * we are going to do, soft mode can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) @@ -3698,7 +3784,7 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } - /* Check for any strings allocade for this module */ + /* Check for any strings allocated for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; @@ -4002,7 +4088,7 @@ static int free_probe_data(void *data) edata->ref--; if (!edata->ref) { - /* Remove the SOFT_MODE flag */ + /* Remove soft mode */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e4581e10782b..54226b48b2d1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1344,13 +1344,14 @@ struct filter_list { struct filter_head { struct list_head list; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct rcu_work rwork; + }; }; - -static void free_filter_list(struct rcu_head *rhp) +static void free_filter_list(struct filter_head *filter_list) { - struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); struct filter_list *filter_item, *tmp; list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { @@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp) kfree(filter_list); } +static void free_filter_list_work(struct work_struct *work) +{ + struct filter_head *filter_list; + + filter_list = container_of(to_rcu_work(work), struct filter_head, rwork); + free_filter_list(filter_list); +} + static void free_filter_list_tasks(struct rcu_head *rhp) { - call_rcu(rhp, free_filter_list); + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + + INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work); + queue_rcu_work(system_wq, &filter_list->rwork); } /* @@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, tracepoint_synchronize_unregister(); if (head) - free_filter_list(&head->rcu); + free_filter_list(head); list_for_each_entry(file, &tr->events, list) { if (file->system != dir || !file->filter) @@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, return 0; fail: /* No call succeeded */ - free_filter_list(&filter_list->rcu); + free_filter_list(filter_list); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: @@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!fail) delay_free_filter(filter_list); else - free_filter_list(&filter_list->rcu); + free_filter_list(filter_list); return -ENOMEM; } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b65353ec2837..2f7b94e98317 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,12 +325,9 @@ static void move_to_next_cpu(void) cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask); cpus_read_unlock(); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(current_mask); - if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto change_mode; |