summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-05 09:43:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-05 09:43:36 -0700
commit21fbefc5886cc38cf7b57b28c35bd619efbce914 (patch)
tree8784f5113b7b9ed8bc9dc049f03a1bc17c37c4bd
parentd9f24f8e60798c066ead61f77e67ee6a5a204514 (diff)
parent61e19cd2e5c5235326a13a68df1a2f8ec4eeed7b (diff)
Merge tag 'trace-v6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing updates from Steven Rostedt: - Use READ_ONCE() and WRITE_ONCE() instead of RCU for syscall tracepoints Individual system call trace events are pseudo events attached to the raw_syscall trace events that just trace the entry and exit of all system calls. When any of these individual system call trace events get enabled, an element in an array indexed by the system call number is assigned to the trace file that defines how to trace it. When the trace event triggers, it reads this array and if the array has an element, it uses that trace file to know what to write it (the trace file defines the output format of the corresponding system call). The issue is that it uses rcu_dereference_ptr() and marks the elements of the array as using RCU. This is incorrect. There is no RCU synchronization here. The event file that is pointed to has a completely different way to make sure its freed properly. The reading of the array during the system call trace event is only to know if there is a value or not. If not, it does nothing (it means this system call isn't being traced). If it does, it uses the information to store the system call data. The RCU usage here can simply be replaced by READ_ONCE() and WRITE_ONCE() macros. - Have the system call trace events use "0x" for hex values Some system call trace events display hex values but do not have "0x" in front of it. Seeing "count: 44" can be assumed that it is 44 decimal when in actuality it is 44 hex (68 decimal). Display "0x44" instead. - Use vmalloc_array() in tracing_map_sort_entries() The function tracing_map_sort_entries() used array_size() and vmalloc() when it could have simply used vmalloc_array(). - Use for_each_online_cpu() in trace_osnoise.c() Instead of open coding for_each_cpu(cpu, cpu_online_mask), use for_each_online_cpu(). - Move the buffer field in struct trace_seq to the end The buffer field in struct trace_seq is architecture dependent in size, and caused padding for the fields after it. By moving the buffer to the end of the structure, it compacts the trace_seq structure better. - Remove redundant zeroing of cmdline_idx field in saved_cmdlines_buffer() The structure that contains cmdline_idx is zeroed by memset(), no need to explicitly zero any of its fields after that. - Use system_percpu_wq instead of system_wq in user_event_mm_remove() As system_wq is being deprecated, use the new wq. - Add cond_resched() is ftrace_module_enable() Some modules have a lot of functions (thousands of them), and the enabling of those functions can take some time. On non preemtable kernels, it was triggering a watchdog timeout. Add a cond_resched() to prevent that. - Add a BUILD_BUG_ON() to make sure PID_MAX_DEFAULT is always a power of 2 There's code that depends on PID_MAX_DEFAULT being a power of 2 or it will break. If in the future that changes, make sure the build fails to ensure that the code is fixed that depends on this. - Grab mutex_lock() before ever exiting s_start() The s_start() function is a seq_file start routine. As s_stop() is always called even if s_start() fails, and s_stop() expects the event_mutex to be held as it will always release it. That mutex must always be taken in s_start() even if that function fails. * tag 'trace-v6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: tracing: Fix lock imbalance in s_start() memory allocation failure path tracing: Ensure optimized hashing works ftrace: Fix softlockup in ftrace_module_enable tracing: replace use of system_wq with system_percpu_wq tracing: Remove redundant 0 value initialization tracing: Move buffer in trace_seq to end of struct tracing/osnoise: Use for_each_online_cpu() instead of for_each_cpu() tracing: Use vmalloc_array() to improve code tracing: Have syscall trace events show "0x" for values greater than 10 tracing: Replace syscall RCU pointer assignment with READ/WRITE_ONCE()
-rw-r--r--include/linux/trace_seq.h2
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_events_user.c2
-rw-r--r--kernel/trace/trace_osnoise.c4
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_syscalls.c26
-rw-r--r--kernel/trace/tracing_map.c2
9 files changed, 27 insertions, 21 deletions
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index a93ed5ac3226..557780fe1c77 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -21,10 +21,10 @@
(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
struct trace_seq {
- char buffer[TRACE_SEQ_BUFFER_SIZE];
struct seq_buf seq;
size_t readpos;
int full;
+ char buffer[TRACE_SEQ_BUFFER_SIZE];
};
static inline void
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a69067367c29..42bd2ba68a82 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7535,6 +7535,8 @@ void ftrace_module_enable(struct module *mod)
if (!within_module(rec->ip, mod))
break;
+ cond_resched();
+
/* Weak functions should still be ignored */
if (!test_for_valid_rec(rec)) {
/* Clear all other flags. Should not be enabled anyway */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5f4bed5842f9..85eabb454bee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -380,8 +380,8 @@ struct trace_array {
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
- struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
- struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
+ struct trace_event_file *enter_syscall_files[NR_syscalls];
+ struct trace_event_file *exit_syscall_files[NR_syscalls];
#endif
int stop_count;
int clock_id;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9f3e9537417d..e00da4182deb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1629,11 +1629,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t l;
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ mutex_lock(&event_mutex);
if (!iter)
return NULL;
- mutex_lock(&event_mutex);
-
iter->type = SET_EVENT_FILE;
iter->file = list_entry(&tr->events, struct trace_event_file, list);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 2ab283fd3032..c428dafe7496 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t)
* so we use a work queue after call_rcu() to run within.
*/
INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
- queue_rcu_work(system_wq, &mm->put_rwork);
+ queue_rcu_work(system_percpu_wq, &mm->put_rwork);
}
void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index dc734867f0fc..12ee346820da 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -271,7 +271,7 @@ static inline void tlat_var_reset(void)
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
- for_each_cpu(cpu, cpu_online_mask) {
+ for_each_online_cpu(cpu) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
if (tlat_var->kthread)
hrtimer_cancel(&tlat_var->timer);
@@ -295,7 +295,7 @@ static inline void osn_var_reset(void)
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
- for_each_cpu(cpu, cpu_online_mask) {
+ for_each_online_cpu(cpu) {
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
memset(osn_var, 0, sizeof(*osn_var));
}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb49f7279dc8..c46d584ded3b 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
/* Place map_cmdline_to_pid array right after saved_cmdlines */
s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
- s->cmdline_idx = 0;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
@@ -248,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid)
return 1;
+ BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT));
+
tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 46aab0ab9350..0f932b22f9ec 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -153,14 +153,20 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
if (trace_seq_has_overflowed(s))
goto end;
+ if (i)
+ trace_seq_puts(s, ", ");
+
/* parameter types */
if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
- trace_seq_printf(s, "%s: %lx%s", entry->args[i],
- trace->args[i],
- i == entry->nb_args - 1 ? "" : ", ");
+ if (trace->args[i] < 10)
+ trace_seq_printf(s, "%s: %lu", entry->args[i],
+ trace->args[i]);
+ else
+ trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
+ trace->args[i]);
}
trace_seq_putc(s, ')');
@@ -310,8 +316,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
- /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
- trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+ trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
if (!trace_file)
return;
@@ -356,8 +361,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
- /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
- trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+ trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
if (!trace_file)
return;
@@ -393,7 +397,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
if (!tr->sys_refcount_enter)
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (!ret) {
- rcu_assign_pointer(tr->enter_syscall_files[num], file);
+ WRITE_ONCE(tr->enter_syscall_files[num], file);
tr->sys_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
@@ -411,7 +415,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_enter--;
- RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
+ WRITE_ONCE(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
@@ -431,7 +435,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file,
if (!tr->sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
- rcu_assign_pointer(tr->exit_syscall_files[num], file);
+ WRITE_ONCE(tr->exit_syscall_files[num], file);
tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
@@ -449,7 +453,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
- RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
+ WRITE_ONCE(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 1921ade45be3..7f8da4dab69d 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -1076,7 +1076,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_entry *sort_entry, **entries;
int i, n_entries, ret;
- entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts));
+ entries = vmalloc_array(map->max_elts, sizeof(sort_entry));
if (!entries)
return -ENOMEM;