diff options
35 files changed, 452 insertions, 328 deletions
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 4073ca48af4a..74d5bd801b1a 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1031,9 +1031,6 @@ explains which is which. CPU#: The CPU which the process was running on. irqs-off: 'd' interrupts are disabled. '.' otherwise. - .. caution:: If the architecture does not support a way to - read the irq flags variable, an 'X' will always - be printed here. need-resched: - 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ace0d6227e3..eaee2a819f4c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1645,6 +1645,11 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; + /* whether BPF link itself has "sleepable" semantics, which can differ + * from underlying BPF program having a "sleepable" semantics, as BPF + * link's semantics is determined by target attach hook + */ + bool sleepable; /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ @@ -1661,8 +1666,10 @@ struct bpf_link_ops { */ void (*dealloc)(struct bpf_link *link); /* deallocate link resources callback, called after RCU grace period; - * if underlying BPF program is sleepable we go through tasks trace - * RCU GP and then "classic" RCU GP + * if either the underlying BPF program is sleepable or BPF link's + * target hook is sleepable, we'll go through tasks trace RCU GP and + * then "classic" RCU GP; this need for chaining tasks trace and + * classic RCU GPs is designated by setting bpf_link->sleepable flag */ void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); @@ -2409,6 +2416,9 @@ int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -2764,6 +2774,12 @@ static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, { } +static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) +{ +} + static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 42bedcddd511..016b29a56c87 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,7 +184,6 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, @@ -193,7 +192,6 @@ enum trace_flag_type { TRACE_FLAG_BH_OFF = 0x80, }; -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? @@ -207,17 +205,6 @@ static inline unsigned int tracing_gen_ctx(void) local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } -#else - -static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -static inline unsigned int tracing_gen_ctx(void) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -#endif static inline unsigned int tracing_gen_ctx_dec(void) { @@ -326,7 +313,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { - TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, @@ -341,7 +327,6 @@ enum { /* * Event flags: - * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file @@ -356,7 +341,6 @@ enum { * to a tracepoint yet, then it is cleared when it is. */ enum { - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), @@ -381,7 +365,6 @@ struct trace_event_call { }; struct trace_event event; char *print_fmt; - struct event_filter *filter; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 4dc4955f0fbf..aebf0571c736 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -29,16 +29,22 @@ struct tracepoint_func { int prio; }; +struct tracepoint_ext { + int (*regfunc)(void); + void (*unregfunc)(void); + /* Flags. */ + unsigned int faultable:1; +}; + struct tracepoint { const char *name; /* Tracepoint name */ - struct static_key key; + struct static_key_false key; struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; void *probestub; - int (*regfunc)(void); - void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; + struct tracepoint_ext *ext; }; #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -83,7 +89,7 @@ struct bpf_raw_event_map { #ifdef CONFIG_TRACEPOINTS # define tracepoint_enabled(tp) \ - static_key_false(&(__tracepoint_##tp).key) + static_branch_unlikely(&(__tracepoint_##tp).key) #else # define tracepoint_enabled(tracepoint) false #endif diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 93a9f3070b48..425123e921ac 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> #include <linux/tracepoint-defs.h> #include <linux/static_call.h> @@ -32,8 +33,6 @@ struct trace_eval_map { #define TRACEPOINT_DEFAULT_PRIO 10 -extern struct srcu_struct tracepoint_srcu; - extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -105,16 +104,30 @@ void for_each_tracepoint_in_module(struct module *mod, * tracepoint_synchronize_unregister must be called between the last tracepoint * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. + * + * An alternative is to use the following for batch reclaim associated + * with a given tracepoint: + * + * - tracepoint_is_faultable() == false: call_rcu() + * - tracepoint_is_faultable() == true: call_rcu_tasks_trace() */ #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { - synchronize_srcu(&tracepoint_srcu); + synchronize_rcu_tasks_trace(); synchronize_rcu(); } +static inline bool tracepoint_is_faultable(struct tracepoint *tp) +{ + return tp->ext && tp->ext->faultable; +} #else static inline void tracepoint_synchronize_unregister(void) { } +static inline bool tracepoint_is_faultable(struct tracepoint *tp) +{ + return false; +} #endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS @@ -197,66 +210,35 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #endif /* CONFIG_HAVE_STATIC_CALL */ /* - * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle - * code that disallow any/all tracing/instrumentation when RCU isn't watching. - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define RCUIDLE_COND(rcuidle) (rcuidle) -#else -/* srcu can't be used from NMI */ -#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) -#endif - -/* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. + * + * With @syscall=0, the tracepoint callback array dereference is + * protected by disabling preemption. + * With @syscall=1, the tracepoint callback array dereference is + * protected by Tasks Trace RCU, which allows probes to handle page + * faults. */ -#define __DO_TRACE(name, args, cond, rcuidle) \ +#define __DO_TRACE(name, args, cond, syscall) \ do { \ int __maybe_unused __idx = 0; \ \ if (!(cond)) \ return; \ \ - if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ - "Bad RCU usage for tracepoint")) \ - return; \ - \ - /* keep srcu and sched-rcu usage consistent */ \ - preempt_disable_notrace(); \ - \ - /* \ - * For rcuidle callers, use srcu since sched-rcu \ - * doesn't work from the idle path. \ - */ \ - if (rcuidle) { \ - __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ - ct_irq_enter_irqson(); \ - } \ + if (syscall) \ + rcu_read_lock_trace(); \ + else \ + preempt_disable_notrace(); \ \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ - if (rcuidle) { \ - ct_irq_exit_irqson(); \ - srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ - } \ - \ - preempt_enable_notrace(); \ + if (syscall) \ + rcu_read_unlock_trace(); \ + else \ + preempt_enable_notrace(); \ } while (0) -#ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond) \ - static inline void trace_##name##_rcuidle(proto) \ - { \ - if (static_key_false(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 1); \ - } -#else -#define __DECLARE_TRACE_RCU(name, proto, args, cond) -#endif - /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -268,23 +250,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * site if it is not watching, as it will need to be active when the * tracepoint is enabled. */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ +#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ - static inline void trace_##name(proto) \ - { \ - if (static_key_false(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 0); \ - if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - WARN_ONCE(!rcu_is_watching(), \ - "RCU not watching for tracepoint"); \ - } \ - } \ - __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ - PARAMS(cond)) \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -311,7 +280,36 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline bool \ trace_##name##_enabled(void) \ { \ - return static_key_false(&__tracepoint_##name.key); \ + return static_branch_unlikely(&__tracepoint_##name.key);\ + } + +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void trace_##name(proto) \ + { \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 0); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ + } \ + } + +#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void trace_##name(proto) \ + { \ + might_fault(); \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 1); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ + } \ } /* @@ -319,7 +317,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. */ -#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \ +#define __DEFINE_TRACE_EXT(_name, _ext, proto, args) \ static const char __tpstrtab_##_name[] \ __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ @@ -328,14 +326,14 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct tracepoint __tracepoint_##_name __used \ __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ - .key = STATIC_KEY_INIT_FALSE, \ + .key = STATIC_KEY_FALSE_INIT, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ .probestub = &__probestub_##_name, \ - .regfunc = _reg, \ - .unregfunc = _unreg, \ - .funcs = NULL }; \ + .funcs = NULL, \ + .ext = _ext, \ + }; \ __TRACEPOINT_ENTRY(_name); \ int __traceiter_##_name(void *__data, proto) \ { \ @@ -358,8 +356,24 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); -#define DEFINE_TRACE(name, proto, args) \ - DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args)); +#define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args) \ + static struct tracepoint_ext __tracepoint_ext_##_name = { \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + .faultable = false, \ + }; \ + __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); + +#define DEFINE_TRACE_SYSCALL(_name, _reg, _unreg, _proto, _args) \ + static struct tracepoint_ext __tracepoint_ext_##_name = { \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + .faultable = true, \ + }; \ + __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); + +#define DEFINE_TRACE(_name, _proto, _args) \ + __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ @@ -375,8 +389,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ static inline void trace_##name(proto) \ { } \ - static inline void trace_##name##_rcuidle(proto) \ - { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ @@ -398,7 +410,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) return false; \ } +#define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE + #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) +#define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args) #define DEFINE_TRACE(name, proto, args) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) @@ -459,6 +474,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) +#define DECLARE_TRACE_SYSCALL(name, proto, args) \ + __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto)) + #define TRACE_EVENT_FLAGS(event, flag) #define TRACE_EVENT_PERF_PERM(event, expr...) @@ -596,6 +616,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct, assign, print) \ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) +#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ + print, reg, unreg) \ + DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index a2ea11cc912e..183fa2aa2935 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -53,6 +53,20 @@ __bpf_trace_##call(void *__data, proto) \ #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#define __BPF_DECLARE_TRACE_SYSCALL(call, proto, args) \ +static notrace void \ +__bpf_trace_##call(void *__data, proto) \ +{ \ + might_fault(); \ + preempt_disable_notrace(); \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ + preempt_enable_notrace(); \ +} + +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ + __BPF_DECLARE_TRACE_SYSCALL(call, PARAMS(proto), PARAMS(args)) + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 00723935dcc7..63fea2218afa 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -46,6 +46,10 @@ assign, print, reg, unreg) \ DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args)) +#undef TRACE_EVENT_SYSCALL +#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \ + DEFINE_TRACE_SYSCALL(name, reg, unreg, PARAMS(proto), PARAMS(args)) + #undef TRACE_EVENT_NOP #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print) @@ -107,6 +111,7 @@ #undef TRACE_EVENT #undef TRACE_EVENT_FN #undef TRACE_EVENT_FN_COND +#undef TRACE_EVENT_SYSCALL #undef TRACE_EVENT_CONDITION #undef TRACE_EVENT_NOP #undef DEFINE_EVENT_NOP diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 3f249e150c0c..f99562d2b496 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -43,8 +43,6 @@ DEFINE_EVENT(preemptirq_template, irq_enable, #else #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #endif #ifdef CONFIG_TRACE_PREEMPT_TOGGLE @@ -58,8 +56,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif #endif /* _TRACE_PREEMPTIRQ_H */ @@ -69,10 +65,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */ #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h index b6e0cbc2c71f..f31ff446b468 100644 --- a/include/trace/events/syscalls.h +++ b/include/trace/events/syscalls.h @@ -15,7 +15,7 @@ #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -TRACE_EVENT_FN(sys_enter, +TRACE_EVENT_SYSCALL(sys_enter, TP_PROTO(struct pt_regs *regs, long id), @@ -41,7 +41,7 @@ TRACE_EVENT_FN(sys_enter, TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY) -TRACE_EVENT_FN(sys_exit, +TRACE_EVENT_SYSCALL(sys_exit, TP_PROTO(struct pt_regs *regs, long ret), diff --git a/include/trace/perf.h b/include/trace/perf.h index 2c11181c82e0..a1754b73a8f5 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -12,10 +12,10 @@ #undef __perf_task #define __perf_task(t) (__task = (t)) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -perf_trace_##call(void *__data, proto) \ +do_perf_trace_##call(void *__data, proto) \ { \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -56,6 +56,41 @@ perf_trace_##call(void *__data, proto) \ } /* + * Define unused __count and __task variables to use @args to pass + * arguments to do_perf_trace_##call. This is needed because the + * macros __perf_count and __perf_task introduce the side-effect to + * store copies into those local variables. + */ +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + do_perf_trace_##call(__data, args); \ +} + +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + might_fault(); \ + preempt_disable_notrace(); \ + do_perf_trace_##call(__data, args); \ + preempt_enable_notrace(); \ +} + +/* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the * perf probe will fail to compile unless it too is updated. @@ -73,4 +108,7 @@ static inline void perf_test_probe_##call(void) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __DECLARE_EVENT_CLASS + #endif /* CONFIG_PERF_EVENTS */ diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index c2f9cabf154d..4f22136fd465 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -45,6 +45,16 @@ PARAMS(print)); \ DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args)); +#undef TRACE_EVENT_SYSCALL +#define TRACE_EVENT_SYSCALL(name, proto, args, tstruct, assign, print, reg, unreg) \ + DECLARE_EVENT_SYSCALL_CLASS(name, \ + PARAMS(proto), \ + PARAMS(args), \ + PARAMS(tstruct), \ + PARAMS(assign), \ + PARAMS(print)); \ + DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args)); + #include "stages/stage1_struct_define.h" #undef DECLARE_EVENT_CLASS @@ -57,6 +67,9 @@ \ static struct trace_event_class event_class_##name; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) \ static struct trace_event_call __used \ @@ -117,6 +130,9 @@ tstruct; \ }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) @@ -208,6 +224,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ .trace = trace_raw_output_##call, \ }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ static notrace enum print_line_t \ @@ -244,6 +263,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ tstruct \ {} }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) @@ -265,6 +287,9 @@ static inline notrace int trace_event_get_offsets_##call( \ return __data_size; \ } +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* @@ -374,11 +399,11 @@ static inline notrace int trace_event_get_offsets_##call( \ #include "stages/stage6_event_callback.h" -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ - \ + +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -trace_event_raw_event_##call(void *__data, proto) \ +do_trace_event_raw_event_##call(void *__data, proto) \ { \ struct trace_event_file *trace_file = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -403,6 +428,30 @@ trace_event_raw_event_##call(void *__data, proto) \ \ trace_event_buffer_commit(&fbuffer); \ } + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + do_trace_event_raw_event_##call(__data, args); \ +} + +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + might_fault(); \ + preempt_disable_notrace(); \ + do_trace_event_raw_event_##call(__data, args); \ + preempt_enable_notrace(); \ +} + /* * The ftrace_test_probe is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the ftrace probe will @@ -418,6 +467,8 @@ static inline void ftrace_test_probe_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __DECLARE_EVENT_CLASS + #include "stages/stage7_class_define.h" #undef DECLARE_EVENT_CLASS @@ -434,6 +485,9 @@ static struct trace_event_class __used __refdata event_class_##call = { \ _TRACE_PERF_INIT(call) \ }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ \ diff --git a/init/Kconfig b/init/Kconfig index c521e1421ad4..b07f238f3bad 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1989,6 +1989,7 @@ config BINDGEN_VERSION_TEXT # config TRACEPOINTS bool + select TASKS_TRACE_RCU source "kernel/Kconfig.kexec" diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 58190ca724a2..5684e8ce132d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <linux/tracepoint.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -3033,17 +3034,33 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) +/* bpf_link_init_sleepable() allows to specify whether BPF link itself has + * "sleepable" semantics, which normally would mean that BPF link's attach + * hook can dereference link or link's underlying program for some time after + * detachment due to RCU Tasks Trace-based lifetime protection scheme. + * BPF program itself can be non-sleepable, yet, because it's transitively + * reachable through BPF link, its freeing has to be delayed until after RCU + * Tasks Trace GP. + */ +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; + link->sleepable = sleepable; link->id = 0; link->ops = ops; link->prog = prog; } +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) +{ + bpf_link_init_sleepable(link, type, ops, prog, false); +} + static void bpf_link_free_id(int id) { if (!id) @@ -3076,12 +3093,24 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_dealloc(struct bpf_link *link) +{ + /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ + if (link->prog) + bpf_prog_put(link->prog); + + /* free bpf_link and its containing memory */ + if (link->ops->dealloc_deferred) + link->ops->dealloc_deferred(link); + else + link->ops->dealloc(link); +} + static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) { struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); - /* free bpf_link and its containing memory */ - link->ops->dealloc_deferred(link); + bpf_link_dealloc(link); } static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) @@ -3096,26 +3125,27 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) static void bpf_link_free(struct bpf_link *link) { const struct bpf_link_ops *ops = link->ops; - bool sleepable = false; bpf_link_free_id(link->id); - if (link->prog) { - sleepable = link->prog->sleepable; - /* detach BPF program, clean up used resources */ + /* detach BPF program, clean up used resources */ + if (link->prog) ops->release(link); - bpf_prog_put(link->prog); - } if (ops->dealloc_deferred) { - /* schedule BPF link deallocation; if underlying BPF program - * is sleepable, we need to first wait for RCU tasks trace - * sync, then go through "classic" RCU grace period + /* Schedule BPF link deallocation, which will only then + * trigger putting BPF program refcount. + * If underlying BPF program is sleepable or BPF link's target + * attach hookpoint is sleepable or otherwise requires RCU GPs + * to ensure link and its underlying BPF program is not + * reachable anymore, we need to first wait for RCU tasks + * trace sync, and then go through "classic" RCU grace period */ - if (sleepable) + if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } else if (ops->dealloc) - ops->dealloc(link); + } else if (ops->dealloc) { + bpf_link_dealloc(link); + } } static void bpf_link_put_deferred(struct work_struct *work) @@ -3936,8 +3966,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog); + bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog, + tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71e53eaba8bc..9b17efb1a87d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5093,6 +5093,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func; int ret; + if (!tr) + return -ENODEV; + /* match_records() modifies func, and we need the original */ func = kstrdup(func_orig, GFP_KERNEL); if (!func) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 103b30b1c73b..3ef047ed9705 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -593,19 +593,6 @@ int tracing_check_open_get_tr(struct trace_array *tr) return 0; } -int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - __trace_event_discard_commit(buffer, event); - return 1; - } - - return 0; -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -988,7 +975,8 @@ static inline void trace_access_lock_init(void) #endif #ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, @@ -997,7 +985,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr, int skip, struct pt_regs *regs); #else -static inline void __ftrace_trace_stack(struct trace_buffer *buffer, +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { @@ -1934,7 +1923,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -2908,7 +2897,6 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx) { - struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -2921,11 +2909,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&trace_function_exports_enabled)) - ftrace_exports(event, TRACE_EXPORT_FUNCTION); - __buffer_unlock_commit(buffer, event); - } + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); + __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE @@ -2933,7 +2919,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 -#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) +#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; @@ -2947,11 +2933,11 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; @@ -2994,6 +2980,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } +#ifdef CONFIG_DYNAMIC_FTRACE + /* Mark entry of stack trace as trampoline code */ + if (tr->ops && tr->ops->trampoline) { + unsigned long tramp_start = tr->ops->trampoline; + unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; + unsigned long *calls = fstack->calls; + + for (int i = 0; i < nr_entries; i++) { + if (calls[i] >= tramp_start && calls[i] < tramp_end) + calls[i] = FTRACE_TRAMPOLINE_MARKER; + } + } +#endif + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); @@ -3005,8 +3005,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -3024,7 +3023,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, trace_ctx, skip, regs); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, @@ -3033,7 +3032,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; } @@ -3050,7 +3049,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, return; ct_irq_enter_irqson(); - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } @@ -3067,8 +3066,8 @@ void trace_dump_stack(int skip) /* Skip 1 to skip this function. */ skip++; #endif - __ftrace_trace_stack(printk_trace->array_buffer.buffer, - tracing_gen_ctx(), skip, NULL); + __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, + tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); @@ -3079,7 +3078,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { - struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -3113,8 +3111,7 @@ ftrace_trace_userstack(struct trace_array *tr, memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -3283,7 +3280,6 @@ static void trace_printk_start_stop_comm(int enabled) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = READ_ONCE(printk_trace); @@ -3327,10 +3323,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); @@ -3350,7 +3344,6 @@ static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; @@ -3385,10 +3378,8 @@ __trace_array_vprintk(struct trace_buffer *buffer, entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3307dad4d917..266740b4e121 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1440,10 +1440,6 @@ struct trace_subsystem_dir { int nr_events; }; -extern int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event); - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, @@ -2187,4 +2183,11 @@ static inline int rv_init_interface(void) } #endif +/* + * This is used only to distinguish + * function address from trampoline code. + * So this value has no meaning. + */ +#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e47fdb4c92fb..6d08a5523ce0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,7 +30,6 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { - struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; struct trace_array_cpu *data; @@ -74,16 +73,13 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) p--; p++; - strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); - strncpy(entry->file, p, TRACE_FILE_SIZE); - entry->func[TRACE_FUNC_SIZE] = 0; - entry->file[TRACE_FILE_SIZE] = 0; + strscpy(entry->func, f->data.func); + strscpy(entry->file, p); entry->constant = f->constant; entry->line = f->data.line; entry->correct = val == expect; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 4702efb00ff2..4cb2ebc439be 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -154,5 +154,5 @@ static atomic64_t trace_counter; */ u64 notrace trace_clock_counter(void) { - return atomic64_add_return(1, &trace_counter); + return atomic64_inc_return(&trace_counter); } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 05e791241812..3ff9caa4a71b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; + struct hw_perf_event *hwc = &p_event->hw; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(p_event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(p_event); + } + /* * If TRACE_REG_PERF_ADD returns false; no custom action was performed * and we need to take the default action of enqueueing our event on diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7266ec2a4eea..77e68efbd43e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3149,8 +3149,6 @@ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); - free_event_filter(call->filter); - call->filter = NULL; } static int probe_remove_event_call(struct trace_event_call *call) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0c611b281a5b..78051de581e7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; ret = kstrtoul(num_buf, 0, &ip); @@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; } else if (!strncmp(str + i, "CPUS", 4)) { @@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; /* Make sure it is a value */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5f9119eb7c67..9c058aa8baf3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, { struct tracepoint *tp = event->tp; - if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + if (unlikely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; synth_probe_func_t probe_func; void *__data; @@ -1354,10 +1354,7 @@ static const char *hist_field_name(struct hist_field *field, } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; else if (field->flags & HIST_FIELD_FL_STACKTRACE) { - if (field->field) - field_name = field->field->name; - else - field_name = "common_stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -1599,7 +1596,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - strncpy(comm, task->comm, TASK_COMM_LEN); + strscpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -3405,7 +3402,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) elt_data = context->elt->private_data; track_elt_data = track_data->elt.private_data; if (elt_data->comm) - strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); track_data->updated = true; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 42b0d998d103..17bcad8f79de 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1676,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user) struct tracepoint *tp = &user->tracepoint; char status = 0; - if (atomic_read(&tp->key.enabled) > 0) { + if (static_key_enabled(&tp->key)) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; @@ -2280,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) * It's possible key.enabled disables after this check, however * we don't mind if a few events are included in this condition. */ - if (likely(atomic_read(&tp->key.enabled) > 0)) { + if (likely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; struct iov_iter copy; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3b0cea37e029..74c353164ca1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,6 +176,27 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + unsigned long true_parent_ip; + int idx = 0; + + true_parent_ip = parent_ip; + if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs) + true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip, + (unsigned long *)ftrace_regs_get_stack_pointer(fregs)); + return true_parent_ip; +} +#else +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + return parent_ip; +} +#endif + static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) @@ -184,7 +205,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -193,10 +213,11 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + parent_ip = function_get_true_parent_ip(parent_ip, fregs); + trace_ctx = tracing_gen_ctx(); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) trace_function(tr, ip, parent_ip, trace_ctx); @@ -241,6 +262,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); @@ -300,7 +322,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx; unsigned long flags; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -309,8 +330,8 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); + data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; @@ -321,7 +342,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * TODO: think about a solution that is better than just hoping to be * lucky. */ - last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + last_info = this_cpu_ptr(tr->last_func_repeats); if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; @@ -356,6 +377,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 03c5a0d300a5..5504b5e4e7b4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -114,7 +114,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; @@ -125,8 +124,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } @@ -292,7 +290,6 @@ void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ret_entry *entry; @@ -303,8 +300,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } static void handle_nosleeptime(struct ftrace_graph_ret *trace, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 3bd6071441ad..b65353ec2837 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -130,7 +130,6 @@ static bool hwlat_busy; static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; - struct trace_event_call *call = &event_hwlat; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; @@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; entry->count = sample->count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 64e77b513697..ba5858866b2f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { - struct trace_event_call *call = &event_mmiotrace_rw; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { - struct trace_event_call *call = &event_mmiotrace_map; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a50ed23bee77..b9f96c77527d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -499,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s) static void __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -517,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe entry->softirq_count = sample->softirq_count; entry->thread_count = sample->thread_count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -578,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s) static void __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -591,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf entry->context = sample->context; entry->timer_latency = sample->timer_latency; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -654,7 +650,6 @@ static void timerlat_save_stack(int skip) static void __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct stack_entry *entry; @@ -668,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 868f2f912f28..e08aee34ef63 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -460,7 +460,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : bh_off ? 'b' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | @@ -1246,6 +1245,10 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, break; trace_seq_puts(s, " => "); + if ((*p) == FTRACE_TRAMPOLINE_MARKER) { + trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); + continue; + } seq_print_ip_sym(s, (*p) + delta, flags); trace_seq_putc(s, '\n'); } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index e37446f7916e..5c03633316a6 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,20 +15,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> -/* - * Use regular trace points on architectures that implement noinstr - * tooling: these calls will only happen with RCU enabled, which can - * use a regular tracepoint. - * - * On older architectures, use the rcuidle tracing methods (which - * aren't NMI-safe - so exclude NMI contexts): - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define trace(point) trace_##point -#else -#define trace(point) if (!in_nmi()) trace_##point##_rcuidle -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -42,7 +28,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -53,7 +39,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -75,7 +61,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } @@ -89,7 +75,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -100,13 +86,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace(preempt_enable)(a0, a1); + trace_preempt_enable(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace(preempt_disable)(a0, a1); + trace_preempt_disable(a0, a1); tracer_preempt_off(a0, a1); } #endif diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8a407adb0e1c..573b5d8e8a28 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -187,7 +187,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ae2ace5e515a..d6c7f18daa15 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -378,7 +378,6 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned int trace_ctx) { - struct trace_event_call *call = &event_context_switch; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -396,8 +395,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void @@ -406,7 +404,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned int trace_ctx) { - struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -424,8 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void notrace diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 785733245ead..46aab0ab9350 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -299,6 +299,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int syscall_nr; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -338,6 +345,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_buffer fbuffer; int syscall_nr; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -584,6 +598,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -686,6 +707,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8879da16ef4d..1848ce7e2976 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,9 +25,6 @@ enum tp_func_state { extern tracepoint_ptr_t __start___tracepoints_ptrs[]; extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; -DEFINE_SRCU(tracepoint_srcu); -EXPORT_SYMBOL_GPL(tracepoint_srcu); - enum tp_transition_sync { TP_TRANSITION_SYNC_1_0_1, TP_TRANSITION_SYNC_N_2_1, @@ -37,7 +34,6 @@ enum tp_transition_sync { struct tp_transition_snapshot { unsigned long rcu; - unsigned long srcu; bool ongoing; }; @@ -50,7 +46,6 @@ static void tp_rcu_get_state(enum tp_transition_sync sync) /* Keep the latest get_state snapshot. */ snapshot->rcu = get_state_synchronize_rcu(); - snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu); snapshot->ongoing = true; } @@ -61,8 +56,6 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync) if (!snapshot->ongoing) return; cond_synchronize_rcu(snapshot->rcu); - if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu)) - synchronize_srcu(&tracepoint_srcu); snapshot->ongoing = false; } @@ -85,9 +78,6 @@ static LIST_HEAD(tracepoint_module_list); */ static DEFINE_MUTEX(tracepoints_mutex); -static struct rcu_head *early_probes; -static bool ok_to_free_tracepoints; - /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent @@ -111,57 +101,21 @@ static inline void *allocate_probes(int count) return p == NULL ? NULL : p->probes; } -static void srcu_free_old_probes(struct rcu_head *head) -{ - kfree(container_of(head, struct tp_probes, rcu)); -} - static void rcu_free_old_probes(struct rcu_head *head) { - call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); -} - -static __init int release_early_probes(void) -{ - struct rcu_head *tmp; - - ok_to_free_tracepoints = true; - - while (early_probes) { - tmp = early_probes; - early_probes = tmp->next; - call_rcu(tmp, rcu_free_old_probes); - } - - return 0; + kfree(container_of(head, struct tp_probes, rcu)); } -/* SRCU is initialized at core_initcall */ -postcore_initcall(release_early_probes); - -static inline void release_probes(struct tracepoint_func *old) +static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - /* - * We can't free probes if SRCU is not initialized yet. - * Postpone the freeing till after SRCU is initialized. - */ - if (unlikely(!ok_to_free_tracepoints)) { - tp_probes->rcu.next = early_probes; - early_probes = &tp_probes->rcu; - return; - } - - /* - * Tracepoint probes are protected by both sched RCU and SRCU, - * by calling the SRCU callback in the sched RCU callback we - * cover both cases. So let us chain the SRCU and sched RCU - * callbacks to wait for both grace periods. - */ - call_rcu(&tp_probes->rcu, rcu_free_old_probes); + if (tracepoint_is_faultable(tp)) + call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); + else + call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } @@ -327,8 +281,8 @@ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *old, *tp_funcs; int ret; - if (tp->regfunc && !static_key_enabled(&tp->key)) { - ret = tp->regfunc(); + if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) { + ret = tp->ext->regfunc(); if (ret < 0) return ret; } @@ -358,7 +312,7 @@ static int tracepoint_add_func(struct tracepoint *tp, tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, tp_funcs); - static_key_enable(&tp->key); + static_branch_enable(&tp->key); break; case TP_FUNC_2: /* 1->2 */ /* Set iterator static call */ @@ -383,7 +337,7 @@ static int tracepoint_add_func(struct tracepoint *tp, break; } - release_probes(old); + release_probes(tp, old); return 0; } @@ -411,10 +365,9 @@ static int tracepoint_remove_func(struct tracepoint *tp, switch (nr_func_state(tp_funcs)) { case TP_FUNC_0: /* 1->0 */ /* Removed last function */ - if (tp->unregfunc && static_key_enabled(&tp->key)) - tp->unregfunc(); - - static_key_disable(&tp->key); + if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key)) + tp->ext->unregfunc(); + static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ @@ -455,7 +408,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, WARN_ON_ONCE(1); break; } - release_probes(old); + release_probes(tp, old); return 0; } diff --git a/scripts/tags.sh b/scripts/tags.sh index 191e0461d6d5..0d01c1cafb70 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -152,9 +152,7 @@ regex_c=( '/^BPF_CALL_[0-9]([[:space:]]*\([[:alnum:]_]*\).*/\1/' '/^COMPAT_SYSCALL_DEFINE[0-9]([[:space:]]*\([[:alnum:]_]*\).*/compat_sys_\1/' '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/get_\1_slot/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/free_\1_slot/' '/^PAGEFLAG([[:space:]]*\([[:alnum:]_]*\).*/Page\1/' |