diff options
Diffstat (limited to 'kernel')
59 files changed, 1372 insertions, 554 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 6520baa13669..61630110e29d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -44,19 +44,14 @@ * a struct file opened for write. Fixed. 2/6/2000, AV. */ -#include <linux/mm.h> #include <linux/slab.h> #include <linux/acct.h> #include <linux/capability.h> -#include <linux/file.h> #include <linux/tty.h> -#include <linux/security.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/jiffies.h> -#include <linux/times.h> #include <linux/syscalls.h> -#include <linux/mount.h> -#include <linux/uaccess.h> +#include <linux/namei.h> #include <linux/sched/cputime.h> #include <asm/div64.h> @@ -217,84 +212,70 @@ static void close_work(struct work_struct *work) complete(&acct->done); } -static int acct_on(struct filename *pathname) +DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T)) +static int acct_on(const char __user *name) { - struct file *file; - struct vfsmount *mnt, *internal; + /* Difference from BSD - they don't do O_APPEND */ + const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE; struct pid_namespace *ns = task_active_pid_ns(current); + struct filename *pathname __free(putname) = getname(name); + struct file *original_file __free(fput) = NULL; // in that order + struct path internal __free(path_put) = {}; // in that order + struct file *file __free(fput_sync) = NULL; // in that order struct bsd_acct_struct *acct; + struct vfsmount *mnt; struct fs_pin *old; - int err; - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (!acct) - return -ENOMEM; + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + original_file = file_open_name(pathname, open_flags, 0); + if (IS_ERR(original_file)) + return PTR_ERR(original_file); - /* Difference from BSD - they don't do O_APPEND */ - file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) { - kfree(acct); + mnt = mnt_clone_internal(&original_file->f_path); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + internal.mnt = mnt; + internal.dentry = dget(mnt->mnt_root); + + file = dentry_open(&internal, open_flags, current_cred()); + if (IS_ERR(file)) return PTR_ERR(file); - } - if (!S_ISREG(file_inode(file)->i_mode)) { - kfree(acct); - filp_close(file, NULL); + if (!S_ISREG(file_inode(file)->i_mode)) return -EACCES; - } /* Exclude kernel kernel internal filesystems. */ - if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { - kfree(acct); - filp_close(file, NULL); + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) return -EINVAL; - } /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { - kfree(acct); - filp_close(file, NULL); + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) return -EINVAL; - } - if (!(file->f_mode & FMODE_CAN_WRITE)) { - kfree(acct); - filp_close(file, NULL); + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EIO; - } - internal = mnt_clone_internal(&file->f_path); - if (IS_ERR(internal)) { - kfree(acct); - filp_close(file, NULL); - return PTR_ERR(internal); - } - err = mnt_get_write_access(internal); - if (err) { - mntput(internal); - kfree(acct); - filp_close(file, NULL); - return err; - } - mnt = file->f_path.mnt; - file->f_path.mnt = internal; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; atomic_long_set(&acct->count, 1); init_fs_pin(&acct->pin, acct_pin_kill); - acct->file = file; + acct->file = no_free_ptr(file); acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); INIT_WORK(&acct->work, close_work); init_completion(&acct->done); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ - pin_insert(&acct->pin, mnt); + pin_insert(&acct->pin, original_file->f_path.mnt); rcu_read_lock(); old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_put_write_access(mnt); - mntput(mnt); return 0; } @@ -319,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - struct filename *tmp = getname(name); - - if (IS_ERR(tmp)) - return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); - error = acct_on(tmp); + error = acct_on(name); mutex_unlock(&acct_on_mutex); - putname(tmp); } else { rcu_read_lock(); pin_kill(task_active_pid_ns(current)->bacct); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1605df0a171e..fda6beb041e0 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -680,7 +680,7 @@ void audit_trim_trees(void) struct audit_tree *tree; struct path path; struct audit_node *node; - struct path *paths; + const struct path *paths; struct path array[16]; int err; @@ -703,7 +703,7 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { struct inode *inode = p->dentry->d_inode; if (inode_to_key(inode) == chunk->key) { node->index &= ~(1U<<31); @@ -742,9 +742,9 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mounts(struct path *paths, struct audit_tree *tree) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { int err = tag_chunk(p->dentry->d_inode, tree); if (err) return err; @@ -807,7 +807,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct audit_tree *seed = rule->tree, *tree; struct path path; struct path array[16]; - struct path *paths; + const struct path *paths; int err; rule->tree = NULL; @@ -879,7 +879,7 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct path array[16]; - struct path *paths; + const struct path *paths; int err; err = kern_path(new, 0, &path2); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c9fab9a356df..8eb117c52817 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work) rcu_read_unlock_trace(); } +static void bpf_async_cb_rcu_free(struct rcu_head *rcu) +{ + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + + kfree_nolock(cb); +} + static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); - kfree_rcu(w, cb.rcu); + call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); } static void bpf_timer_delete_work(struct work_struct *work) @@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work) /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call - * kfree_rcu(t) right after for both preallocated and non-preallocated + * call_rcu() right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, @@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until - * kmalloc_nolock() is available, avoid locking issues by using - * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). - */ - cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); + cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); - kfree(cb); + kfree_nolock(cb); ret = -EPERM; } out: @@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val) * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do - * kfree_rcu, even though no more timers can be armed. + * call_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a @@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val) * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); else queue_work(system_dfl_wq, &t->cb.delete_work); } else { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f90bdcc0a047..81780bcf8d25 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -775,7 +775,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_free_inode(struct inode *inode) +static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; @@ -790,7 +790,7 @@ const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, - .free_inode = bpf_free_inode, + .destroy_inode = bpf_destroy_inode, }; enum { diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 3c611aba7f52..1e6538f59a78 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -195,8 +195,10 @@ static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, return ERR_PTR(-ENOMEM); result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), GFP_KERNEL_ACCOUNT); - if (!result->must_write_set) + if (!result->must_write_set) { + kvfree(result); return ERR_PTR(-ENOMEM); + } memcpy(&result->callchain, callchain, sizeof(*callchain)); result->insn_cnt = subprog_sz; hash_add(liveness->func_instances, &result->hl_node, key); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a9456a3e730..8a129746bd6c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, return ptr; } +void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *memcg, *old_memcg; + void *ptr; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); + ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + + return ptr; +} + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 73bba397672a..ff40e5e65c43 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15645,7 +15645,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if (opcode == BPF_NEG) { + if (opcode == BPF_NEG && + regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, ®s[insn->dst_reg], @@ -15803,7 +15804,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off > 1 || + if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) || (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; @@ -15813,7 +15814,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; } else { - if (insn->src_reg != BPF_REG_0 || insn->off > 1 || + if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) || (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 10b63433f057..e12b946278b6 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/page_counter.h> #include <linux/parser.h> +#include <linux/rculist.h> #include <linux/slab.h> struct dmem_cgroup_region { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index f625172d4b67..22fe969c5d2e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -30,6 +30,7 @@ #include <linux/kgdb.h> #include <linux/kdb.h> #include <linux/serial_core.h> +#include <linux/string.h> #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -547,7 +548,7 @@ static void gdb_cmd_setregs(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); } else { gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } } @@ -577,7 +578,7 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) if (err) error_packet(remcom_out_buffer, err); else - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } #if DBG_MAX_REG_NUM > 0 @@ -630,7 +631,7 @@ static void gdb_cmd_reg_set(struct kgdb_state *ks) i = i / 2; kgdb_hex2mem(ptr, (char *)gdb_regs, i); dbg_set_reg(regnum, gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } #endif /* DBG_MAX_REG_NUM > 0 */ @@ -642,7 +643,7 @@ static void gdb_cmd_binwrite(struct kgdb_state *ks) if (err) error_packet(remcom_out_buffer, err); else - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } /* Handle the 'D' or 'k', detach or kill packets */ @@ -656,7 +657,7 @@ static void gdb_cmd_detachkill(struct kgdb_state *ks) if (error < 0) { error_packet(remcom_out_buffer, error); } else { - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); kgdb_connected = 0; } put_packet(remcom_out_buffer); @@ -676,7 +677,7 @@ static int gdb_cmd_reboot(struct kgdb_state *ks) /* For now, only honor R0 */ if (strcmp(remcom_in_buffer, "R0") == 0) { printk(KERN_CRIT "Executing emergency reboot\n"); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); put_packet(remcom_out_buffer); /* @@ -739,7 +740,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) case 'C': /* Current thread id */ - strcpy(remcom_out_buffer, "QC"); + strscpy(remcom_out_buffer, "QC"); ks->threadid = shadow_pid(current->pid); int_to_threadref(thref, ks->threadid); pack_threadid(remcom_out_buffer + 2, thref); @@ -773,7 +774,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) int len = strlen(remcom_in_buffer + 6); if ((len % 2) != 0) { - strcpy(remcom_out_buffer, "E01"); + strscpy(remcom_out_buffer, "E01"); break; } kgdb_hex2mem(remcom_in_buffer + 6, @@ -785,14 +786,14 @@ static void gdb_cmd_query(struct kgdb_state *ks) kdb_parse(remcom_out_buffer); kdb_common_deinit_state(); - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); } break; #endif #ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT case 'S': if (!strncmp(remcom_in_buffer, "qSupported:", 11)) - strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + strscpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); break; case 'X': if (!strncmp(remcom_in_buffer, "qXfer:", 6)) @@ -822,7 +823,7 @@ static void gdb_cmd_task(struct kgdb_state *ks) } kgdb_usethread = thread; ks->kgdb_usethreadid = ks->threadid; - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); break; case 'c': ptr = &remcom_in_buffer[2]; @@ -837,7 +838,7 @@ static void gdb_cmd_task(struct kgdb_state *ks) } kgdb_contthread = thread; } - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); break; } } @@ -851,7 +852,7 @@ static void gdb_cmd_thread(struct kgdb_state *ks) kgdb_hex2long(&ptr, &ks->threadid); thread = getthread(ks->linux_regs, ks->threadid); if (thread) - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); else error_packet(remcom_out_buffer, -EINVAL); } @@ -913,7 +914,7 @@ static void gdb_cmd_break(struct kgdb_state *ks) (int) length, *bpt_type - '0'); if (error == 0) - strcpy(remcom_out_buffer, "OK"); + strscpy(remcom_out_buffer, "OK"); else error_packet(remcom_out_buffer, error); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b11b10b120c..b12b9db75c1d 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -714,8 +714,8 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) * it, depending on the results of the search. */ cp++; /* to byte after the newline */ - replaced_byte = *cp; /* remember what/where it was */ - cphold = cp; + replaced_byte = *cp; /* remember what it was */ + cphold = cp; /* remember where it was */ *cp = '\0'; /* end the string for our search */ /* @@ -732,8 +732,9 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) * Shift the buffer left. */ *cphold = replaced_byte; - strcpy(kdb_buffer, cphold); - len = strlen(kdb_buffer); + len = strlen(cphold); + /* Use memmove() because the buffers overlap */ + memmove(kdb_buffer, cphold, len + 1); next_avail = kdb_buffer + len; size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; @@ -872,8 +873,9 @@ kdb_printit: */ if (kdb_grepping_flag && !suspend_grep) { *cphold = replaced_byte; - strcpy(kdb_buffer, cphold); - len = strlen(kdb_buffer); + len = strlen(cphold); + /* Use memmove() because the buffers overlap */ + memmove(kdb_buffer, cphold, len + 1); next_avail = kdb_buffer + len; size_avail = sizeof(kdb_buffer) - len; } diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 3a74604fdb8a..386d30e530b7 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -145,9 +145,6 @@ int kdb_get_kbd_char(void) return CTRL('F'); } - if (scancode == 0xe0) - return -1; - /* * For Japanese 86/106 keyboards * See comment in drivers/char/pc_keyb.c. diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7a4d2d4689a5..dddf2b5aad57 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -721,20 +721,12 @@ static int kdb_defcmd(int argc, const char **argv) mp->name = kdb_strdup(argv[1], GFP_KDB); if (!mp->name) goto fail_name; - mp->usage = kdb_strdup(argv[2], GFP_KDB); + mp->usage = kdb_strdup_dequote(argv[2], GFP_KDB); if (!mp->usage) goto fail_usage; - mp->help = kdb_strdup(argv[3], GFP_KDB); + mp->help = kdb_strdup_dequote(argv[3], GFP_KDB); if (!mp->help) goto fail_help; - if (mp->usage[0] == '"') { - strcpy(mp->usage, argv[2]+1); - mp->usage[strlen(mp->usage)-1] = '\0'; - } - if (mp->help[0] == '"') { - strcpy(mp->help, argv[3]+1); - mp->help[strlen(mp->help)-1] = '\0'; - } INIT_LIST_HEAD(&kdb_macro->statements); defcmd_in_progress = true; @@ -860,7 +852,7 @@ static void parse_grep(const char *str) kdb_printf("search string too long\n"); return; } - strcpy(kdb_grep_string, cp); + memcpy(kdb_grep_string, cp, len + 1); kdb_grepping_flag++; return; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index d2520d72b1f5..a2fc7d2bc9fc 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -110,6 +110,7 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, extern int kdbgetsymval(const char *, kdb_symtab_t *); extern int kdbnearsym(unsigned long, kdb_symtab_t *); extern char *kdb_strdup(const char *str, gfp_t type); +extern char *kdb_strdup_dequote(const char *str, gfp_t type); extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int); /* Routine for debugging the debugger state. */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 05b137e7dcb9..56f7b906e7cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -23,6 +23,7 @@ #include <linux/uaccess.h> #include <linux/kdb.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/ctype.h> #include "kdb_private.h" @@ -246,11 +247,41 @@ void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p, */ char *kdb_strdup(const char *str, gfp_t type) { - int n = strlen(str)+1; + size_t n = strlen(str) + 1; char *s = kmalloc(n, type); if (!s) return NULL; - return strcpy(s, str); + memcpy(s, str, n); + return s; +} + +/* + * kdb_strdup_dequote - same as kdb_strdup(), but trims surrounding quotes from + * the input string if present. + * Remarks: + * Quotes are only removed if there is both a leading and a trailing quote. + */ +char *kdb_strdup_dequote(const char *str, gfp_t type) +{ + size_t len = strlen(str); + char *s; + + if (str[0] == '"' && len > 1 && str[len - 1] == '"') { + /* trim both leading and trailing quotes */ + str++; + len -= 2; + } + + len++; /* add space for NUL terminator */ + + s = kmalloc(len, type); + if (!s) + return NULL; + + memcpy(s, str, len - 1); + s[len - 1] = '\0'; + + return s; } /* diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index b82399437db0..1e5c64cb6a42 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -38,8 +38,8 @@ enum { dma_debug_single, dma_debug_sg, dma_debug_coherent, - dma_debug_resource, dma_debug_noncoherent, + dma_debug_phy, }; enum map_err_types { @@ -141,8 +141,8 @@ static const char *type2name[] = { [dma_debug_single] = "single", [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", - [dma_debug_resource] = "resource", [dma_debug_noncoherent] = "noncoherent", + [dma_debug_phy] = "phy", }; static const char *dir2name[] = { @@ -1054,17 +1054,16 @@ static void check_unmap(struct dma_debug_entry *ref) dma_entry_free(entry); } -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) +static void check_for_stack(struct device *dev, phys_addr_t phys) { void *addr; struct vm_struct *stack_vm_area = task_stack_vm_area(current); if (!stack_vm_area) { /* Stack is direct-mapped. */ - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = page_address(page) + offset; + addr = phys_to_virt(phys); if (object_is_on_stack(addr)) err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr); } else { @@ -1072,10 +1071,12 @@ static void check_for_stack(struct device *dev, int i; for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) + if (__phys_to_pfn(phys) != + page_to_pfn(stack_vm_area->pages[i])) continue; - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + addr = (u8 *)current->stack + i * PAGE_SIZE + + (phys % PAGE_SIZE); err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr); break; } @@ -1204,9 +1205,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, } EXPORT_SYMBOL(debug_dma_map_single); -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - unsigned long attrs) +void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + int direction, dma_addr_t dma_addr, unsigned long attrs) { struct dma_debug_entry *entry; @@ -1221,19 +1221,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_single; - entry->paddr = page_to_phys(page) + offset; + entry->type = dma_debug_phy; + entry->paddr = phys; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - check_for_stack(dev, page, offset); + if (!(attrs & DMA_ATTR_MMIO)) { + check_for_stack(dev, phys); - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); + if (!PhysHighMem(phys)) + check_for_illegal_area(dev, phys_to_virt(phys), size); } add_dma_entry(entry, attrs); @@ -1277,11 +1276,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, +void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_single, + .type = dma_debug_phy, .dev = dev, .dev_addr = dma_addr, .size = size, @@ -1305,7 +1304,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nents, i) { - check_for_stack(dev, sg_page(s), s->offset); + check_for_stack(dev, sg_phys(s)); if (!PageHighMem(sg_page(s))) check_for_illegal_area(dev, sg_virt(s), s->length); } @@ -1445,47 +1444,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->paddr = addr; - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry, attrs); -} - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} - void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index 48757ca13f31..da7be0bddcf6 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -9,12 +9,11 @@ #define _KERNEL_DMA_DEBUG_H #ifdef CONFIG_DMA_API_DEBUG -extern void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, +extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, dma_addr_t dma_addr, unsigned long attrs); -extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, @@ -31,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size, extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); -extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs); - -extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction); - extern void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); @@ -62,14 +53,13 @@ extern void debug_dma_free_pages(struct device *dev, struct page *page, size_t size, int direction, dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ -static inline void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) +static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, + dma_addr_t dma_addr, unsigned long attrs) { } -static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction) { } @@ -97,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, { } -static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs) -{ -} - -static inline void debug_dma_unmap_resource(struct device *dev, - dma_addr_t dma_addr, size_t size, - int direction) -{ -} - static inline void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 24c359d9c879..1f9ee9759426 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -120,7 +120,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); - struct page *page = NULL; + struct page *page; u64 phys_limit; WARN_ON_ONCE(!PAGE_ALIGNED(size)); @@ -131,30 +131,25 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { - if (!dma_coherent_ok(dev, page_to_phys(page), size) || - (!allow_highmem && PageHighMem(page))) { - dma_free_contiguous(dev, page, size); - page = NULL; - } + if (dma_coherent_ok(dev, page_to_phys(page), size) && + (allow_highmem || !PageHighMem(page))) + return page; + + dma_free_contiguous(dev, page, size); } -again: - if (!page) - page = alloc_pages_node(node, gfp, get_order(size)); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + + while ((page = alloc_pages_node(node, gfp, get_order(size))) + && !dma_coherent_ok(dev, page_to_phys(page), size)) { __free_pages(page, get_order(size)); - page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && - !(gfp & (GFP_DMA32 | GFP_DMA))) { + !(gfp & (GFP_DMA32 | GFP_DMA))) gfp |= GFP_DMA32; - goto again; - } - - if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { + else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } + else + return NULL; } return page; @@ -453,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else - dma_direct_unmap_page(dev, sg->dma_address, + dma_direct_unmap_phys(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } @@ -476,8 +471,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, */ break; case PCI_P2PDMA_MAP_NONE: - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); + sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; @@ -502,22 +497,6 @@ out_unmap: return ret; } -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t dma_addr = paddr; - - if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - dev_err_once(dev, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - WARN_ON_ONCE(1); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index d2c0b7e632fc..da2fadf45bcd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_dma_mark_clean(paddr, size); } -static inline dma_addr_t dma_direct_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) +static inline dma_addr_t dma_direct_map_phys(struct device *dev, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); + dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) + goto err_overflow; + return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true)) || - dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) { + dma_addr = phys; + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; + } else { + dma_addr = phys_to_dma(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { + if (is_swiotlb_active(dev)) + return swiotlb_map(dev, phys, size, dir, attrs); + + goto err_overflow; + } } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); return dma_addr; + +err_overflow: + dev_WARN_ONCE( + dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, +static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = dma_to_phys(dev, addr); + phys_addr_t phys; + + if (attrs & DMA_ATTR_MMIO) + /* nothing to do: uncached and no swiotlb */ + return; + phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 56de28a3b179..fe7472f13b10 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -152,11 +152,11 @@ static inline bool dma_map_direct(struct device *dev, return dma_go_direct(dev, *dev->dma_mask, ops); } -dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, - size_t offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + bool is_mmio = attrs & DMA_ATTR_MMIO; dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); @@ -165,36 +165,81 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) + addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) - addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); - else + addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); + else if (is_mmio) { + if (!ops->map_resource) + return DMA_MAPPING_ERROR; + + addr = ops->map_resource(dev, phys, size, dir, attrs); + } else { + struct page *page = phys_to_page(phys); + size_t offset = offset_in_page(phys); + + /* + * The dma_ops API contract for ops->map_page() requires + * kmappable memory, while ops->map_resource() does not. + */ addr = ops->map_page(dev, page, offset, size, dir, attrs); - kmsan_handle_dma(page, offset, size, dir); - trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, - attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); + } + + if (!is_mmio) + kmsan_handle_dma(phys, size, dir); + trace_dma_map_phys(dev, phys, addr, size, dir, attrs); + debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; } +EXPORT_SYMBOL_GPL(dma_map_phys); + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(is_zone_device_page(page))) + return DMA_MAPPING_ERROR; + + return dma_map_phys(dev, phys, size, dir, attrs); +} EXPORT_SYMBOL(dma_map_page_attrs); -void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + bool is_mmio = attrs & DMA_ATTR_MMIO; BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || - arch_dma_unmap_page_direct(dev, addr + size)) - dma_direct_unmap_page(dev, addr, size, dir, attrs); + (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) + dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, addr, size, dir, attrs); - else + iommu_dma_unmap_phys(dev, addr, size, dir, attrs); + else if (is_mmio) { + if (ops->unmap_resource) + ops->unmap_resource(dev, addr, size, dir, attrs); + } else ops->unmap_page(dev, addr, size, dir, attrs); - trace_dma_unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir); + trace_dma_unmap_phys(dev, addr, size, dir, attrs); + debug_dma_unmap_phys(dev, addr, size, dir); +} +EXPORT_SYMBOL_GPL(dma_unmap_phys); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + if (unlikely(attrs & DMA_ATTR_MMIO)) + return; + + dma_unmap_phys(dev, addr, size, dir, attrs); } EXPORT_SYMBOL(dma_unmap_page_attrs); @@ -321,41 +366,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr = DMA_MAPPING_ERROR; - - BUG_ON(!valid_dma_direction(dir)); - - if (WARN_ON_ONCE(!dev->dma_mask)) + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) - addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); - else if (use_dma_iommu(dev)) - addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); - else if (ops->map_resource) - addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - - trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); - return addr; + return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) - ; /* nothing to do: uncached and no swiotlb */ - else if (use_dma_iommu(dev)) - iommu_dma_unmap_resource(dev, addr, size, dir, attrs); - else if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - trace_dma_unmap_resource(dev, addr, size, dir, attrs); - debug_dma_unmap_resource(dev, addr, size, dir); + dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_unmap_resource); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 9afd569eadb9..6f9d604d9d40 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -72,8 +72,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, return NULL; if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, + dir, DMA_ATTR_SKIP_CPU_SYNC); else *dma_handle = ops->map_page(dev, page, 0, size, dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -92,7 +92,7 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, dma_handle, size, dir, + iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index abcf3fa63a56..0d37da3d95b6 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1209,7 +1209,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, nslabs = nr_slots(alloc_size); phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!pool) return -1; diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 77fcd83dd663..2333d70802e4 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -14,4 +14,4 @@ CFLAGS_common.o += -fno-stack-protector obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o -obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o +obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o diff --git a/kernel/entry/kvm.c b/kernel/entry/virt.c index 8485f63863af..c52f99249763 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/virt.c @@ -1,17 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/entry-kvm.h> -#include <linux/kvm_host.h> +#include <linux/entry-virt.h> -static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +static int xfer_to_guest_mode_work(unsigned long ti_work) { do { int ret; - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - kvm_handle_signal_exit(vcpu); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) return -EINTR; - } if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); @@ -19,7 +16,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(NULL); - ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + ret = arch_xfer_to_guest_mode_handle_work(ti_work); if (ret) return ret; @@ -28,7 +25,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return 0; } -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +int xfer_to_guest_mode_handle_work(void) { unsigned long ti_work; @@ -44,6 +41,6 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; - return xfer_to_guest_mode_work(vcpu, ti_work); + return xfer_to_guest_mode_work(ti_work); } EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 7541f6f85fcb..177e57c1a362 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9403,7 +9403,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_HUGETLB; if (file) { - struct inode *inode; + const struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -9416,12 +9416,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = file_path(file, buf, PATH_MAX - sizeof(u64)); + name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } - inode = file_inode(vma->vm_file); + inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; @@ -9492,7 +9492,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, if (!filter->path.dentry) return false; - if (d_inode(filter->path.dentry) != file_inode(file)) + if (d_inode(filter->path.dentry) != file_user_inode(file)) return false; if (filter->offset > offset + size) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8709c69118b5..f11ceb8be8c4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2765,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + /* * If user decided to take execution elsewhere, it makes little sense * to execute the original instruction, so let's skip it. @@ -2772,9 +2775,6 @@ static void handle_swbp(struct pt_regs *regs) if (instruction_pointer(regs) != bp_vaddr) goto out; - /* Try to optimize after first hit. */ - arch_uprobe_optimize(&uprobe->arch, bp_vaddr); - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 5083c68c3a4e..76f0940fb485 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -18,6 +18,7 @@ #include <linux/memblock.h> #include <linux/notifier.h> #include <linux/page-isolation.h> +#include <linux/vmalloc.h> #include <asm/early_ioremap.h> @@ -107,6 +108,29 @@ struct kho_serialization { struct khoser_mem_chunk *preserved_mem_map; }; +struct kho_out { + struct blocking_notifier_head chain_head; + + struct dentry *dir; + + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_serialization ser; + bool finalized; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .ser = { + .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + .track = { + .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), + }, + }, + .finalized = false, +}; + static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) { void *elm, *res; @@ -165,6 +189,9 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, might_sleep(); + if (kho_out.finalized) + return -EBUSY; + physxa = xa_load(&track->orders, order); if (!physxa) { int err; @@ -248,6 +275,37 @@ struct folio *kho_restore_folio(phys_addr_t phys) } EXPORT_SYMBOL_GPL(kho_restore_folio); +/** + * kho_restore_pages - restore list of contiguous order 0 pages. + * @phys: physical address of the first page. + * @nr_pages: number of pages. + * + * Restore a contiguous list of order 0 pages that was preserved with + * kho_preserve_pages(). + * + * Return: 0 on success, error code on failure + */ +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +{ + const unsigned long start_pfn = PHYS_PFN(phys); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn)); + + if (!page) + return NULL; + split_page(page, order); + pfn += 1 << order; + } + + return pfn_to_page(start_pfn); +} +EXPORT_SYMBOL_GPL(kho_restore_pages); + /* Serialize and deserialize struct kho_mem_phys across kexec * * Record all the bitmaps in a linked list of pages for the next kernel to @@ -667,29 +725,6 @@ int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) } EXPORT_SYMBOL_GPL(kho_add_subtree); -struct kho_out { - struct blocking_notifier_head chain_head; - - struct dentry *dir; - - struct mutex lock; /* protects KHO FDT finalization */ - - struct kho_serialization ser; - bool finalized; -}; - -static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), - .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, - }, - .finalized = false, -}; - int register_kho_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&kho_out.chain_head, nb); @@ -717,37 +752,28 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; - if (kho_out.finalized) - return -EBUSY; - return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); /** - * kho_preserve_phys - preserve a physically contiguous range across kexec. - * @phys: physical address of the range. - * @size: size of the range. + * kho_preserve_pages - preserve contiguous pages across kexec + * @page: first page in the list. + * @nr_pages: number of pages. * - * Instructs KHO to preserve the memory range from @phys to @phys + @size - * across kexec. + * Preserve a contiguous list of order 0 pages. Must be restored using + * kho_restore_pages() to ensure the pages are restored properly as order 0. * * Return: 0 on success, error code on failure */ -int kho_preserve_phys(phys_addr_t phys, size_t size) +int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - unsigned long pfn = PHYS_PFN(phys); + struct kho_mem_track *track = &kho_out.ser.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; unsigned long failed_pfn = 0; - const unsigned long start_pfn = pfn; - const unsigned long end_pfn = PHYS_PFN(phys + size); int err = 0; - struct kho_mem_track *track = &kho_out.ser.track; - - if (kho_out.finalized) - return -EBUSY; - - if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) - return -EINVAL; while (pfn < end_pfn) { const unsigned int order = @@ -767,7 +793,256 @@ int kho_preserve_phys(phys_addr_t phys, size_t size) return err; } -EXPORT_SYMBOL_GPL(kho_preserve_phys); +EXPORT_SYMBOL_GPL(kho_preserve_pages); + +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(phys_addr_t)) + +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + phys_addr_t phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* vmalloc flags KHO supports */ +#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) + +/* KHO internal flags for vmalloc preservations */ +#define KHO_VMALLOC_ALLOC 0x0001 +#define KHO_VMALLOC_HUGE_VMAP 0x0002 + +static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) +{ + unsigned short kho_flags = 0; + + if (vm_flags & VM_ALLOC) + kho_flags |= KHO_VMALLOC_ALLOC; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + kho_flags |= KHO_VMALLOC_HUGE_VMAP; + + return kho_flags; +} + +static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) +{ + unsigned int vm_flags = 0; + + if (kho_flags & KHO_VMALLOC_ALLOC) + vm_flags |= VM_ALLOC; + if (kho_flags & KHO_VMALLOC_HUGE_VMAP) + vm_flags |= VM_ALLOW_HUGE_VMAP; + + return vm_flags; +} + +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) +{ + struct kho_vmalloc_chunk *chunk; + int err; + + chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return NULL; + + err = kho_preserve_pages(virt_to_page(chunk), 1); + if (err) + goto err_free; + if (cur) + KHOSER_STORE_PTR(cur->hdr.next, chunk); + return chunk; + +err_free: + free_page((unsigned long)chunk); + return NULL; +} + +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +{ + struct kho_mem_track *track = &kho_out.ser.track; + unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); + + __kho_unpreserve(track, pfn, pfn + 1); + + for (int i = 0; chunk->phys[i]; i++) { + pfn = PHYS_PFN(chunk->phys[i]); + __kho_unpreserve(track, pfn, pfn + 1); + } +} + +static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} + +/** + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec + * @ptr: pointer to the area in vmalloc address space + * @preservation: placeholder for preservation metadata + * + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The + * physical pages mapped at @ptr will be preserved and on successful return + * @preservation will hold the physical address of a structure that describes + * the preservation. + * + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably + * restored on the same node + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk; + struct vm_struct *vm = find_vm_area(ptr); + unsigned int order, flags, nr_contig_pages; + unsigned int idx = 0; + int err; + + if (!vm) + return -EINVAL; + + if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return -EOPNOTSUPP; + + flags = vmalloc_flags_to_kho(vm->flags); + order = get_vm_area_page_order(vm); + + chunk = new_vmalloc_chunk(NULL); + if (!chunk) + return -ENOMEM; + KHOSER_STORE_PTR(preservation->first, chunk); + + nr_contig_pages = (1 << order); + for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { + phys_addr_t phys = page_to_phys(vm->pages[i]); + + err = kho_preserve_pages(vm->pages[i], nr_contig_pages); + if (err) + goto err_free; + + chunk->phys[idx++] = phys; + if (idx == ARRAY_SIZE(chunk->phys)) { + chunk = new_vmalloc_chunk(chunk); + if (!chunk) + goto err_free; + idx = 0; + } + } + + preservation->total_pages = vm->nr_pages; + preservation->flags = flags; + preservation->order = order; + + return 0; + +err_free: + kho_vmalloc_free_chunks(preservation); + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); + +/** + * kho_restore_vmalloc - recreates and populates an area in vmalloc address + * space from the preserved memory. + * @preservation: preservation metadata. + * + * Recreates an area in vmalloc address space and populates it with memory that + * was preserved using kho_preserve_vmalloc(). + * + * Return: pointer to the area in the vmalloc address space, NULL on failure. + */ +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + unsigned int align, order, shift, vm_flags; + unsigned long total_pages, contig_pages; + unsigned long addr, size; + struct vm_struct *area; + struct page **pages; + unsigned int idx = 0; + int err; + + vm_flags = kho_flags_to_vmalloc(preservation->flags); + if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return NULL; + + total_pages = preservation->total_pages; + pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + order = preservation->order; + contig_pages = (1 << order); + shift = PAGE_SHIFT + order; + align = 1 << shift; + + while (chunk) { + struct page *page; + + for (int i = 0; chunk->phys[i]; i++) { + phys_addr_t phys = chunk->phys[i]; + + if (idx + contig_pages > total_pages) + goto err_free_pages_array; + + page = kho_restore_pages(phys, contig_pages); + if (!page) + goto err_free_pages_array; + + for (int j = 0; j < contig_pages; j++) + pages[idx++] = page; + + phys += contig_pages * PAGE_SIZE; + } + + page = kho_restore_pages(virt_to_phys(chunk), 1); + if (!page) + goto err_free_pages_array; + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + __free_page(page); + } + + if (idx != total_pages) + goto err_free_pages_array; + + area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, + vm_flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + if (!area) + goto err_free_pages_array; + + addr = (unsigned long)area->addr; + size = get_vm_area_size(area); + err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); + if (err) + goto err_free_vm_area; + + area->nr_pages = total_pages; + area->pages = pages; + + return area->addr; + +err_free_vm_area: + free_vm_area(area); +err_free_pages_array: + kvfree(pages); + return NULL; +} +EXPORT_SYMBOL_GPL(kho_restore_vmalloc); /* Handling for debug/kho/out */ diff --git a/kernel/padata.c b/kernel/padata.c index f85f8bd788d0..f4def028c48c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -291,8 +291,12 @@ static void padata_reorder(struct padata_priv *padata) struct padata_serial_queue *squeue; int cb_cpu; - cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); processed++; + /* When sequence wraps around, reset to the first CPU. */ + if (unlikely(processed == 0)) + cpu = cpumask_first(pd->cpumask.pcpu); + else + cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); cb_cpu = padata->cb_cpu; squeue = per_cpu_ptr(pd->squeue, cb_cpu); @@ -486,9 +490,9 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) do { nid = next_node_in(old_node, node_states[N_CPU]); } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid)); - queue_work_node(nid, system_unbound_wq, &pw->pw_work); + queue_work_node(nid, system_dfl_wq, &pw->pw_work); } else { - queue_work(system_unbound_wq, &pw->pw_work); + queue_work(system_dfl_wq, &pw->pw_work); } /* Use the current thread, which saves starting a workqueue worker. */ @@ -963,8 +967,9 @@ struct padata_instance *padata_alloc(const char *name) cpus_read_lock(); - pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | - WQ_CPU_INTENSIVE, 1, name); + pinst->serial_wq = alloc_workqueue("%s_serial", + WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE | WQ_PERCPU, + 1, name); if (!pinst->serial_wq) goto err_put_cpus; diff --git a/kernel/printk/.kunitconfig b/kernel/printk/.kunitconfig new file mode 100644 index 000000000000..f31458fd1a92 --- /dev/null +++ b/kernel/printk/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_PRINTK=y +CONFIG_PRINTK_RINGBUFFER_KUNIT_TEST=y diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 39a2b61c7232..f8004ac3983d 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_PRINTK_INDEX) += index.o obj-$(CONFIG_PRINTK) += printk_support.o printk_support-y := printk_ringbuffer.o printk_support-$(CONFIG_SYSCTL) += sysctl.o + +obj-$(CONFIG_PRINTK_RINGBUFFER_KUNIT_TEST) += printk_ringbuffer_kunit_test.o diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index e2a1b2d34d2b..40198bffb7d0 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <kunit/visibility.h> #include <linux/kernel.h> #include <linux/irqflags.h> #include <linux/string.h> @@ -393,25 +394,21 @@ static unsigned int to_blk_size(unsigned int size) * Sanity checker for reserve size. The ringbuffer code assumes that a data * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the - * assumption is safe. + * assumption is safe. In particular, it guarantees that data_push_tail() will + * never attempt to push the tail beyond the head. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { - struct prb_data_block *db = NULL; - + /* Data-less blocks take no space. */ if (size == 0) return true; /* - * Ensure the alignment padded size could possibly fit in the data - * array. The largest possible data block must still leave room for - * at least the ID of the next block. + * If data blocks were allowed to be larger than half the data ring + * size, a wrapping data block could require more space than the full + * ringbuffer. */ - size = to_blk_size(size); - if (size > DATA_SIZE(data_ring) - sizeof(db->id)) - return false; - - return true; + return to_blk_size(size) <= DATA_SIZE(data_ring) / 2; } /* Query the state of a descriptor. */ @@ -1051,8 +1048,17 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); - if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { - /* Failed to allocate, specify a data-less block. */ + /* + * data_check_size() prevents data block allocation that could + * cause illegal ringbuffer states. But double check that the + * used space will not be bigger than the ring buffer. Wrapped + * messages need to reserve more space, see get_next_lpos(). + * + * Specify a data-less block when the check or the allocation + * fails. + */ + if (WARN_ON_ONCE(next_lpos - begin_lpos > DATA_SIZE(data_ring)) || + !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; return NULL; @@ -1140,8 +1146,18 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, return &blk->data[0]; } - if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) + /* + * data_check_size() prevents data block reallocation that could + * cause illegal ringbuffer states. But double check that the + * new used space will not be bigger than the ring buffer. Wrapped + * messages need to reserve more space, see get_next_lpos(). + * + * Specify failure when the check or the allocation fails. + */ + if (WARN_ON_ONCE(next_lpos - blk_lpos->begin > DATA_SIZE(data_ring)) || + !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { return NULL; + } /* The memory barrier involvement is the same as data_alloc:A. */ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, @@ -1685,6 +1701,7 @@ fail: memset(r, 0, sizeof(*r)); return false; } +EXPORT_SYMBOL_IF_KUNIT(prb_reserve); /* Commit the data (possibly finalizing it) and restore interrupts. */ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) @@ -1759,6 +1776,7 @@ void prb_commit(struct prb_reserved_entry *e) if (head_id != e->id) desc_make_final(e->rb, e->id); } +EXPORT_SYMBOL_IF_KUNIT(prb_commit); /** * prb_final_commit() - Commit and finalize (previously reserved) data to @@ -2184,6 +2202,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, { return _prb_read_valid(rb, &seq, r, NULL); } +EXPORT_SYMBOL_IF_KUNIT(prb_read_valid); /** * prb_read_valid_info() - Non-blocking read of meta data for a requested @@ -2333,6 +2352,7 @@ void prb_init(struct printk_ringbuffer *rb, infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; } +EXPORT_SYMBOL_IF_KUNIT(prb_init); /** * prb_record_text_space() - Query the full actual used ringbuffer space for diff --git a/kernel/printk/printk_ringbuffer_kunit_test.c b/kernel/printk/printk_ringbuffer_kunit_test.c new file mode 100644 index 000000000000..2282348e869a --- /dev/null +++ b/kernel/printk/printk_ringbuffer_kunit_test.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpuhplock.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/wait.h> + +#include <kunit/resource.h> +#include <kunit/test.h> + +#include "printk_ringbuffer.h" + +/* + * This KUnit tests the data integrity of the lockless printk_ringbuffer. + * From multiple CPUs it writes messages of varying length and content while + * a reader validates the correctness of the messages. + * + * IMPORTANT: The more CPUs you can use for this KUnit, the better! + * + * The test works by starting "num_online_cpus() - 1" writer threads, each + * pinned to their own CPU. Each writer thread loops, writing data of varying + * length into a printk_ringbuffer as fast as possible. The data content is + * an embedded data struct followed by string content repeating the byte: + * + * 'A' + CPUID + * + * The reader is running on the remaining online CPU, or if there is only one + * CPU on the same as the writer. + * It ensures that the embedded struct content is consistent with the string + * and that the string * is terminated and is composed of the same repeating + * byte as its first byte. + * + * Because the threads are running in such tight loops, they will call + * cond_resched() from time to time so the system stays functional. + * + * If the reader encounters an error, the test is aborted and some + * information about the error is reported. + * The runtime of the test can be configured with the runtime_ms module parameter. + * + * Note that the test is performed on a separate printk_ringbuffer instance + * and not the instance used by printk(). + */ + +static unsigned long runtime_ms = 10 * MSEC_PER_SEC; +module_param(runtime_ms, ulong, 0400); + +/* test data structure */ +struct prbtest_rbdata { + unsigned int size; + char text[] __counted_by(size); +}; + +#define MAX_RBDATA_TEXT_SIZE 0x80 +#define MAX_PRB_RECORD_SIZE (sizeof(struct prbtest_rbdata) + MAX_RBDATA_TEXT_SIZE) + +struct prbtest_data { + struct kunit *test; + struct printk_ringbuffer *ringbuffer; + /* used by writers to signal reader of new records */ + wait_queue_head_t new_record_wait; +}; + +struct prbtest_thread_data { + unsigned long num; + struct prbtest_data *test_data; +}; + +static void prbtest_fail_record(struct kunit *test, const struct prbtest_rbdata *dat, u64 seq) +{ + unsigned int len; + + len = dat->size - 1; + + KUNIT_FAIL(test, "BAD RECORD: seq=%llu size=%u text=%.*s\n", + seq, dat->size, + len < MAX_RBDATA_TEXT_SIZE ? len : -1, + len < MAX_RBDATA_TEXT_SIZE ? dat->text : "<invalid>"); +} + +static bool prbtest_check_data(const struct prbtest_rbdata *dat) +{ + unsigned int len; + + /* Sane size? At least one character + trailing '\0' */ + if (dat->size < 2 || dat->size > MAX_RBDATA_TEXT_SIZE) + return false; + + len = dat->size - 1; + if (dat->text[len] != '\0') + return false; + + /* String repeats with the same character? */ + while (len--) { + if (dat->text[len] != dat->text[0]) + return false; + } + + return true; +} + +static int prbtest_writer(void *data) +{ + struct prbtest_thread_data *tr = data; + char text_id = 'A' + tr->num; + struct prb_reserved_entry e; + struct prbtest_rbdata *dat; + u32 record_size, text_size; + unsigned long count = 0; + struct printk_record r; + + kunit_info(tr->test_data->test, "start thread %03lu (writer)\n", tr->num); + + for (;;) { + /* ensure at least 1 character + trailing '\0' */ + text_size = get_random_u32_inclusive(2, MAX_RBDATA_TEXT_SIZE); + if (WARN_ON_ONCE(text_size < 2)) + text_size = 2; + if (WARN_ON_ONCE(text_size > MAX_RBDATA_TEXT_SIZE)) + text_size = MAX_RBDATA_TEXT_SIZE; + + record_size = sizeof(struct prbtest_rbdata) + text_size; + WARN_ON_ONCE(record_size > MAX_PRB_RECORD_SIZE); + + /* specify the text sizes for reservation */ + prb_rec_init_wr(&r, record_size); + + /* + * Reservation can fail if: + * + * - No free descriptor is available. + * - The buffer is full, and the oldest record is reserved + * but not yet committed. + * + * It actually happens in this test because all CPUs are trying + * to write an unbounded number of messages in a tight loop. + * These failures are intentionally ignored because this test + * focuses on races, ringbuffer consistency, and pushing system + * usability limits. + */ + if (prb_reserve(&e, tr->test_data->ringbuffer, &r)) { + r.info->text_len = record_size; + + dat = (struct prbtest_rbdata *)r.text_buf; + dat->size = text_size; + memset(dat->text, text_id, text_size - 1); + dat->text[text_size - 1] = '\0'; + + prb_commit(&e); + + wake_up_interruptible(&tr->test_data->new_record_wait); + } + + if ((count++ & 0x3fff) == 0) + cond_resched(); + + if (kthread_should_stop()) + break; + } + + kunit_info(tr->test_data->test, "end thread %03lu: wrote=%lu\n", tr->num, count); + + return 0; +} + +struct prbtest_wakeup_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void prbtest_wakeup_callback(struct timer_list *timer) +{ + struct prbtest_wakeup_timer *wakeup = timer_container_of(wakeup, timer, timer); + + set_tsk_thread_flag(wakeup->task, TIF_NOTIFY_SIGNAL); + wake_up_process(wakeup->task); +} + +static int prbtest_reader(struct prbtest_data *test_data, unsigned long timeout_ms) +{ + struct prbtest_wakeup_timer wakeup; + char text_buf[MAX_PRB_RECORD_SIZE]; + unsigned long count = 0; + struct printk_info info; + struct printk_record r; + u64 seq = 0; + + wakeup.task = current; + timer_setup_on_stack(&wakeup.timer, prbtest_wakeup_callback, 0); + mod_timer(&wakeup.timer, jiffies + msecs_to_jiffies(timeout_ms)); + + prb_rec_init_rd(&r, &info, text_buf, sizeof(text_buf)); + + kunit_info(test_data->test, "start reader\n"); + + while (!wait_event_interruptible(test_data->new_record_wait, + prb_read_valid(test_data->ringbuffer, seq, &r))) { + /* check/track the sequence */ + if (info.seq < seq) + KUNIT_FAIL(test_data->test, "BAD SEQ READ: request=%llu read=%llu\n", + seq, info.seq); + + if (!prbtest_check_data((struct prbtest_rbdata *)r.text_buf)) + prbtest_fail_record(test_data->test, + (struct prbtest_rbdata *)r.text_buf, info.seq); + + if ((count++ & 0x3fff) == 0) + cond_resched(); + + seq = info.seq + 1; + } + + timer_delete_sync(&wakeup.timer); + timer_destroy_on_stack(&wakeup.timer); + + kunit_info(test_data->test, "end reader: read=%lu seq=%llu\n", count, info.seq); + + return 0; +} + +KUNIT_DEFINE_ACTION_WRAPPER(prbtest_cpumask_cleanup, free_cpumask_var, struct cpumask *); +KUNIT_DEFINE_ACTION_WRAPPER(prbtest_kthread_cleanup, kthread_stop, struct task_struct *); + +static void prbtest_add_cpumask_cleanup(struct kunit *test, cpumask_var_t mask) +{ + int err; + + err = kunit_add_action_or_reset(test, prbtest_cpumask_cleanup, mask); + KUNIT_ASSERT_EQ(test, err, 0); +} + +static void prbtest_add_kthread_cleanup(struct kunit *test, struct task_struct *kthread) +{ + int err; + + err = kunit_add_action_or_reset(test, prbtest_kthread_cleanup, kthread); + KUNIT_ASSERT_EQ(test, err, 0); +} + +static inline void prbtest_prb_reinit(struct printk_ringbuffer *rb) +{ + prb_init(rb, rb->text_data_ring.data, rb->text_data_ring.size_bits, rb->desc_ring.descs, + rb->desc_ring.count_bits, rb->desc_ring.infos); +} + +static void test_readerwriter(struct kunit *test) +{ + /* Equivalent to CONFIG_LOG_BUF_SHIFT=13 */ + DEFINE_PRINTKRB(test_rb, 8, 5); + + struct prbtest_thread_data *thread_data; + struct prbtest_data *test_data; + struct task_struct *thread; + cpumask_var_t test_cpus; + int cpu, reader_cpu; + + KUNIT_ASSERT_TRUE(test, alloc_cpumask_var(&test_cpus, GFP_KERNEL)); + prbtest_add_cpumask_cleanup(test, test_cpus); + + cpus_read_lock(); + /* + * Failure of KUNIT_ASSERT() kills the current task + * so it can not be called while the CPU hotplug lock is held. + * Instead use a snapshot of the online CPUs. + * If they change during test execution it is unfortunate but not a grave error. + */ + cpumask_copy(test_cpus, cpu_online_mask); + cpus_read_unlock(); + + /* One CPU is for the reader, all others are writers */ + reader_cpu = cpumask_first(test_cpus); + if (cpumask_weight(test_cpus) == 1) + kunit_warn(test, "more than one CPU is recommended"); + else + cpumask_clear_cpu(reader_cpu, test_cpus); + + /* KUnit test can get restarted more times. */ + prbtest_prb_reinit(&test_rb); + + test_data = kunit_kmalloc(test, sizeof(*test_data), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, test_data); + test_data->test = test; + test_data->ringbuffer = &test_rb; + init_waitqueue_head(&test_data->new_record_wait); + + kunit_info(test, "running for %lu ms\n", runtime_ms); + + for_each_cpu(cpu, test_cpus) { + thread_data = kunit_kmalloc(test, sizeof(*thread_data), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, thread_data); + thread_data->test_data = test_data; + thread_data->num = cpu; + + thread = kthread_run_on_cpu(prbtest_writer, thread_data, cpu, + "prbtest writer %u"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, thread); + prbtest_add_kthread_cleanup(test, thread); + } + + kunit_info(test, "starting test\n"); + + set_cpus_allowed_ptr(current, cpumask_of(reader_cpu)); + prbtest_reader(test_data, runtime_ms); + + kunit_info(test, "completed test\n"); +} + +static struct kunit_case prb_test_cases[] = { + KUNIT_CASE_SLOW(test_readerwriter), + {} +}; + +static struct kunit_suite prb_test_suite = { + .name = "printk-ringbuffer", + .test_cases = prb_test_cases, +}; +kunit_test_suite(prb_test_suite); + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); +MODULE_AUTHOR("John Ogness <john.ogness@linutronix.de>"); +MODULE_DESCRIPTION("printk_ringbuffer KUnit test"); +MODULE_LICENSE("GPL"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7a893d51d02b..29fe3c01312f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1528,7 +1528,7 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) static int rcu_torture_writer(void *arg) { - bool boot_ended; + bool booting_still = false; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; struct rcu_gp_oldstate cookie_full; @@ -1539,6 +1539,7 @@ rcu_torture_writer(void *arg) struct rcu_gp_oldstate gp_snap1_full; int i; int idx; + unsigned long j; int oldnice = task_nice(current); struct rcu_gp_oldstate *rgo = NULL; int rgo_size = 0; @@ -1571,16 +1572,26 @@ rcu_torture_writer(void *arg) return 0; } if (cur_ops->poll_active > 0) { - ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL); + ulo = kcalloc(cur_ops->poll_active, sizeof(*ulo), GFP_KERNEL); if (!WARN_ON(!ulo)) ulo_size = cur_ops->poll_active; } if (cur_ops->poll_active_full > 0) { - rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL); + rgo = kcalloc(cur_ops->poll_active_full, sizeof(*rgo), GFP_KERNEL); if (!WARN_ON(!rgo)) rgo_size = cur_ops->poll_active_full; } + // If the system is still booting, let it finish. + j = jiffies; + while (!torture_must_stop() && !rcu_inkernel_boot_has_ended()) { + booting_still = true; + schedule_timeout_interruptible(HZ); + } + if (booting_still) + pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n", + torture_type, jiffies - j); + do { rcu_torture_writer_state = RTWS_FIXED_DELAY; torture_hrtimeout_us(500, 1000, &rand); @@ -1769,13 +1780,11 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - boot_ended && time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && @@ -2437,7 +2446,8 @@ rcu_torture_reader(void *arg) torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) + while (!torture_must_stop() && + (torture_num_online_cpus() < mynumonline || !rcu_inkernel_boot_has_ended())) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); @@ -2756,7 +2766,8 @@ rcu_torture_stats_print(void) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_access_pointer(rcu_torture_current) && - !rcu_stall_is_suppressed()) { + !rcu_stall_is_suppressed() && + rcu_inkernel_boot_has_ended()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; @@ -3446,6 +3457,8 @@ static int rcu_torture_fwd_prog(void *args) int tested_tries = 0; VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); rcu_bind_current_to_nocb(); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index df646e0694a8..19841704d8f5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -1021,7 +1021,7 @@ static int main_func(void *arg) set_user_nice(current, MAX_NICE); VERBOSE_SCALEOUT("main_func task started"); - result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + result_avg = kcalloc(nruns, sizeof(*result_avg), GFP_KERNEL); buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { SCALEOUT_ERRSTRING("out of memory"); @@ -1133,9 +1133,9 @@ ref_scale_cleanup(void) reader_tasks[i].task); } kfree(reader_tasks); + reader_tasks = NULL; torture_stop_kthread("main_task", main_task); - kfree(main_task); // Do scale-type-specific cleanup operations. if (cur_ops->cleanup != NULL) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 6e9fe2ce1075..e3b64a5e0ec7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -176,10 +176,9 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_LAZY + lockdep_assert_preemption_disabled(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { - preempt_enable(); return; } WRITE_ONCE(ssp->srcu_idx_max, cookie); @@ -189,7 +188,6 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } - preempt_enable(); } /* diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c5e8ebc493d5..1ff94b76d91f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1168,6 +1168,16 @@ static void srcu_flip(struct srcu_struct *ssp) * counter update. Note that both this memory barrier and the * one in srcu_readers_active_idx_check() provide the guarantee * for __srcu_read_lock(). + * + * Note that this is a performance optimization, in which we spend + * an otherwise unnecessary smp_mb() in order to reduce the number + * of full per-CPU-variable scans in srcu_readers_lock_idx() and + * srcu_readers_unlock_idx(). But this performance optimization + * is not so optimal for SRCU-fast, where we would be spending + * not smp_mb(), but rather synchronize_rcu(). At the same time, + * the overhead of the smp_mb() is in the noise, so there is no + * point in omitting it in the SRCU-fast case. So the same code + * is executed either way. */ smp_mb(); /* D */ /* Pairs with C. */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index f92443561d36..2dc044fd126e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -553,13 +553,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu rtpcp_next = rtp->rtpcp_array[index]; if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); index++; if (index < num_possible_cpus()) { rtpcp_next = rtp->rtpcp_array[index]; if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); } } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8eff357b0436..8293bae1dec1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -573,7 +573,7 @@ void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -602,7 +602,7 @@ noinstr void rcu_irq_work_resched(void) if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; - if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; instrumentation_begin(); @@ -611,7 +611,7 @@ noinstr void rcu_irq_work_resched(void) } instrumentation_end(); } -#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** @@ -3800,6 +3800,11 @@ static void rcu_barrier_handler(void *cpu_in) * to complete. For example, if there are no RCU callbacks queued anywhere * in the system, then rcu_barrier() is within its rights to return * immediately, without waiting for anything, much less an RCU grace period. + * In fact, rcu_barrier() will normally not result in any RCU grace periods + * beyond those that were already destined to be executed. + * + * In kernels built with CONFIG_RCU_LAZY=y, this function also hurries all + * pending lazy RCU callbacks. */ void rcu_barrier(void) { @@ -4885,10 +4890,10 @@ void __init rcu_init(void) rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ - rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM | WQ_PERCPU, 0); WARN_ON(!rcu_gp_wq); - sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); WARN_ON(!sync_wq); /* Respect if explicitly disabled via a boot parameter. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4cd170b2d655..d85763336b3c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -626,11 +626,10 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) */ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { - unsigned long flags; struct rcu_data *rdp; + lockdep_assert_irqs_disabled(); rdp = container_of(iwp, struct rcu_data, defer_qs_iw); - local_irq_save(flags); /* * If the IRQ work handler happens to run in the middle of RCU read-side @@ -647,8 +646,6 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) */ if (rcu_preempt_depth() > 0) WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); - - local_irq_restore(flags); } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 198d2dd45f59..f1ebf67b48e2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8571,10 +8571,12 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { WARN(true, "Dying CPU not properly vacated!"); dump_rq_tasks(rq, KERN_WARNING); } + dl_server_stop(&rq->fair_server); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 615411a0a881..7b7671060bf9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1582,6 +1582,9 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || dl_se->dl_server_active) return; + if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) + return; + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc0b7ce8a65d..cee1793e8277 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8920,21 +8920,21 @@ simple: return p; idle: - if (!rf) - return NULL; - - new_tasks = sched_balance_newidle(rq, rf); + if (rf) { + new_tasks = sched_balance_newidle(rq, rf); - /* - * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. - */ - if (new_tasks < 0) - return RETRY_TASK; + /* + * Because sched_balance_newidle() releases (and re-acquires) + * rq->lock, it is possible for any higher priority task to + * appear. In that case we must re-start the pick_next_entity() + * loop. + */ + if (new_tasks < 0) + return RETRY_TASK; - if (new_tasks > 0) - goto again; + if (new_tasks > 0) + goto again; + } /* * rq is about to be idle, check if we need to update the diff --git a/kernel/torture.c b/kernel/torture.c index 3a0a8cc60401..1ea9f67953a7 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -359,6 +359,8 @@ torture_onoff(void *arg) torture_hrtimeout_jiffies(onoff_holdoff, &rand); VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); while (!torture_must_stop()) { if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { torture_hrtimeout_jiffies(HZ / 10, &rand); @@ -797,8 +799,9 @@ static unsigned long torture_init_jiffies; static void torture_print_module_parms(void) { - pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n", - torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle); + pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d%s\n", + torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle, + rcu_inkernel_boot_has_ended() ? "" : " still booting"); } /* diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8f23f5273bab..4f87c16d915a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -899,7 +899,7 @@ const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz) { struct path copy; long len; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a69067367c29..42bd2ba68a82 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7535,6 +7535,8 @@ void ftrace_module_enable(struct module *mod) if (!within_module(rec->ip, mod)) break; + cond_resched(); + /* Weak functions should still be ignored */ if (!test_for_valid_rec(rec)) { /* Clear all other flags. Should not be enabled anyway */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 43460949ad3f..1244d2c5c384 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7273,7 +7273,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } - return 0; + return err; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b3c94fbaf002..d1e527cf2aae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4791,12 +4791,6 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp) return single_release(inode, filp); } -static int tracing_mark_open(struct inode *inode, struct file *filp) -{ - stream_open(inode, filp); - return tracing_open_generic_tr(inode, filp); -} - static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -7163,7 +7157,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, size_t cnt, unsigned long ip) { struct ring_buffer_event *event; @@ -7173,20 +7167,11 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user int meta_size; ssize_t written; size_t size; - int len; - -/* Used in tracing_mark_raw_write() as well */ -#define FAULTED_STR "<faulted>" -#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; - /* If less than "<faulted>", then make sure we can still add that */ - if (cnt < FAULTED_SIZE) - size += FAULTED_SIZE - cnt; - buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); @@ -7196,9 +7181,6 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user * make it smaller and try again. */ if (size > ring_buffer_max_event_size(buffer)) { - /* cnt < FAULTED size should never be bigger than max */ - if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) - return -EBADF; cnt = ring_buffer_max_event_size(buffer) - meta_size; /* The above should only happen once */ if (WARN_ON_ONCE(cnt + meta_size == size)) @@ -7212,14 +7194,8 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - - len = copy_from_user_nofault(&entry->buf, ubuf, cnt); - if (len) { - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - cnt = FAULTED_SIZE; - written = -EFAULT; - } else - written = cnt; + memcpy(&entry->buf, buf, cnt); + written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ @@ -7243,6 +7219,169 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user return written; } +struct trace_user_buf { + char *buf; +}; + +struct trace_user_buf_info { + struct trace_user_buf __percpu *tbuf; + int ref; +}; + + +static DEFINE_MUTEX(trace_user_buffer_mutex); +static struct trace_user_buf_info *trace_user_buffer; + +static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) +{ + char *buf; + int cpu; + + for_each_possible_cpu(cpu) { + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + kfree(buf); + } + free_percpu(tinfo->tbuf); + kfree(tinfo); +} + +static int trace_user_fault_buffer_enable(void) +{ + struct trace_user_buf_info *tinfo; + char *buf; + int cpu; + + guard(mutex)(&trace_user_buffer_mutex); + + if (trace_user_buffer) { + trace_user_buffer->ref++; + return 0; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->tbuf = alloc_percpu(struct trace_user_buf); + if (!tinfo->tbuf) { + kfree(tinfo); + return -ENOMEM; + } + + tinfo->ref = 1; + + /* Clear each buffer in case of error */ + for_each_possible_cpu(cpu) { + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; + } + + for_each_possible_cpu(cpu) { + buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, + cpu_to_node(cpu)); + if (!buf) { + trace_user_fault_buffer_free(tinfo); + return -ENOMEM; + } + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; + } + + trace_user_buffer = tinfo; + + return 0; +} + +static void trace_user_fault_buffer_disable(void) +{ + struct trace_user_buf_info *tinfo; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo = trace_user_buffer; + + if (WARN_ON_ONCE(!tinfo)) + return; + + if (--tinfo->ref) + return; + + trace_user_fault_buffer_free(tinfo); + trace_user_buffer = NULL; +} + +/* Must be called with preemption disabled */ +static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + size_t *read_size) +{ + int cpu = smp_processor_id(); + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + unsigned int cnt; + int trys = 0; + int ret; + + if (size > TRACE_MARKER_MAX_SIZE) + size = TRACE_MARKER_MAX_SIZE; + *read_size = 0; + + /* + * This acts similar to a seqcount. The per CPU context switches are + * recorded, migration is disabled and preemption is enabled. The + * read of the user space memory is copied into the per CPU buffer. + * Preemption is disabled again, and if the per CPU context switches count + * is still the same, it means the buffer has not been corrupted. + * If the count is different, it is assumed the buffer is corrupted + * and reading must be tried again. + */ + + do { + /* + * If for some reason, copy_from_user() always causes a context + * switch, this would then cause an infinite loop. + * If this task is preempted by another user space task, it + * will cause this task to try again. But just in case something + * changes where the copying from user space causes another task + * to run, prevent this from going into an infinite loop. + * 100 tries should be plenty. + */ + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) + return NULL; + + /* Read the current CPU context switch counter */ + cnt = nr_context_switches_cpu(cpu); + + /* + * Preemption is going to be enabled, but this task must + * remain on this CPU. + */ + migrate_disable(); + + /* + * Now preemption is being enabed and another task can come in + * and use the same buffer and corrupt our data. + */ + preempt_enable_notrace(); + + ret = __copy_from_user(buffer, ptr, size); + + preempt_disable_notrace(); + migrate_enable(); + + /* if it faulted, no need to test if the buffer was corrupted */ + if (ret) + return NULL; + + /* + * Preemption is disabled again, now check the per CPU context + * switch counter. If it doesn't match, then another user space + * process may have schedule in and corrupted our buffer. In that + * case the copying must be retried. + */ + } while (nr_context_switches_cpu(cpu) != cnt); + + *read_size = size; + return buffer; +} + static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7250,6 +7389,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; unsigned long ip; + size_t size; + char *buf; if (tracing_disabled) return -EINVAL; @@ -7263,6 +7404,16 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_MARKER_MAX_SIZE) cnt = TRACE_MARKER_MAX_SIZE; + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + if (!buf) + return -EFAULT; + + if (cnt > size) + cnt = size; + /* The selftests expect this function to be the IP address */ ip = _THIS_IP_; @@ -7270,32 +7421,28 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tr == &global_trace) { guard(rcu)(); list_for_each_entry_rcu(tr, &marker_copies, marker_list) { - written = write_marker_to_buffer(tr, ubuf, cnt, ip); + written = write_marker_to_buffer(tr, buf, cnt, ip); if (written < 0) break; } } else { - written = write_marker_to_buffer(tr, ubuf, cnt, ip); + written = write_marker_to_buffer(tr, buf, cnt, ip); } return written; } static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, - const char __user *ubuf, size_t cnt) + const char *buf, size_t cnt) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size_t size; - size = sizeof(*entry) + cnt; - if (cnt < FAULT_SIZE_ID) - size += FAULT_SIZE_ID - cnt; + /* cnt includes both the entry->id and the data behind it. */ + size = struct_size(entry, buf, cnt - sizeof(entry->id)); buffer = tr->array_buffer.buffer; @@ -7309,14 +7456,11 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, return -EBADF; entry = ring_buffer_event_data(event); - - len = copy_from_user_nofault(&entry->id, ubuf, cnt); - if (len) { - entry->id = -1; - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - written = -EFAULT; - } else - written = cnt; + unsafe_memcpy(&entry->id, buf, cnt, + "id and content already reserved on ring buffer" + "'buf' includes the 'id' and the data." + "'entry' was allocated with cnt from 'id'."); + written = cnt; __buffer_unlock_commit(buffer, event); @@ -7329,8 +7473,8 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size_t size; + char *buf; if (tracing_disabled) return -EINVAL; @@ -7342,21 +7486,53 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (cnt < sizeof(unsigned int)) return -EINVAL; + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + if (!buf) + return -EFAULT; + + /* raw write is all or nothing */ + if (cnt > size) + return -EINVAL; + /* The global trace_marker_raw can go to multiple instances */ if (tr == &global_trace) { guard(rcu)(); list_for_each_entry_rcu(tr, &marker_copies, marker_list) { - written = write_raw_marker_to_buffer(tr, ubuf, cnt); + written = write_raw_marker_to_buffer(tr, buf, cnt); if (written < 0) break; } } else { - written = write_raw_marker_to_buffer(tr, ubuf, cnt); + written = write_raw_marker_to_buffer(tr, buf, cnt); } return written; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = trace_user_fault_buffer_enable(); + if (ret < 0) + return ret; + + stream_open(inode, filp); + ret = tracing_open_generic_tr(inode, filp); + if (ret < 0) + trace_user_fault_buffer_disable(); + return ret; +} + +static int tracing_mark_release(struct inode *inode, struct file *file) +{ + trace_user_fault_buffer_disable(); + return tracing_release_generic_tr(inode, file); +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -7764,13 +7940,13 @@ static const struct file_operations tracing_free_buffer_fops = { static const struct file_operations tracing_mark_fops = { .open = tracing_mark_open, .write = tracing_mark_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_mark_open, .write = tracing_mark_raw_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations trace_clock_fops = { @@ -10211,8 +10387,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); - ret = vfs_parse_fs_string(fc, "source", - "tracefs", strlen("tracefs")); + ret = vfs_parse_fs_string(fc, "source", "tracefs"); if (!ret) mnt = fc_mount(fc); else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5f4bed5842f9..85eabb454bee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -380,8 +380,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; + struct trace_event_file *enter_syscall_files[NR_syscalls]; + struct trace_event_file *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9f3e9537417d..e00da4182deb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1629,11 +1629,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l; iter = kzalloc(sizeof(*iter), GFP_KERNEL); + mutex_lock(&event_mutex); if (!iter) return NULL; - mutex_lock(&event_mutex); - iter->type = SET_EVENT_FILE; iter->file = list_entry(&tr->events, struct trace_event_file, list); diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 2ab283fd3032..c428dafe7496 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t) * so we use a work queue after call_rcu() to run within. */ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); - queue_rcu_work(system_wq, &mm->put_rwork); + queue_rcu_work(system_percpu_wq, &mm->put_rwork); } void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b36ade43d4b3..ad9d6347b5fa 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -522,13 +522,14 @@ static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + unsigned int flags = trace_probe_load_flag(&tf->tp); int ret = 0; - if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) fentry_trace_func(tf, entry_ip, fregs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret = fentry_perf_func(tf, entry_ip, fregs); #endif return ret; @@ -540,11 +541,12 @@ static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + unsigned int flags = trace_probe_load_flag(&tf->tp); - if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data); #endif } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5496758b6c76..4c45c49b06c8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -184,7 +184,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, unsigned long flags; unsigned int trace_ctx; u64 *calltime; - int ret; + int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) return 0; @@ -202,13 +202,11 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, return 0; calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); - if (!calltime) - return 0; - - *calltime = trace_clock_local(); - - trace_ctx = tracing_gen_ctx_flags(flags); - ret = __trace_graph_entry(tr, trace, trace_ctx); + if (calltime) { + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } local_dec(&data->disabled); return ret; @@ -233,11 +231,10 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); - if (!calltime) - return; - - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + if (calltime) { + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + } local_dec(&data->disabled); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fa60362a3f31..ee8171b19bee 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1815,14 +1815,15 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + unsigned int flags = trace_probe_load_flag(&tk->tp); int ret = 0; raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + if (flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); #endif return ret; @@ -1834,6 +1835,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); struct trace_kprobe *tk; + unsigned int flags; /* * There is a small chance that get_kretprobe(ri) returns NULL when @@ -1846,10 +1848,11 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); - if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tk->tp); + if (flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweak kernel, so just return 0 */ diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index dc734867f0fc..a9962d4497e8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -24,6 +24,7 @@ #include <linux/sched/clock.h> #include <uapi/linux/sched/types.h> #include <linux/sched.h> +#include <linux/string.h> #include "trace.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -271,7 +272,7 @@ static inline void tlat_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); if (tlat_var->kthread) hrtimer_cancel(&tlat_var->timer); @@ -295,7 +296,7 @@ static inline void osn_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); memset(osn_var, 0, sizeof(*osn_var)); } @@ -2325,13 +2326,9 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (count < 1) return 0; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, count)) - return -EFAULT; - buf[count] = '\0'; + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 842383fbc03b..08b5bda24da2 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -271,16 +271,21 @@ struct event_file_link { struct list_head list; }; +static inline unsigned int trace_probe_load_flag(struct trace_probe *tp) +{ + return smp_load_acquire(&tp->event->flags); +} + static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->event->flags & flag); + return !!(trace_probe_load_flag(tp) & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->event->flags |= flag; + smp_store_release(&tp->event->flags, tp->event->flags | flag); } static inline void trace_probe_clear_flag(struct trace_probe *tp, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb49f7279dc8..c46d584ded3b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) /* Place map_cmdline_to_pid array right after saved_cmdlines */ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; - s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, @@ -248,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid) return 1; + BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT)); + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index bf1cb80742ae..e3f2e4f56faa 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -138,12 +138,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, return 0; calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); - if (!calltime) - return 0; - - *calltime = trace_clock_local(); - - ret = __trace_graph_entry(tr, trace, trace_ctx); + if (calltime) { + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } local_dec(&data->disabled); preempt_enable_notrace(); @@ -169,12 +167,10 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); - if (!calltime) - return; + if (calltime) + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); local_dec(&data->disabled); - preempt_enable_notrace(); return; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46aab0ab9350..0f932b22f9ec 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -153,14 +153,20 @@ print_syscall_enter(struct trace_iterator *iter, int flags, if (trace_seq_has_overflowed(s)) goto end; + if (i) + trace_seq_puts(s, ", "); + /* parameter types */ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ - trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); + if (trace->args[i] < 10) + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + else + trace_seq_printf(s, "%s: 0x%lx", entry->args[i], + trace->args[i]); } trace_seq_putc(s, ')'); @@ -310,8 +316,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); if (!trace_file) return; @@ -356,8 +361,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); if (!trace_file) return; @@ -393,7 +397,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file, if (!tr->sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - rcu_assign_pointer(tr->enter_syscall_files[num], file); + WRITE_ONCE(tr->enter_syscall_files[num], file); tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); @@ -411,7 +415,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); + WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -431,7 +435,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - rcu_assign_pointer(tr->exit_syscall_files[num], file); + WRITE_ONCE(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -449,7 +453,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); + WRITE_ONCE(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b0bcc0d8f41..430d09c49462 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1547,6 +1547,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; + unsigned int flags; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); @@ -1561,11 +1562,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); @@ -1579,6 +1581,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb = NULL; + unsigned int flags; tu = container_of(con, struct trace_uprobe, consumer); @@ -1590,11 +1593,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + flags = trace_probe_load_flag(&tu->tp); + if (flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 1921ade45be3..7f8da4dab69d 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1076,7 +1076,7 @@ int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; - entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); + entries = vmalloc_array(map->max_elts, sizeof(sort_entry)); if (!entries) return -ENOMEM; |