diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 14 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 6 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 42 | ||||
-rw-r--r-- | kernel/bpf/token.c | 10 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 14 | ||||
-rw-r--r-- | kernel/module/main.c | 2 | ||||
-rw-r--r-- | kernel/nsproxy.c | 12 | ||||
-rw-r--r-- | kernel/pid.c | 10 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/sys.c | 10 | ||||
-rw-r--r-- | kernel/taskstats.c | 4 | ||||
-rw-r--r-- | kernel/time/timer_migration.c | 393 | ||||
-rw-r--r-- | kernel/time/timer_migration.h | 27 | ||||
-rw-r--r-- | kernel/watch_queue.c | 4 |
15 files changed, 284 insertions, 274 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index b0ef45db207c..0a79aee6523d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -80,10 +80,10 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) struct bpf_local_storage_data *sdata; struct fd f = fdget_raw(*(int *)key); - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - sdata = inode_storage_lookup(file_inode(f.file), map, true); + sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true); fdput(f); return sdata ? sdata->data : NULL; } @@ -94,14 +94,14 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, struct bpf_local_storage_data *sdata; struct fd f = fdget_raw(*(int *)key); - if (!f.file) + if (!fd_file(f)) return -EBADF; - if (!inode_storage_ptr(file_inode(f.file))) { + if (!inode_storage_ptr(file_inode(fd_file(f)))) { fdput(f); return -EBADF; } - sdata = bpf_local_storage_update(file_inode(f.file), + sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); fdput(f); @@ -126,10 +126,10 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) struct fd f = fdget_raw(*(int *)key); int err; - if (!f.file) + if (!fd_file(f)) return -EBADF; - err = inode_storage_delete(file_inode(f.file), map); + err = inode_storage_delete(file_inode(fd_file(f)), map); fdput(f); return err; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 95426d5b634e..e34f58abc7e2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7680,15 +7680,15 @@ struct btf *btf_get_by_fd(int fd) f = fdget(fd); - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - if (f.file->f_op != &btf_fops) { + if (fd_file(f)->f_op != &btf_fops) { fdput(f); return ERR_PTR(-EINVAL); } - btf = f.file->private_data; + btf = fd_file(f)->private_data; refcount_inc(&btf->refcnt); fdput(f); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bf6c5f685ea2..3093bf2cc266 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -829,7 +829,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) { - fmode_t mode = f.file->f_mode; + fmode_t mode = fd_file(f)->f_mode; /* Our file permissions may have been overridden by global * map permissions facing syscall side. @@ -1423,14 +1423,14 @@ put_token: */ struct bpf_map *__bpf_map_get(struct fd f) { - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_map_fops) { + if (fd_file(f)->f_op != &bpf_map_fops) { fdput(f); return ERR_PTR(-EINVAL); } - return f.file->private_data; + return fd_file(f)->private_data; } void bpf_map_inc(struct bpf_map *map) @@ -1651,7 +1651,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto free_key; } - err = bpf_map_update_value(map, f.file, key, value, attr->flags); + err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags); if (!err) maybe_wait_bpf_programs(map); @@ -2409,14 +2409,14 @@ int bpf_prog_new_fd(struct bpf_prog *prog) static struct bpf_prog *____bpf_prog_get(struct fd f) { - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_prog_fops) { + if (fd_file(f)->f_op != &bpf_prog_fops) { fdput(f); return ERR_PTR(-EINVAL); } - return f.file->private_data; + return fd_file(f)->private_data; } void bpf_prog_add(struct bpf_prog *prog, int i) @@ -3259,14 +3259,14 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct fd f = fdget(ufd); struct bpf_link *link; - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) { + if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) { fdput(f); return ERR_PTR(-EINVAL); } - link = f.file->private_data; + link = fd_file(f)->private_data; bpf_link_inc(link); fdput(f); @@ -4982,19 +4982,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EINVAL; f = fdget(ufd); - if (!f.file) + if (!fd_file(f)) return -EBADFD; - if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, + if (fd_file(f)->f_op == &bpf_prog_fops) + err = bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); - else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, + else if (fd_file(f)->f_op == &bpf_map_fops) + err = bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); - else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); - else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll) - err = bpf_link_get_info_by_fd(f.file, f.file->private_data, + else if (fd_file(f)->f_op == &btf_fops) + err = bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) + err = bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else err = -EINVAL; @@ -5215,7 +5215,7 @@ static int bpf_map_do_batch(const union bpf_attr *attr, else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); else if (cmd == BPF_MAP_UPDATE_BATCH) - BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr); + BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr); else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index d6ccf8d00eab..9a1d356e79ed 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -122,10 +122,10 @@ int bpf_token_create(union bpf_attr *attr) int err, fd; f = fdget(attr->token_create.bpffs_fd); - if (!f.file) + if (!fd_file(f)) return -EBADF; - path = f.file->f_path; + path = fd_file(f)->f_path; path_get(&path); fdput(f); @@ -235,14 +235,14 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd) struct fd f = fdget(ufd); struct bpf_token *token; - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_token_fops) { + if (fd_file(f)->f_op != &bpf_token_fops) { fdput(f); return ERR_PTR(-EINVAL); } - token = f.file->private_data; + token = fd_file(f)->private_data; bpf_token_inc(token); fdput(f); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c8e4b62b436a..b96489277f70 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6901,10 +6901,10 @@ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct fd f = fdget_raw(fd); - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - cgrp = cgroup_v1v2_get_from_file(f.file); + cgrp = cgroup_v1v2_get_from_file(fd_file(f)); fdput(f); return cgrp; } diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc227..fd2ac9c7fd77 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -933,10 +933,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct fd f = fdget(fd); int ret = 0; - if (!f.file) + if (!fd_file(f)) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_path.dentry, + css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -5898,10 +5898,10 @@ static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) { struct fd f = fdget(fd); - if (!f.file) + if (!fd_file(f)) return -EBADF; - if (f.file->f_op != &perf_fops) { + if (fd_file(f)->f_op != &perf_fops) { fdput(f); return -EBADF; } @@ -5961,7 +5961,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon ret = perf_fget_light(arg, &output); if (ret) return ret; - output_event = output.file->private_data; + output_event = fd_file(output)->private_data; ret = perf_event_set_output(event, output_event); fdput(output); } else { @@ -12474,7 +12474,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = {NULL, 0}; + struct fd group = EMPTY_FD; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; @@ -12549,7 +12549,7 @@ SYSCALL_DEFINE5(perf_event_open, err = perf_fget_light(group_fd, &group); if (err) goto err_fd; - group_leader = group.file->private_data; + group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) diff --git a/kernel/module/main.c b/kernel/module/main.c index d9592195c5bb..6ed334eecc14 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3211,7 +3211,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) return -EINVAL; f = fdget(fd); - err = idempotent_init_module(f.file, uargs, flags); + err = idempotent_init_module(fd_file(f), uargs, flags); fdput(f); return err; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ec3deec68c2..dc952c3b05af 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -550,15 +550,15 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) struct nsset nsset = {}; int err = 0; - if (!f.file) + if (!fd_file(f)) return -EBADF; - if (proc_ns_file(f.file)) { - ns = get_proc_ns(file_inode(f.file)); + if (proc_ns_file(fd_file(f))) { + ns = get_proc_ns(file_inode(fd_file(f))); if (flags && (ns->ops->type != flags)) err = -EINVAL; flags = ns->ops->type; - } else if (!IS_ERR(pidfd_pid(f.file))) { + } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { err = -EINVAL; @@ -570,10 +570,10 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (err) goto out; - if (proc_ns_file(f.file)) + if (proc_ns_file(fd_file(f))) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, pidfd_pid(f.file)); + err = validate_nsset(&nsset, pidfd_pid(fd_file(f))); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); diff --git a/kernel/pid.c b/kernel/pid.c index da76ed1873f7..2715afb77eab 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -540,13 +540,13 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) struct pid *pid; f = fdget(fd); - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-EBADF); - pid = pidfd_pid(f.file); + pid = pidfd_pid(fd_file(f)); if (!IS_ERR(pid)) { get_pid(pid); - *flags = f.file->f_flags; + *flags = fd_file(f)->f_flags; } fdput(f); @@ -755,10 +755,10 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, return -EINVAL; f = fdget(pidfd); - if (!f.file) + if (!fd_file(f)) return -EBADF; - pid = pidfd_pid(f.file); + pid = pidfd_pid(fd_file(f)); if (IS_ERR(pid)) ret = PTR_ERR(pid); else diff --git a/kernel/signal.c b/kernel/signal.c index 60c737e423a1..cc5d87cfa7c0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3922,11 +3922,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, return -EINVAL; f = fdget(pidfd); - if (!f.file) + if (!fd_file(f)) return -EBADF; /* Is this a pidfd? */ - pid = pidfd_to_pid(f.file); + pid = pidfd_to_pid(fd_file(f)); if (IS_ERR(pid)) { ret = PTR_ERR(pid); goto err; @@ -3939,7 +3939,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, switch (flags) { case 0: /* Infer scope from the type of pidfd. */ - if (f.file->f_flags & PIDFD_THREAD) + if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; diff --git a/kernel/sys.c b/kernel/sys.c index 3a2df1bd9f64..a4be1e568ff5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1916,10 +1916,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) int err; exe = fdget(fd); - if (!exe.file) + if (!fd_file(exe)) return -EBADF; - inode = file_inode(exe.file); + inode = file_inode(fd_file(exe)); /* * Because the original mm->exe_file points to executable file, make @@ -1927,14 +1927,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) + if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) goto exit; - err = file_permission(exe.file, MAY_EXEC); + err = file_permission(fd_file(exe), MAY_EXEC); if (err) goto exit; - err = replace_mm_exe_file(mm, exe.file); + err = replace_mm_exe_file(mm, fd_file(exe)); exit: fdput(exe); return err; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4354ea231fab..0700f40c53ac 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -419,7 +419,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); f = fdget(fd); - if (!f.file) + if (!fd_file(f)) return 0; size = nla_total_size(sizeof(struct cgroupstats)); @@ -440,7 +440,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, f.file->f_path.dentry); + rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 84413114db5c..8d57f7686bb0 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -475,9 +475,54 @@ static bool tmigr_check_lonely(struct tmigr_group *group) return bitmap_weight(&active, BIT_CNT) <= 1; } -typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); +/** + * struct tmigr_walk - data required for walking the hierarchy + * @nextexp: Next CPU event expiry information which is handed into + * the timer migration code by the timer code + * (get_next_timer_interrupt()) + * @firstexp: Contains the first event expiry information when + * hierarchy is completely idle. When CPU itself was the + * last going idle, information makes sure, that CPU will + * be back in time. When using this value in the remote + * expiry case, firstexp is stored in the per CPU tmigr_cpu + * struct of CPU which expires remote timers. It is updated + * in top level group only. Be aware, there could occur a + * new top level of the hierarchy between the 'top level + * call' in tmigr_update_events() and the check for the + * parent group in walk_groups(). Then @firstexp might + * contain a value != KTIME_MAX even if it was not the + * final top level. This is not a problem, as the worst + * outcome is a CPU which might wake up a little early. + * @evt: Pointer to tmigr_event which needs to be queued (of idle + * child group) + * @childmask: groupmask of child group + * @remote: Is set, when the new timer path is executed in + * tmigr_handle_remote_cpu() + * @basej: timer base in jiffies + * @now: timer base monotonic + * @check: is set if there is the need to handle remote timers; + * required in tmigr_requires_handle_remote() only + * @tmc_active: this flag indicates, whether the CPU which triggers + * the hierarchy walk is !idle in the timer migration + * hierarchy. When the CPU is idle and the whole hierarchy is + * idle, only the first event of the top level has to be + * considered. + */ +struct tmigr_walk { + u64 nextexp; + u64 firstexp; + struct tmigr_event *evt; + u8 childmask; + bool remote; + unsigned long basej; + u64 now; + bool check; + bool tmc_active; +}; + +typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, void *data, +static void __walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { struct tmigr_group *child = NULL, *group = tmc->tmgroup; @@ -490,64 +535,17 @@ static void __walk_groups(up_f up, void *data, child = group; group = group->parent; + data->childmask = child->groupmask; } while (group); } -static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) +static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); __walk_groups(up, data, tmc); } -/** - * struct tmigr_walk - data required for walking the hierarchy - * @nextexp: Next CPU event expiry information which is handed into - * the timer migration code by the timer code - * (get_next_timer_interrupt()) - * @firstexp: Contains the first event expiry information when last - * active CPU of hierarchy is on the way to idle to make - * sure CPU will be back in time. - * @evt: Pointer to tmigr_event which needs to be queued (of idle - * child group) - * @childmask: childmask of child group - * @remote: Is set, when the new timer path is executed in - * tmigr_handle_remote_cpu() - */ -struct tmigr_walk { - u64 nextexp; - u64 firstexp; - struct tmigr_event *evt; - u8 childmask; - bool remote; -}; - -/** - * struct tmigr_remote_data - data required for remote expiry hierarchy walk - * @basej: timer base in jiffies - * @now: timer base monotonic - * @firstexp: returns expiry of the first timer in the idle timer - * migration hierarchy to make sure the timer is handled in - * time; it is stored in the per CPU tmigr_cpu struct of - * CPU which expires remote timers - * @childmask: childmask of child group - * @check: is set if there is the need to handle remote timers; - * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. - */ -struct tmigr_remote_data { - unsigned long basej; - u64 now; - u64 firstexp; - u8 childmask; - bool check; - bool tmc_active; -}; - /* * Returns the next event of the timerqueue @group->events * @@ -618,10 +616,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) static bool tmigr_active_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -649,8 +646,7 @@ static bool tmigr_active_up(struct tmigr_group *group, } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); - if ((walk_done == false) && group->parent) - data->childmask = group->childmask; + trace_tmigr_group_set_cpu_active(group, newstate, childmask); /* * The group is active (again). The group event might be still queued @@ -666,8 +662,6 @@ static bool tmigr_active_up(struct tmigr_group *group, */ group->groupevt.ignore = true; - trace_tmigr_group_set_cpu_active(group, newstate, childmask); - return walk_done; } @@ -675,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) { struct tmigr_walk data; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; trace_tmigr_cpu_active(tmc); @@ -860,10 +854,8 @@ unlock: static bool tmigr_new_timer_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_walk *data = ptr; - return tmigr_update_events(group, child, data); } @@ -995,9 +987,8 @@ unlock: static bool tmigr_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; struct tmigr_event *evt; unsigned long jif; u8 childmask; @@ -1034,12 +1025,10 @@ again: } /* - * Update of childmask for the next level and keep track of the expiry - * of the first event that needs to be handled (group->next_expiry was - * updated by tmigr_next_expired_groupevt(), next was set by - * tmigr_handle_remote_cpu()). + * Keep track of the expiry of the first event that needs to be handled + * (group->next_expiry was updated by tmigr_next_expired_groupevt(), + * next was set by tmigr_handle_remote_cpu()). */ - data->childmask = group->childmask; data->firstexp = group->next_expiry; raw_spin_unlock_irq(&group->lock); @@ -1055,12 +1044,12 @@ again: void tmigr_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; if (tmigr_is_not_available(tmc)) return; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; /* @@ -1068,7 +1057,7 @@ void tmigr_handle_remote(void) * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ - if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { + if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { /* * If this CPU was an idle migrator, make sure to clear its wakeup * value so it won't chase timers that have already expired elsewhere. @@ -1097,9 +1086,8 @@ void tmigr_handle_remote(void) static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; u8 childmask; childmask = data->childmask; @@ -1118,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, * group before reading the next_expiry value. */ if (group->parent && !data->tmc_active) - goto out; + return false; /* * The lock is required on 32bit architectures to read the variable @@ -1143,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, raw_spin_unlock(&group->lock); } -out: - /* Update of childmask for the next level */ - data->childmask = group->childmask; return false; } @@ -1157,7 +1142,7 @@ out: bool tmigr_requires_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; unsigned long jif; bool ret = false; @@ -1165,7 +1150,7 @@ bool tmigr_requires_handle_remote(void) return ret; data.now = get_jiffies_update(&jif); - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; data.tmc_active = !tmc->idle; data.check = false; @@ -1230,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp) if (nextexp != tmc->cpuevt.nextevt.expires || tmc->cpuevt.ignore) { ret = tmigr_new_timer(tmc, nextexp); + /* + * Make sure the reevaluation of timers in idle path + * will not miss an event. + */ + WRITE_ONCE(tmc->wakeup, ret); } } - /* - * Make sure the reevaluation of timers in idle path will not miss an - * event. - */ - WRITE_ONCE(tmc->wakeup, ret); - trace_tmigr_cpu_new_timer_idle(tmc, nextexp); raw_spin_unlock(&tmc->lock); return ret; @@ -1245,10 +1229,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp) static bool tmigr_inactive_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate, childstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -1299,9 +1282,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group, WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); - if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, - newstate.state)) + if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { + trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); break; + } /* * The memory barrier is paired with the cmpxchg() in @@ -1317,22 +1301,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group, /* Event Handling */ tmigr_update_events(group, child, data); - if (group->parent && (walk_done == false)) - data->childmask = group->childmask; - - /* - * data->firstexp was set by tmigr_update_events() and contains the - * expiry of the first global event which needs to be handled. It - * differs from KTIME_MAX if: - * - group is the top level group and - * - group is idle (which means CPU was the last active CPU in the - * hierarchy) and - * - there is a pending event in the hierarchy - */ - WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); - - trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); - return walk_done; } @@ -1341,7 +1309,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) struct tmigr_walk data = { .nextexp = nextexp, .firstexp = KTIME_MAX, .evt = &tmc->cpuevt, - .childmask = tmc->childmask }; + .childmask = tmc->groupmask }; /* * If nextexp is KTIME_MAX, the CPU event will be ignored because the @@ -1400,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp) * the only one in the level 0 group; and if it is the * only one in level 0 group, but there are more than a * single group active on the way to top level) - * * nextevt - when CPU is offline and has to handle timer on his own + * * nextevt - when CPU is offline and has to handle timer on its own * or when on the way to top in every group only a single * child is active but @nextevt is before the lowest * next_expiry encountered while walking up to top level. @@ -1419,7 +1387,7 @@ u64 tmigr_quick_check(u64 nextevt) if (WARN_ON_ONCE(tmc->idle)) return nextevt; - if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) + if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) return KTIME_MAX; do { @@ -1442,6 +1410,66 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; } +/* + * tmigr_trigger_active() - trigger a CPU to become active again + * + * This function is executed on a CPU which is part of cpu_online_mask, when the + * last active CPU in the hierarchy is offlining. With this, it is ensured that + * the other CPU is active and takes over the migrator duty. + */ +static long tmigr_trigger_active(void *unused) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + WARN_ON_ONCE(!tmc->online || tmc->idle); + + return 0; +} + +static int tmigr_cpu_offline(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + int migrator; + u64 firstexp; + + raw_spin_lock_irq(&tmc->lock); + tmc->online = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_offline(tmc); + raw_spin_unlock_irq(&tmc->lock); + + if (firstexp != KTIME_MAX) { + migrator = cpumask_any_but(cpu_online_mask, cpu); + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } + + return 0; +} + +static int tmigr_cpu_online(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + /* Check whether CPU data was successfully initialized */ + if (WARN_ON_ONCE(!tmc->tmgroup)) + return -EINVAL; + + raw_spin_lock_irq(&tmc->lock); + trace_tmigr_cpu_online(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->online = true; + raw_spin_unlock_irq(&tmc->lock); + return 0; +} + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1514,21 +1542,25 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, } static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent) + struct tmigr_group *parent, + bool activate) { - union tmigr_state childstate; + struct tmigr_walk data; raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); child->parent = parent; - child->childmask = BIT(parent->num_children++); + child->groupmask = BIT(parent->num_children++); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); trace_tmigr_connect_child_parent(child); + if (!activate) + return; + /* * To prevent inconsistent states, active children need to be active in * the new parent as well. Inactive children are already marked inactive @@ -1544,21 +1576,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, * child to the new parent. So tmigr_connect_child_parent() is * executed with the formerly top level group (child) and the newly * created group (parent). + * + * * It is ensured that the child is active, as this setup path is + * executed in hotplug prepare callback. This is exectued by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - childstate.state = atomic_read(&child->migr_state); - if (childstate.migrator != TMIGR_NONE) { - struct tmigr_walk data; - - data.childmask = child->childmask; + data.childmask = child->groupmask; - /* - * There is only one new level per time. When connecting the - * child and the parent and set the child active when the parent - * is inactive, the parent needs to be the uppermost - * level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); - } + /* + * There is only one new level per time (which is protected by + * tmigr_mutex). When connecting the child and the parent and set the + * child active when the parent is inactive, the parent needs to be the + * uppermost level. Otherwise there went something wrong! + */ + WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } static int tmigr_setup_groups(unsigned int cpu, unsigned int node) @@ -1611,12 +1646,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * Update tmc -> group / child -> group connection */ if (i == 0) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); raw_spin_lock_irq(&group->lock); tmc->tmgroup = group; - tmc->childmask = BIT(group->num_children++); + tmc->groupmask = BIT(group->num_children++); raw_spin_unlock_irq(&group->lock); @@ -1626,7 +1661,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group); + /* Will be activated at online time */ + tmigr_connect_child_parent(child, group, false); } /* check if uppermost level was newly created */ @@ -1637,12 +1673,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) lvllist = &tmigr_level_list[top]; if (group->num_children == 1 && list_is_singular(lvllist)) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + lvllist = &tmigr_level_list[top - 1]; list_for_each_entry(child, lvllist, list) { if (child->parent) continue; - tmigr_connect_child_parent(child, group); + tmigr_connect_child_parent(child, group, true); } } } @@ -1664,80 +1709,31 @@ static int tmigr_add_cpu(unsigned int cpu) return ret; } -static int tmigr_cpu_online(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int ret; - - /* First online attempt? Initialize CPU data */ - if (!tmc->tmgroup) { - raw_spin_lock_init(&tmc->lock); - - ret = tmigr_add_cpu(cpu); - if (ret < 0) - return ret; - - if (tmc->childmask == 0) - return -EINVAL; - - timerqueue_init(&tmc->cpuevt.nextevt); - tmc->cpuevt.nextevt.expires = KTIME_MAX; - tmc->cpuevt.ignore = true; - tmc->cpuevt.cpu = cpu; - - tmc->remote = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); - } - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); - return 0; -} - -/* - * tmigr_trigger_active() - trigger a CPU to become active again - * - * This function is executed on a CPU which is part of cpu_online_mask, when the - * last active CPU in the hierarchy is offlining. With this, it is ensured that - * the other CPU is active and takes over the migrator duty. - */ -static long tmigr_trigger_active(void *unused) +static int tmigr_cpu_prepare(unsigned int cpu) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); + int ret = 0; - WARN_ON_ONCE(!tmc->online || tmc->idle); - - return 0; -} - -static int tmigr_cpu_offline(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int migrator; - u64 firstexp; + /* Not first online attempt? */ + if (tmc->tmgroup) + return ret; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; + raw_spin_lock_init(&tmc->lock); + timerqueue_init(&tmc->cpuevt.nextevt); + tmc->cpuevt.nextevt.expires = KTIME_MAX; + tmc->cpuevt.ignore = true; + tmc->cpuevt.cpu = cpu; + tmc->remote = false; WRITE_ONCE(tmc->wakeup, KTIME_MAX); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + ret = tmigr_add_cpu(cpu); + if (ret < 0) + return ret; - if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); - work_on_cpu(migrator, tmigr_trigger_active, NULL); - } + if (tmc->groupmask == 0) + return -EINVAL; - return 0; + return ret; } static int __init tmigr_init(void) @@ -1796,6 +1792,11 @@ static int __init tmigr_init(void) tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, tmigr_crossnode_level); + ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", + tmigr_cpu_prepare, NULL); + if (ret) + goto err; + ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", tmigr_cpu_online, tmigr_cpu_offline); if (ret) @@ -1807,4 +1808,4 @@ err: pr_err("Timer migration setup failed\n"); return ret; } -late_initcall(tmigr_init); +early_initcall(tmigr_init); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 6c37d94a37d9..154accc7a543 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -22,7 +22,17 @@ struct tmigr_event { * struct tmigr_group - timer migration hierarchy group * @lock: Lock protecting the event information and group hierarchy * information during setup - * @parent: Pointer to the parent group + * @parent: Pointer to the parent group. Pointer is updated when a + * new hierarchy level is added because of a CPU coming + * online the first time. Once it is set, the pointer will + * not be removed or updated. When accessing parent pointer + * lock less to decide whether to abort a propagation or + * not, it is not a problem. The worst outcome is an + * unnecessary/early CPU wake up. But do not access parent + * pointer several times in the same 'action' (like + * activation, deactivation, check for remote expiry,...) + * without holding the lock as it is not ensured that value + * will not change. * @groupevt: Next event of the group which is only used when the * group is !active. The group event is then queued into * the parent timer queue. @@ -41,9 +51,8 @@ struct tmigr_event { * @num_children: Counter of group children to make sure the group is only * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup * only - * @childmask: childmask of the group in the parent group; is set - * during setup and will never change; can be read - * lockless + * @groupmask: mask of the group in the parent group; is set during + * setup and will never change; can be read lockless * @list: List head that is added to the per level * tmigr_level_list; is required during setup when a * new group needs to be connected to the existing @@ -59,7 +68,7 @@ struct tmigr_group { unsigned int level; int numa_node; unsigned int num_children; - u8 childmask; + u8 groupmask; struct list_head list; }; @@ -79,7 +88,7 @@ struct tmigr_group { * hierarchy * @remote: Is set when timers of the CPU are expired remotely * @tmgroup: Pointer to the parent group - * @childmask: childmask of tmigr_cpu in the parent group + * @groupmask: mask of tmigr_cpu in the parent group * @wakeup: Stores the first timer when the timer migration * hierarchy is completely idle and remote expiry was done; * is returned to timer code in the idle path and is only @@ -92,7 +101,7 @@ struct tmigr_cpu { bool idle; bool remote; struct tmigr_group *tmgroup; - u8 childmask; + u8 groupmask; u64 wakeup; struct tmigr_event cpuevt; }; @@ -108,8 +117,8 @@ union tmigr_state { u32 state; /** * struct - split state of tmigr_group - * @active: Contains each childmask bit of the active children - * @migrator: Contains childmask of the child which is migrator + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator * @seq: Sequence counter needs to be increased when an update * to the tmigr_state is done. It prevents a race when * updates in the child groups are propagated in changed diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 03b90d7d2175..d36242fd4936 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -666,8 +666,8 @@ struct watch_queue *get_watch_queue(int fd) struct fd f; f = fdget(fd); - if (f.file) { - pipe = get_pipe_info(f.file, false); + if (fd_file(f)) { + pipe = get_pipe_info(fd_file(f), false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); |