diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/memalloc.c | 44 | ||||
| -rw-r--r-- | kernel/bpf/mprog.c | 13 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 21 | ||||
| -rw-r--r-- | kernel/bpf/tcx.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 14 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 5 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 4 | ||||
| -rw-r--r-- | kernel/printk/printk.c | 8 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 3 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 89 | ||||
| -rw-r--r-- | kernel/workqueue.c | 24 |
11 files changed, 154 insertions, 79 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index cf1941516643..d93ddac283d4 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -965,37 +965,31 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } -/* Most of the logic is taken from setup_kmalloc_cache_index_table() */ static __init int bpf_mem_cache_adjust_size(void) { - unsigned int size, index; + unsigned int size; - /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be - * up-to 256-bytes. + /* Adjusting the indexes in size_index() according to the object_size + * of underlying slab cache, so bpf_mem_alloc() will select a + * bpf_mem_cache with unit_size equal to the object_size of + * the underlying slab cache. + * + * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is + * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. */ - size = KMALLOC_MIN_SIZE; - if (size <= 192) - index = size_index[(size - 1) / 8]; - else - index = fls(size - 1) - 1; - for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8) - size_index[(size - 1) / 8] = index; + for (size = 192; size >= 8; size -= 8) { + unsigned int kmalloc_size, index; - /* The minimal alignment is 64-bytes, so disable 96-bytes cache and - * use 128-bytes cache instead. - */ - if (KMALLOC_MIN_SIZE >= 64) { - index = size_index[(128 - 1) / 8]; - for (size = 64 + 8; size <= 96; size += 8) - size_index[(size - 1) / 8] = index; - } + kmalloc_size = kmalloc_size_roundup(size); + if (kmalloc_size == size) + continue; - /* The minimal alignment is 128-bytes, so disable 192-bytes cache and - * use 256-bytes cache instead. - */ - if (KMALLOC_MIN_SIZE >= 128) { - index = fls(256 - 1) - 1; - for (size = 128 + 8; size <= 192; size += 8) + if (kmalloc_size <= 192) + index = size_index[(kmalloc_size - 1) / 8]; + else + index = fls(kmalloc_size - 1) - 1; + /* Only overwrite if necessary */ + if (size_index[(size - 1) / 8] != index) size_index[(size - 1) / 8] = index; } diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c index 32d2c4829eb8..1394168062e8 100644 --- a/kernel/bpf/mprog.c +++ b/kernel/bpf/mprog.c @@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry, goto out; } idx = tidx; + } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { + ret = -ERANGE; + goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); @@ -398,14 +401,16 @@ int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; + u32 id, count = 0; + u64 revision = 1; int i, ret = 0; - u32 id, count; - u64 revision; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; - revision = bpf_mprog_revision(entry); - count = bpf_mprog_total(entry); + if (entry) { + revision = bpf_mprog_revision(entry); + count = bpf_mprog_total(entry); + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb01c31ed591..d77b2f8b9364 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3796,7 +3796,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - u32 mask; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) @@ -3805,10 +3804,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; - mask = bpf_mprog_supported(ptype) ? - BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE; - if (attr->attach_flags & ~mask) - return -EINVAL; + if (bpf_mprog_supported(ptype)) { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + } else { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) + return -EINVAL; + if (attr->relative_fd || + attr->expected_revision) + return -EINVAL; + } prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -3878,6 +3883,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (attr->attach_flags || + attr->relative_fd || + attr->expected_revision) { + return -EINVAL; } switch (ptype) { @@ -3913,7 +3922,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags +#define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 13f0b5dc8262..1338a13a8b64 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -123,7 +123,6 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; - struct bpf_mprog_entry *entry; struct net_device *dev; int ret; @@ -133,12 +132,7 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) ret = -ENODEV; goto out; } - entry = tcx_entry_fetch(dev, ingress); - if (!entry) { - ret = -ENOENT; - goto out; - } - ret = bpf_mprog_query(attr, uattr, entry); + ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress)); out: rtnl_unlock(); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb78212fa5b2..873ade146f3d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4047,11 +4047,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_reg(bt, i); - continue; - } - reg->precise = true; + bt_clear_reg(bt, i); + if (reg->type == SCALAR_VALUE) + reg->precise = true; } return 0; } @@ -14481,7 +14479,7 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1); + struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; @@ -14529,8 +14527,8 @@ static int check_return_code(struct bpf_verifier_env *env) return -EINVAL; } - if (!tnum_in(tnum_const(0), reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "async callback", "R0"); + if (!tnum_in(const_0, reg->var_off)) { + verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0"); return -EINVAL; } return 0; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c487ffef6652..76db6c67e39a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -360,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, } css_task_iter_end(&it); length = n; - /* now sort & (if procs) strip out duplicates */ + /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 87e9f7e2bdc0..0f12e0a97e43 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2647,7 +2647,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, memory_bm_free(bm, PG_UNSAFE_KEEP); /* Make a copy of zero_bm so it can be created in safe pages */ - error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY); + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); if (error) goto Free; @@ -2660,7 +2660,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, goto Free; duplicate_memory_bitmap(zero_bm, &tmp); - memory_bm_free(&tmp, PG_UNSAFE_KEEP); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); /* At this point zero_bm is in safe pages and it can be used for restoring. */ if (nr_highmem > 0) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7e0b4dd02398..0b3af1529778 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3740,12 +3740,18 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre seq = prb_next_seq(prb); + /* Flush the consoles so that records up to @seq are printed. */ + console_lock(); + console_unlock(); + for (;;) { diff = 0; /* * Hold the console_lock to guarantee safe access to - * console->seq. + * console->seq. Releasing console_lock flushes more + * records in case @seq is still not printed on all + * usable consoles. */ console_lock(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4492608b7d7f..458d359f5991 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -350,7 +350,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, * Except when the rq is capped by uclamp_max. */ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb225921bbca..061a30a8925a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -664,6 +664,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; } +/* + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * For this to be so, the result of this function must have a left bias. + */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -677,8 +681,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) load += weight; } - if (load) + if (load) { + /* sign flips effective floor / ceil */ + if (avg < 0) + avg -= (load - 1); avg = div_s64(avg, load); + } return cfs_rq->min_vruntime + avg; } @@ -864,14 +872,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * * Which allows an EDF like search on (sub)trees. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; + struct sched_entity *best_left = NULL; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; + best = curr; /* * Once selected, run a task until it either becomes non-eligible or @@ -892,33 +902,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) } /* - * If this entity has an earlier deadline than the previous - * best, take this one. If it also has the earliest deadline - * of its subtree, we're done. + * Now we heap search eligible trees for the best (min_)deadline */ - if (!best || deadline_gt(deadline, best, se)) { + if (!best || deadline_gt(deadline, best, se)) best = se; - if (best->deadline == best->min_deadline) - break; - } /* - * If the earlest deadline in this subtree is in the fully - * eligible left half of our space, go there. + * Every se in a left branch is eligible, keep track of the + * branch with the best min_deadline */ + if (node->rb_left) { + struct sched_entity *left = __node_2_se(node->rb_left); + + if (!best_left || deadline_gt(min_deadline, best_left, left)) + best_left = left; + + /* + * min_deadline is in the left branch. rb_left and all + * descendants are eligible, so immediately switch to the second + * loop. + */ + if (left->min_deadline == se->min_deadline) + break; + } + + /* min_deadline is at this node, no need to look right */ + if (se->deadline == se->min_deadline) + break; + + /* else min_deadline is in the right branch. */ + node = node->rb_right; + } + + /* + * We ran into an eligible node which is itself the best. + * (Or nr_running == 0 and both are NULL) + */ + if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0) + return best; + + /* + * Now best_left and all of its children are eligible, and we are just + * looking for deadline == min_deadline + */ + node = &best_left->run_node; + while (node) { + struct sched_entity *se = __node_2_se(node); + + /* min_deadline is the current node */ + if (se->deadline == se->min_deadline) + return se; + + /* min_deadline is in the left branch */ if (node->rb_left && __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { node = node->rb_left; continue; } + /* else min_deadline is in the right branch */ node = node->rb_right; } + return NULL; +} - if (!best || (curr && deadline_gt(deadline, best, curr))) - best = curr; +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = __pick_eevdf(cfs_rq); - if (unlikely(!best)) { + if (!se) { struct sched_entity *left = __pick_first_entity(cfs_rq); if (left) { pr_err("EEVDF scheduling fail, picking leftmost\n"); @@ -926,7 +978,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) } } - return best; + return se; } #ifdef CONFIG_SCHED_DEBUG @@ -3605,6 +3657,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, */ deadline = div_s64(deadline * old_weight, weight); se->deadline = se->vruntime + deadline; + min_deadline_cb_propagate(&se->run_node, NULL); } #ifdef CONFIG_SMP @@ -4919,10 +4972,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice = calc_delta_fair(se->slice, se); - u64 vruntime = avg_vruntime(cfs_rq); + u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; + se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + /* * Due to how V is constructed as the weighted average of entities, * adding tasks with positive lag, or removing tasks with negative lag diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b9f053a5a5f0..a3522b70218d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2166,7 +2166,7 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[16]; + char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); @@ -4600,12 +4600,22 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } cpus_read_unlock(); + /* for unbound pwq, flush the pwq_release_worker ensures that the + * pwq_release_workfn() completes before calling kfree(wq). + */ + if (ret) + kthread_flush_worker(pwq_release_worker); + return ret; enomem: if (wq->cpu_pwq) { - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); + + if (pwq) + kmem_cache_free(pwq_cache, pwq); + } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } @@ -5782,9 +5792,13 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND)) continue; + /* creating multiple pwqs breaks ordering guarantee */ - if (wq->flags & __WQ_ORDERED) - continue; + if (!list_empty(&wq->pwqs)) { + if (wq->flags & __WQ_ORDERED_EXPLICIT) + continue; + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { |
