summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c16
-rw-r--r--kernel/bpf/dmabuf_iter.c56
-rw-r--r--kernel/cgroup/rstat.c13
-rw-r--r--kernel/power/em_netlink_autogen.c1
-rw-r--r--kernel/power/em_netlink_autogen.h1
-rw-r--r--kernel/sched/ext.c72
-rw-r--r--kernel/trace/bpf_trace.c2
7 files changed, 124 insertions, 37 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c8ae6ab31651..1b9b18e5b03c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -760,6 +760,22 @@ struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
NULL;
}
+bool bpf_has_frame_pointer(unsigned long ip)
+{
+ struct bpf_ksym *ksym;
+ unsigned long offset;
+
+ guard(rcu)();
+
+ ksym = bpf_ksym_find(ip);
+ if (!ksym || !ksym->fp_start || !ksym->fp_end)
+ return false;
+
+ offset = ip - ksym->start;
+
+ return offset >= ksym->fp_start && offset < ksym->fp_end;
+}
+
const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
const struct exception_table_entry *e = NULL;
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
index 4dd7ef7c145c..cd500248abd9 100644
--- a/kernel/bpf/dmabuf_iter.c
+++ b/kernel/bpf/dmabuf_iter.c
@@ -6,10 +6,33 @@
#include <linux/kernel.h>
#include <linux/seq_file.h>
+struct dmabuf_iter_priv {
+ /*
+ * If this pointer is non-NULL, the buffer's refcount is elevated to
+ * prevent destruction between stop/start. If reading is not resumed and
+ * start is never called again, then dmabuf_iter_seq_fini drops the
+ * reference when the iterator is released.
+ */
+ struct dma_buf *dmabuf;
+};
+
static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
{
- if (*pos)
- return NULL;
+ struct dmabuf_iter_priv *p = seq->private;
+
+ if (*pos) {
+ struct dma_buf *dmabuf = p->dmabuf;
+
+ if (!dmabuf)
+ return NULL;
+
+ /*
+ * Always resume from where we stopped, regardless of the value
+ * of pos.
+ */
+ p->dmabuf = NULL;
+ return dmabuf;
+ }
return dma_buf_iter_begin();
}
@@ -54,8 +77,11 @@ static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
{
struct dma_buf *dmabuf = v;
- if (dmabuf)
- dma_buf_put(dmabuf);
+ if (dmabuf) {
+ struct dmabuf_iter_priv *p = seq->private;
+
+ p->dmabuf = dmabuf;
+ }
}
static const struct seq_operations dmabuf_iter_seq_ops = {
@@ -71,11 +97,27 @@ static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
seq_puts(seq, "dmabuf iter\n");
}
+static int dmabuf_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv;
+
+ p->dmabuf = NULL;
+ return 0;
+}
+
+static void dmabuf_iter_seq_fini(void *priv)
+{
+ struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv;
+
+ if (p->dmabuf)
+ dma_buf_put(p->dmabuf);
+}
+
static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
.seq_ops = &dmabuf_iter_seq_ops,
- .init_seq_private = NULL,
- .fini_seq_private = NULL,
- .seq_priv_size = 0,
+ .init_seq_private = dmabuf_iter_seq_init,
+ .fini_seq_private = dmabuf_iter_seq_fini,
+ .seq_priv_size = sizeof(struct dmabuf_iter_priv),
};
static struct bpf_iter_reg bpf_dmabuf_reg_info = {
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index a198e40c799b..150e5871e66f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -71,7 +71,6 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
struct llist_head *lhead;
struct css_rstat_cpu *rstatc;
- struct css_rstat_cpu __percpu *rstatc_pcpu;
struct llist_node *self;
/*
@@ -104,18 +103,22 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
/*
* This function can be renentered by irqs and nmis for the same cgroup
* and may try to insert the same per-cpu lnode into the llist. Note
- * that llist_add() does not protect against such scenarios.
+ * that llist_add() does not protect against such scenarios. In addition
+ * this same per-cpu lnode can be modified through init_llist_node()
+ * from css_rstat_flush() running on a different CPU.
*
* To protect against such stacked contexts of irqs/nmis, we use the
* fact that lnode points to itself when not on a list and then use
- * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * try_cmpxchg() to atomically set to NULL to select the winner
* which will call llist_add(). The losers can assume the insertion is
* successful and the winner will eventually add the per-cpu lnode to
* the llist.
+ *
+ * Please note that we can not use this_cpu_cmpxchg() here as on some
+ * archs it is not safe against modifications from multiple CPUs.
*/
self = &rstatc->lnode;
- rstatc_pcpu = css->rstat_cpu;
- if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
+ if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL))
return;
lhead = ss_lhead_cpu(css->ss, cpu);
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
index a7a09ab1d1c2..ceb3b2bb6ebe 100644
--- a/kernel/power/em_netlink_autogen.c
+++ b/kernel/power/em_netlink_autogen.c
@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/em.yaml */
/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#include <net/netlink.h>
#include <net/genetlink.h>
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
index 78ce609641f1..140ab548103c 100644
--- a/kernel/power/em_netlink_autogen.h
+++ b/kernel/power/em_netlink_autogen.h
@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/em.yaml */
/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#ifndef _LINUX_EM_GEN_H
#define _LINUX_EM_GEN_H
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49e9649..94164f2dec6d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -41,6 +41,13 @@ static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+/*
+ * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
+ * depth on enable failure. Will be removed when bypass depth is moved into the
+ * sched instance.
+ */
+static bool scx_bypassed_for_enable;
+
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -975,6 +982,30 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
+static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
+ u64 enq_flags)
+{
+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+ bool preempt = false;
+
+ /*
+ * If @rq is in balance, the CPU is already vacant and looking for the
+ * next task to run. No need to preempt or trigger resched after moving
+ * @p into its local DSQ.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE)
+ return;
+
+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+ rq->curr->sched_class == &ext_sched_class) {
+ rq->curr->scx.slice = 0;
+ preempt = true;
+ }
+
+ if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
+ resched_curr(rq);
+}
+
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
struct task_struct *p, u64 enq_flags)
{
@@ -1086,22 +1117,10 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
if (enq_flags & SCX_ENQ_CLEAR_OPSS)
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
- if (is_local) {
- struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
- bool preempt = false;
-
- if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
- rq->curr->sched_class == &ext_sched_class) {
- rq->curr->scx.slice = 0;
- preempt = true;
- }
-
- if (preempt || sched_class_above(&ext_sched_class,
- rq->curr->sched_class))
- resched_curr(rq);
- } else {
+ if (is_local)
+ local_dsq_post_enq(dsq, p, enq_flags);
+ else
raw_spin_unlock(&dsq->lock);
- }
}
static void task_unlink_from_dsq(struct task_struct *p,
@@ -1625,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
dsq_mod_nr(dst_dsq, 1);
p->scx.dsq = dst_dsq;
+
+ local_dsq_post_enq(dst_dsq, p, enq_flags);
}
/**
@@ -2402,7 +2423,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* ops.enqueue() that @p is the only one available for this cpu,
* which should trigger an explicit follow-up scheduling event.
*/
- if (sched_class_above(&ext_sched_class, next->sched_class)) {
+ if (next && sched_class_above(&ext_sched_class, next->sched_class)) {
WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
@@ -2425,7 +2446,7 @@ static struct task_struct *
do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
{
struct task_struct *prev = rq->curr;
- bool keep_prev, kick_idle = false;
+ bool keep_prev;
struct task_struct *p;
/* see kick_cpus_irq_workfn() */
@@ -2467,12 +2488,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
} else {
p = first_local_task(rq);
- if (!p) {
- if (kick_idle)
- scx_kick_cpu(rcu_dereference_sched(scx_root),
- cpu_of(rq), SCX_KICK_IDLE);
+ if (!p)
return NULL;
- }
if (unlikely(!p->scx.slice)) {
struct scx_sched *sch = rcu_dereference_sched(scx_root);
@@ -3575,7 +3592,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
int node;
irq_work_sync(&sch->error_irq_work);
- kthread_stop(sch->helper->task);
+ kthread_destroy_worker(sch->helper);
free_percpu(sch->pcpu);
@@ -4318,6 +4335,11 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_max_batch = 0;
free_kick_syncs();
+ if (scx_bypassed_for_enable) {
+ scx_bypassed_for_enable = false;
+ scx_bypass(false);
+ }
+
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4786,7 +4808,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
return sch;
err_stop_helper:
- kthread_stop(sch->helper->task);
+ kthread_destroy_worker(sch->helper);
err_free_pcpu:
free_percpu(sch->pcpu);
err_free_gdsqs:
@@ -4970,6 +4992,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* Init in bypass mode to guarantee forward progress.
*/
scx_bypass(true);
+ scx_bypassed_for_enable = true;
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
@@ -5067,6 +5090,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
+ scx_bypassed_for_enable = false;
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d57727abaade..fe28d86f7c35 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -965,7 +965,7 @@ static const struct bpf_func_proto bpf_d_path_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_d_path_btf_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.allowed = bpf_d_path_allowed,
};