summaryrefslogtreecommitdiff
path: root/kernel/cgroup/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r--kernel/cgroup/cpuset.c136
1 files changed, 105 insertions, 31 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 306b60430091..3bc4301466f3 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -192,6 +192,20 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
WRITE_ONCE(cs->prs_err, PERR_NONE);
}
+/*
+ * The top_cpuset is always synchronized to cpu_active_mask and we should avoid
+ * using cpu_online_mask as much as possible. An active CPU is always an online
+ * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ
+ * during hotplug operations. A CPU is marked active at the last stage of CPU
+ * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code
+ * will be called to update the sched domains so that the scheduler can move
+ * a normal task to a newly active CPU or remove tasks away from a newly
+ * inactivated CPU. The online bit is set much earlier in the CPU bringup
+ * process and cleared much later in CPU teardown.
+ *
+ * If cpu_online_mask is used while a hotunplug operation is happening in
+ * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
+ */
static struct cpuset top_cpuset = {
.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
@@ -355,18 +369,18 @@ static inline bool partition_is_populated(struct cpuset *cs,
* appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
+ * of cpu_active_mask.
*
* Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct task_struct *tsk,
+static void guarantee_active_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
- if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
- cpumask_copy(pmask, cpu_online_mask);
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+ cpumask_copy(pmask, cpu_active_mask);
rcu_read_lock();
cs = task_cs(tsk);
@@ -1116,9 +1130,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
- * Percpu kthreads in top_cpuset are ignored
+ * PF_NO_SETAFFINITY tasks are ignored.
+ * All per cpu kthreads should have PF_NO_SETAFFINITY
+ * flag set, see kthread_set_per_cpu().
*/
- if (kthread_is_per_cpu(task))
+ if (task->flags & PF_NO_SETAFFINITY)
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
@@ -1388,14 +1404,12 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
if (sibling == cs)
continue;
- if (!cpumask_empty(sibling->exclusive_cpus) &&
- cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
+ if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
retval++;
continue;
}
- if (!cpumask_empty(sibling->effective_xcpus) &&
- cpumask_intersects(xcpus, sibling->effective_xcpus)) {
+ if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
retval++;
}
@@ -1439,13 +1453,15 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* The requested exclusive_cpus must not be allocated to other
* partitions and it can't use up all the root's effective_cpus.
*
- * Note that if there is any local partition root above it or
- * remote partition root underneath it, its exclusive_cpus must
- * have overlapped with subpartitions_cpus.
+ * The effective_xcpus mask can contain offline CPUs, but there must
+ * be at least one or more online CPUs present before it can be enabled.
+ *
+ * Note that creating a remote partition with any local partition root
+ * above it or remote partition root underneath it is not allowed.
*/
compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
- if (cpumask_empty(tmp->new_cpus) ||
- cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+ WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
+ if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
return PERR_INVCPUS;
@@ -1541,6 +1557,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
* left in the top cpuset.
*/
if (adding) {
+ WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
if (!capable(CAP_SYS_ADMIN))
cs->prs_err = PERR_ACCESS;
else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
@@ -1650,7 +1667,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
- WARN_ON_ONCE(is_remote_partition(cs));
+ WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
* new_prs will only be changed for the partcmd_update and
@@ -1696,7 +1713,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* exclusive_cpus not set. Sibling conflict should only happen
* if exclusive_cpus isn't set.
*/
- xcpus = tmp->new_cpus;
+ xcpus = tmp->delmask;
if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
@@ -1717,9 +1734,20 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (nocpu)
return PERR_NOCPUS;
- deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
- if (deleting)
- subparts_delta++;
+ /*
+ * This function will only be called when all the preliminary
+ * checks have passed. At this point, the following condition
+ * should hold.
+ *
+ * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus
+ *
+ * Warn if it is not the case.
+ */
+ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+
+ deleting = true;
+ subparts_delta++;
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
@@ -1774,6 +1802,15 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
parent->effective_xcpus);
}
/*
+ * The new CPUs to be removed from parent's effective CPUs
+ * must be present.
+ */
+ if (deleting) {
+ cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+ }
+
+ /*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
@@ -2263,7 +2300,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
bool force = false;
int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -3082,7 +3119,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
lockdep_assert_held(&cpuset_mutex);
if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
+ guarantee_active_cpus(task, cpus_attach);
else
cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
subpartitions_cpus);
@@ -3524,11 +3561,7 @@ out_unlock:
* will call rebuild_sched_domains_locked(). That is not needed
* in the default hierarchy where only changes in partition
* will cause repartitioning.
- *
- * If the cpuset has the 'sched.partition' flag enabled, simulate
- * turning 'sched.partition" off.
*/
-
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -3546,6 +3579,11 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_unlock();
}
+/*
+ * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
+ * changing it back to member to free its exclusive CPUs back to the pool to
+ * be used by other online cpusets.
+ */
static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -4026,7 +4064,7 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
+ * subset of cpu_active_mask, even if this means going outside the
* tasks cpuset, except when the task is in the top cpuset.
**/
@@ -4040,7 +4078,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
cs = task_cs(tsk);
if (cs != &top_cpuset)
- guarantee_online_cpus(tsk, pmask);
+ guarantee_active_cpus(tsk, pmask);
/*
* Tasks in the top cpuset won't get update to their cpumasks
* when a hotplug online/offline event happens. So we include all
@@ -4054,7 +4092,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* allowable online cpu left, we fall back to all possible cpus.
*/
cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
- if (!cpumask_intersects(pmask, cpu_online_mask))
+ if (!cpumask_intersects(pmask, cpu_active_mask))
cpumask_copy(pmask, possible_mask);
}
@@ -4164,7 +4202,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/*
- * cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -4203,7 +4241,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
bool allowed; /* is allocation in zone z allowed? */
@@ -4237,6 +4275,42 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
return allowed;
}
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+ bool allowed;
+
+ /*
+ * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+ * and mems_allowed is likely to be empty even if we could get to it,
+ * so return true to avoid taking a global lock on the empty check.
+ */
+ if (!cpuset_v2())
+ return true;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return true;
+
+ /*
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but node_isset is atomic and the reference
+ * taken via cgroup_get_e_css is sufficient to protect css.
+ *
+ * Since this interface is intended for use by migration paths, we
+ * relax locking here to avoid taking global locks - while accepting
+ * there may be rare scenarios where the result may be innaccurate.
+ *
+ * Reclaim and migration are subject to these same race conditions, and
+ * cannot make strong isolation guarantees, so this is acceptable.
+ */
+ cs = container_of(css, struct cpuset, css);
+ allowed = node_isset(nid, cs->effective_mems);
+ css_put(css);
+ return allowed;
+}
+
/**
* cpuset_spread_node() - On which node to begin search for a page
* @rotor: round robin rotor