From 013d3868143387be99bb0373d49452eaa3c55d41 Mon Sep 17 00:00:00 2001 From: Martin Andersson Date: Mon, 27 Mar 2006 01:15:18 -0800 Subject: [PATCH] sched: fix task interactivity calculation Is a truncation error in kernel/sched.c triggered when the nice value is negative. The affected code is used in the TASK_INTERACTIVE macro. The code is: #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) which is used in this way: SCALE(TASK_NICE(p), 40, MAX_BONUS) Comments in the code says: * This part scales the interactivity limit depending on niceness. * * We scale it linearly, offset by the INTERACTIVE_DELTA delta. * Here are a few examples of different nice levels: * * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] * * (the X axis represents the possible -5 ... 0 ... +5 dynamic * priority range a task can explore, a value of '1' means the * task is rated interactive.) However, the current code does not scale it linearly and the result differs from the given examples. If the mathematical function "floor" is used when the nice value is negative instead of the truncation one gets when using integer division, the result conforms to the documentation. Output of TASK_INTERACTIVE when using the kernel code: nice dynamic priorities -20 1 1 1 1 1 1 1 1 1 0 0 -19 1 1 1 1 1 1 1 1 0 0 0 -18 1 1 1 1 1 1 1 1 0 0 0 -17 1 1 1 1 1 1 1 1 0 0 0 -16 1 1 1 1 1 1 1 1 0 0 0 -15 1 1 1 1 1 1 1 0 0 0 0 -14 1 1 1 1 1 1 1 0 0 0 0 -13 1 1 1 1 1 1 1 0 0 0 0 -12 1 1 1 1 1 1 1 0 0 0 0 -11 1 1 1 1 1 1 0 0 0 0 0 -10 1 1 1 1 1 1 0 0 0 0 0 -9 1 1 1 1 1 1 0 0 0 0 0 -8 1 1 1 1 1 1 0 0 0 0 0 -7 1 1 1 1 1 0 0 0 0 0 0 -6 1 1 1 1 1 0 0 0 0 0 0 -5 1 1 1 1 1 0 0 0 0 0 0 -4 1 1 1 1 1 0 0 0 0 0 0 -3 1 1 1 1 0 0 0 0 0 0 0 -2 1 1 1 1 0 0 0 0 0 0 0 -1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0 0 4 1 1 1 0 0 0 0 0 0 0 0 5 1 1 1 0 0 0 0 0 0 0 0 6 1 1 1 0 0 0 0 0 0 0 0 7 1 1 1 0 0 0 0 0 0 0 0 8 1 1 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 0 10 1 1 0 0 0 0 0 0 0 0 0 11 1 1 0 0 0 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 0 0 0 13 1 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0 0 0 0 0 0 0 0 15 1 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 19 0 0 0 0 0 0 0 0 0 0 0 Output of TASK_INTERACTIVE when using "floor" nice dynamic priorities -20 1 1 1 1 1 1 1 1 1 0 0 -19 1 1 1 1 1 1 1 1 1 0 0 -18 1 1 1 1 1 1 1 1 1 0 0 -17 1 1 1 1 1 1 1 1 1 0 0 -16 1 1 1 1 1 1 1 1 0 0 0 -15 1 1 1 1 1 1 1 1 0 0 0 -14 1 1 1 1 1 1 1 1 0 0 0 -13 1 1 1 1 1 1 1 1 0 0 0 -12 1 1 1 1 1 1 1 0 0 0 0 -11 1 1 1 1 1 1 1 0 0 0 0 -10 1 1 1 1 1 1 1 0 0 0 0 -9 1 1 1 1 1 1 1 0 0 0 0 -8 1 1 1 1 1 1 0 0 0 0 0 -7 1 1 1 1 1 1 0 0 0 0 0 -6 1 1 1 1 1 1 0 0 0 0 0 -5 1 1 1 1 1 1 0 0 0 0 0 -4 1 1 1 1 1 0 0 0 0 0 0 -3 1 1 1 1 1 0 0 0 0 0 0 -2 1 1 1 1 1 0 0 0 0 0 0 -1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0 0 4 1 1 1 0 0 0 0 0 0 0 0 5 1 1 1 0 0 0 0 0 0 0 0 6 1 1 1 0 0 0 0 0 0 0 0 7 1 1 1 0 0 0 0 0 0 0 0 8 1 1 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 0 10 1 1 0 0 0 0 0 0 0 0 0 11 1 1 0 0 0 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 0 0 0 13 1 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0 0 0 0 0 0 0 0 15 1 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 19 0 0 0 0 0 0 0 0 0 0 0 Signed-off-by: Martin Andersson Acked-by: Ingo Molnar Cc: Nick Piggin Cc: Mike Galbraith Cc: Peter Williams Cc: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 78acdefeccca..dc599c85a88d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -145,7 +145,8 @@ (v1) * (v2_max) / (v1_max) #define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) -- cgit From 77e4bfbcf071f795b54862455dce8902b3fc29c2 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Mon, 27 Mar 2006 01:15:20 -0800 Subject: [PATCH] Small schedule() optimization small schedule() microoptimization. Signed-off-by: Andreas Mohr Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dc599c85a88d..a96a05d23262 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2879,13 +2879,11 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!current->exit_state)) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -- cgit From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Mon, 27 Mar 2006 01:15:22 -0800 Subject: [PATCH] sched: new sched domain for representing multi-core Add a new sched domain for representing multi-core with shared caches between cores. Consider a dual package system, each package containing two cores and with last level cache shared between cores with in a package. If there are two runnable processes, with this appended patch those two processes will be scheduled on different packages. On such systems, with this patch we have observed 8% perf improvement with specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2 users). This new domain will come into play only on multi-core systems with shared caches. On other systems, this sched domain will be removed by domain degeneration code. This new domain can be also used for implementing power savings policy (see OLS 2005 CMP kernel scheduler paper for more details.. I will post another patch for power savings policy soon) Most of the arch/* file changes are for cpu_coregroup_map() implementation. Signed-off-by: Suresh Siddha Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a96a05d23262..8a8b71b5751b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu) } #endif +#ifdef CONFIG_SCHED_MC +static DEFINE_PER_CPU(struct sched_domain, core_domains); +static struct sched_group sched_group_core[NR_CPUS]; +#endif + +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) +static int cpu_to_core_group(int cpu) +{ + return first_cpu(cpu_sibling_map[cpu]); +} +#elif defined(CONFIG_SCHED_MC) +static int cpu_to_core_group(int cpu) +{ + return cpu; +} +#endif + static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; static int cpu_to_phys_group(int cpu) { -#ifdef CONFIG_SCHED_SMT +#if defined(CONFIG_SCHED_MC) + cpumask_t mask = cpu_coregroup_map(cpu); + return first_cpu(mask); +#elif defined(CONFIG_SCHED_SMT) return first_cpu(cpu_sibling_map[cpu]); #else return cpu; @@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map) sd->parent = p; sd->groups = &sched_group_phys[group]; +#ifdef CONFIG_SCHED_MC + p = sd; + sd = &per_cpu(core_domains, i); + group = cpu_to_core_group(i); + *sd = SD_MC_INIT; + sd->span = cpu_coregroup_map(i); + cpus_and(sd->span, sd->span, *cpu_map); + sd->parent = p; + sd->groups = &sched_group_core[group]; +#endif + #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); @@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map) } #endif +#ifdef CONFIG_SCHED_MC + /* Set up multi-core groups */ + for_each_cpu_mask(i, *cpu_map) { + cpumask_t this_core_map = cpu_coregroup_map(i); + cpus_and(this_core_map, this_core_map, *cpu_map); + if (i != first_cpu(this_core_map)) + continue; + init_sched_build_groups(sched_group_core, this_core_map, + &cpu_to_core_group); + } +#endif + + /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map) power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i); + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + * SCHED_LOAD_SCALE / 10; + sd->groups->cpu_power = power; + + sd = &per_cpu(phys_domains, i); + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; +#else sd = &per_cpu(phys_domains, i); power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; +#endif #ifdef CONFIG_NUMA sd = &per_cpu(allnodes_domains, i); @@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map) next_sg: for_each_cpu_mask(j, sg->cpumask) { struct sched_domain *sd; - int power; sd = &per_cpu(phys_domains, j); if (j != first_cpu(sd->groups->cpumask)) { @@ -5833,10 +5896,8 @@ next_sg: */ continue; } - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sg->cpu_power += power; + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; if (sg != sched_group_nodes[i]) @@ -5849,6 +5910,8 @@ next_sg: struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); +#elif defined(CONFIG_SCHED_MC) + sd = &per_cpu(core_domains, i); #else sd = &per_cpu(phys_domains, i); #endif -- cgit From 0806903316d516a3b3851c51cea5c71724d9051d Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Mon, 27 Mar 2006 01:15:23 -0800 Subject: [PATCH] sched: fix group power for allnodes_domains Current sched groups power calculation for allnodes_domains is wrong. We should really be using cumulative power of the physical packages in that group (similar to the calculation in node_domains) Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 62 +++++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8a8b71b5751b..7854ee516b92 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5621,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } +static void init_numa_sched_groups_power(struct sched_group *group_head) +{ + struct sched_group *sg = group_head; + int j; + + if (!sg) + return; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + + sg->cpu_power += sd->groups->cpu_power; + } + sg = sg->next; + if (sg != group_head) + goto next_sg; +} #endif /* @@ -5866,43 +5892,13 @@ void build_sched_domains(const cpumask_t *cpu_map) (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; #endif - -#ifdef CONFIG_NUMA - sd = &per_cpu(allnodes_domains, i); - if (sd->groups) { - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sd->groups->cpu_power = power; - } -#endif } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *sg = sched_group_nodes[i]; - int j; - - if (sg == NULL) - continue; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; + for (i = 0; i < MAX_NUMNODES; i++) + init_numa_sched_groups_power(sched_group_nodes[i]); - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - if (sg != sched_group_nodes[i]) - goto next_sg; - } + init_numa_sched_groups_power(sched_group_allnodes); #endif /* Attach the domains */ -- cgit From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 28 Mar 2006 01:56:37 -0800 Subject: [PATCH] for_each_possible_cpu: fixes for generic part replaces for_each_cpu with for_each_possible_cpu(). Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7854ee516b92..a9ecac398bb9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1625,7 +1625,7 @@ unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += cpu_rq(i)->nr_uninterruptible; /* @@ -1642,7 +1642,7 @@ unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; return sum; @@ -1652,7 +1652,7 @@ unsigned long nr_iowait(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); return sum; @@ -6080,7 +6080,7 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for_each_cpu(i) { + for_each_possible_cpu(i) { prio_array_t *array; rq = cpu_rq(i); -- cgit