From 4f167fb491725ca0be9df0d76b4b2dd862cdfe0b Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin.zhang@intel.com>
Date: Mon, 16 May 2005 21:53:43 -0700
Subject: [PATCH] spurious interrupt fix

On my IA64 machine, after kernel 2.6.12-rc3 boots, an edge-triggered
interrupt (IRQ 46) keeps triggered over and over again.  There is no IRQ 46
interrupt action handler.  It has lots of impact on performance.

Kernel 2.6.10 and its prior versions have no the problem.  Basically,
kernel 2.6.10 will mask the spurious edge interrupt if the interrupt is
triggered for the second time and its status includes
IRQ_DISABLE|IRQ_PENDING.

Originally, IA64 kernel has its own specific _irq_desc definitions in file
arch/ia64/kernel/irq.c.  The definition initiates _irq_desc[irq].status to
IRQ_DISABLE.  Since kernel 2.6.11, it was moved to architecture independent
codes, i.e.  kernel/irq/handle.c, but kernel/irq/handle.c initiates
_irq_desc[irq].status to 0 instead of IRQ_DISABLE.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/handle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11f3..06b5a6323998 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
+		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
 		.lock = SPIN_LOCK_UNLOCKED
 	}
-- 
cgit 


From 3c0547ba8b3bbd8b26ae35e33ac17ff51f67f78c Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 16 May 2005 21:53:47 -0700
Subject: [PATCH] add_preferred_console() build fix

Move add_preferred_console out of CONFIG_PRINTK so serial console does the
right thing.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 72 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c8a..01b58d7d17ff 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
 
 __setup("console=", console_setup);
 
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init.  Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
-	struct console_cmdline *c;
-	int i;
-
-	/*
-	 *	See if this tty is not yet registered, and
-	 *	if we have a slot free.
-	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-		if (strcmp(console_cmdline[i].name, name) == 0 &&
-			  console_cmdline[i].index == idx) {
-				selected_console = i;
-				return 0;
-		}
-	if (i == MAX_CMDLINECONSOLES)
-		return -E2BIG;
-	selected_console = i;
-	c = &console_cmdline[i];
-	memcpy(c->name, name, sizeof(c->name));
-	c->name[sizeof(c->name) - 1] = 0;
-	c->options = options;
-	c->index = idx;
-	return 0;
-}
-
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned long size = memparse(str, &str);
@@ -670,6 +634,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
 
 #endif
 
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	/*
+	 *	See if this tty is not yet registered, and
+	 *	if we have a slot free.
+	 */
+	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				selected_console = i;
+				return 0;
+		}
+	if (i == MAX_CMDLINECONSOLES)
+		return -E2BIG;
+	selected_console = i;
+	c = &console_cmdline[i];
+	memcpy(c->name, name, sizeof(c->name));
+	c->name[sizeof(c->name) - 1] = 0;
+	c->options = options;
+	c->index = idx;
+	return 0;
+}
+
 /**
  * acquire_console_sem - lock the console system for exclusive use.
  *
-- 
cgit 


From dfaa9c94b13071c9b5f8578d0ae99acc76c60139 Mon Sep 17 00:00:00 2001
From: William Lee Irwin III <wli@holomorphy.com>
Date: Mon, 16 May 2005 21:53:58 -0700
Subject: [PATCH] profile.c: `schedule' parsing fix

profile=schedule parsing is not quite what it should be.  First, str[7] is
'e', not ',', but then even if it did fall through, prof_on =
SCHED_PROFILING would be clobbered inside if (get_option(...)) So a small
amount of rearrangement is done in this patch to correct it.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca867..ad8cbb75ffa2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
 
 static int __init profile_setup(char * str)
 {
+	static char __initdata schedstr[] = "schedule";
 	int par;
 
-	if (!strncmp(str, "schedule", 8)) {
+	if (!strncmp(str, schedstr, strlen(schedstr))) {
 		prof_on = SCHED_PROFILING;
-		printk(KERN_INFO "kernel schedule profiling enabled\n");
-		if (str[7] == ',')
-			str += 8;
-	}
-	if (get_option(&str,&par)) {
+		if (str[strlen(schedstr)] == ',')
+			str += strlen(schedstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel schedule profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (get_option(&str, &par)) {
 		prof_shift = par;
 		prof_on = CPU_PROFILING;
 		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
-- 
cgit 


From 82428b62aa6294ea640c7e920a9224ecaf46db65 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 9 May 2005 08:07:00 -0700
Subject: [PATCH] Driver Core: pm diagnostics update, check for errors

This patch includes various tweaks in the messaging that appears during
system pm state transitions:

  * Warn about certain illegal calls in the device tree, like resuming
    child before parent or suspending parent before child.  This could
    happen easily enough through sysfs, or in some cases when drivers
    use device_pm_set_parent().

  * Be more consistent about dev_dbg() tracing ... do it for resume() and
    shutdown() too, and never if the driver doesn't have that method.

  * Say which type of system sleep state is being entered.

Except for the warnings, these only affect debug messaging.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a57..4cdebc972ff2 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
 		goto Unlock;
 	}
 
-	pr_debug("PM: Preparing system for suspend\n");
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
 
-	pr_debug("PM: Entering state.\n");
+	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
 	error = suspend_enter(state);
 
-	pr_debug("PM: Finishing up.\n");
+	pr_debug("PM: Finishing wakeup.\n");
 	suspend_finish(state);
  Unlock:
 	up(&pm_sem);
-- 
cgit 


From b39c4fab259b216148e705344a892c96efe1946d Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 20 May 2005 13:59:15 -0700
Subject: [PATCH] cpusets+hotplug+preepmt broken

This patch removes the entwining of cpusets and hotplug code in the "No
more Mr.  Nice Guy" case of sched.c move_task_off_dead_cpu().

Since the hotplug code is holding a spinlock at this point, we cannot take
the cpuset semaphore, cpuset_sem, as would seem to be required either to
update the tasks cpuset, or to scan up the nested cpuset chain, looking for
the nearest cpuset ancestor that still has some CPUs that are online.  So
we just punt and blast the tasks cpus_allowed with all bits allowed.

This reverts these lines of code to what they were before the cpuset patch.
 And it updates the cpuset Doc file, to match.

The one known alternative to this that seems to work came from Dinakar
Guniguntala, and required the hotplug code to take the cpuset_sem semaphore
much earlier in its processing.  So far as we know, the increased locking
entanglement between cpusets and hot plug of this alternative approach is
not worth doing in this case.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Nathan Lynch <ntl@pobox.com>
Acked-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667a2..66b2ed784822 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4243,7 +4243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
-		tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+		cpus_setall(tsk->cpus_allowed);
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
 		/*
-- 
cgit 


From 10f02d1c59e55f529140dda3a92f0099d748451c Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@labri.fr>
Date: Sat, 21 May 2005 17:50:15 +0200
Subject: [PATCH] spin_unlock_bh() and preempt_check_resched()

In _spin_unlock_bh(lock):
	do { \
		_raw_spin_unlock(lock); \
		preempt_enable(); \
		local_bh_enable(); \
		__release(lock); \
	} while (0)

there is no reason for using preempt_enable() instead of a simple
preempt_enable_no_resched()

Since we know bottom halves are disabled, preempt_schedule() will always
return at once (preempt_count!=0), and hence preempt_check_resched() is
useless here...

This fixes it by using "preempt_enable_no_resched()" instead of the
"preempt_enable()", and thus avoids the useless preempt_check_resched()
just before re-enabling bottom halves.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863f1..0c3f9d8bbe17 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	_raw_spin_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	_raw_read_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	_raw_write_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 	if (_raw_spin_trylock(lock))
 		return 1;
 
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-- 
cgit 


From c33880aaddbbab1ccf36f4457ed1090621f2e39a Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Tue, 24 May 2005 19:29:47 -0700
Subject: [PATCH] sigkill priority fix

If SIGKILL does not have priority, we cannot instantly kill task before it
makes some unexpected job.  It can be critical, but we were unable to
reproduce this easily until Heiko Carstens <Heiko.Carstens@de.ibm.com>
reported this problem on LKML.

Signed-Off-By: Kirill Korotaev <dev@sw.ru>
Signed-Off-By: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c5b..b3c24c732c5a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
 	int sig = 0;
 
-	sig = next_signal(pending, mask);
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
-- 
cgit 


From 2efe86b809d97debaaf9fcc13b041aedf15bd3d2 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 27 May 2005 02:02:43 -0700
Subject: [PATCH] cpuset exit NULL dereference fix

There is a race in the kernel cpuset code, between the code
to handle notify_on_release, and the code to remove a cpuset.
The notify_on_release code can end up trying to access a
cpuset that has been removed.  In the most common case, this
causes a NULL pointer dereference from the routine cpuset_path.
However all manner of bad things are possible, in theory at least.

The existing code decrements the cpuset use count, and if the
count goes to zero, processes the notify_on_release request,
if appropriate.  However, once the count goes to zero, unless we
are holding the global cpuset_sem semaphore, there is nothing to
stop another task from immediately removing the cpuset entirely,
and recycling its memory.

The obvious fix would be to always hold the cpuset_sem
semaphore while decrementing the use count and dealing with
notify_on_release.  However we don't want to force a global
semaphore into the mainline task exit path, as that might create
a scaling problem.

The actual fix is almost as easy - since this is only an issue
for cpusets using notify_on_release, which the top level big
cpusets don't normally need to use, only take the cpuset_sem
for cpusets using notify_on_release.

This code has been run for hours without a hiccup, while running
a cpuset create/destroy stress test that could crash the existing
kernel in seconds.  This patch applies to the current -linus
git kernel.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Simon Derr <simon.derr@bull.net>
Acked-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044deb..00e8f2575512 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
  * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
  * (usually) grab cpuset_sem.  These are the two most performance
  * critical pieces of code here.  The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
- * notify_on_release.  In that case, the cpuset_sem is taken, the
- * path to the released cpuset calculated, and a usermode call made
+ * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
 	tsk->cpuset = NULL;
 	task_unlock(tsk);
 
-	if (atomic_dec_and_test(&cs->count)) {
+	if (notify_on_release(cs)) {
 		down(&cpuset_sem);
-		check_for_release(cs);
+		if (atomic_dec_and_test(&cs->count))
+			check_for_release(cs);
 		up(&cpuset_sem);
+	} else {
+		atomic_dec(&cs->count);
 	}
 }
 
-- 
cgit