[RFC PATCH] Bug during kexec...not all cpus are stopped

Alok Kataria akataria at vmware.com
Fri Oct 8 16:34:47 EDT 2010


Before starting the new kernel kexec calls machine_shutdown to stop all
the cpus, which internally calls native_smp_send_stop. AFAIU, kexec
expects that all the cpus are now halted after that call returns.
Now, looking at the code for native_smp_send_stop, it assumes that all
the processors have processed the REBOOT ipi in 1 second after the IPI
was sent.
native_smp_send_stop()
---------------------------------------------------------
	apic->send_IPI_allbutself(REBOOT_VECTOR);

        /* Don't wait longer than a second */
        wait = USEC_PER_SEC;
        while (num_online_cpus() > 1 && wait--)
        	udelay(1);
---------------------------------------------------------

It just returns after that 1 second irrespective of whether all cpus
were halted or not. This brings up a issue in the kexec case, since we
can have the BSP starting the new kernel and AP's still processing the
REBOOT IPI simultaneously.

Many distribution kernels use kexec to load the newly installed kernel
during the installation phase, in virtualized environment with the host
heavily overcommitted, we have seen some instances when vcpu fails to
process the IPI in the allotted 1 sec and as a result the AP's end up
accessing uninitialized state (the BSP has already gone ahead with
setting up the new state) and causing GPF's.

IMO, kexec expects machine_shutdown to return only after all cpus are
stopped.

The patch below should fix the issue, comments ??

--
machine_shutdown now takes a parameter "wait", if it is true, it waits
until all the cpus are halted. All the callers except kexec still
fallback to the earlier version of the shutdown call, where it just
waited for max 1 sec before returning.

Signed-off-by: Alok N Kataria <akataria at vmware.com>

Index: linux-2.6/arch/x86/include/asm/reboot.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/reboot.h	2010-03-26 16:57:18.000000000 -0700
+++ linux-2.6/arch/x86/include/asm/reboot.h	2010-10-07 16:41:58.000000000 -0700
@@ -9,7 +9,7 @@ struct machine_ops {
 	void (*restart)(char *cmd);
 	void (*halt)(void);
 	void (*power_off)(void);
-	void (*shutdown)(void);
+	void (*shutdown)(int wait);
 	void (*crash_shutdown)(struct pt_regs *);
 	void (*emergency_restart)(void);
 };
@@ -17,7 +17,7 @@ struct machine_ops {
 extern struct machine_ops machine_ops;
 
 void native_machine_crash_shutdown(struct pt_regs *regs);
-void native_machine_shutdown(void);
+void native_machine_shutdown(int wait);
 void machine_real_restart(const unsigned char *code, int length);
 
 typedef void (*nmi_shootdown_cb)(int, struct die_args*);
Index: linux-2.6/arch/x86/include/asm/smp.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/smp.h	2010-08-03 12:43:47.000000000 -0700
+++ linux-2.6/arch/x86/include/asm/smp.h	2010-10-07 16:37:41.000000000 -0700
@@ -50,7 +50,7 @@ struct smp_ops {
 	void (*smp_prepare_cpus)(unsigned max_cpus);
 	void (*smp_cpus_done)(unsigned max_cpus);
 
-	void (*smp_send_stop)(void);
+	void (*smp_send_stop)(int wait);
 	void (*smp_send_reschedule)(int cpu);
 
 	int (*cpu_up)(unsigned cpu);
@@ -71,9 +71,9 @@ extern void set_cpu_sibling_map(int cpu)
 #endif
 extern struct smp_ops smp_ops;
 
-static inline void smp_send_stop(void)
+static inline void smp_send_stop(int wait)
 {
-	smp_ops.smp_send_stop();
+	smp_ops.smp_send_stop(wait);
 }
 
 static inline void smp_prepare_boot_cpu(void)
Index: linux-2.6/arch/x86/kernel/kvmclock.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/kvmclock.c	2010-08-03 12:43:47.000000000 -0700
+++ linux-2.6/arch/x86/kernel/kvmclock.c	2010-10-07 16:43:28.000000000 -0700
@@ -174,10 +174,10 @@ static void kvm_crash_shutdown(struct pt
 }
 #endif
 
-static void kvm_shutdown(void)
+static void kvm_shutdown(int wait)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
-	native_machine_shutdown();
+	native_machine_shutdown(wait);
 }
 
 void __init kvmclock_init(void)
Index: linux-2.6/arch/x86/kernel/reboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/reboot.c	2010-08-03 12:43:47.000000000 -0700
+++ linux-2.6/arch/x86/kernel/reboot.c	2010-10-07 16:45:24.000000000 -0700
@@ -616,7 +616,7 @@ static void native_machine_emergency_res
 	}
 }
 
-void native_machine_shutdown(void)
+void native_machine_shutdown(int wait)
 {
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
@@ -641,7 +641,7 @@ void native_machine_shutdown(void)
 	/* O.K Now that I'm on the appropriate processor,
 	 * stop all of the others.
 	 */
-	smp_send_stop();
+	smp_send_stop(wait);
 #endif
 
 	lapic_shutdown();
@@ -670,14 +670,14 @@ static void native_machine_restart(char 
 	printk("machine restart\n");
 
 	if (!reboot_force)
-		machine_shutdown();
+		machine_shutdown(0);
 	__machine_emergency_restart(0);
 }
 
 static void native_machine_halt(void)
 {
 	/* stop other cpus and apics */
-	machine_shutdown();
+	machine_shutdown(0);
 
 	tboot_shutdown(TB_SHUTDOWN_HALT);
 
@@ -689,7 +689,7 @@ static void native_machine_power_off(voi
 {
 	if (pm_power_off) {
 		if (!reboot_force)
-			machine_shutdown();
+			machine_shutdown(0);
 		pm_power_off();
 	}
 	/* a fallback in case there is no PM info available */
@@ -712,9 +712,9 @@ void machine_power_off(void)
 	machine_ops.power_off();
 }
 
-void machine_shutdown(void)
+void machine_shutdown(int wait)
 {
-	machine_ops.shutdown();
+	machine_ops.shutdown(wait);
 }
 
 void machine_emergency_restart(void)
Index: linux-2.6/arch/x86/kernel/smp.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smp.c	2010-10-07 16:30:34.000000000 -0700
+++ linux-2.6/arch/x86/kernel/smp.c	2010-10-07 16:34:16.000000000 -0700
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(voi
 	irq_exit();
 }
 
-static void native_smp_send_stop(void)
+static void native_smp_send_stop(int wait)
 {
 	unsigned long flags;
-	unsigned long wait;
+	unsigned long timeout;
 
 	if (reboot_force)
 		return;
@@ -179,9 +179,9 @@ static void native_smp_send_stop(void)
 	if (num_online_cpus() > 1) {
 		apic->send_IPI_allbutself(REBOOT_VECTOR);
 
-		/* Don't wait longer than a second */
-		wait = USEC_PER_SEC;
-		while (num_online_cpus() > 1 && wait--)
+		/* Don't wait longer than a second if this not a synchronous call */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
 			udelay(1);
 	}
 
Index: linux-2.6/arch/x86/xen/enlighten.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/enlighten.c	2010-08-30 16:20:50.000000000 -0700
+++ linux-2.6/arch/x86/xen/enlighten.c	2010-10-07 16:49:00.000000000 -0700
@@ -1018,7 +1018,7 @@ static void xen_reboot(int reason)
 	struct sched_shutdown r = { .reason = reason };
 
 #ifdef CONFIG_SMP
-	smp_send_stop();
+	smp_send_stop(0);
 #endif
 
 	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
Index: linux-2.6/arch/x86/xen/smp.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/smp.c	2010-08-30 16:20:50.000000000 -0700
+++ linux-2.6/arch/x86/xen/smp.c	2010-10-07 16:46:27.000000000 -0700
@@ -400,7 +400,7 @@ static void stop_self(void *v)
 	BUG();
 }
 
-static void xen_smp_send_stop(void)
+static void xen_smp_send_stop(int wait)
 {
 	smp_call_function(stop_self, NULL, 0);
 }
Index: linux-2.6/drivers/s390/char/sclp_quiesce.c
===================================================================
--- linux-2.6.orig/drivers/s390/char/sclp_quiesce.c	2010-08-03 12:45:04.000000000 -0700
+++ linux-2.6/drivers/s390/char/sclp_quiesce.c	2010-10-07 16:49:12.000000000 -0700
@@ -29,7 +29,7 @@ static void do_machine_quiesce(void)
 {
 	psw_t quiesce_psw;
 
-	smp_send_stop();
+	smp_send_stop(0);
 	quiesce_psw.mask = PSW_BASE_BITS | PSW_MASK_WAIT;
 	quiesce_psw.addr = 0xfff;
 	__load_psw(quiesce_psw);
Index: linux-2.6/include/linux/reboot.h
===================================================================
--- linux-2.6.orig/include/linux/reboot.h	2010-08-03 12:46:08.000000000 -0700
+++ linux-2.6/include/linux/reboot.h	2010-10-07 16:44:21.000000000 -0700
@@ -51,7 +51,7 @@ extern void machine_restart(char *cmd);
 extern void machine_halt(void);
 extern void machine_power_off(void);
 
-extern void machine_shutdown(void);
+extern void machine_shutdown(int wait);
 struct pt_regs;
 extern void machine_crash_shutdown(struct pt_regs *);
 
Index: linux-2.6/include/linux/smp.h
===================================================================
--- linux-2.6.orig/include/linux/smp.h	2010-08-03 12:46:08.000000000 -0700
+++ linux-2.6/include/linux/smp.h	2010-10-07 16:49:41.000000000 -0700
@@ -43,7 +43,7 @@ int smp_call_function_single(int cpuid, 
 /*
  * stops all CPUs but the current one:
  */
-extern void smp_send_stop(void);
+extern void smp_send_stop(int wait);
 
 /*
  * sends a 'reschedule' event to another CPU:
@@ -116,7 +116,7 @@ extern unsigned int setup_max_cpus;
 
 #else /* !SMP */
 
-static inline void smp_send_stop(void) { }
+static inline void smp_send_stop(int wait) { }
 
 /*
  *	These macros fold the SMP functionality into a single CPU system
Index: linux-2.6/kernel/kexec.c
===================================================================
--- linux-2.6.orig/kernel/kexec.c	2010-10-07 14:33:57.000000000 -0700
+++ linux-2.6/kernel/kexec.c	2010-10-07 16:45:38.000000000 -0700
@@ -1538,7 +1538,7 @@ int kernel_kexec(void)
 	{
 		kernel_restart_prepare(NULL);
 		printk(KERN_EMERG "Starting new kernel\n");
-		machine_shutdown();
+		machine_shutdown(1);
 	}
 
 	machine_kexec(kexec_image);
Index: linux-2.6/kernel/panic.c
===================================================================
--- linux-2.6.orig/kernel/panic.c	2010-08-30 16:22:03.000000000 -0700
+++ linux-2.6/kernel/panic.c	2010-10-07 16:50:05.000000000 -0700
@@ -94,7 +94,7 @@ NORET_TYPE void panic(const char * fmt, 
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
-	smp_send_stop();
+	smp_send_stop(0);
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 





More information about the kexec mailing list