From 23026bf6245baa32efd1f15d356954b4f702371d Mon Sep 17 00:00:00 2001 From: Andrei Warkentin Date: Mon, 13 Dec 2010 18:25:02 -0600 Subject: [PATCH] ARM: Make SMP init more resilient to failures. Ensure the secondary CPU is brought up in a fashion that ensures consistent kernel state on late entry to secondary_start_kernel or timeout. Change-Id: I26cc32678916ffdd628e4f2fb16bbc507df02eac Signed-off-by: Andrei Warkentin --- arch/arm/include/asm/cpu.h | 2 + arch/arm/kernel/smp.c | 76 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/cpu.h b/arch/arm/include/asm/cpu.h index 7939681..64ecd79 100644 --- a/arch/arm/include/asm/cpu.h +++ b/arch/arm/include/asm/cpu.h @@ -16,6 +16,8 @@ struct cpuinfo_arm { struct cpu cpu; #ifdef CONFIG_SMP + arch_spinlock_t up_lock; + wait_queue_head_t up; struct task_struct *idle; unsigned int loops_per_jiffy; #endif diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 40dc74f..57aae4a 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -67,6 +67,19 @@ enum ipi_msg_type { IPI_CPU_STOP, }; +static DECLARE_BITMAP(cpu_booting_bits, CONFIG_NR_CPUS); +const struct cpumask *const cpu_booting_mask = to_cpumask(cpu_booting_bits); + +void set_cpu_booting(unsigned int cpu, bool booting) +{ + if (booting) + cpumask_set_cpu(cpu, to_cpumask(cpu_booting_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_booting_bits)); +} + +#define cpu_booting(cpu) cpumask_test_cpu((cpu), cpu_booting_mask) + int __cpuinit __cpu_up(unsigned int cpu) { struct cpuinfo_arm *ci = &per_cpu(cpu_data, cpu); @@ -119,25 +132,57 @@ int __cpuinit __cpu_up(unsigned int cpu) /* * Now bring the CPU into our world. */ + set_cpu_booting(cpu, true); + init_waitqueue_head(&ci->up); + + /* + * If a previous attempt resulted in a timeout, but + * the secondary did enter secondary_start_kernel, we don't + * want to clear the lock, as that would unwedge the secondary + * and make it start initializing, just to be reset again, + * possibly resulting in inconsistent state. + */ + arch_spin_trylock(&ci->up_lock); ret = boot_secondary(cpu, idle); if (ret == 0) { unsigned long timeout; /* - * CPU was successfully started, wait for it - * to come online or time out. + * CPU was successfully started, wait for it for 1s + * to hit the beginning of secondary_start_kernel + * or time out. Waiting for cpu_online with a timeout + * is dangerous - if we time out because the CPU takes + * longer than expected, we're in an inconsistent state, + * and future attempts to bring the CPU will fail. Once we + * reach the idle loop on the brought up core, clean up is + * also impossible without risking races. So we wait until + * the kernel enters secondary_start_kernel, and signal + * the secondary that it can continue booting. */ - timeout = jiffies + HZ; - while (time_before(jiffies, timeout)) { - if (cpu_online(cpu)) + + for(timeout = 0; timeout < 10000; timeout++) { + if (!cpu_booting(cpu)) break; - udelay(10); - barrier(); + udelay(100); + + /* + * Let something else do useful work while we wait. + */ + schedule(); } - if (!cpu_online(cpu)) + /* + * Stuck? Never made it? + */ + if (cpu_booting(cpu)) { + set_cpu_booting(cpu, false); + platform_cpu_kill(cpu); ret = -EIO; + } else { + arch_spin_unlock(&ci->up_lock); + wait_event(ci->up, cpu_online(cpu)); + } } secondary_data.stack = NULL; @@ -256,8 +301,20 @@ asmlinkage void __cpuinit secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; unsigned int cpu = smp_processor_id(); + struct cpuinfo_arm *ci = &per_cpu(cpu_data, cpu); + printk("CPU%u: secondary processor up\n", cpu); + + /* + * Once we clear this bit, __cpu_up knows the core is up, + * in a consistent state, and ready to finish initialization, + * however long that takes. However, until it drops the lock, + * we're not going anywhere. This prevents a late entry + * into secondary_start_kernel to cause unexpected initialization. + */ + set_cpu_booting(cpu, false); + arch_spin_lock(&ci->up_lock); - printk("CPU%u: Booted secondary processor\n", cpu); + printk("CPU%u: Booting secondary processor\n", cpu); /* * All kernel threads share the same mm context; grab a @@ -299,6 +356,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void) * OK, now it's safe to let the boot CPU continue */ set_cpu_online(cpu, true); + wake_up(&ci->up); /* * OK, it's off to the idle thread for us -- 1.7.0.4