[RFC PATCH 2/3] arm64: Introduce IRQ stack
Jungseok Lee
jungseoklee85 at gmail.com
Tue Sep 8 07:28:18 PDT 2015
On Sep 7, 2015, at 11:48 PM, James Morse wrote:
Hi James,
> On 04/09/15 15:23, Jungseok Lee wrote:
>> Currently, kernel context and interrupts are handled using a single
>> kernel stack navigated by sp_el1. This forces many systems to use
>> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
>> both memory pressure and performance degradation simultaneously as
>> VM page allocator falls into slowpath frequently.
>>
>> This patch, thus, solves the problem as introducing a separate percpu
>> IRQ stack to handle both hard and soft interrupts with two ground rules:
>>
>> - Utilize sp_el0 in EL1 context, which is not used currently
>> - Do *not* complicate current_thread_info calculation
>>
>> struct thread_info can be tracked easily using sp_el0, not sp_el1 when
>> this feature is enabled.
>>
>> Signed-off-by: Jungseok Lee <jungseoklee85 at gmail.com>
>> ---
>> arch/arm64/Kconfig.debug | 10 ++
>> arch/arm64/include/asm/irq.h | 8 ++
>> arch/arm64/include/asm/thread_info.h | 11 ++
>> arch/arm64/kernel/asm-offsets.c | 8 ++
>> arch/arm64/kernel/entry.S | 83 +++++++++++++++-
>> arch/arm64/kernel/head.S | 7 ++
>> arch/arm64/kernel/irq.c | 18 ++++
>> 7 files changed, 142 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
>> index d6285ef..e16d91f 100644
>> --- a/arch/arm64/Kconfig.debug
>> +++ b/arch/arm64/Kconfig.debug
>> @@ -18,6 +18,16 @@ config ARM64_PTDUMP
>> kernel.
>> If in doubt, say "N"
>>
>> +config IRQ_STACK
>> + bool "Use separate kernel stack when handling interrupts"
>> + depends on ARM64_4K_PAGES
>> + help
>> + Say Y here if you want to use separate kernel stack to handle both
>> + hard and soft interrupts. As reduceing memory footprint regarding
>> + kernel stack, it benefits low memory platforms.
>> +
>> + If in doubt, say N.
>> +
>
> I don't think it is necessary to have a debug-only Kconfig option for this.
> Reducing memory use is good for everyone!
>
> This would let you get rid of all the #ifdefs
Hmm.. A single concern is stability. However, I agree that this is not an optional
feature. Especially, it definitely benefits low memory platforms. I will remove
this snippet in the next version.
>> config STRICT_DEVMEM
>> bool "Filter access to /dev/mem"
>> depends on MMU
>> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
>> index dcd06d1..5345a67 100644
>> --- a/arch/arm64/include/asm/thread_info.h
>> +++ b/arch/arm64/include/asm/thread_info.h
>> @@ -71,11 +71,22 @@ register unsigned long current_stack_pointer asm ("sp");
>> */
>> static inline struct thread_info *current_thread_info(void) __attribute_const__;
>>
>> +#ifndef CONFIG_IRQ_STACK
>> static inline struct thread_info *current_thread_info(void)
>> {
>> return (struct thread_info *)
>> (current_stack_pointer & ~(THREAD_SIZE - 1));
>> }
>> +#else
>> +static inline struct thread_info *current_thread_info(void)
>> +{
>> + unsigned long sp_el0;
>> +
>> + asm volatile("mrs %0, sp_el0" : "=r" (sp_el0));
>> +
>> + return (struct thread_info *)(sp_el0 & ~(THREAD_SIZE - 1));
>> +}
>> +#endif
>
> Because sp_el0 is only used as a stack value to find struct thread_info,
> you could just store the struct thread_info pointer in sp_el0, and save the
> masking on each read of the value.
IMHO, this logic, masking SP with ~(THREAD_SIZE - 1), is a well-known idiom.
So, I don't want to change the expression. In addition, the same process is
needed before storing the address of struct thread_info.
>>
>> #define thread_saved_pc(tsk) \
>> ((unsigned long)(tsk->thread.cpu_context.pc))
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index d23ca0d..f1fdfa9 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -88,7 +88,11 @@
>>
>> .if \el == 0
>> mrs x21, sp_el0
>> +#ifndef CONFIG_IRQ_STACK
>> get_thread_info tsk // Ensure MDSCR_EL1.SS is clear,
>> +#else
>> + get_thread_info \el, tsk
>> +#endif
>> ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug
>> disable_step_tsk x19, x20 // exceptions when scheduling.
>> .endif
>> @@ -168,11 +172,56 @@
>> eret // return to kernel
>> .endm
>>
>> +#ifndef CONFIG_IRQ_STACK
>> .macro get_thread_info, rd
>> mov \rd, sp
>> - and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack
>> + and \rd, \rd, #~(THREAD_SIZE - 1) // bottom of stack
>> + .endm
>> +#else
>> + .macro get_thread_info, el, rd
>> + .if \el == 0
>> + mov \rd, sp
>> + .else
>> + mrs \rd, sp_el0
>> + .endif
>> + and \rd, \rd, #~(THREAD_SIZE - 1) // bottom of thread stack
>> + .endm
>> +
>> + .macro get_irq_stack
>> + get_thread_info 1, tsk
>> + ldr w22, [tsk, #TI_CPU]
>> + adr_l x21, irq_stacks
>> + mov x23, #IRQ_STACK_SIZE
>> + madd x21, x22, x23, x21
>> .endm
>
> Using per_cpu variables would save the multiply here.
> You then wouldn't need IRQ_STACK_SIZE.
Agree.
>>
>> + .macro irq_stack_entry
>> + get_irq_stack
>> + ldr w23, [x21, #IRQ_COUNT]
>> + cbnz w23, 1f
>> + mov x23, sp
>> + str x23, [x21, #IRQ_THREAD_SP]
>> + ldr x23, [x21, #IRQ_STACK]
>> + mov sp, x23
>> + mov x23, xzr
>> +1: add w23, w23, #1
>> + str w23, [x21, #IRQ_COUNT]
>
> Why do you need to count the number of irqs?
> struct thread_info:preempt_count already does something similar (but its
> not usable this early...)
>
> An alternative way would be to compare the top bits of the two stack
> pointers - all we care about is irq recursion right?
Exactly. A point is a recursion check, not an actual number in this context.
I'd like to address the recursion issue with minimal load and store operation.
The suggestion sounds good. I will figure out a better and simpler way and
update it with comments in the code.
> This would save storing 'int count' for each cpu, and the logic to
> inc/decrement it.
>
>
>> + .endm
>> +
>> + .macro irq_stack_exit
>> + get_irq_stack
>> + ldr w23, [x21, #IRQ_COUNT]
>> + sub w23, w23, #1
>> + cbnz w23, 1f
>> + mov x23, sp
>> + str x23, [x21, #IRQ_STACK]
>> + ldr x23, [x21, #IRQ_THREAD_SP]
>> + mov sp, x23
>> + mov x23, xzr
>> +1: str w23, [x21, #IRQ_COUNT]
>> + .endm
>> +#endif
>> +
>> /*
>> * These are the registers used in the syscall handler, and allow us to
>> * have in theory up to 7 arguments to a function - x0 to x6.
>> @@ -188,10 +237,15 @@ tsk .req x28 // current thread_info
>> * Interrupt handling.
>> */
>> .macro irq_handler
>> - adrp x1, handle_arch_irq
>> - ldr x1, [x1, #:lo12:handle_arch_irq]
>> + ldr_l x1, handle_arch_irq
>> mov x0, sp
>> +#ifdef CONFIG_IRQ_STACK
>> + irq_stack_entry
>> +#endif
>> blr x1
>> +#ifdef CONFIG_IRQ_STACK
>> + irq_stack_exit
>> +#endif
>> .endm
>>
>> .text
>> @@ -366,7 +420,11 @@ el1_irq:
>> irq_handler
>>
>> #ifdef CONFIG_PREEMPT
>> +#ifndef CONFIG_IRQ_STACK
>> get_thread_info tsk
>> +#else
>> + get_thread_info 1, tsk
>> +#endif
>> ldr w24, [tsk, #TI_PREEMPT] // get preempt count
>> cbnz w24, 1f // preempt count != 0
>> ldr x0, [tsk, #TI_FLAGS] // get flags
>> @@ -395,6 +453,10 @@ el1_preempt:
>> .align 6
>> el0_sync:
>> kernel_entry 0
>> +#ifdef CONFIG_IRQ_STACK
>> + mov x25, sp
>> + msr sp_el0, x25
>> +#endif
>
> Could you do this in kernel_entry?
Yes, I could. It would make the code neater.
>> mrs x25, esr_el1 // read the syndrome register
>> lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
>> cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
>> @@ -423,6 +485,10 @@ el0_sync:
>> .align 6
>> el0_sync_compat:
>> kernel_entry 0, 32
>> +#ifdef CONFIG_IRQ_STACK
>> + mov x25, sp
>> + msr sp_el0, x25
>> +#endif
>> mrs x25, esr_el1 // read the syndrome register
>> lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
>> cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state
>> @@ -560,6 +626,10 @@ ENDPROC(el0_sync)
>> el0_irq:
>> kernel_entry 0
>> el0_irq_naked:
>> +#ifdef CONFIG_IRQ_STACK
>> + mov x22, sp
>> + msr sp_el0, x22
>> +#endif
>> enable_dbg
>> #ifdef CONFIG_TRACE_IRQFLAGS
>> bl trace_hardirqs_off
>> @@ -602,6 +672,9 @@ ENTRY(cpu_switch_to)
>> ldp x29, x9, [x8], #16
>> ldr lr, [x8]
>> mov sp, x9
>> +#ifdef CONFIG_IRQ_STACK
>> + msr sp_el0, x9
>> +#endif
>> ret
>> ENDPROC(cpu_switch_to)
>>
>> @@ -661,7 +734,11 @@ ENTRY(ret_from_fork)
>> cbz x19, 1f // not a kernel thread
>> mov x0, x20
>> blr x19
>> +#ifndef CONFIG_IRQ_STACK
>> 1: get_thread_info tsk
>> +#else
>> +1: get_thread_info 1, tsk
>> +#endif
>> b ret_to_user
>> ENDPROC(ret_from_fork)
>>
>> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
>> index c0ff3ce..3142766 100644
>> --- a/arch/arm64/kernel/head.S
>> +++ b/arch/arm64/kernel/head.S
>> @@ -446,6 +446,10 @@ __mmap_switched:
>> b 1b
>> 2:
>> adr_l sp, initial_sp, x4
>> +#ifdef CONFIG_IRQ_STACK
>> + mov x4, sp
>> + msr sp_el0, x4
>> +#endif
>> str_l x21, __fdt_pointer, x5 // Save FDT pointer
>> str_l x24, memstart_addr, x6 // Save PHYS_OFFSET
>> mov x29, #0
>> @@ -619,6 +623,9 @@ ENDPROC(secondary_startup)
>> ENTRY(__secondary_switched)
>> ldr x0, [x21] // get secondary_data.stack
>> mov sp, x0
>> +#ifdef CONFIG_IRQ_STACK
>> + msr sp_el0, x0
>> +#endif
>> mov x29, #0
>> b secondary_start_kernel
>> ENDPROC(__secondary_switched)
>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
>> index 463fa2e..fb0f522 100644
>> --- a/arch/arm64/kernel/irq.c
>> +++ b/arch/arm64/kernel/irq.c
>> @@ -31,6 +31,10 @@
>>
>> unsigned long irq_err_count;
>>
>> +#ifdef CONFIG_IRQ_STACK
>> +struct irq_stack irq_stacks[NR_CPUS];
>> +#endif
>> +
>
> DEFINE_PER_CPU()?
Yeah, I will update it with per_cpu in the next version.
>> int arch_show_interrupts(struct seq_file *p, int prec)
>> {
>> #ifdef CONFIG_SMP
>> @@ -52,6 +56,20 @@ void __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
>>
>> void __init init_IRQ(void)
>> {
>> +#ifdef CONFIG_IRQ_STACK
>> + unsigned int cpu;
>> +
>> + for_each_possible_cpu(cpu) {
>> + void *stack = (void *)__get_free_pages(THREADINFO_GFP,
>> + THREAD_SIZE_ORDER);
>> + if (!stack)
>> + panic("CPU%u: No IRQ stack", cpu);
>> +
>> + irq_stacks[cpu].stack = stack + THREAD_START_SP;
>> + }
>> + pr_info("IRQ: Allocated IRQ stack successfully\n");
>> +#endif
>> +
>
> Wouldn't it be better to only allocate the stack when the CPU is about to
> be brought up? (put the alloc code in __cpu_up()).
__cpu_up could be called very frequently if a power management scheme based
on CPU hotplug is actively used. So, I'd like to avoid even a simple logic
which checks the previously allocated IRQ stack.
Thanks for the valuable feedbacks.
Best Regards
Jungseok Lee
More information about the linux-arm-kernel
mailing list