[PATCH 5/5] ARM: implement support for vmap'ed stacks

Ard Biesheuvel ardb at kernel.org
Fri Oct 8 00:41:27 PDT 2021


Wire up the generic support for managing task stack allocations via vmalloc,
and implement the entry code that detects whether we faulted because of a
stack overrun (or future stack overrun caused by pushing the pt_regs array)

While this adds a fair amount of tricky entry asm code, it should be
noted that it only adds a TST + branch to the svc_entry path (although
Thumb-2 needs a little tweak to get the stack pointer into a GPR). The
code implementing the non-trivial manipulation of the overflow stack
pointer is emitted out-of-line into the .text section, with only a small
trampoline residing in .entry.text (but not on the entry path itself)
that branches out to C code if a kernel stack overflow actually occurs.

Since on ARM, we rely on do_translation_fault() to keep PMD level page
table entries that cover the vmalloc region up to date, we need to
ensure that we don't hit such a stale PMD entry when accessing the
stack. So we do a dummy read from the new stack while still running from
the old one on the context switch path, and bump the vmalloc_seq counter
when PMD level entries in the vmalloc range are modified, so that the MM
switch fetches the latest version of the entries.

Note that this also adds unwind information to the entry code dealing
with the stack overflow condition but this is only used when an
additional fault is triggered while trying to mitigate.

Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
---
 arch/arm/Kconfig                   |  1 +
 arch/arm/include/asm/page.h        |  4 ++
 arch/arm/include/asm/thread_info.h |  8 +++
 arch/arm/kernel/entry-armv.S       | 75 ++++++++++++++++++--
 arch/arm/kernel/entry-header.S     | 74 +++++++++++++++++++
 arch/arm/kernel/irq.c              |  9 ++-
 arch/arm/kernel/traps.c            | 65 ++++++++++++++++-
 arch/arm/kernel/unwind.c           |  3 +-
 arch/arm/kernel/vmlinux.lds.S      |  4 +-
 9 files changed, 230 insertions(+), 13 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d46b243e1b26..964cc41a0fed 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -126,6 +126,7 @@ config ARM
 	select RTC_LIB
 	select SYS_SUPPORTS_APM_EMULATION
 	select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO
+	select HAVE_ARCH_VMAP_STACK if THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M
 	# Above selects are sorted alphabetically; please add new ones
 	# according to that.  Thanks.
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 11b058a72a5b..7b871ed99ccf 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -149,6 +149,10 @@ extern void copy_page(void *to, const void *from);
 #include <asm/pgtable-2level-types.h>
 #endif
 
+#ifdef CONFIG_VMAP_STACK
+#define ARCH_PAGE_TABLE_SYNC_MASK	PGTBL_PMD_MODIFIED
+#endif
+
 #endif /* CONFIG_MMU */
 
 typedef struct page *pgtable_t;
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 787511396f3f..900c8ae484c9 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -25,6 +25,14 @@
 #define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_START_SP		(THREAD_SIZE - 8)
 
+#ifdef CONFIG_VMAP_STACK
+#define THREAD_ALIGN		(2 * THREAD_SIZE)
+#else
+#define THREAD_ALIGN		THREAD_SIZE
+#endif
+
+#define OVERFLOW_STACK_SIZE	SZ_4K
+
 #ifndef __ASSEMBLY__
 
 struct task_struct;
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 47ba80d7a84a..ca1b01425a1a 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -53,6 +53,14 @@ ARM(	mov	r8, fp		)	@ Preserve original FP
 	@
 	subs	r2, sp, r7		@ SP above bottom of IRQ stack?
 	rsbscs	r2, r2, #THREAD_SIZE	@ ... and below the top?
+#ifdef CONFIG_VMAP_STACK
+	bcs	.L\@
+	mov_l	r2, overflow_stack	@ Take base address
+	add	r2, r2, r3		@ Top of this CPU's overflow stack
+	subs	r2, r7, r2		@ Compare with incoming SP
+	rsbscs	r2, r2, #OVERFLOW_STACK_SIZE
+.L\@:
+#endif
 	movcs	sp, r7			@ If so, revert to incoming SP
 
 	@
@@ -177,10 +185,15 @@ ENDPROC(__und_invalid)
 #define SPFIX(code...)
 #endif
 
-	.macro	svc_entry, stack_hole=0, trace=1, uaccess=1
+	.macro	svc_entry, stack_hole=0, trace=1, uaccess=1, overflow_check=1
+	sub	sp, sp, #(SVC_REGS_SIZE + \stack_hole - 4)
+	.if	\overflow_check
+	do_overflow_check (SVC_REGS_SIZE + \stack_hole - 4)
  UNWIND(.fnstart		)
  UNWIND(.save {r0 - pc}		)
-	sub	sp, sp, #(SVC_REGS_SIZE + \stack_hole - 4)
+	.else
+ UNWIND(.fnstart		)
+	.endif
 #ifdef CONFIG_THUMB2_KERNEL
  SPFIX(	str	r0, [sp]	)	@ temporarily saved
  SPFIX(	mov	r0, sp		)
@@ -812,16 +825,64 @@ ENTRY(__switch_to)
 #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	str	r7, [r8]
 #endif
- THUMB(	mov	ip, r4			   )
 	mov	r0, r5
+#if !defined(CONFIG_THUMB2_KERNEL) && !defined(CONFIG_VMAP_STACK)
 	set_current r7
- ARM(	ldmia	r4, {r4 - sl, fp, sp, pc}  )	@ Load all regs saved previously
- THUMB(	ldmia	ip!, {r4 - sl, fp}	   )	@ Load all regs saved previously
- THUMB(	ldr	sp, [ip], #4		   )
- THUMB(	ldr	pc, [ip]		   )
+	ldmia	r4, {r4 - sl, fp, sp, pc}	@ Load all regs saved previously
+#else
+	mov	r1, r7
+	ldmia	r4, {r4 - sl, fp, ip, lr}	@ Load all regs saved previously
+#ifdef CONFIG_VMAP_STACK
+	@
+	@ Do a dummy read from the new stack while running from the old one so
+	@ that we can rely on do_translation_fault() to fix up any stale PGD
+	@ entries covering the vmalloc region.
+	@
+	ldr	r2, [ip]
+#endif
+	set_current r1
+	mov	sp, ip
+	bx	lr
+#endif
  UNWIND(.fnend		)
 ENDPROC(__switch_to)
 
+#ifdef CONFIG_VMAP_STACK
+__bad_stack:
+	@
+	@ We detected an overflow in svc_entry, which switched to the
+	@ overflow stack. Stash the exception regs, and head to our overflow
+	@ handler.
+	@
+
+	mov	ip, sp
+THUMB(	sub	sp, #4		)
+	push	{fpreg, ip, lr ARM(, pc)}
+
+UNWIND( ldr	lr, [ip, #4]	)		@ restore exception LR
+	mrc	p15, 0, ip, c13, c0, 2		@ reload IP
+
+	@ Store the original GPRs to the new stack.
+	svc_entry uaccess=0, overflow_check=0
+
+UNWIND( .save   {fpreg, sp, lr}	)
+UNWIND( .setfp  fpreg, sp, #12	)
+	ldr	fpreg, [sp, #S_SP]		@ Add our frame record
+	add	fpreg, fpreg, #12		@ to the linked list
+
+	ldr	r1, [fpreg, #4]			@ reload SP at entry
+	str	r1, [sp, #S_SP]			@ store in pt_regs
+
+	@ Stash the regs for handle_bad_stack
+	mov	r0, sp
+
+	@ Time to die
+	bl	handle_bad_stack
+	nop
+UNWIND( .fnend			)
+ENDPROC(__bad_stack)
+#endif
+
 	__INIT
 
 /*
diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index ae24dd54e9ef..083967065688 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -423,3 +423,77 @@ scno	.req	r7		@ syscall number
 tbl	.req	r8		@ syscall table pointer
 why	.req	r8		@ Linux syscall (!= 0)
 tsk	.req	r9		@ current thread_info
+
+	.macro	do_overflow_check, frame_size:req
+#ifdef CONFIG_VMAP_STACK
+	@
+	@ Test whether the SP has overflowed, without corrupting a GPR.
+	@ Task and IRQ stacks are aligned so that
+	@ SP & BIT(THREAD_SIZE_ORDER + PAGE_SHIFT) should always be zero.
+	@
+ARM(	tst	sp, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT)	)
+
+	@
+	@ Note that Thumb-2 needs this little dance to get the value of SP
+	@ into a register that may be used with TST, and needs the explicit
+	@ IT NE to ensure that the unconditional branch opcode is chosen,
+	@ which guarantees that the cross-section branch will be in range.
+	@
+THUMB(	add	sp, sp, r0					)
+THUMB(	sub	r0, sp, r0					)
+THUMB(	tst	r0, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT)	)
+THUMB(	sub	r0, sp, r0					)
+THUMB(	sub	sp, sp, r0					)
+THUMB(	it	ne						)
+	bne	stack_overflow_check\@
+
+	.pushsection	.text
+stack_overflow_check\@:
+UNWIND( .fnstart		)
+	@
+	@ Either we've just detected an overflow, or we've taken an exception
+	@ while on the overflow stack. We cannot use the stack until we have
+	@ decided which is the case. However, as we won't return to userspace,
+	@ we can clobber some USR/SYS mode registers to free up GPRs.
+	@
+
+	mcr	p15, 0, ip, c13, c0, 2		@ Stash IP in TPIDRURW
+	mrs	ip, cpsr
+	eor	ip, ip, #(SVC_MODE ^ SYSTEM_MODE)
+	msr	cpsr_c, ip			@ switch to SYS mode
+	eor	lr, ip, #(SVC_MODE ^ SYSTEM_MODE)
+
+	@ Load the overflow stack into IP using SP_usr as a scratch register
+	ldr	sp, .Lit\@			@ Get base address
+	mrc	p15, 0, ip, c13, c0, 4		@ Get CPU offset
+	add	ip, sp, ip			@ IP := this CPU's overflow stack
+	msr	cpsr_c, lr			@ Switch back using mode in LR_usr
+
+	@
+	@ Check whether we are already on the overflow stack. This may happen
+	@ after panic() re-enables interrupts or when performing accesses that
+	@ may fault when dumping the stack.
+	@ The overflow stack is not in the vmalloc space so we only need to
+	@ check whether the incoming SP is below the top of the overflow stack.
+	@
+	subs	ip, sp, ip			@ Delta with top of overflow stack
+	mrclo	p15, 0, ip, c13, c0, 2		@ Restore IP
+	blo	.Lout\@				@ Carry on
+
+	sub	sp, sp, ip			@ Switch to overflow stack
+	add	ip, sp, ip			@ Keep incoming SP value in IP
+	add	ip, ip, #\frame_size		@ Undo svc_entry's SP change
+	push	{ip, lr}			@ Preserve on the stack
+
+UNWIND( .save	{sp, lr}	)		@ Create an unwind frame here to
+UNWIND( mov	lr, pc		)		@ chain the two stacks together
+	b	__bad_stack
+UNWIND( .fnend			)
+ENDPROC(stack_overflow_check\@)
+
+	.align	2
+.Lit\@:	.long		overflow_stack + OVERFLOW_STACK_SIZE
+	.popsection
+.Lout\@:
+#endif
+	.endm
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index feb07f703a98..8718aaf47891 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -55,7 +55,14 @@ static void __init init_irq_stacks(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		stack = (u8 *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+		if (!IS_ENABLED(CONFIG_VMAP_STACK))
+			stack = (u8 *)__get_free_pages(GFP_KERNEL,
+						       THREAD_SIZE_ORDER);
+		else
+			stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
+					       THREADINFO_GFP, NUMA_NO_NODE,
+					       __builtin_return_address(0));
+
 		if (WARN_ON(!stack))
 			break;
 		per_cpu(irq_stack_ptr, cpu) = &stack[THREAD_SIZE];
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index b42c446cec9a..eb8c73be7c81 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -121,7 +121,8 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl)
 static int verify_stack(unsigned long sp)
 {
 	if (sp < PAGE_OFFSET ||
-	    (sp > (unsigned long)high_memory && high_memory != NULL))
+	    (!IS_ENABLED(CONFIG_VMAP_STACK) &&
+	     sp > (unsigned long)high_memory && high_memory != NULL))
 		return -EFAULT;
 
 	return 0;
@@ -291,7 +292,8 @@ static int __die(const char *str, int err, struct pt_regs *regs)
 
 	if (!user_mode(regs) || in_interrupt()) {
 		dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp,
-			 ALIGN(regs->ARM_sp, THREAD_SIZE));
+			 ALIGN(regs->ARM_sp - THREAD_SIZE, THREAD_ALIGN)
+			 + THREAD_SIZE);
 		dump_backtrace(regs, tsk, KERN_EMERG);
 		dump_instr(KERN_EMERG, regs);
 	}
@@ -838,3 +840,62 @@ void __init early_trap_init(void *vectors_base)
 	 */
 #endif
 }
+
+#ifdef CONFIG_VMAP_STACK
+
+DECLARE_PER_CPU(u8 *, irq_stack_ptr);
+
+asmlinkage DEFINE_PER_CPU_ALIGNED(u8[OVERFLOW_STACK_SIZE], overflow_stack);
+
+asmlinkage void handle_bad_stack(struct pt_regs *regs)
+{
+	unsigned long tsk_stk = (unsigned long)current->stack;
+	unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr);
+	unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
+
+	console_verbose();
+	pr_emerg("Insufficient stack space to handle exception!");
+
+	pr_emerg("Task stack:     [0x%08lx..0x%08lx]\n",
+		 tsk_stk, tsk_stk + THREAD_SIZE);
+	pr_emerg("IRQ stack:      [0x%08lx..0x%08lx]\n",
+		 irq_stk, irq_stk + THREAD_SIZE);
+	pr_emerg("Overflow stack: [0x%08lx..0x%08lx]\n",
+		 ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE);
+
+	die("kernel stack overflow", regs, 0);
+}
+
+/*
+ * Normally, we rely on the logic in do_translation_fault() to update stale PMD
+ * entries covering the vmalloc space in a task's page tables when it first
+ * accesses the region in question. Unfortunately, this is not sufficient when
+ * the task stack resides in the vmalloc region, as do_translation_fault() is a
+ * C function that needs a stack to run.
+ *
+ * So we need to ensure that these PMD entries are up to date *before* the MM
+ * switch. As we already have some logic in the MM switch path that takes care
+ * of this, let's trigger it by bumping the counter every time the core vmalloc
+ * code modifies a PMD entry in the vmalloc region.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+	if (start > VMALLOC_END || end < VMALLOC_START)
+		return;
+
+	/*
+	 * This hooks into the core vmalloc code to receive notifications of
+	 * any PMD level changes that have been made to the kernel page tables.
+	 * This means it should only be triggered once for every MiB worth of
+	 * vmalloc space, given that we don't support huge vmalloc/vmap on ARM,
+	 * and that kernel PMD level table entries are rarely (if ever)
+	 * updated.
+	 *
+	 * This means that the counter is going to max out at ~250 for the
+	 * typical case. If it overflows, something entirely unexpected has
+	 * occurred so let's throw a warning if that happens.
+	 */
+	WARN_ON(++init_mm.context.vmalloc_seq == UINT_MAX);
+}
+
+#endif
diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index de0d26dc73fd..f67d548d2b28 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -389,7 +389,8 @@ int unwind_frame(struct stackframe *frame)
 
 	/* store the highest address on the stack to avoid crossing it*/
 	ctrl.sp_low = frame->sp;
-	ctrl.sp_high = ALIGN(ctrl.sp_low, THREAD_SIZE);
+	ctrl.sp_high = ALIGN(ctrl.sp_low - THREAD_SIZE, THREAD_ALIGN)
+		       + THREAD_SIZE;
 
 	pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__,
 		 frame->pc, frame->lr, frame->sp);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 20c4f6d20c7a..924cde084293 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -138,12 +138,12 @@ SECTIONS
 #ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #else
-	. = ALIGN(THREAD_SIZE);
+	. = ALIGN(THREAD_ALIGN);
 #endif
 	__init_end = .;
 
 	_sdata = .;
-	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
 	_edata = .;
 
 	BSS_SECTION(0, 0, 0)
-- 
2.30.2




More information about the linux-arm-kernel mailing list