arm64: Support dynamic kernel stacks

Turning on dynamic kernel stacks can save a lot of kernel
RAM on ARM64.

Example on minimal OpenWrt system:
$ cat /proc/vmstat | grep stack
nr_kernel_stack 320
nr_dynamic_stacks_faults 8

Each stack initially just use 4KB of stack, the each fault
extends one of the kernel stacks with one more 4KB page.

We see that in this case we consume

  328 * 4KB = 1.28 MB

of RAM for stacks.

Compare if using 16KB pre-allocated stacks: all 320
processes would use 16KB of physical memory each, i.e.

  320 * 16KB = 5 MB

So in this minimal system we save almost 4 MB of runtime
memory.

The approach taken here is to special-case the sync
exceptions from the vector table. If we are handling a
sync (and using dynamic stack), we stash (x16, x17)
into (TPIDR_EL0, TPIDRRO_EL0) temporarily so that we
can execute some code without using any stack at all.

We define a special sync stack that is only used
when handling sync calls, and we switch to this stack
immediately in the exception handler, without saving
a single value onto the task stack.

We then check if this sync exception was a data abort
on the ordinary process stack. If it was not, we copy our
current sync stack over to the task stack and continue
like nothing special happened.

If this was indeed a data abort on the task stack,
we call do_stack_abort() which in turn calls
dynamic_stack_fault() to latch in a new physical
page to the stack, all while running on the temporary
sync stack. We then return from the exception restoring
SP to what it used to be before the sync exception.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331..9bea55b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -175,6 +175,7 @@
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_BITREVERSE
 	select HAVE_ARCH_COMPILER_H
+	select HAVE_ARCH_DYNAMIC_STACK
 	select HAVE_ARCH_HUGE_VMALLOC
 	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index d48fc16..c5410f0 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -33,7 +33,7 @@ asmlinkage void el1t_64_irq_handler(struct pt_regs *regs);
 asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs);
 asmlinkage void el1t_64_error_handler(struct pt_regs *regs);
 
-asmlinkage void el1h_64_sync_handler(struct pt_regs *regs);
+asmlinkage int el1h_64_sync_handler(struct pt_regs *regs);
 asmlinkage void el1h_64_irq_handler(struct pt_regs *regs);
 asmlinkage void el1h_64_fiq_handler(struct pt_regs *regs);
 asmlinkage void el1h_64_error_handler(struct pt_regs *regs);
@@ -48,10 +48,12 @@ asmlinkage void el0t_32_irq_handler(struct pt_regs *regs);
 asmlinkage void el0t_32_fiq_handler(struct pt_regs *regs);
 asmlinkage void el0t_32_error_handler(struct pt_regs *regs);
 
+asmlinkage void switch_sync_stack_to_task_stack(void);
 asmlinkage void call_on_irq_stack(struct pt_regs *regs,
 				  void (*func)(struct pt_regs *));
 asmlinkage void asm_exit_to_user_mode(struct pt_regs *regs);
 
+void do_stack_abort(unsigned long far, struct pt_regs *regs);
 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs);
 void do_el0_undef(struct pt_regs *regs, unsigned long esr);
 void do_el1_undef(struct pt_regs *regs, unsigned long esr);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 717829d..ba7969e 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -143,6 +143,13 @@
 
 #define IRQ_STACK_SIZE		THREAD_SIZE
 
+/*
+ * This size is determined by trial-and-error. The sync aborts do not
+ * nest very deep, they put a new page into the stack and return, no calls
+ * to deep memory management code etc is done from this context.
+ */
+#define SYNC_STACK_SIZE		SZ_8K
+
 #define OVERFLOW_STACK_SIZE	SZ_4K
 
 #define NVHE_STACK_SHIFT       PAGE_SHIFT
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 7c1970b..9d52b0f 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -523,10 +523,65 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
 	exit_to_kernel_mode(regs);
 }
 
-asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
+#define EL1_FAULT_ON_STACK 1
+#define EL1_STACK_OVERFLOW 2
+
+static unsigned int  noinstr el1_page_fault_on_stack(unsigned long esr,
+						     unsigned long far)
+{
+	unsigned long stack = (unsigned long)current->stack;
+	unsigned long addr = untagged_addr(far);
+
+	/*
+	 * Is this even a page fault?
+	 * NB: only check for data abort, we have no business
+	 * executing code on the stack so no instruction aborts.
+	 */
+	if (ESR_ELx_EC(esr) !=  ESR_ELx_EC_DABT_CUR)
+		return 0;
+
+	if (addr < stack || addr >= stack + THREAD_SIZE)
+		return 0;
+
+	/* We hit the botton of the stack: overflow! */
+	if (addr == stack)
+		return EL1_STACK_OVERFLOW;
+
+	/* Actually a page fault on the stack! */
+	return EL1_FAULT_ON_STACK;
+}
+
+/* Returns 1 if we are still on the sync stack, else 0 */
+asmlinkage int noinstr el1h_64_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
+	if (IS_ENABLED(CONFIG_DYNAMIC_STACK)) {
+		unsigned long far = read_sysreg(far_el1);
+		unsigned int fault;
+
+		/* First restore tpidr_el0 and tpidrro_el0 for dynamic stack */
+		tls_thread_restore_current();
+		/*
+		 * Are we faulting on the thread stack? Else just switch
+		 * back to the thread stack and continue as if nothing happened.
+		 * code running in the abort handlers will allow further
+		 * aborts to happen so we most definitely need to switch
+		 * back to the task stack unless we are handling a page fault
+		 * on the stack itself.
+		 */
+		fault = el1_page_fault_on_stack(esr, far);
+		if (fault == EL1_STACK_OVERFLOW)
+			handle_bad_stack(regs);
+		if (fault == EL1_FAULT_ON_STACK) {
+			pr_info("PAGE FAULT ON STACK!!\n");
+			do_stack_abort(far, regs);
+			return 1;
+		} else {
+			switch_sync_stack_to_task_stack();
+		}
+	}
+
 	switch (ESR_ELx_EC(esr)) {
 	case ESR_ELx_EC_DABT_CUR:
 	case ESR_ELx_EC_IABT_CUR:
@@ -564,6 +619,8 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 	default:
 		__panic_unhandled(regs, "64-bit el1h sync", esr);
 	}
+
+	return 0;
 }
 
 static __always_inline void __el1_pnmi(struct pt_regs *regs,
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5ae2a34..389cb47 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -55,11 +55,15 @@
 	.endif
 
 	sub	sp, sp, #PT_REGS_SIZE
-#ifdef CONFIG_VMAP_STACK
+#if defined(CONFIG_VMAP_STACK) && !defined(CONFIG_DYNAMIC_STACK)
 	/*
 	 * Test whether the SP has overflowed, without corrupting a GPR.
 	 * Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT)
 	 * should always be zero.
+	 *
+	 * This trick is not needed when using dynamic stacks, the dynamic
+	 * stack handling code has its own overflow handling using the sync
+	 * stack.
 	 */
 	add	sp, sp, x0			// sp' = sp + x0
 	sub	x0, sp, x0			// x0' = sp' - x0 = (sp + x0) - x0 = sp
@@ -97,11 +101,45 @@
 	/* We were already on the overflow stack. Restore sp/x0 and carry on. */
 	sub	sp, sp, x0
 	mrs	x0, tpidrro_el0
-#endif
+#endif /* CONFIG_VMAP_STACK */
 	b	el\el\ht\()_\regsize\()_\label
 .org .Lventry_start\@ + 128	// Did we overflow the ventry slot?
 	.endm
 
+#ifdef CONFIG_DYNAMIC_STACK
+	/* Only used for EL1 sync exceptions */
+	.macro kernel_ventry_el1_sync, el:req, ht:req, regsize:req, label:req
+	.align 7
+.Lventry_el1_sync_start\@:
+	/*
+	 * Call the sync handler on the special sync stack since we could
+	 * have just encountered a data abort on the task stack, and in
+	 * that case we cannot store anything on the current task stack.
+	 */
+	// Save x16, x17 in tpidr_el0, tpidrro_el0
+	msr	tpidr_el0, x16
+	msr	tpidrro_el0, x17
+	ldr_this_cpu x16, sync_stack_ptr, x17
+	mov	x17, sp		// Save old SP in x17
+	add	sp, x16, #SYNC_STACK_SIZE
+	sub	sp, sp, #8
+	str	x17, [sp]	// Save old SP on the sync stack
+	sub	sp, sp, #PT_REGS_SIZE
+	// Restore x16, x17 from in tpidr_el0, tpidrro_el0
+	mrs	x16, tpidr_el0
+	mrs	x17, tpidrro_el0
+	/*
+	 * Now tpidr_el0, tpidrro_el0 are clobbered and need to be restored
+	 * in the el1 sync handlers in C.
+	 */
+	b	el\el\ht\()_\regsize\()_\label
+.org .Lventry_el1_sync_start\@ + 128	// Did we overflow the ventry slot?
+	.endm
+#else
+	/* Just an alias to the kernel_ventry macro */
+#define kernel_ventry_el1_sync kernel_ventry
+#endif /* CONFIG_DYNAMIC_STACK */
+
 	.macro	tramp_alias, dst, sym
 	.set	.Lalias\@, TRAMP_VALIAS + \sym - .entry.tramp.text
 	movz	\dst, :abs_g2_s:.Lalias\@
@@ -334,7 +372,7 @@
 	*/
 	.endm
 
-	.macro	kernel_exit, el
+	.macro	kernel_exit, el, sync
 	.if	\el != 0
 	disable_daif
 	.endif
@@ -413,6 +451,20 @@
 
 	msr	elr_el1, x21			// set up the return data
 	msr	spsr_el1, x22
+#ifdef CONFIG_DYNAMIC_STACK
+	/*
+	 * Really complicated way of getting the old SP out.
+	 * We have now swapped in a new stack page so we can use
+	 * the common stack without any faults.
+	 */
+	.if	\el == 1 && \sync == 1
+	ldr	x2, [sp, #PT_REGS_SIZE] // Get old SP
+	sub	x2, x2, #8
+	str	x2, [sp, #PT_REGS_SIZE] // Save SP
+	ldp	x0, x1, [sp, #16 *0] // Get stored x0, x1 from the IRQ stack
+	str	x0, [x2] // Put a copy of x0 on the top of the ordinary stack
+	.endif
+#endif
 	ldp	x0, x1, [sp, #16 * 0]
 	ldp	x2, x3, [sp, #16 * 1]
 	ldp	x4, x5, [sp, #16 * 2]
@@ -447,7 +499,19 @@
 	.endif
 
 	ldr	lr, [sp, #S_LR]
+#ifdef CONFIG_DYNAMIC_STACK
+	.if	\el == 1 && \sync == 1
+	// Restore original SP (was using sync stack) before in-kernel sync
+	ldr	x0, [sp, #PT_REGS_SIZE]
+	mov	sp, x0
+	ldr	x0, [sp] // We have put a copy of x0 here (see above)
+	add	sp, sp, #8
+	.else
 	add	sp, sp, #PT_REGS_SIZE		// restore sp
+	.endif
+#else
+	add	sp, sp, #PT_REGS_SIZE		// restore sp
+#endif
 
 	.if \el == 0
 	/* This must be after the last explicit memory access */
@@ -519,12 +583,12 @@
 
 	.align	11
 SYM_CODE_START(vectors)
-	kernel_ventry	1, t, 64, sync		// Synchronous EL1t
+	kernel_ventry_el1_sync	1, t, 64, sync		// Synchronous EL1t
 	kernel_ventry	1, t, 64, irq		// IRQ EL1t
 	kernel_ventry	1, t, 64, fiq		// FIQ EL1t
 	kernel_ventry	1, t, 64, error		// Error EL1t
 
-	kernel_ventry	1, h, 64, sync		// Synchronous EL1h
+	kernel_ventry_el1_sync	1, h, 64, sync		// Synchronous EL1h
 	kernel_ventry	1, h, 64, irq		// IRQ EL1h
 	kernel_ventry	1, h, 64, fiq		// FIQ EL1h
 	kernel_ventry	1, h, 64, error		// Error EL1h
@@ -578,9 +642,42 @@
 	bl	el\el\ht\()_\regsize\()_\label\()_handler
 	.if \el == 0
 	b	ret_to_user
+	.endif
+#ifdef CONFIG_DYNAMIC_STACK
+	/*
+	 * This .ifeqs with .if inside it looks a bit insane and this
+	 * is because .ifeqs doesn't support nesting whatsoever: you
+	 * can put .if:s inside an .ifeqs but you simply can't put an
+	 * .ifeqs inside an .if and expect it to work, the result will
+	 * be that the assembler inside an .ifeqs nested inside an .if
+	 * will be evaluated if the .ifeqs evaluates to true no matter
+	 * what the outer .if evaluates to.
+	 *
+	 * This is probably a bug in the GNU Assembler that I work
+	 * around like this.
+	 */
+	.ifeqs	"\label", "sync"
+	.if \el != 0
+	/*
+	 * We get called with parameter 1 if we are still on the
+	 * sync stack.
+	 */
+	tst	x0, #1
+	b.ne	0f
+	b	ret_to_kernel
+0:
+	b	ret_to_kernel_el1_sync
+	.endif
 	.else
+	.if \el != 0
 	b	ret_to_kernel
 	.endif
+	.endif
+#else
+	.if \el != 0
+	b	ret_to_kernel
+	.endif
+#endif
 SYM_CODE_END(el\el\ht\()_\regsize\()_\label)
 	.endm
 
@@ -608,16 +705,49 @@
 	entry_handler	0, t, 32, error
 
 SYM_CODE_START_LOCAL(ret_to_kernel)
-	kernel_exit 1
+	kernel_exit el=1, sync=0
 SYM_CODE_END(ret_to_kernel)
 
+#ifdef CONFIG_DYNAMIC_STACK
+	/*
+	 * Special version that return us to the kernel from a page
+	 * fault that happened when we were using the stack.
+	 */
+SYM_CODE_START_LOCAL(ret_to_kernel_el1_sync)
+	kernel_exit el=1, sync=1
+SYM_CODE_END(ret_to_kernel_el1_sync)
+
+SYM_FUNC_START(switch_sync_stack_to_task_stack)
+	// Obtain the task stack SP which is helpfully placed
+	// on the bottom of the special sync stack.
+	ldr_this_cpu x0, sync_stack_ptr, x1
+	add	x0, x0, #SYNC_STACK_SIZE
+	sub	x0, x0, #8
+	ldr	x1, [x0] // x1 = task stack SP
+	mov	x3, sp // copy of current SP in x3
+	// Copy the contents of the sync stack to the task stack
+0:
+	sub	x0, x0, #8
+	sub	x1, x1, #8
+	ldr	x2, [x0]
+	str	x2, [x1]
+	cmp	x0, x3
+	b.ne	0b
+	// Switch to the task stack, ditch the old sync stack and
+	// pretend like nothing happened
+	mov	sp, x1
+	ret
+SYM_FUNC_END(switch_sync_stack_to_task_stack)
+NOKPROBE(switch_sync_stack_to_task_stack)
+#endif
+
 SYM_CODE_START_LOCAL(ret_to_user)
 	ldr	x19, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
 	enable_step_tsk x19, x2
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 	bl	stackleak_erase_on_task_stack
 #endif
-	kernel_exit 0
+	kernel_exit el=0, sync=0
 SYM_CODE_END(ret_to_user)
 
 	.popsection				// .entry.text
@@ -785,12 +915,12 @@
  */
 	.macro generate_el1_vector, bhb
 .Lvector_start\@:
-	kernel_ventry	1, t, 64, sync		// Synchronous EL1t
+	kernel_ventry_el1_sync	1, t, 64, sync		// Synchronous EL1t
 	kernel_ventry	1, t, 64, irq		// IRQ EL1t
 	kernel_ventry	1, t, 64, fiq		// FIQ EL1h
 	kernel_ventry	1, t, 64, error		// Error EL1t
 
-	kernel_ventry	1, h, 64, sync		// Synchronous EL1h
+	kernel_ventry_el1_sync	1, h, 64, sync		// Synchronous EL1h
 	kernel_ventry	1, h, 64, irq		// IRQ EL1h
 	kernel_ventry	1, h, 64, fiq		// FIQ EL1h
 	kernel_ventry	1, h, 64, error		// Error EL1h
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 85087e2..f47717c 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -32,6 +32,9 @@ DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);
 
 DEFINE_PER_CPU(unsigned long *, irq_stack_ptr);
 
+#ifdef CONFIG_DYNAMIC_STACK
+DEFINE_PER_CPU(unsigned long *, sync_stack_ptr);
+#endif
 
 DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr);
 
@@ -60,6 +63,10 @@ static void __init init_irq_stacks(void)
 	for_each_possible_cpu(cpu) {
 		p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu));
 		per_cpu(irq_stack_ptr, cpu) = p;
+#ifdef CONFIG_DYNAMIC_STACK
+		p = arch_alloc_vmap_stack(SYNC_STACK_SIZE, early_cpu_to_node(cpu));
+		per_cpu(sync_stack_ptr, cpu) = p;
+#endif
 	}
 }
 #else
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ec0a337..dd60376 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -911,6 +911,19 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
+void do_stack_abort(unsigned long far, struct pt_regs *regs)
+{
+	unsigned long addr = untagged_addr(far);
+
+	/* Deal with dynamic paging in of new physical stack pages */
+	if (!dynamic_stack_fault(current, addr)) {
+		/* Not good */
+		unsigned long esr = read_sysreg(esr_el1);
+		die_kernel_fault("dynamic_stack_fault", addr, esr, regs);
+	}
+	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+}
+
 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);