arm64: Support dynamic kernel stacks
Turning on dynamic kernel stacks can save a lot of kernel
RAM on ARM64.
Example on minimal OpenWrt system:
$ cat /proc/vmstat | grep stack
nr_kernel_stack 320
nr_dynamic_stacks_faults 8
Each stack initially just use 4KB of stack, the each fault
extends one of the kernel stacks with one more 4KB page.
We see that in this case we consume
328 * 4KB = 1.28 MB
of RAM for stacks.
Compare if using 16KB pre-allocated stacks: all 320
processes would use 16KB of physical memory each, i.e.
320 * 16KB = 5 MB
So in this minimal system we save almost 4 MB of runtime
memory.
The approach taken here is to special-case the sync
exceptions from the vector table. If we are handling a
sync (and using dynamic stack), we stash (x16, x17)
into (TPIDR_EL0, TPIDRRO_EL0) temporarily so that we
can execute some code without using any stack at all.
We define a special sync stack that is only used
when handling sync calls, and we switch to this stack
immediately in the exception handler, without saving
a single value onto the task stack.
We then check if this sync exception was a data abort
on the ordinary process stack. If it was not, we copy our
current sync stack over to the task stack and continue
like nothing special happened.
If this was indeed a data abort on the task stack,
we call do_stack_abort() which in turn calls
dynamic_stack_fault() to latch in a new physical
page to the stack, all while running on the temporary
sync stack. We then return from the exception restoring
SP to what it used to be before the sync exception.
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331..9bea55b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -175,6 +175,7 @@
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_BITREVERSE
select HAVE_ARCH_COMPILER_H
+ select HAVE_ARCH_DYNAMIC_STACK
select HAVE_ARCH_HUGE_VMALLOC
select HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index d48fc16..c5410f0 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -33,7 +33,7 @@ asmlinkage void el1t_64_irq_handler(struct pt_regs *regs);
asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs);
asmlinkage void el1t_64_error_handler(struct pt_regs *regs);
-asmlinkage void el1h_64_sync_handler(struct pt_regs *regs);
+asmlinkage int el1h_64_sync_handler(struct pt_regs *regs);
asmlinkage void el1h_64_irq_handler(struct pt_regs *regs);
asmlinkage void el1h_64_fiq_handler(struct pt_regs *regs);
asmlinkage void el1h_64_error_handler(struct pt_regs *regs);
@@ -48,10 +48,12 @@ asmlinkage void el0t_32_irq_handler(struct pt_regs *regs);
asmlinkage void el0t_32_fiq_handler(struct pt_regs *regs);
asmlinkage void el0t_32_error_handler(struct pt_regs *regs);
+asmlinkage void switch_sync_stack_to_task_stack(void);
asmlinkage void call_on_irq_stack(struct pt_regs *regs,
void (*func)(struct pt_regs *));
asmlinkage void asm_exit_to_user_mode(struct pt_regs *regs);
+void do_stack_abort(unsigned long far, struct pt_regs *regs);
void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs);
void do_el0_undef(struct pt_regs *regs, unsigned long esr);
void do_el1_undef(struct pt_regs *regs, unsigned long esr);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 717829d..ba7969e 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -143,6 +143,13 @@
#define IRQ_STACK_SIZE THREAD_SIZE
+/*
+ * This size is determined by trial-and-error. The sync aborts do not
+ * nest very deep, they put a new page into the stack and return, no calls
+ * to deep memory management code etc is done from this context.
+ */
+#define SYNC_STACK_SIZE SZ_8K
+
#define OVERFLOW_STACK_SIZE SZ_4K
#define NVHE_STACK_SHIFT PAGE_SHIFT
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 7c1970b..9d52b0f 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -523,10 +523,65 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
exit_to_kernel_mode(regs);
}
-asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
+#define EL1_FAULT_ON_STACK 1
+#define EL1_STACK_OVERFLOW 2
+
+static unsigned int noinstr el1_page_fault_on_stack(unsigned long esr,
+ unsigned long far)
+{
+ unsigned long stack = (unsigned long)current->stack;
+ unsigned long addr = untagged_addr(far);
+
+ /*
+ * Is this even a page fault?
+ * NB: only check for data abort, we have no business
+ * executing code on the stack so no instruction aborts.
+ */
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR)
+ return 0;
+
+ if (addr < stack || addr >= stack + THREAD_SIZE)
+ return 0;
+
+ /* We hit the botton of the stack: overflow! */
+ if (addr == stack)
+ return EL1_STACK_OVERFLOW;
+
+ /* Actually a page fault on the stack! */
+ return EL1_FAULT_ON_STACK;
+}
+
+/* Returns 1 if we are still on the sync stack, else 0 */
+asmlinkage int noinstr el1h_64_sync_handler(struct pt_regs *regs)
{
unsigned long esr = read_sysreg(esr_el1);
+ if (IS_ENABLED(CONFIG_DYNAMIC_STACK)) {
+ unsigned long far = read_sysreg(far_el1);
+ unsigned int fault;
+
+ /* First restore tpidr_el0 and tpidrro_el0 for dynamic stack */
+ tls_thread_restore_current();
+ /*
+ * Are we faulting on the thread stack? Else just switch
+ * back to the thread stack and continue as if nothing happened.
+ * code running in the abort handlers will allow further
+ * aborts to happen so we most definitely need to switch
+ * back to the task stack unless we are handling a page fault
+ * on the stack itself.
+ */
+ fault = el1_page_fault_on_stack(esr, far);
+ if (fault == EL1_STACK_OVERFLOW)
+ handle_bad_stack(regs);
+ if (fault == EL1_FAULT_ON_STACK) {
+ pr_info("PAGE FAULT ON STACK!!\n");
+ do_stack_abort(far, regs);
+ return 1;
+ } else {
+ switch_sync_stack_to_task_stack();
+ }
+ }
+
switch (ESR_ELx_EC(esr)) {
case ESR_ELx_EC_DABT_CUR:
case ESR_ELx_EC_IABT_CUR:
@@ -564,6 +619,8 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
default:
__panic_unhandled(regs, "64-bit el1h sync", esr);
}
+
+ return 0;
}
static __always_inline void __el1_pnmi(struct pt_regs *regs,
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5ae2a34..389cb47 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -55,11 +55,15 @@
.endif
sub sp, sp, #PT_REGS_SIZE
-#ifdef CONFIG_VMAP_STACK
+#if defined(CONFIG_VMAP_STACK) && !defined(CONFIG_DYNAMIC_STACK)
/*
* Test whether the SP has overflowed, without corrupting a GPR.
* Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT)
* should always be zero.
+ *
+ * This trick is not needed when using dynamic stacks, the dynamic
+ * stack handling code has its own overflow handling using the sync
+ * stack.
*/
add sp, sp, x0 // sp' = sp + x0
sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
@@ -97,11 +101,45 @@
/* We were already on the overflow stack. Restore sp/x0 and carry on. */
sub sp, sp, x0
mrs x0, tpidrro_el0
-#endif
+#endif /* CONFIG_VMAP_STACK */
b el\el\ht\()_\regsize\()_\label
.org .Lventry_start\@ + 128 // Did we overflow the ventry slot?
.endm
+#ifdef CONFIG_DYNAMIC_STACK
+ /* Only used for EL1 sync exceptions */
+ .macro kernel_ventry_el1_sync, el:req, ht:req, regsize:req, label:req
+ .align 7
+.Lventry_el1_sync_start\@:
+ /*
+ * Call the sync handler on the special sync stack since we could
+ * have just encountered a data abort on the task stack, and in
+ * that case we cannot store anything on the current task stack.
+ */
+ // Save x16, x17 in tpidr_el0, tpidrro_el0
+ msr tpidr_el0, x16
+ msr tpidrro_el0, x17
+ ldr_this_cpu x16, sync_stack_ptr, x17
+ mov x17, sp // Save old SP in x17
+ add sp, x16, #SYNC_STACK_SIZE
+ sub sp, sp, #8
+ str x17, [sp] // Save old SP on the sync stack
+ sub sp, sp, #PT_REGS_SIZE
+ // Restore x16, x17 from in tpidr_el0, tpidrro_el0
+ mrs x16, tpidr_el0
+ mrs x17, tpidrro_el0
+ /*
+ * Now tpidr_el0, tpidrro_el0 are clobbered and need to be restored
+ * in the el1 sync handlers in C.
+ */
+ b el\el\ht\()_\regsize\()_\label
+.org .Lventry_el1_sync_start\@ + 128 // Did we overflow the ventry slot?
+ .endm
+#else
+ /* Just an alias to the kernel_ventry macro */
+#define kernel_ventry_el1_sync kernel_ventry
+#endif /* CONFIG_DYNAMIC_STACK */
+
.macro tramp_alias, dst, sym
.set .Lalias\@, TRAMP_VALIAS + \sym - .entry.tramp.text
movz \dst, :abs_g2_s:.Lalias\@
@@ -334,7 +372,7 @@
*/
.endm
- .macro kernel_exit, el
+ .macro kernel_exit, el, sync
.if \el != 0
disable_daif
.endif
@@ -413,6 +451,20 @@
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
+#ifdef CONFIG_DYNAMIC_STACK
+ /*
+ * Really complicated way of getting the old SP out.
+ * We have now swapped in a new stack page so we can use
+ * the common stack without any faults.
+ */
+ .if \el == 1 && \sync == 1
+ ldr x2, [sp, #PT_REGS_SIZE] // Get old SP
+ sub x2, x2, #8
+ str x2, [sp, #PT_REGS_SIZE] // Save SP
+ ldp x0, x1, [sp, #16 *0] // Get stored x0, x1 from the IRQ stack
+ str x0, [x2] // Put a copy of x0 on the top of the ordinary stack
+ .endif
+#endif
ldp x0, x1, [sp, #16 * 0]
ldp x2, x3, [sp, #16 * 1]
ldp x4, x5, [sp, #16 * 2]
@@ -447,7 +499,19 @@
.endif
ldr lr, [sp, #S_LR]
+#ifdef CONFIG_DYNAMIC_STACK
+ .if \el == 1 && \sync == 1
+ // Restore original SP (was using sync stack) before in-kernel sync
+ ldr x0, [sp, #PT_REGS_SIZE]
+ mov sp, x0
+ ldr x0, [sp] // We have put a copy of x0 here (see above)
+ add sp, sp, #8
+ .else
add sp, sp, #PT_REGS_SIZE // restore sp
+ .endif
+#else
+ add sp, sp, #PT_REGS_SIZE // restore sp
+#endif
.if \el == 0
/* This must be after the last explicit memory access */
@@ -519,12 +583,12 @@
.align 11
SYM_CODE_START(vectors)
- kernel_ventry 1, t, 64, sync // Synchronous EL1t
+ kernel_ventry_el1_sync 1, t, 64, sync // Synchronous EL1t
kernel_ventry 1, t, 64, irq // IRQ EL1t
kernel_ventry 1, t, 64, fiq // FIQ EL1t
kernel_ventry 1, t, 64, error // Error EL1t
- kernel_ventry 1, h, 64, sync // Synchronous EL1h
+ kernel_ventry_el1_sync 1, h, 64, sync // Synchronous EL1h
kernel_ventry 1, h, 64, irq // IRQ EL1h
kernel_ventry 1, h, 64, fiq // FIQ EL1h
kernel_ventry 1, h, 64, error // Error EL1h
@@ -578,9 +642,42 @@
bl el\el\ht\()_\regsize\()_\label\()_handler
.if \el == 0
b ret_to_user
+ .endif
+#ifdef CONFIG_DYNAMIC_STACK
+ /*
+ * This .ifeqs with .if inside it looks a bit insane and this
+ * is because .ifeqs doesn't support nesting whatsoever: you
+ * can put .if:s inside an .ifeqs but you simply can't put an
+ * .ifeqs inside an .if and expect it to work, the result will
+ * be that the assembler inside an .ifeqs nested inside an .if
+ * will be evaluated if the .ifeqs evaluates to true no matter
+ * what the outer .if evaluates to.
+ *
+ * This is probably a bug in the GNU Assembler that I work
+ * around like this.
+ */
+ .ifeqs "\label", "sync"
+ .if \el != 0
+ /*
+ * We get called with parameter 1 if we are still on the
+ * sync stack.
+ */
+ tst x0, #1
+ b.ne 0f
+ b ret_to_kernel
+0:
+ b ret_to_kernel_el1_sync
+ .endif
.else
+ .if \el != 0
b ret_to_kernel
.endif
+ .endif
+#else
+ .if \el != 0
+ b ret_to_kernel
+ .endif
+#endif
SYM_CODE_END(el\el\ht\()_\regsize\()_\label)
.endm
@@ -608,16 +705,49 @@
entry_handler 0, t, 32, error
SYM_CODE_START_LOCAL(ret_to_kernel)
- kernel_exit 1
+ kernel_exit el=1, sync=0
SYM_CODE_END(ret_to_kernel)
+#ifdef CONFIG_DYNAMIC_STACK
+ /*
+ * Special version that return us to the kernel from a page
+ * fault that happened when we were using the stack.
+ */
+SYM_CODE_START_LOCAL(ret_to_kernel_el1_sync)
+ kernel_exit el=1, sync=1
+SYM_CODE_END(ret_to_kernel_el1_sync)
+
+SYM_FUNC_START(switch_sync_stack_to_task_stack)
+ // Obtain the task stack SP which is helpfully placed
+ // on the bottom of the special sync stack.
+ ldr_this_cpu x0, sync_stack_ptr, x1
+ add x0, x0, #SYNC_STACK_SIZE
+ sub x0, x0, #8
+ ldr x1, [x0] // x1 = task stack SP
+ mov x3, sp // copy of current SP in x3
+ // Copy the contents of the sync stack to the task stack
+0:
+ sub x0, x0, #8
+ sub x1, x1, #8
+ ldr x2, [x0]
+ str x2, [x1]
+ cmp x0, x3
+ b.ne 0b
+ // Switch to the task stack, ditch the old sync stack and
+ // pretend like nothing happened
+ mov sp, x1
+ ret
+SYM_FUNC_END(switch_sync_stack_to_task_stack)
+NOKPROBE(switch_sync_stack_to_task_stack)
+#endif
+
SYM_CODE_START_LOCAL(ret_to_user)
ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step
enable_step_tsk x19, x2
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
bl stackleak_erase_on_task_stack
#endif
- kernel_exit 0
+ kernel_exit el=0, sync=0
SYM_CODE_END(ret_to_user)
.popsection // .entry.text
@@ -785,12 +915,12 @@
*/
.macro generate_el1_vector, bhb
.Lvector_start\@:
- kernel_ventry 1, t, 64, sync // Synchronous EL1t
+ kernel_ventry_el1_sync 1, t, 64, sync // Synchronous EL1t
kernel_ventry 1, t, 64, irq // IRQ EL1t
kernel_ventry 1, t, 64, fiq // FIQ EL1h
kernel_ventry 1, t, 64, error // Error EL1t
- kernel_ventry 1, h, 64, sync // Synchronous EL1h
+ kernel_ventry_el1_sync 1, h, 64, sync // Synchronous EL1h
kernel_ventry 1, h, 64, irq // IRQ EL1h
kernel_ventry 1, h, 64, fiq // FIQ EL1h
kernel_ventry 1, h, 64, error // Error EL1h
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 85087e2..f47717c 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -32,6 +32,9 @@ DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);
DEFINE_PER_CPU(unsigned long *, irq_stack_ptr);
+#ifdef CONFIG_DYNAMIC_STACK
+DEFINE_PER_CPU(unsigned long *, sync_stack_ptr);
+#endif
DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr);
@@ -60,6 +63,10 @@ static void __init init_irq_stacks(void)
for_each_possible_cpu(cpu) {
p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu));
per_cpu(irq_stack_ptr, cpu) = p;
+#ifdef CONFIG_DYNAMIC_STACK
+ p = arch_alloc_vmap_stack(SYNC_STACK_SIZE, early_cpu_to_node(cpu));
+ per_cpu(sync_stack_ptr, cpu) = p;
+#endif
}
}
#else
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ec0a337..dd60376 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -911,6 +911,19 @@ static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
};
+void do_stack_abort(unsigned long far, struct pt_regs *regs)
+{
+ unsigned long addr = untagged_addr(far);
+
+ /* Deal with dynamic paging in of new physical stack pages */
+ if (!dynamic_stack_fault(current, addr)) {
+ /* Not good */
+ unsigned long esr = read_sysreg(esr_el1);
+ die_kernel_fault("dynamic_stack_fault", addr, esr, regs);
+ }
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+}
+
void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_fault_info(esr);