| From e37e43a497d5a8b7c0cc1736d56986f432c394c9 Mon Sep 17 00:00:00 2001 |
| From: Andy Lutomirski <luto@kernel.org> |
| Date: Thu, 11 Aug 2016 02:35:23 -0700 |
| Subject: [PATCH] x86/mm/64: Enable vmapped stacks |
| (CONFIG_HAVE_ARCH_VMAP_STACK=y) |
| |
| commit e37e43a497d5a8b7c0cc1736d56986f432c394c9 upstream. |
| |
| This allows x86_64 kernels to enable vmapped stacks by setting |
| HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y |
| high level Kconfig option. |
| |
| There are a couple of interesting bits: |
| |
| First, x86 lazily faults in top-level paging entries for the vmalloc |
| area. This won't work if we get a page fault while trying to access |
| the stack: the CPU will promote it to a double-fault and we'll die. |
| To avoid this problem, probe the new stack when switching stacks and |
| forcibly populate the pgd entry for the stack when switching mms. |
| |
| Second, once we have guard pages around the stack, we'll want to |
| detect and handle stack overflow. |
| |
| I didn't enable it on x86_32. We'd need to rework the double-fault |
| code a bit and I'm concerned about running out of vmalloc virtual |
| addresses under some workloads. |
| |
| This patch, by itself, will behave somewhat erratically when the |
| stack overflows while RSP is still more than a few tens of bytes |
| above the bottom of the stack. Specifically, we'll get #PF and make |
| it to no_context and them oops without reliably triggering a |
| double-fault, and no_context doesn't know about stack overflows. |
| The next patch will improve that case. |
| |
| Thank you to Nadav and Brian for helping me pay enough attention to |
| the SDM to hopefully get this right. |
| |
| Signed-off-by: Andy Lutomirski <luto@kernel.org> |
| Cc: Borislav Petkov <bp@alien8.de> |
| Cc: Brian Gerst <brgerst@gmail.com> |
| Cc: Denys Vlasenko <dvlasenk@redhat.com> |
| Cc: H. Peter Anvin <hpa@zytor.com> |
| Cc: Josh Poimboeuf <jpoimboe@redhat.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Nadav Amit <nadav.amit@gmail.com> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Link: http://lkml.kernel.org/r/c88f3e2920b18e6cc621d772a04a62c06869037e.1470907718.git.luto@kernel.org |
| [ Minor edits. ] |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| |
| diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig |
| index c580d8c33562..21a6d0ec5983 100644 |
| --- a/arch/x86/Kconfig |
| +++ b/arch/x86/Kconfig |
| @@ -94,6 +94,7 @@ config X86 |
| select HAVE_ARCH_TRANSPARENT_HUGEPAGE |
| select HAVE_ARCH_WITHIN_STACK_FRAMES |
| select HAVE_EBPF_JIT if X86_64 |
| + select HAVE_ARCH_VMAP_STACK if X86_64 |
| select HAVE_CC_STACKPROTECTOR |
| select HAVE_CMPXCHG_DOUBLE |
| select HAVE_CMPXCHG_LOCAL |
| diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h |
| index 8f321a1b03a1..14e4b20f0aaf 100644 |
| --- a/arch/x86/include/asm/switch_to.h |
| +++ b/arch/x86/include/asm/switch_to.h |
| @@ -8,6 +8,28 @@ struct tss_struct; |
| void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, |
| struct tss_struct *tss); |
| |
| +/* This runs runs on the previous thread's stack. */ |
| +static inline void prepare_switch_to(struct task_struct *prev, |
| + struct task_struct *next) |
| +{ |
| +#ifdef CONFIG_VMAP_STACK |
| + /* |
| + * If we switch to a stack that has a top-level paging entry |
| + * that is not present in the current mm, the resulting #PF will |
| + * will be promoted to a double-fault and we'll panic. Probe |
| + * the new stack now so that vmalloc_fault can fix up the page |
| + * tables if needed. This can only happen if we use a stack |
| + * in vmap space. |
| + * |
| + * We assume that the stack is aligned so that it never spans |
| + * more than one top-level paging entry. |
| + * |
| + * To minimize cache pollution, just follow the stack pointer. |
| + */ |
| + READ_ONCE(*(unsigned char *)next->thread.sp); |
| +#endif |
| +} |
| + |
| #ifdef CONFIG_X86_32 |
| |
| #ifdef CONFIG_CC_STACKPROTECTOR |
| @@ -39,6 +61,8 @@ do { \ |
| */ \ |
| unsigned long ebx, ecx, edx, esi, edi; \ |
| \ |
| + prepare_switch_to(prev, next); \ |
| + \ |
| asm volatile("pushl %%ebp\n\t" /* save EBP */ \ |
| "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ |
| "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ |
| @@ -103,7 +127,9 @@ do { \ |
| * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL |
| * has no effect. |
| */ |
| -#define switch_to(prev, next, last) \ |
| +#define switch_to(prev, next, last) \ |
| + prepare_switch_to(prev, next); \ |
| + \ |
| asm volatile(SAVE_CONTEXT \ |
| "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ |
| "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ |
| diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c |
| index b70ca12dd389..907b4e4aeb5e 100644 |
| --- a/arch/x86/kernel/traps.c |
| +++ b/arch/x86/kernel/traps.c |
| @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) |
| DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) |
| DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) |
| |
| +#ifdef CONFIG_VMAP_STACK |
| +static void __noreturn handle_stack_overflow(const char *message, |
| + struct pt_regs *regs, |
| + unsigned long fault_address) |
| +{ |
| + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", |
| + (void *)fault_address, current->stack, |
| + (char *)current->stack + THREAD_SIZE - 1); |
| + die(message, regs, 0); |
| + |
| + /* Be absolutely certain we don't return. */ |
| + panic(message); |
| +} |
| +#endif |
| + |
| #ifdef CONFIG_X86_64 |
| /* Runs on IST stack */ |
| dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) |
| { |
| static const char str[] = "double fault"; |
| struct task_struct *tsk = current; |
| +#ifdef CONFIG_VMAP_STACK |
| + unsigned long cr2; |
| +#endif |
| |
| #ifdef CONFIG_X86_ESPFIX64 |
| extern unsigned char native_irq_return_iret[]; |
| @@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) |
| tsk->thread.error_code = error_code; |
| tsk->thread.trap_nr = X86_TRAP_DF; |
| |
| +#ifdef CONFIG_VMAP_STACK |
| + /* |
| + * If we overflow the stack into a guard page, the CPU will fail |
| + * to deliver #PF and will send #DF instead. Similarly, if we |
| + * take any non-IST exception while too close to the bottom of |
| + * the stack, the processor will get a page fault while |
| + * delivering the exception and will generate a double fault. |
| + * |
| + * According to the SDM (footnote in 6.15 under "Interrupt 14 - |
| + * Page-Fault Exception (#PF): |
| + * |
| + * Processors update CR2 whenever a page fault is detected. If a |
| + * second page fault occurs while an earlier page fault is being |
| + * deliv- ered, the faulting linear address of the second fault will |
| + * overwrite the contents of CR2 (replacing the previous |
| + * address). These updates to CR2 occur even if the page fault |
| + * results in a double fault or occurs during the delivery of a |
| + * double fault. |
| + * |
| + * The logic below has a small possibility of incorrectly diagnosing |
| + * some errors as stack overflows. For example, if the IDT or GDT |
| + * gets corrupted such that #GP delivery fails due to a bad descriptor |
| + * causing #GP and we hit this condition while CR2 coincidentally |
| + * points to the stack guard page, we'll think we overflowed the |
| + * stack. Given that we're going to panic one way or another |
| + * if this happens, this isn't necessarily worth fixing. |
| + * |
| + * If necessary, we could improve the test by only diagnosing |
| + * a stack overflow if the saved RSP points within 47 bytes of |
| + * the bottom of the stack: if RSP == tsk_stack + 48 and we |
| + * take an exception, the stack is already aligned and there |
| + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a |
| + * possible error code, so a stack overflow would *not* double |
| + * fault. With any less space left, exception delivery could |
| + * fail, and, as a practical matter, we've overflowed the |
| + * stack even if the actual trigger for the double fault was |
| + * something else. |
| + */ |
| + cr2 = read_cr2(); |
| + if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) |
| + handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); |
| +#endif |
| + |
| #ifdef CONFIG_DOUBLEFAULT |
| df_debug(regs, error_code); |
| #endif |
| diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c |
| index 4dbe65622810..a7655f6caf7d 100644 |
| --- a/arch/x86/mm/tlb.c |
| +++ b/arch/x86/mm/tlb.c |
| @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
| unsigned cpu = smp_processor_id(); |
| |
| if (likely(prev != next)) { |
| + if (IS_ENABLED(CONFIG_VMAP_STACK)) { |
| + /* |
| + * If our current stack is in vmalloc space and isn't |
| + * mapped in the new pgd, we'll double-fault. Forcibly |
| + * map it. |
| + */ |
| + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); |
| + |
| + pgd_t *pgd = next->pgd + stack_pgd_index; |
| + |
| + if (unlikely(pgd_none(*pgd))) |
| + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); |
| + } |
| + |
| #ifdef CONFIG_SMP |
| this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); |
| this_cpu_write(cpu_tlbstate.active_mm, next); |
| #endif |
| + |
| cpumask_set_cpu(cpu, mm_cpumask(next)); |
| |
| /* |
| -- |
| 2.15.0 |
| |