| From 3386bc8aed825e9f1f65ce38df4b109b2019b71a Mon Sep 17 00:00:00 2001 |
| From: Andy Lutomirski <luto@kernel.org> |
| Date: Mon, 4 Dec 2017 15:07:25 +0100 |
| Subject: x86/entry/64: Create a per-CPU SYSCALL entry trampoline |
| |
| From: Andy Lutomirski <luto@kernel.org> |
| |
| commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a upstream. |
| |
| Handling SYSCALL is tricky: the SYSCALL handler is entered with every |
| single register (except FLAGS), including RSP, live. It somehow needs |
| to set RSP to point to a valid stack, which means it needs to save the |
| user RSP somewhere and find its own stack pointer. The canonical way |
| to do this is with SWAPGS, which lets us access percpu data using the |
| %gs prefix. |
| |
| With PAGE_TABLE_ISOLATION-like pagetable switching, this is |
| problematic. Without a scratch register, switching CR3 is impossible, so |
| %gs-based percpu memory would need to be mapped in the user pagetables. |
| Doing that without information leaks is difficult or impossible. |
| |
| Instead, use a different sneaky trick. Map a copy of the first part |
| of the SYSCALL asm at a different address for each CPU. Now RIP |
| varies depending on the CPU, so we can use RIP-relative memory access |
| to access percpu memory. By putting the relevant information (one |
| scratch slot and the stack address) at a constant offset relative to |
| RIP, we can make SYSCALL work without relying on %gs. |
| |
| A nice thing about this approach is that we can easily switch it on |
| and off if we want pagetable switching to be configurable. |
| |
| The compat variant of SYSCALL doesn't have this problem in the first |
| place -- there are plenty of scratch registers, since we don't care |
| about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 |
| at all. |
| |
| This patch actually seems to be a small speedup. With this patch, |
| SYSCALL touches an extra cache line and an extra virtual page, but |
| the pipeline no longer stalls waiting for SWAPGS. It seems that, at |
| least in a tight loop, the latter outweights the former. |
| |
| Thanks to David Laight for an optimization tip. |
| |
| Signed-off-by: Andy Lutomirski <luto@kernel.org> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Reviewed-by: Thomas Gleixner <tglx@linutronix.de> |
| Reviewed-by: Borislav Petkov <bpetkov@suse.de> |
| Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> |
| Cc: Borislav Petkov <bp@alien8.de> |
| Cc: Brian Gerst <brgerst@gmail.com> |
| Cc: Dave Hansen <dave.hansen@intel.com> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: David Laight <David.Laight@aculab.com> |
| Cc: Denys Vlasenko <dvlasenk@redhat.com> |
| Cc: Eduardo Valentin <eduval@amazon.com> |
| Cc: Greg KH <gregkh@linuxfoundation.org> |
| Cc: H. Peter Anvin <hpa@zytor.com> |
| Cc: Josh Poimboeuf <jpoimboe@redhat.com> |
| Cc: Juergen Gross <jgross@suse.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Rik van Riel <riel@redhat.com> |
| Cc: Will Deacon <will.deacon@arm.com> |
| Cc: aliguori@amazon.com |
| Cc: daniel.gruss@iaik.tugraz.at |
| Cc: hughd@google.com |
| Cc: keescook@google.com |
| Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| arch/x86/entry/entry_64.S | 58 ++++++++++++++++++++++++++++++++++++++++++ |
| arch/x86/include/asm/fixmap.h | 2 + |
| arch/x86/kernel/asm-offsets.c | 1 |
| arch/x86/kernel/cpu/common.c | 15 ++++++++++ |
| arch/x86/kernel/vmlinux.lds.S | 9 ++++++ |
| 5 files changed, 84 insertions(+), 1 deletion(-) |
| |
| --- a/arch/x86/entry/entry_64.S |
| +++ b/arch/x86/entry/entry_64.S |
| @@ -136,6 +136,64 @@ END(native_usergs_sysret64) |
| * with them due to bugs in both AMD and Intel CPUs. |
| */ |
| |
| + .pushsection .entry_trampoline, "ax" |
| + |
| +/* |
| + * The code in here gets remapped into cpu_entry_area's trampoline. This means |
| + * that the assembler and linker have the wrong idea as to where this code |
| + * lives (and, in fact, it's mapped more than once, so it's not even at a |
| + * fixed address). So we can't reference any symbols outside the entry |
| + * trampoline and expect it to work. |
| + * |
| + * Instead, we carefully abuse %rip-relative addressing. |
| + * _entry_trampoline(%rip) refers to the start of the remapped) entry |
| + * trampoline. We can thus find cpu_entry_area with this macro: |
| + */ |
| + |
| +#define CPU_ENTRY_AREA \ |
| + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) |
| + |
| +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ |
| +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ |
| + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA |
| + |
| +ENTRY(entry_SYSCALL_64_trampoline) |
| + UNWIND_HINT_EMPTY |
| + swapgs |
| + |
| + /* Stash the user RSP. */ |
| + movq %rsp, RSP_SCRATCH |
| + |
| + /* Load the top of the task stack into RSP */ |
| + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp |
| + |
| + /* Start building the simulated IRET frame. */ |
| + pushq $__USER_DS /* pt_regs->ss */ |
| + pushq RSP_SCRATCH /* pt_regs->sp */ |
| + pushq %r11 /* pt_regs->flags */ |
| + pushq $__USER_CS /* pt_regs->cs */ |
| + pushq %rcx /* pt_regs->ip */ |
| + |
| + /* |
| + * x86 lacks a near absolute jump, and we can't jump to the real |
| + * entry text with a relative jump. We could push the target |
| + * address and then use retq, but this destroys the pipeline on |
| + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, |
| + * spill RDI and restore it in a second-stage trampoline. |
| + */ |
| + pushq %rdi |
| + movq $entry_SYSCALL_64_stage2, %rdi |
| + jmp *%rdi |
| +END(entry_SYSCALL_64_trampoline) |
| + |
| + .popsection |
| + |
| +ENTRY(entry_SYSCALL_64_stage2) |
| + UNWIND_HINT_EMPTY |
| + popq %rdi |
| + jmp entry_SYSCALL_64_after_hwframe |
| +END(entry_SYSCALL_64_stage2) |
| + |
| ENTRY(entry_SYSCALL_64) |
| UNWIND_HINT_EMPTY |
| /* |
| --- a/arch/x86/include/asm/fixmap.h |
| +++ b/arch/x86/include/asm/fixmap.h |
| @@ -61,6 +61,8 @@ struct cpu_entry_area { |
| * of the TSS region. |
| */ |
| struct tss_struct tss; |
| + |
| + char entry_trampoline[PAGE_SIZE]; |
| }; |
| |
| #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) |
| --- a/arch/x86/kernel/asm-offsets.c |
| +++ b/arch/x86/kernel/asm-offsets.c |
| @@ -101,4 +101,5 @@ void common(void) { |
| |
| /* Layout info for cpu_entry_area */ |
| OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); |
| + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); |
| } |
| --- a/arch/x86/kernel/cpu/common.c |
| +++ b/arch/x86/kernel/cpu/common.c |
| @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, |
| static inline void setup_cpu_entry_area(int cpu) |
| { |
| #ifdef CONFIG_X86_64 |
| + extern char _entry_trampoline[]; |
| + |
| /* On 64-bit systems, we use a read-only fixmap GDT. */ |
| pgprot_t gdt_prot = PAGE_KERNEL_RO; |
| #else |
| @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area( |
| #ifdef CONFIG_X86_32 |
| this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); |
| #endif |
| + |
| +#ifdef CONFIG_X86_64 |
| + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), |
| + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); |
| +#endif |
| } |
| |
| /* Load the original GDT from the per-cpu structure */ |
| @@ -1395,10 +1402,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, |
| /* May not be marked __init: used by software suspend */ |
| void syscall_init(void) |
| { |
| + extern char _entry_trampoline[]; |
| + extern char entry_SYSCALL_64_trampoline[]; |
| + |
| int cpu = smp_processor_id(); |
| + unsigned long SYSCALL64_entry_trampoline = |
| + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + |
| + (entry_SYSCALL_64_trampoline - _entry_trampoline); |
| |
| wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); |
| - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); |
| + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); |
| |
| #ifdef CONFIG_IA32_EMULATION |
| wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); |
| --- a/arch/x86/kernel/vmlinux.lds.S |
| +++ b/arch/x86/kernel/vmlinux.lds.S |
| @@ -107,6 +107,15 @@ SECTIONS |
| SOFTIRQENTRY_TEXT |
| *(.fixup) |
| *(.gnu.warning) |
| + |
| +#ifdef CONFIG_X86_64 |
| + . = ALIGN(PAGE_SIZE); |
| + _entry_trampoline = .; |
| + *(.entry_trampoline) |
| + . = ALIGN(PAGE_SIZE); |
| + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); |
| +#endif |
| + |
| /* End of text section */ |
| _etext = .; |
| } :text = 0x9090 |