| From b645af2d5905c4e32399005b867987919cbfc3ae Mon Sep 17 00:00:00 2001 |
| From: Andy Lutomirski <luto@amacapital.net> |
| Date: Sat, 22 Nov 2014 18:00:33 -0800 |
| Subject: x86_64, traps: Rework bad_iret |
| |
| From: Andy Lutomirski <luto@amacapital.net> |
| |
| commit b645af2d5905c4e32399005b867987919cbfc3ae upstream. |
| |
| It's possible for iretq to userspace to fail. This can happen because |
| of a bad CS, SS, or RIP. |
| |
| Historically, we've handled it by fixing up an exception from iretq to |
| land at bad_iret, which pretends that the failed iret frame was really |
| the hardware part of #GP(0) from userspace. To make this work, there's |
| an extra fixup to fudge the gs base into a usable state. |
| |
| This is suboptimal because it loses the original exception. It's also |
| buggy because there's no guarantee that we were on the kernel stack to |
| begin with. For example, if the failing iret happened on return from an |
| NMI, then we'll end up executing general_protection on the NMI stack. |
| This is bad for several reasons, the most immediate of which is that |
| general_protection, as a non-paranoid idtentry, will try to deliver |
| signals and/or schedule from the wrong stack. |
| |
| This patch throws out bad_iret entirely. As a replacement, it augments |
| the existing swapgs fudge into a full-blown iret fixup, mostly written |
| in C. It's should be clearer and more correct. |
| |
| Signed-off-by: Andy Lutomirski <luto@amacapital.net> |
| Reviewed-by: Thomas Gleixner <tglx@linutronix.de> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| arch/x86/kernel/entry_64.S | 45 +++++++++++++++++++-------------------------- |
| arch/x86/kernel/traps.c | 29 +++++++++++++++++++++++++++++ |
| 2 files changed, 48 insertions(+), 26 deletions(-) |
| |
| --- a/arch/x86/kernel/entry_64.S |
| +++ b/arch/x86/kernel/entry_64.S |
| @@ -1055,8 +1055,13 @@ ENTRY(native_iret) |
| |
| .global native_irq_return_iret |
| native_irq_return_iret: |
| + /* |
| + * This may fault. Non-paranoid faults on return to userspace are |
| + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. |
| + * Double-faults due to espfix64 are handled in do_double_fault. |
| + * Other faults here are fatal. |
| + */ |
| iretq |
| - _ASM_EXTABLE(native_irq_return_iret, bad_iret) |
| |
| #ifdef CONFIG_X86_ESPFIX64 |
| native_irq_return_ldt: |
| @@ -1084,25 +1089,6 @@ native_irq_return_ldt: |
| jmp native_irq_return_iret |
| #endif |
| |
| - .section .fixup,"ax" |
| -bad_iret: |
| - /* |
| - * The iret traps when the %cs or %ss being restored is bogus. |
| - * We've lost the original trap vector and error code. |
| - * #GPF is the most likely one to get for an invalid selector. |
| - * So pretend we completed the iret and took the #GPF in user mode. |
| - * |
| - * We are now running with the kernel GS after exception recovery. |
| - * But error_entry expects us to have user GS to match the user %cs, |
| - * so swap back. |
| - */ |
| - pushq $0 |
| - |
| - SWAPGS |
| - jmp general_protection |
| - |
| - .previous |
| - |
| /* edi: workmask, edx: work */ |
| retint_careful: |
| CFI_RESTORE_STATE |
| @@ -1629,16 +1615,15 @@ error_sti: |
| |
| /* |
| * There are two places in the kernel that can potentially fault with |
| - * usergs. Handle them here. The exception handlers after iret run with |
| - * kernel gs again, so don't set the user space flag. B stepping K8s |
| - * sometimes report an truncated RIP for IRET exceptions returning to |
| - * compat mode. Check for these here too. |
| + * usergs. Handle them here. B stepping K8s sometimes report a |
| + * truncated RIP for IRET exceptions returning to compat mode. Check |
| + * for these here too. |
| */ |
| error_kernelspace: |
| incl %ebx |
| leaq native_irq_return_iret(%rip),%rcx |
| cmpq %rcx,RIP+8(%rsp) |
| - je error_swapgs |
| + je error_bad_iret |
| movl %ecx,%eax /* zero extend */ |
| cmpq %rax,RIP+8(%rsp) |
| je bstep_iret |
| @@ -1649,7 +1634,15 @@ error_kernelspace: |
| bstep_iret: |
| /* Fix truncated RIP */ |
| movq %rcx,RIP+8(%rsp) |
| - jmp error_swapgs |
| + /* fall through */ |
| + |
| +error_bad_iret: |
| + SWAPGS |
| + mov %rsp,%rdi |
| + call fixup_bad_iret |
| + mov %rax,%rsp |
| + decl %ebx /* Return to usergs */ |
| + jmp error_sti |
| CFI_ENDPROC |
| END(error_entry) |
| |
| --- a/arch/x86/kernel/traps.c |
| +++ b/arch/x86/kernel/traps.c |
| @@ -384,6 +384,35 @@ asmlinkage __kprobes struct pt_regs *syn |
| *regs = *eregs; |
| return regs; |
| } |
| + |
| +struct bad_iret_stack { |
| + void *error_entry_ret; |
| + struct pt_regs regs; |
| +}; |
| + |
| +asmlinkage __visible |
| +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) |
| +{ |
| + /* |
| + * This is called from entry_64.S early in handling a fault |
| + * caused by a bad iret to user mode. To handle the fault |
| + * correctly, we want move our stack frame to task_pt_regs |
| + * and we want to pretend that the exception came from the |
| + * iret target. |
| + */ |
| + struct bad_iret_stack *new_stack = |
| + container_of(task_pt_regs(current), |
| + struct bad_iret_stack, regs); |
| + |
| + /* Copy the IRET target to the new stack. */ |
| + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); |
| + |
| + /* Copy the remainder of the stack from the current stack. */ |
| + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); |
| + |
| + BUG_ON(!user_mode_vm(&new_stack->regs)); |
| + return new_stack; |
| +} |
| #endif |
| |
| /* |