| From: Ben Hutchings <ben@decadent.org.uk> |
| Date: Fri, 09 Mar 2018 00:11:14 +0000 |
| Subject: x86/syscall: Sanitize syscall table de-references under speculation |
| |
| commit 2fbd7af5af8665d18bcefae3e9700be07e22b681 upstream. |
| |
| The upstream version of this, touching C code, was written by Dan Williams, |
| with the following description: |
| |
| > The syscall table base is a user controlled function pointer in kernel |
| > space. Use array_index_nospec() to prevent any out of bounds speculation. |
| > |
| > While retpoline prevents speculating into a userspace directed target it |
| > does not stop the pointer de-reference, the concern is leaking memory |
| > relative to the syscall table base, by observing instruction cache |
| > behavior. |
| |
| The x86_64 assembly version for 4.4 was written by Jiri Slaby, with |
| the following description: |
| |
| > In 4.4.118, we have commit c8961332d6da (x86/syscall: Sanitize syscall |
| > table de-references under speculation), which is a backport of upstream |
| > commit 2fbd7af5af86. But it fixed only the C part of the upstream patch |
| > -- the IA32 sysentry. So it ommitted completely the assembly part -- the |
| > 64bit sysentry. |
| > |
| > Fix that in this patch by explicit array_index_mask_nospec written in |
| > assembly. The same was used in lib/getuser.S. |
| > |
| > However, to have "sbb" working properly, we have to switch from "cmp" |
| > against (NR_syscalls-1) to (NR_syscalls), otherwise the last syscall |
| > number would be "and"ed by 0. It is because the original "ja" relies on |
| > "CF" or "ZF", but we rely only on "CF" in "sbb". That means: switch to |
| > "jae" conditional jump too. |
| > |
| > Final note: use rcx for mask as this is exactly what is overwritten by |
| > the 4th syscall argument (r10) right after. |
| |
| In 3.16 the x86_32 syscall table lookup is also written in assembly. |
| So I've taken Jiri's version and added similar masking in entry_32.S, |
| using edx as the temporary. edx is clobbered by SAVE_REGS and seems |
| to be free at this point. |
| |
| The ia32 compat syscall table lookup on x86_64 is also written in |
| assembly, so I've added the same masking in ia32entry.S, using r8 as |
| the temporary since it is always clobbered by the following |
| instructions. |
| |
| Cc: Dan Williams <dan.j.williams@intel.com> |
| Cc: Jiri Slaby <jslaby@suse.cz> |
| Cc: Jan Beulich <JBeulich@suse.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Cc: linux-arch@vger.kernel.org |
| Cc: kernel-hardening@lists.openwall.com |
| Cc: gregkh@linuxfoundation.org |
| Cc: Andy Lutomirski <luto@kernel.org> |
| Cc: alan@linux.intel.com |
| Cc: Jinpu Wang <jinpu.wang@profitbricks.com> |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| --- a/arch/x86/ia32/ia32entry.S |
| +++ b/arch/x86/ia32/ia32entry.S |
| @@ -169,9 +169,11 @@ sysenter_flags_fixed: |
| testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
| CFI_REMEMBER_STATE |
| jnz sysenter_tracesys |
| - cmpq $(IA32_NR_syscalls-1),%rax |
| - ja ia32_badsys |
| + cmpq $(IA32_NR_syscalls),%rax |
| + jae ia32_badsys |
| sysenter_do_call: |
| + sbb %r8,%r8 /* array_index_mask_nospec() */ |
| + and %r8,%rax |
| IA32_ARG_FIXUP |
| sysenter_dispatch: |
| #ifdef CONFIG_RETPOLINE |
| @@ -216,8 +218,10 @@ sysexit_from_sys_call: |
| movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ |
| call __audit_syscall_entry |
| movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ |
| - cmpq $(IA32_NR_syscalls-1),%rax |
| - ja ia32_badsys |
| + cmpq $(IA32_NR_syscalls),%rax |
| + jae ia32_badsys |
| + sbb %r8,%r8 /* array_index_mask_nospec() */ |
| + and %r8,%rax |
| movl %ebx,%edi /* reload 1st syscall arg */ |
| movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ |
| movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ |
| @@ -273,8 +277,8 @@ sysenter_tracesys: |
| call syscall_trace_enter |
| LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ |
| RESTORE_REST |
| - cmpq $(IA32_NR_syscalls-1),%rax |
| - ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ |
| + cmpq $(IA32_NR_syscalls),%rax |
| + jae int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ |
| jmp sysenter_do_call |
| CFI_ENDPROC |
| ENDPROC(ia32_sysenter_target) |
| @@ -339,9 +343,11 @@ ENTRY(ia32_cstar_target) |
| testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
| CFI_REMEMBER_STATE |
| jnz cstar_tracesys |
| - cmpq $IA32_NR_syscalls-1,%rax |
| - ja ia32_badsys |
| + cmpq $IA32_NR_syscalls,%rax |
| + jae ia32_badsys |
| cstar_do_call: |
| + sbb %r8,%r8 /* array_index_mask_nospec() */ |
| + and %r8,%rax |
| IA32_ARG_FIXUP 1 |
| cstar_dispatch: |
| #ifdef CONFIG_RETPOLINE |
| @@ -397,8 +403,8 @@ cstar_tracesys: |
| LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ |
| RESTORE_REST |
| xchgl %ebp,%r9d |
| - cmpq $(IA32_NR_syscalls-1),%rax |
| - ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ |
| + cmpq $(IA32_NR_syscalls),%rax |
| + jae int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ |
| jmp cstar_do_call |
| END(ia32_cstar_target) |
| |
| @@ -456,9 +462,11 @@ ENTRY(ia32_syscall) |
| orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
| testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
| jnz ia32_tracesys |
| - cmpq $(IA32_NR_syscalls-1),%rax |
| - ja ia32_badsys |
| + cmpq $(IA32_NR_syscalls),%rax |
| + jae ia32_badsys |
| ia32_do_call: |
| + sbb %r8,%r8 /* array_index_mask_nospec() */ |
| + and %r8,%rax |
| IA32_ARG_FIXUP |
| #ifdef CONFIG_RETPOLINE |
| movq ia32_sys_call_table(,%rax,8),%rax |
| @@ -480,8 +488,8 @@ ia32_tracesys: |
| call syscall_trace_enter |
| LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ |
| RESTORE_REST |
| - cmpq $(IA32_NR_syscalls-1),%rax |
| - ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ |
| + cmpq $(IA32_NR_syscalls),%rax |
| + jae int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ |
| jmp ia32_do_call |
| END(ia32_syscall) |
| |
| --- a/arch/x86/kernel/entry_32.S |
| +++ b/arch/x86/kernel/entry_32.S |
| @@ -426,6 +426,8 @@ sysenter_past_esp: |
| sysenter_do_call: |
| cmpl $(NR_syscalls), %eax |
| jae sysenter_badsys |
| + sbb %edx, %edx /* array_index_mask_nospec() */ |
| + and %edx, %eax |
| #ifdef CONFIG_RETPOLINE |
| movl sys_call_table(,%eax,4),%eax |
| call __x86_indirect_thunk_eax |
| @@ -508,6 +510,8 @@ ENTRY(system_call) |
| cmpl $(NR_syscalls), %eax |
| jae syscall_badsys |
| syscall_call: |
| + sbb %edx, %edx /* array_index_mask_nospec() */ |
| + and %edx, %eax |
| #ifdef CONFIG_RETPOLINE |
| movl sys_call_table(,%eax,4),%eax |
| call __x86_indirect_thunk_eax |
| --- a/arch/x86/kernel/entry_64.S |
| +++ b/arch/x86/kernel/entry_64.S |
| @@ -445,12 +445,14 @@ GLOBAL(system_call_after_swapgs) |
| jnz tracesys |
| system_call_fastpath: |
| #if __SYSCALL_MASK == ~0 |
| - cmpq $__NR_syscall_max,%rax |
| + cmpq $NR_syscalls, %rax |
| #else |
| andl $__SYSCALL_MASK,%eax |
| - cmpl $__NR_syscall_max,%eax |
| + cmpl $NR_syscalls, %eax |
| #endif |
| - ja badsys |
| + jae badsys |
| + sbb %rcx, %rcx /* array_index_mask_nospec() */ |
| + and %rcx, %rax |
| movq %r10,%rcx |
| #ifdef CONFIG_RETPOLINE |
| movq sys_call_table(, %rax, 8), %rax |
| @@ -577,12 +579,14 @@ tracesys: |
| LOAD_ARGS ARGOFFSET, 1 |
| RESTORE_REST |
| #if __SYSCALL_MASK == ~0 |
| - cmpq $__NR_syscall_max,%rax |
| + cmpq $NR_syscalls, %rax |
| #else |
| andl $__SYSCALL_MASK,%eax |
| - cmpl $__NR_syscall_max,%eax |
| + cmpl $NR_syscalls, %eax |
| #endif |
| - ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ |
| + jae int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ |
| + sbb %rcx, %rcx /* array_index_mask_nospec() */ |
| + and %rcx, %rax |
| movq %r10,%rcx /* fixup for C */ |
| #ifdef CONFIG_RETPOLINE |
| movq sys_call_table(, %rax, 8), %rax |