| From foo@baz Tue Aug 14 16:14:56 CEST 2018 |
| From: Andi Kleen <ak@linux.intel.com> |
| Date: Wed, 13 Jun 2018 15:48:27 -0700 |
| Subject: x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings |
| |
| From: Andi Kleen <ak@linux.intel.com> |
| |
| commit 42e4089c7890725fcd329999252dc489b72f2921 upstream |
| |
| For L1TF PROT_NONE mappings are protected by inverting the PFN in the page |
| table entry. This sets the high bits in the CPU's address space, thus |
| making sure to point to not point an unmapped entry to valid cached memory. |
| |
| Some server system BIOSes put the MMIO mappings high up in the physical |
| address space. If such an high mapping was mapped to unprivileged users |
| they could attack low memory by setting such a mapping to PROT_NONE. This |
| could happen through a special device driver which is not access |
| protected. Normal /dev/mem is of course access protected. |
| |
| To avoid this forbid PROT_NONE mappings or mprotect for high MMIO mappings. |
| |
| Valid page mappings are allowed because the system is then unsafe anyways. |
| |
| It's not expected that users commonly use PROT_NONE on MMIO. But to |
| minimize any impact this is only enforced if the mapping actually refers to |
| a high MMIO address (defined as the MAX_PA-1 bit being set), and also skip |
| the check for root. |
| |
| For mmaps this is straight forward and can be handled in vm_insert_pfn and |
| in remap_pfn_range(). |
| |
| For mprotect it's a bit trickier. At the point where the actual PTEs are |
| accessed a lot of state has been changed and it would be difficult to undo |
| on an error. Since this is a uncommon case use a separate early page talk |
| walk pass for MMIO PROT_NONE mappings that checks for this condition |
| early. For non MMIO and non PROT_NONE there are no changes. |
| |
| [dwmw2: Backport to 4.9] |
| |
| Signed-off-by: Andi Kleen <ak@linux.intel.com> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> |
| Acked-by: Dave Hansen <dave.hansen@intel.com> |
| Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/x86/include/asm/pgtable.h | 9 +++++++ |
| arch/x86/mm/mmap.c | 21 +++++++++++++++++ |
| include/asm-generic/pgtable.h | 12 ++++++++++ |
| mm/memory.c | 29 ++++++++++++++++++------ |
| mm/mprotect.c | 49 +++++++++++++++++++++++++++++++++++++++++ |
| 5 files changed, 113 insertions(+), 7 deletions(-) |
| |
| --- a/arch/x86/include/asm/pgtable.h |
| +++ b/arch/x86/include/asm/pgtable.h |
| @@ -1026,6 +1026,15 @@ static inline u16 pte_flags_pkey(unsigne |
| #endif |
| } |
| |
| + |
| +#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 |
| +extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); |
| + |
| +static inline bool arch_has_pfn_modify_check(void) |
| +{ |
| + return boot_cpu_has_bug(X86_BUG_L1TF); |
| +} |
| + |
| #include <asm-generic/pgtable.h> |
| #endif /* __ASSEMBLY__ */ |
| |
| --- a/arch/x86/mm/mmap.c |
| +++ b/arch/x86/mm/mmap.c |
| @@ -121,3 +121,24 @@ const char *arch_vma_name(struct vm_area |
| return "[mpx]"; |
| return NULL; |
| } |
| + |
| +/* |
| + * Only allow root to set high MMIO mappings to PROT_NONE. |
| + * This prevents an unpriv. user to set them to PROT_NONE and invert |
| + * them, then pointing to valid memory for L1TF speculation. |
| + * |
| + * Note: for locked down kernels may want to disable the root override. |
| + */ |
| +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) |
| +{ |
| + if (!boot_cpu_has_bug(X86_BUG_L1TF)) |
| + return true; |
| + if (!__pte_needs_invert(pgprot_val(prot))) |
| + return true; |
| + /* If it's real memory always allow */ |
| + if (pfn_valid(pfn)) |
| + return true; |
| + if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) |
| + return false; |
| + return true; |
| +} |
| --- a/include/asm-generic/pgtable.h |
| +++ b/include/asm-generic/pgtable.h |
| @@ -842,4 +842,16 @@ int phys_mem_access_prot_allowed(struct |
| #endif |
| #endif |
| |
| +#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED |
| +static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) |
| +{ |
| + return true; |
| +} |
| + |
| +static inline bool arch_has_pfn_modify_check(void) |
| +{ |
| + return false; |
| +} |
| +#endif |
| + |
| #endif /* _ASM_GENERIC_PGTABLE_H */ |
| --- a/mm/memory.c |
| +++ b/mm/memory.c |
| @@ -1641,6 +1641,9 @@ int vm_insert_pfn_prot(struct vm_area_st |
| if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) |
| return -EINVAL; |
| |
| + if (!pfn_modify_allowed(pfn, pgprot)) |
| + return -EACCES; |
| + |
| ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); |
| |
| return ret; |
| @@ -1659,6 +1662,9 @@ int vm_insert_mixed(struct vm_area_struc |
| if (track_pfn_insert(vma, &pgprot, pfn)) |
| return -EINVAL; |
| |
| + if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) |
| + return -EACCES; |
| + |
| /* |
| * If we don't have pte special, then we have to use the pfn_valid() |
| * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* |
| @@ -1692,6 +1698,7 @@ static int remap_pte_range(struct mm_str |
| { |
| pte_t *pte; |
| spinlock_t *ptl; |
| + int err = 0; |
| |
| pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
| if (!pte) |
| @@ -1699,12 +1706,16 @@ static int remap_pte_range(struct mm_str |
| arch_enter_lazy_mmu_mode(); |
| do { |
| BUG_ON(!pte_none(*pte)); |
| + if (!pfn_modify_allowed(pfn, prot)) { |
| + err = -EACCES; |
| + break; |
| + } |
| set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); |
| pfn++; |
| } while (pte++, addr += PAGE_SIZE, addr != end); |
| arch_leave_lazy_mmu_mode(); |
| pte_unmap_unlock(pte - 1, ptl); |
| - return 0; |
| + return err; |
| } |
| |
| static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, |
| @@ -1713,6 +1724,7 @@ static inline int remap_pmd_range(struct |
| { |
| pmd_t *pmd; |
| unsigned long next; |
| + int err; |
| |
| pfn -= addr >> PAGE_SHIFT; |
| pmd = pmd_alloc(mm, pud, addr); |
| @@ -1721,9 +1733,10 @@ static inline int remap_pmd_range(struct |
| VM_BUG_ON(pmd_trans_huge(*pmd)); |
| do { |
| next = pmd_addr_end(addr, end); |
| - if (remap_pte_range(mm, pmd, addr, next, |
| - pfn + (addr >> PAGE_SHIFT), prot)) |
| - return -ENOMEM; |
| + err = remap_pte_range(mm, pmd, addr, next, |
| + pfn + (addr >> PAGE_SHIFT), prot); |
| + if (err) |
| + return err; |
| } while (pmd++, addr = next, addr != end); |
| return 0; |
| } |
| @@ -1734,6 +1747,7 @@ static inline int remap_pud_range(struct |
| { |
| pud_t *pud; |
| unsigned long next; |
| + int err; |
| |
| pfn -= addr >> PAGE_SHIFT; |
| pud = pud_alloc(mm, pgd, addr); |
| @@ -1741,9 +1755,10 @@ static inline int remap_pud_range(struct |
| return -ENOMEM; |
| do { |
| next = pud_addr_end(addr, end); |
| - if (remap_pmd_range(mm, pud, addr, next, |
| - pfn + (addr >> PAGE_SHIFT), prot)) |
| - return -ENOMEM; |
| + err = remap_pmd_range(mm, pud, addr, next, |
| + pfn + (addr >> PAGE_SHIFT), prot); |
| + if (err) |
| + return err; |
| } while (pud++, addr = next, addr != end); |
| return 0; |
| } |
| --- a/mm/mprotect.c |
| +++ b/mm/mprotect.c |
| @@ -260,6 +260,42 @@ unsigned long change_protection(struct v |
| return pages; |
| } |
| |
| +static int prot_none_pte_entry(pte_t *pte, unsigned long addr, |
| + unsigned long next, struct mm_walk *walk) |
| +{ |
| + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? |
| + 0 : -EACCES; |
| +} |
| + |
| +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, |
| + unsigned long addr, unsigned long next, |
| + struct mm_walk *walk) |
| +{ |
| + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? |
| + 0 : -EACCES; |
| +} |
| + |
| +static int prot_none_test(unsigned long addr, unsigned long next, |
| + struct mm_walk *walk) |
| +{ |
| + return 0; |
| +} |
| + |
| +static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, |
| + unsigned long end, unsigned long newflags) |
| +{ |
| + pgprot_t new_pgprot = vm_get_page_prot(newflags); |
| + struct mm_walk prot_none_walk = { |
| + .pte_entry = prot_none_pte_entry, |
| + .hugetlb_entry = prot_none_hugetlb_entry, |
| + .test_walk = prot_none_test, |
| + .mm = current->mm, |
| + .private = &new_pgprot, |
| + }; |
| + |
| + return walk_page_range(start, end, &prot_none_walk); |
| +} |
| + |
| int |
| mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, |
| unsigned long start, unsigned long end, unsigned long newflags) |
| @@ -278,6 +314,19 @@ mprotect_fixup(struct vm_area_struct *vm |
| } |
| |
| /* |
| + * Do PROT_NONE PFN permission checks here when we can still |
| + * bail out without undoing a lot of state. This is a rather |
| + * uncommon case, so doesn't need to be very optimized. |
| + */ |
| + if (arch_has_pfn_modify_check() && |
| + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && |
| + (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { |
| + error = prot_none_walk(vma, start, end, newflags); |
| + if (error) |
| + return error; |
| + } |
| + |
| + /* |
| * If we make a private mapping writable we increase our commit; |
| * but (without finer accounting) cannot reduce our commit if we |
| * make it unwritable again. hugetlb mapping were accounted for |