| From 3ea277194daaeaa84ce75180ec7c7a2075027a68 Mon Sep 17 00:00:00 2001 |
| From: Mel Gorman <mgorman@suse.de> |
| Date: Wed, 2 Aug 2017 13:31:52 -0700 |
| Subject: mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries |
| |
| From: Mel Gorman <mgorman@suse.de> |
| |
| commit 3ea277194daaeaa84ce75180ec7c7a2075027a68 upstream. |
| |
| Stable note for 4.4: The upstream patch patches madvise(MADV_FREE) but 4.4 |
| does not have support for that feature. The changelog is left |
| as-is but the hunk related to madvise is omitted from the backport. |
| |
| Nadav Amit identified a theoritical race between page reclaim and |
| mprotect due to TLB flushes being batched outside of the PTL being held. |
| |
| He described the race as follows: |
| |
| CPU0 CPU1 |
| ---- ---- |
| user accesses memory using RW PTE |
| [PTE now cached in TLB] |
| try_to_unmap_one() |
| ==> ptep_get_and_clear() |
| ==> set_tlb_ubc_flush_pending() |
| mprotect(addr, PROT_READ) |
| ==> change_pte_range() |
| ==> [ PTE non-present - no flush ] |
| |
| user writes using cached RW PTE |
| ... |
| |
| try_to_unmap_flush() |
| |
| The same type of race exists for reads when protecting for PROT_NONE and |
| also exists for operations that can leave an old TLB entry behind such |
| as munmap, mremap and madvise. |
| |
| For some operations like mprotect, it's not necessarily a data integrity |
| issue but it is a correctness issue as there is a window where an |
| mprotect that limits access still allows access. For munmap, it's |
| potentially a data integrity issue although the race is massive as an |
| munmap, mmap and return to userspace must all complete between the |
| window when reclaim drops the PTL and flushes the TLB. However, it's |
| theoritically possible so handle this issue by flushing the mm if |
| reclaim is potentially currently batching TLB flushes. |
| |
| Other instances where a flush is required for a present pte should be ok |
| as either the page lock is held preventing parallel reclaim or a page |
| reference count is elevated preventing a parallel free leading to |
| corruption. In the case of page_mkclean there isn't an obvious path |
| that userspace could take advantage of without using the operations that |
| are guarded by this patch. Other users such as gup as a race with |
| reclaim looks just at PTEs. huge page variants should be ok as they |
| don't race with reclaim. mincore only looks at PTEs. userfault also |
| should be ok as if a parallel reclaim takes place, it will either fault |
| the page back in or read some of the data before the flush occurs |
| triggering a fault. |
| |
| Note that a variant of this patch was acked by Andy Lutomirski but this |
| was for the x86 parts on top of his PCID work which didn't make the 4.13 |
| merge window as expected. His ack is dropped from this version and |
| there will be a follow-on patch on top of PCID that will include his |
| ack. |
| |
| [akpm@linux-foundation.org: tweak comments] |
| [akpm@linux-foundation.org: fix spello] |
| Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de |
| Reported-by: Nadav Amit <nadav.amit@gmail.com> |
| Signed-off-by: Mel Gorman <mgorman@suse.de> |
| Cc: Andy Lutomirski <luto@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/linux/mm_types.h | 4 ++++ |
| mm/internal.h | 5 ++++- |
| mm/memory.c | 1 + |
| mm/mprotect.c | 1 + |
| mm/mremap.c | 1 + |
| mm/rmap.c | 36 ++++++++++++++++++++++++++++++++++++ |
| 6 files changed, 47 insertions(+), 1 deletion(-) |
| |
| --- a/include/linux/mm_types.h |
| +++ b/include/linux/mm_types.h |
| @@ -504,6 +504,10 @@ struct mm_struct { |
| */ |
| bool tlb_flush_pending; |
| #endif |
| +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| + /* See flush_tlb_batched_pending() */ |
| + bool tlb_flush_batched; |
| +#endif |
| struct uprobes_state uprobes_state; |
| #ifdef CONFIG_X86_INTEL_MPX |
| /* address of the bounds directory */ |
| --- a/mm/internal.h |
| +++ b/mm/internal.h |
| @@ -453,6 +453,7 @@ struct tlbflush_unmap_batch; |
| #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| void try_to_unmap_flush(void); |
| void try_to_unmap_flush_dirty(void); |
| +void flush_tlb_batched_pending(struct mm_struct *mm); |
| #else |
| static inline void try_to_unmap_flush(void) |
| { |
| @@ -460,6 +461,8 @@ static inline void try_to_unmap_flush(vo |
| static inline void try_to_unmap_flush_dirty(void) |
| { |
| } |
| - |
| +static inline void flush_tlb_batched_pending(struct mm_struct *mm) |
| +{ |
| +} |
| #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ |
| #endif /* __MM_INTERNAL_H */ |
| --- a/mm/memory.c |
| +++ b/mm/memory.c |
| @@ -1127,6 +1127,7 @@ again: |
| init_rss_vec(rss); |
| start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| pte = start_pte; |
| + flush_tlb_batched_pending(mm); |
| arch_enter_lazy_mmu_mode(); |
| do { |
| pte_t ptent = *pte; |
| --- a/mm/mprotect.c |
| +++ b/mm/mprotect.c |
| @@ -72,6 +72,7 @@ static unsigned long change_pte_range(st |
| if (!pte) |
| return 0; |
| |
| + flush_tlb_batched_pending(vma->vm_mm); |
| arch_enter_lazy_mmu_mode(); |
| do { |
| oldpte = *pte; |
| --- a/mm/mremap.c |
| +++ b/mm/mremap.c |
| @@ -135,6 +135,7 @@ static void move_ptes(struct vm_area_str |
| new_ptl = pte_lockptr(mm, new_pmd); |
| if (new_ptl != old_ptl) |
| spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
| + flush_tlb_batched_pending(vma->vm_mm); |
| arch_enter_lazy_mmu_mode(); |
| |
| for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, |
| --- a/mm/rmap.c |
| +++ b/mm/rmap.c |
| @@ -649,6 +649,13 @@ static void set_tlb_ubc_flush_pending(st |
| tlb_ubc->flush_required = true; |
| |
| /* |
| + * Ensure compiler does not re-order the setting of tlb_flush_batched |
| + * before the PTE is cleared. |
| + */ |
| + barrier(); |
| + mm->tlb_flush_batched = true; |
| + |
| + /* |
| * If the PTE was dirty then it's best to assume it's writable. The |
| * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() |
| * before the page is queued for IO. |
| @@ -675,6 +682,35 @@ static bool should_defer_flush(struct mm |
| |
| return should_defer; |
| } |
| + |
| +/* |
| + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to |
| + * releasing the PTL if TLB flushes are batched. It's possible for a parallel |
| + * operation such as mprotect or munmap to race between reclaim unmapping |
| + * the page and flushing the page. If this race occurs, it potentially allows |
| + * access to data via a stale TLB entry. Tracking all mm's that have TLB |
| + * batching in flight would be expensive during reclaim so instead track |
| + * whether TLB batching occurred in the past and if so then do a flush here |
| + * if required. This will cost one additional flush per reclaim cycle paid |
| + * by the first operation at risk such as mprotect and mumap. |
| + * |
| + * This must be called under the PTL so that an access to tlb_flush_batched |
| + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise |
| + * via the PTL. |
| + */ |
| +void flush_tlb_batched_pending(struct mm_struct *mm) |
| +{ |
| + if (mm->tlb_flush_batched) { |
| + flush_tlb_mm(mm); |
| + |
| + /* |
| + * Do not allow the compiler to re-order the clearing of |
| + * tlb_flush_batched before the tlb is flushed. |
| + */ |
| + barrier(); |
| + mm->tlb_flush_batched = false; |
| + } |
| +} |
| #else |
| static void set_tlb_ubc_flush_pending(struct mm_struct *mm, |
| struct page *page, bool writable) |