releases/4.4.81/mm-mprotect-flush-tlb-if-potentially-racing-with-a-parallel-reclaim-leaving-stale-tlb-entries.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 3ea277194daaeaa84ce75180ec7c7a2075027a68 Mon Sep 17 00:00:00 2001
 From: Mel Gorman <mgorman@suse.de>
 Date: Wed, 2 Aug 2017 13:31:52 -0700
 Subject: mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries

 From: Mel Gorman <mgorman@suse.de>

 commit 3ea277194daaeaa84ce75180ec7c7a2075027a68 upstream.

 Stable note for 4.4: The upstream patch patches madvise(MADV_FREE) but 4.4
 	does not have support for that feature. The changelog is left
 	as-is but the hunk related to madvise is omitted from the backport.

 Nadav Amit identified a theoritical race between page reclaim and
 mprotect due to TLB flushes being batched outside of the PTL being held.

 He described the race as follows:

         CPU0                            CPU1
         ----                            ----
                                         user accesses memory using RW PTE
                                         [PTE now cached in TLB]
         try_to_unmap_one()
         ==> ptep_get_and_clear()
         ==> set_tlb_ubc_flush_pending()
                                         mprotect(addr, PROT_READ)
                                         ==> change_pte_range()
                                         ==> [ PTE non-present - no flush ]

                                         user writes using cached RW PTE
         ...

         try_to_unmap_flush()

 The same type of race exists for reads when protecting for PROT_NONE and
 also exists for operations that can leave an old TLB entry behind such
 as munmap, mremap and madvise.

 For some operations like mprotect, it's not necessarily a data integrity
 issue but it is a correctness issue as there is a window where an
 mprotect that limits access still allows access.  For munmap, it's
 potentially a data integrity issue although the race is massive as an
 munmap, mmap and return to userspace must all complete between the
 window when reclaim drops the PTL and flushes the TLB.  However, it's
 theoritically possible so handle this issue by flushing the mm if
 reclaim is potentially currently batching TLB flushes.

 Other instances where a flush is required for a present pte should be ok
 as either the page lock is held preventing parallel reclaim or a page
 reference count is elevated preventing a parallel free leading to
 corruption.  In the case of page_mkclean there isn't an obvious path
 that userspace could take advantage of without using the operations that
 are guarded by this patch.  Other users such as gup as a race with
 reclaim looks just at PTEs.  huge page variants should be ok as they
 don't race with reclaim.  mincore only looks at PTEs.  userfault also
 should be ok as if a parallel reclaim takes place, it will either fault
 the page back in or read some of the data before the flush occurs
 triggering a fault.

 Note that a variant of this patch was acked by Andy Lutomirski but this
 was for the x86 parts on top of his PCID work which didn't make the 4.13
 merge window as expected.  His ack is dropped from this version and
 there will be a follow-on patch on top of PCID that will include his
 ack.

 [akpm@linux-foundation.org: tweak comments]
 [akpm@linux-foundation.org: fix spello]
 Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de
 Reported-by: Nadav Amit <nadav.amit@gmail.com>
 Signed-off-by: Mel Gorman <mgorman@suse.de>
 Cc: Andy Lutomirski <luto@kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  include/linux/mm_types.h |    4 ++++
  mm/internal.h            |    5 ++++-
  mm/memory.c              |    1 +
  mm/mprotect.c            |    1 +
  mm/mremap.c              |    1 +
  mm/rmap.c                |   36 ++++++++++++++++++++++++++++++++++++
  6 files changed, 47 insertions(+), 1 deletion(-)

 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
 @@ -504,6 +504,10 @@ struct mm_struct {
  	 */
  	bool tlb_flush_pending;
  #endif
 +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 +	/* See flush_tlb_batched_pending() */
 +	bool tlb_flush_batched;
 +#endif
  	struct uprobes_state uprobes_state;
  #ifdef CONFIG_X86_INTEL_MPX
  	/* address of the bounds directory */
 --- a/mm/internal.h
 +++ b/mm/internal.h
 @@ -453,6 +453,7 @@ struct tlbflush_unmap_batch;
  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  void try_to_unmap_flush(void);
  void try_to_unmap_flush_dirty(void);
 +void flush_tlb_batched_pending(struct mm_struct *mm);
  #else
  static inline void try_to_unmap_flush(void)
  {
 @@ -460,6 +461,8 @@ static inline void try_to_unmap_flush(vo
  static inline void try_to_unmap_flush_dirty(void)
  {
  }
 -
 +static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 +{
 +}
  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  #endif	/* __MM_INTERNAL_H */
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -1127,6 +1127,7 @@ again:
  	init_rss_vec(rss);
  	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  	pte = start_pte;
 +	flush_tlb_batched_pending(mm);
  	arch_enter_lazy_mmu_mode();
  	do {
  		pte_t ptent = *pte;
 --- a/mm/mprotect.c
 +++ b/mm/mprotect.c
 @@ -72,6 +72,7 @@ static unsigned long change_pte_range(st
  	if (!pte)
  		return 0;

 +	flush_tlb_batched_pending(vma->vm_mm);
  	arch_enter_lazy_mmu_mode();
  	do {
  		oldpte = *pte;
 --- a/mm/mremap.c
 +++ b/mm/mremap.c
 @@ -135,6 +135,7 @@ static void move_ptes(struct vm_area_str
  	new_ptl = pte_lockptr(mm, new_pmd);
  	if (new_ptl != old_ptl)
  		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 +	flush_tlb_batched_pending(vma->vm_mm);
  	arch_enter_lazy_mmu_mode();

  	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 --- a/mm/rmap.c
 +++ b/mm/rmap.c
 @@ -649,6 +649,13 @@ static void set_tlb_ubc_flush_pending(st
  	tlb_ubc->flush_required = true;

  	/*
 +	 * Ensure compiler does not re-order the setting of tlb_flush_batched
 +	 * before the PTE is cleared.
 +	 */
 +	barrier();
 +	mm->tlb_flush_batched = true;
 +
 +	/*
  	 * If the PTE was dirty then it's best to assume it's writable. The
  	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
  	 * before the page is queued for IO.
 @@ -675,6 +682,35 @@ static bool should_defer_flush(struct mm

  	return should_defer;
  }
 +
 +/*
 + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 + * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 + * operation such as mprotect or munmap to race between reclaim unmapping
 + * the page and flushing the page. If this race occurs, it potentially allows
 + * access to data via a stale TLB entry. Tracking all mm's that have TLB
 + * batching in flight would be expensive during reclaim so instead track
 + * whether TLB batching occurred in the past and if so then do a flush here
 + * if required. This will cost one additional flush per reclaim cycle paid
 + * by the first operation at risk such as mprotect and mumap.
 + *
 + * This must be called under the PTL so that an access to tlb_flush_batched
 + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 + * via the PTL.
 + */
 +void flush_tlb_batched_pending(struct mm_struct *mm)
 +{
 +	if (mm->tlb_flush_batched) {
 +		flush_tlb_mm(mm);
 +
 +		/*
 +		 * Do not allow the compiler to re-order the clearing of
 +		 * tlb_flush_batched before the tlb is flushed.
 +		 */
 +		barrier();
 +		mm->tlb_flush_batched = false;
 +	}
 +}
  #else
  static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
  		struct page *page, bool writable)
	From 3ea277194daaeaa84ce75180ec7c7a2075027a68 Mon Sep 17 00:00:00 2001
	From: Mel Gorman <mgorman@suse.de>
	Date: Wed, 2 Aug 2017 13:31:52 -0700
	Subject: mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries

	From: Mel Gorman <mgorman@suse.de>

	commit 3ea277194daaeaa84ce75180ec7c7a2075027a68 upstream.

	Stable note for 4.4: The upstream patch patches madvise(MADV_FREE) but 4.4
	does not have support for that feature. The changelog is left
	as-is but the hunk related to madvise is omitted from the backport.

	Nadav Amit identified a theoritical race between page reclaim and
	mprotect due to TLB flushes being batched outside of the PTL being held.

	He described the race as follows:

	CPU0 CPU1
	---- ----
	user accesses memory using RW PTE
	[PTE now cached in TLB]
	try_to_unmap_one()
	==> ptep_get_and_clear()
	==> set_tlb_ubc_flush_pending()
	mprotect(addr, PROT_READ)
	==> change_pte_range()
	==> [ PTE non-present - no flush ]

	user writes using cached RW PTE
	...

	try_to_unmap_flush()

	The same type of race exists for reads when protecting for PROT_NONE and
	also exists for operations that can leave an old TLB entry behind such
	as munmap, mremap and madvise.

	For some operations like mprotect, it's not necessarily a data integrity
	issue but it is a correctness issue as there is a window where an
	mprotect that limits access still allows access. For munmap, it's
	potentially a data integrity issue although the race is massive as an
	munmap, mmap and return to userspace must all complete between the
	window when reclaim drops the PTL and flushes the TLB. However, it's
	theoritically possible so handle this issue by flushing the mm if
	reclaim is potentially currently batching TLB flushes.

	Other instances where a flush is required for a present pte should be ok
	as either the page lock is held preventing parallel reclaim or a page
	reference count is elevated preventing a parallel free leading to
	corruption. In the case of page_mkclean there isn't an obvious path
	that userspace could take advantage of without using the operations that
	are guarded by this patch. Other users such as gup as a race with
	reclaim looks just at PTEs. huge page variants should be ok as they
	don't race with reclaim. mincore only looks at PTEs. userfault also
	should be ok as if a parallel reclaim takes place, it will either fault
	the page back in or read some of the data before the flush occurs
	triggering a fault.

	Note that a variant of this patch was acked by Andy Lutomirski but this
	was for the x86 parts on top of his PCID work which didn't make the 4.13
	merge window as expected. His ack is dropped from this version and
	there will be a follow-on patch on top of PCID that will include his
	ack.

	[akpm@linux-foundation.org: tweak comments]
	[akpm@linux-foundation.org: fix spello]
	Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de
	Reported-by: Nadav Amit <nadav.amit@gmail.com>
	Signed-off-by: Mel Gorman <mgorman@suse.de>
	Cc: Andy Lutomirski <luto@kernel.org>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	include/linux/mm_types.h \| 4 ++++
	mm/internal.h \| 5 ++++-
	mm/memory.c \| 1 +
	mm/mprotect.c \| 1 +
	mm/mremap.c \| 1 +
	mm/rmap.c \| 36 ++++++++++++++++++++++++++++++++++++
	6 files changed, 47 insertions(+), 1 deletion(-)

	--- a/include/linux/mm_types.h
	+++ b/include/linux/mm_types.h
	@@ -504,6 +504,10 @@ struct mm_struct {
	*/
	bool tlb_flush_pending;
	#endif
	+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
	+ /* See flush_tlb_batched_pending() */
	+ bool tlb_flush_batched;
	+#endif
	struct uprobes_state uprobes_state;
	#ifdef CONFIG_X86_INTEL_MPX
	/* address of the bounds directory */
	--- a/mm/internal.h
	+++ b/mm/internal.h
	@@ -453,6 +453,7 @@ struct tlbflush_unmap_batch;
	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
	void try_to_unmap_flush(void);
	void try_to_unmap_flush_dirty(void);
	+void flush_tlb_batched_pending(struct mm_struct *mm);
	#else
	static inline void try_to_unmap_flush(void)
	{
	@@ -460,6 +461,8 @@ static inline void try_to_unmap_flush(vo
	static inline void try_to_unmap_flush_dirty(void)
	{
	}
	-
	+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
	+{
	+}
	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
	#endif /* __MM_INTERNAL_H */
	--- a/mm/memory.c
	+++ b/mm/memory.c
	@@ -1127,6 +1127,7 @@ again:
	init_rss_vec(rss);
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
	+ flush_tlb_batched_pending(mm);
	arch_enter_lazy_mmu_mode();
	do {
	pte_t ptent = *pte;
	--- a/mm/mprotect.c
	+++ b/mm/mprotect.c
	@@ -72,6 +72,7 @@ static unsigned long change_pte_range(st
	if (!pte)
	return 0;

	+ flush_tlb_batched_pending(vma->vm_mm);
	arch_enter_lazy_mmu_mode();
	do {
	oldpte = *pte;
	--- a/mm/mremap.c
	+++ b/mm/mremap.c
	@@ -135,6 +135,7 @@ static void move_ptes(struct vm_area_str
	new_ptl = pte_lockptr(mm, new_pmd);
	if (new_ptl != old_ptl)
	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
	+ flush_tlb_batched_pending(vma->vm_mm);
	arch_enter_lazy_mmu_mode();

	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
	--- a/mm/rmap.c
	+++ b/mm/rmap.c
	@@ -649,6 +649,13 @@ static void set_tlb_ubc_flush_pending(st
	tlb_ubc->flush_required = true;

	/*
	+ * Ensure compiler does not re-order the setting of tlb_flush_batched
	+ * before the PTE is cleared.
	+ */
	+ barrier();
	+ mm->tlb_flush_batched = true;
	+
	+ /*
	* If the PTE was dirty then it's best to assume it's writable. The
	* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
	* before the page is queued for IO.
	@@ -675,6 +682,35 @@ static bool should_defer_flush(struct mm

	return should_defer;
	}
	+
	+/*
	+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
	+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
	+ * operation such as mprotect or munmap to race between reclaim unmapping
	+ * the page and flushing the page. If this race occurs, it potentially allows
	+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
	+ * batching in flight would be expensive during reclaim so instead track
	+ * whether TLB batching occurred in the past and if so then do a flush here
	+ * if required. This will cost one additional flush per reclaim cycle paid
	+ * by the first operation at risk such as mprotect and mumap.
	+ *
	+ * This must be called under the PTL so that an access to tlb_flush_batched
	+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
	+ * via the PTL.
	+ */
	+void flush_tlb_batched_pending(struct mm_struct *mm)
	+{
	+ if (mm->tlb_flush_batched) {
	+ flush_tlb_mm(mm);
	+
	+ /*
	+ * Do not allow the compiler to re-order the clearing of
	+ * tlb_flush_batched before the tlb is flushed.
	+ */
	+ barrier();
	+ mm->tlb_flush_batched = false;
	+ }
	+}
	#else
	static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
	struct page *page, bool writable)