releases/4.3.5/x86-mm-add-barriers-and-document-switch_mm-vs-flush-synchronization.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 71b3c126e61177eb693423f2e18a1914205b165e Mon Sep 17 00:00:00 2001
 From: Andy Lutomirski <luto@kernel.org>
 Date: Wed, 6 Jan 2016 12:21:01 -0800
 Subject: x86/mm: Add barriers and document switch_mm()-vs-flush synchronization

 From: Andy Lutomirski <luto@kernel.org>

 commit 71b3c126e61177eb693423f2e18a1914205b165e upstream.

 When switch_mm() activates a new PGD, it also sets a bit that
 tells other CPUs that the PGD is in use so that TLB flush IPIs
 will be sent.  In order for that to work correctly, the bit
 needs to be visible prior to loading the PGD and therefore
 starting to fill the local TLB.

 Document all the barriers that make this work correctly and add
 a couple that were missing.

 Signed-off-by: Andy Lutomirski <luto@kernel.org>
 Cc: Andrew Morton <akpm@linux-foundation.org>
 Cc: Andy Lutomirski <luto@amacapital.net>
 Cc: Borislav Petkov <bp@alien8.de>
 Cc: Brian Gerst <brgerst@gmail.com>
 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 Cc: Denys Vlasenko <dvlasenk@redhat.com>
 Cc: H. Peter Anvin <hpa@zytor.com>
 Cc: Linus Torvalds <torvalds@linux-foundation.org>
 Cc: Peter Zijlstra <peterz@infradead.org>
 Cc: Rik van Riel <riel@redhat.com>
 Cc: Thomas Gleixner <tglx@linutronix.de>
 Cc: linux-mm@kvack.org
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

 ---
  arch/x86/include/asm/mmu_context.h |   33 ++++++++++++++++++++++++++++++++-
  arch/x86/mm/tlb.c                  |   29 ++++++++++++++++++++++++++---
  2 files changed, 58 insertions(+), 4 deletions(-)

 --- a/arch/x86/include/asm/mmu_context.h
 +++ b/arch/x86/include/asm/mmu_context.h
 @@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_s
  #endif
  		cpumask_set_cpu(cpu, mm_cpumask(next));

 -		/* Re-load page tables */
 +		/*
 +		 * Re-load page tables.
 +		 *
 +		 * This logic has an ordering constraint:
 +		 *
 +		 *  CPU 0: Write to a PTE for 'next'
 +		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
 +		 *  CPU 1: set bit 1 in next's mm_cpumask
 +		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
 +		 *
 +		 * We need to prevent an outcome in which CPU 1 observes
 +		 * the new PTE value and CPU 0 observes bit 1 clear in
 +		 * mm_cpumask.  (If that occurs, then the IPI will never
 +		 * be sent, and CPU 0's TLB will contain a stale entry.)
 +		 *
 +		 * The bad outcome can occur if either CPU's load is
 +		 * reordered before that CPU's store, so both CPUs much
 +		 * execute full barriers to prevent this from happening.
 +		 *
 +		 * Thus, switch_mm needs a full barrier between the
 +		 * store to mm_cpumask and any operation that could load
 +		 * from next->pgd.  This barrier synchronizes with
 +		 * remote TLB flushers.  Fortunately, load_cr3 is
 +		 * serializing and thus acts as a full barrier.
 +		 *
 +		 */
  		load_cr3(next->pgd);
 +
  		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

  		/* Stop flush ipis for the previous mm */
 @@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_s
  			 * schedule, protecting us from simultaneous changes.
  			 */
  			cpumask_set_cpu(cpu, mm_cpumask(next));
 +
  			/*
  			 * We were in lazy tlb mode and leave_mm disabled
  			 * tlb flush IPI delivery. We must reload CR3
  			 * to make sure to use no freed page tables.
 +			 *
 +			 * As above, this is a barrier that forces
 +			 * TLB repopulation to be ordered after the
 +			 * store to mm_cpumask.
  			 */
  			load_cr3(next->pgd);
  			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 --- a/arch/x86/mm/tlb.c
 +++ b/arch/x86/mm/tlb.c
 @@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
  	preempt_disable();

  	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 +
 +	/* This is an implicit full barrier that synchronizes with switch_mm. */
  	local_flush_tlb();
 +
  	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
  		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
 @@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct
  	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;

  	preempt_disable();
 -	if (current->active_mm != mm)
 +	if (current->active_mm != mm) {
 +		/* Synchronize with switch_mm. */
 +		smp_mb();
 +
  		goto out;
 +	}

  	if (!current->mm) {
  		leave_mm(smp_processor_id());
 +
 +		/* Synchronize with switch_mm. */
 +		smp_mb();
 +
  		goto out;
  	}

  	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
  		base_pages_to_flush = (end - start) >> PAGE_SHIFT;

 +	/*
 +	 * Both branches below are implicit full barriers (MOV to CR or
 +	 * INVLPG) that synchronize with switch_mm.
 +	 */
  	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
  		base_pages_to_flush = TLB_FLUSH_ALL;
  		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 @@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struc
  	preempt_disable();

  	if (current->active_mm == mm) {
 -		if (current->mm)
 +		if (current->mm) {
 +			/*
 +			 * Implicit full barrier (INVLPG) that synchronizes
 +			 * with switch_mm.
 +			 */
  			__flush_tlb_one(start);
 -		else
 +		} else {
  			leave_mm(smp_processor_id());
 +
 +			/* Synchronize with switch_mm. */
 +			smp_mb();
 +		}
  	}

  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
	From 71b3c126e61177eb693423f2e18a1914205b165e Mon Sep 17 00:00:00 2001
	From: Andy Lutomirski <luto@kernel.org>
	Date: Wed, 6 Jan 2016 12:21:01 -0800
	Subject: x86/mm: Add barriers and document switch_mm()-vs-flush synchronization

	From: Andy Lutomirski <luto@kernel.org>

	commit 71b3c126e61177eb693423f2e18a1914205b165e upstream.

	When switch_mm() activates a new PGD, it also sets a bit that
	tells other CPUs that the PGD is in use so that TLB flush IPIs
	will be sent. In order for that to work correctly, the bit
	needs to be visible prior to loading the PGD and therefore
	starting to fill the local TLB.

	Document all the barriers that make this work correctly and add
	a couple that were missing.

	Signed-off-by: Andy Lutomirski <luto@kernel.org>
	Cc: Andrew Morton <akpm@linux-foundation.org>
	Cc: Andy Lutomirski <luto@amacapital.net>
	Cc: Borislav Petkov <bp@alien8.de>
	Cc: Brian Gerst <brgerst@gmail.com>
	Cc: Dave Hansen <dave.hansen@linux.intel.com>
	Cc: Denys Vlasenko <dvlasenk@redhat.com>
	Cc: H. Peter Anvin <hpa@zytor.com>
	Cc: Linus Torvalds <torvalds@linux-foundation.org>
	Cc: Peter Zijlstra <peterz@infradead.org>
	Cc: Rik van Riel <riel@redhat.com>
	Cc: Thomas Gleixner <tglx@linutronix.de>
	Cc: linux-mm@kvack.org
	Signed-off-by: Ingo Molnar <mingo@kernel.org>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

	---
	arch/x86/include/asm/mmu_context.h \| 33 ++++++++++++++++++++++++++++++++-
	arch/x86/mm/tlb.c \| 29 ++++++++++++++++++++++++++---
	2 files changed, 58 insertions(+), 4 deletions(-)

	--- a/arch/x86/include/asm/mmu_context.h
	+++ b/arch/x86/include/asm/mmu_context.h
	@@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_s
	#endif
	cpumask_set_cpu(cpu, mm_cpumask(next));

	- /* Re-load page tables */
	+ /*
	+ * Re-load page tables.
	+ *
	+ * This logic has an ordering constraint:
	+ *
	+ * CPU 0: Write to a PTE for 'next'
	+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
	+ * CPU 1: set bit 1 in next's mm_cpumask
	+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
	+ *
	+ * We need to prevent an outcome in which CPU 1 observes
	+ * the new PTE value and CPU 0 observes bit 1 clear in
	+ * mm_cpumask. (If that occurs, then the IPI will never
	+ * be sent, and CPU 0's TLB will contain a stale entry.)
	+ *
	+ * The bad outcome can occur if either CPU's load is
	+ * reordered before that CPU's store, so both CPUs much
	+ * execute full barriers to prevent this from happening.
	+ *
	+ * Thus, switch_mm needs a full barrier between the
	+ * store to mm_cpumask and any operation that could load
	+ * from next->pgd. This barrier synchronizes with
	+ * remote TLB flushers. Fortunately, load_cr3 is
	+ * serializing and thus acts as a full barrier.
	+ *
	+ */
	load_cr3(next->pgd);
	+
	trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

	/* Stop flush ipis for the previous mm */
	@@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_s
	* schedule, protecting us from simultaneous changes.
	*/
	cpumask_set_cpu(cpu, mm_cpumask(next));
	+
	/*
	* We were in lazy tlb mode and leave_mm disabled
	* tlb flush IPI delivery. We must reload CR3
	* to make sure to use no freed page tables.
	+ *
	+ * As above, this is a barrier that forces
	+ * TLB repopulation to be ordered after the
	+ * store to mm_cpumask.
	*/
	load_cr3(next->pgd);
	trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
	--- a/arch/x86/mm/tlb.c
	+++ b/arch/x86/mm/tlb.c
	@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
	preempt_disable();

	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
	+
	+ /* This is an implicit full barrier that synchronizes with switch_mm. */
	local_flush_tlb();
	+
	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
	flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
	@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct
	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;

	preempt_disable();
	- if (current->active_mm != mm)
	+ if (current->active_mm != mm) {
	+ /* Synchronize with switch_mm. */
	+ smp_mb();
	+
	goto out;
	+ }

	if (!current->mm) {
	leave_mm(smp_processor_id());
	+
	+ /* Synchronize with switch_mm. */
	+ smp_mb();
	+
	goto out;
	}

	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
	base_pages_to_flush = (end - start) >> PAGE_SHIFT;

	+ /*
	+ * Both branches below are implicit full barriers (MOV to CR or
	+ * INVLPG) that synchronize with switch_mm.
	+ */
	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
	base_pages_to_flush = TLB_FLUSH_ALL;
	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
	@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struc
	preempt_disable();

	if (current->active_mm == mm) {
	- if (current->mm)
	+ if (current->mm) {
	+ /*
	+ * Implicit full barrier (INVLPG) that synchronizes
	+ * with switch_mm.
	+ */
	__flush_tlb_one(start);
	- else
	+ } else {
	leave_mm(smp_processor_id());
	+
	+ /* Synchronize with switch_mm. */
	+ smp_mb();
	+ }
	}

	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)