releases/5.9.5/mm-fix-exec-activate_mm-vs-tlb-shootdown-and-lazy-tl.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 8469091364ccc3e9cba51aeee3bff547a03b4c86 Mon Sep 17 00:00:00 2001
 From: Sasha Levin <sashal@kernel.org>
 Date: Mon, 14 Sep 2020 14:52:16 +1000
 Subject: mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race

 From: Nicholas Piggin <npiggin@gmail.com>

 [ Upstream commit d53c3dfb23c45f7d4f910c3a3ca84bf0a99c6143 ]

 Reading and modifying current->mm and current->active_mm and switching
 mm should be done with irqs off, to prevent races seeing an intermediate
 state.

 This is similar to commit 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB
 invalidate"). At exec-time when the new mm is activated, the old one
 should usually be single-threaded and no longer used, unless something
 else is holding an mm_users reference (which may be possible).

 Absent other mm_users, there is also a race with preemption and lazy tlb
 switching. Consider the kernel_execve case where the current thread is
 using a lazy tlb active mm:

   call_usermodehelper()
     kernel_execve()
       old_mm = current->mm;
       active_mm = current->active_mm;
       *** preempt *** -------------------->  schedule()
                                                prev->active_mm = NULL;
                                                mmdrop(prev active_mm);
                                              ...
                       <--------------------  schedule()
       current->mm = mm;
       current->active_mm = mm;
       if (!old_mm)
           mmdrop(active_mm);

 If we switch back to the kernel thread from a different mm, there is a
 double free of the old active_mm, and a missing free of the new one.

 Closing this race only requires interrupts to be disabled while ->mm
 and ->active_mm are being switched, but the TLB problem requires also
 holding interrupts off over activate_mm. Unfortunately not all archs
 can do that yet, e.g., arm defers the switch if irqs are disabled and
 expects finish_arch_post_lock_switch() to be called to complete the
 flush; um takes a blocking lock in activate_mm().

 So as a first step, disable interrupts across the mm/active_mm updates
 to close the lazy tlb preempt race, and provide an arch option to
 extend that to activate_mm which allows architectures doing IPI based
 TLB shootdowns to close the second race.

 This is a bit ugly, but in the interest of fixing the bug and backporting
 before all architectures are converted this is a compromise.

 Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
 Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
 Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
 Link: https://lore.kernel.org/r/20200914045219.3736466-2-npiggin@gmail.com
 Signed-off-by: Sasha Levin <sashal@kernel.org>
 ---
  arch/Kconfig |  7 +++++++
  fs/exec.c    | 17 +++++++++++++++--
  2 files changed, 22 insertions(+), 2 deletions(-)

 diff --git a/arch/Kconfig b/arch/Kconfig
 index af14a567b493f..94821e3f94d16 100644
 --- a/arch/Kconfig
 +++ b/arch/Kconfig
 @@ -414,6 +414,13 @@ config MMU_GATHER_NO_GATHER
  	bool
  	depends on MMU_GATHER_TABLE_FREE

 +config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 +	bool
 +	help
 +	  Temporary select until all architectures can be converted to have
 +	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 +	  shootdowns should enable this.
 +
  config ARCH_HAVE_NMI_SAFE_CMPXCHG
  	bool

 diff --git a/fs/exec.c b/fs/exec.c
 index 07910f5032e74..3622681489864 100644
 --- a/fs/exec.c
 +++ b/fs/exec.c
 @@ -1131,11 +1131,24 @@ static int exec_mmap(struct mm_struct *mm)
  	}

  	task_lock(tsk);
 -	active_mm = tsk->active_mm;
  	membarrier_exec_mmap(mm);
 -	tsk->mm = mm;
 +
 +	local_irq_disable();
 +	active_mm = tsk->active_mm;
  	tsk->active_mm = mm;
 +	tsk->mm = mm;
 +	/*
 +	 * This prevents preemption while active_mm is being loaded and
 +	 * it and mm are being updated, which could cause problems for
 +	 * lazy tlb mm refcounting when these are updated by context
 +	 * switches. Not all architectures can handle irqs off over
 +	 * activate_mm yet.
 +	 */
 +	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 +		local_irq_enable();
  	activate_mm(active_mm, mm);
 +	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 +		local_irq_enable();
  	tsk->mm->vmacache_seqnum = 0;
  	vmacache_flush(tsk);
  	task_unlock(tsk);
 --
 2.27.0
	From 8469091364ccc3e9cba51aeee3bff547a03b4c86 Mon Sep 17 00:00:00 2001
	From: Sasha Levin <sashal@kernel.org>
	Date: Mon, 14 Sep 2020 14:52:16 +1000
	Subject: mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race

	From: Nicholas Piggin <npiggin@gmail.com>

	[ Upstream commit d53c3dfb23c45f7d4f910c3a3ca84bf0a99c6143 ]

	Reading and modifying current->mm and current->active_mm and switching
	mm should be done with irqs off, to prevent races seeing an intermediate
	state.

	This is similar to commit 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB
	invalidate"). At exec-time when the new mm is activated, the old one
	should usually be single-threaded and no longer used, unless something
	else is holding an mm_users reference (which may be possible).

	Absent other mm_users, there is also a race with preemption and lazy tlb
	switching. Consider the kernel_execve case where the current thread is
	using a lazy tlb active mm:

	call_usermodehelper()
	kernel_execve()
	old_mm = current->mm;
	active_mm = current->active_mm;
	* preempt * --------------------> schedule()
	prev->active_mm = NULL;
	mmdrop(prev active_mm);
	...
	<-------------------- schedule()
	current->mm = mm;
	current->active_mm = mm;
	if (!old_mm)
	mmdrop(active_mm);

	If we switch back to the kernel thread from a different mm, there is a
	double free of the old active_mm, and a missing free of the new one.

	Closing this race only requires interrupts to be disabled while ->mm
	and ->active_mm are being switched, but the TLB problem requires also
	holding interrupts off over activate_mm. Unfortunately not all archs
	can do that yet, e.g., arm defers the switch if irqs are disabled and
	expects finish_arch_post_lock_switch() to be called to complete the
	flush; um takes a blocking lock in activate_mm().

	So as a first step, disable interrupts across the mm/active_mm updates
	to close the lazy tlb preempt race, and provide an arch option to
	extend that to activate_mm which allows architectures doing IPI based
	TLB shootdowns to close the second race.

	This is a bit ugly, but in the interest of fixing the bug and backporting
	before all architectures are converted this is a compromise.

	Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
	Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
	Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
	Link: https://lore.kernel.org/r/20200914045219.3736466-2-npiggin@gmail.com
	Signed-off-by: Sasha Levin <sashal@kernel.org>
	---
	arch/Kconfig \| 7 +++++++
	fs/exec.c \| 17 +++++++++++++++--
	2 files changed, 22 insertions(+), 2 deletions(-)

	diff --git a/arch/Kconfig b/arch/Kconfig
	index af14a567b493f..94821e3f94d16 100644
	--- a/arch/Kconfig
	+++ b/arch/Kconfig
	@@ -414,6 +414,13 @@ config MMU_GATHER_NO_GATHER
	bool
	depends on MMU_GATHER_TABLE_FREE

	+config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
	+ bool
	+ help
	+ Temporary select until all architectures can be converted to have
	+ irqs disabled over activate_mm. Architectures that do IPI based TLB
	+ shootdowns should enable this.
	+
	config ARCH_HAVE_NMI_SAFE_CMPXCHG
	bool

	diff --git a/fs/exec.c b/fs/exec.c
	index 07910f5032e74..3622681489864 100644
	--- a/fs/exec.c
	+++ b/fs/exec.c
	@@ -1131,11 +1131,24 @@ static int exec_mmap(struct mm_struct *mm)
	}

	task_lock(tsk);
	- active_mm = tsk->active_mm;
	membarrier_exec_mmap(mm);
	- tsk->mm = mm;
	+
	+ local_irq_disable();
	+ active_mm = tsk->active_mm;
	tsk->active_mm = mm;
	+ tsk->mm = mm;
	+ /*
	+ * This prevents preemption while active_mm is being loaded and
	+ * it and mm are being updated, which could cause problems for
	+ * lazy tlb mm refcounting when these are updated by context
	+ * switches. Not all architectures can handle irqs off over
	+ * activate_mm yet.
	+ */
	+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
	+ local_irq_enable();
	activate_mm(active_mm, mm);
	+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
	+ local_irq_enable();
	tsk->mm->vmacache_seqnum = 0;
	vmacache_flush(tsk);
	task_unlock(tsk);
	--
	2.27.0