| From 8469091364ccc3e9cba51aeee3bff547a03b4c86 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Mon, 14 Sep 2020 14:52:16 +1000 |
| Subject: mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race |
| |
| From: Nicholas Piggin <npiggin@gmail.com> |
| |
| [ Upstream commit d53c3dfb23c45f7d4f910c3a3ca84bf0a99c6143 ] |
| |
| Reading and modifying current->mm and current->active_mm and switching |
| mm should be done with irqs off, to prevent races seeing an intermediate |
| state. |
| |
| This is similar to commit 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB |
| invalidate"). At exec-time when the new mm is activated, the old one |
| should usually be single-threaded and no longer used, unless something |
| else is holding an mm_users reference (which may be possible). |
| |
| Absent other mm_users, there is also a race with preemption and lazy tlb |
| switching. Consider the kernel_execve case where the current thread is |
| using a lazy tlb active mm: |
| |
| call_usermodehelper() |
| kernel_execve() |
| old_mm = current->mm; |
| active_mm = current->active_mm; |
| *** preempt *** --------------------> schedule() |
| prev->active_mm = NULL; |
| mmdrop(prev active_mm); |
| ... |
| <-------------------- schedule() |
| current->mm = mm; |
| current->active_mm = mm; |
| if (!old_mm) |
| mmdrop(active_mm); |
| |
| If we switch back to the kernel thread from a different mm, there is a |
| double free of the old active_mm, and a missing free of the new one. |
| |
| Closing this race only requires interrupts to be disabled while ->mm |
| and ->active_mm are being switched, but the TLB problem requires also |
| holding interrupts off over activate_mm. Unfortunately not all archs |
| can do that yet, e.g., arm defers the switch if irqs are disabled and |
| expects finish_arch_post_lock_switch() to be called to complete the |
| flush; um takes a blocking lock in activate_mm(). |
| |
| So as a first step, disable interrupts across the mm/active_mm updates |
| to close the lazy tlb preempt race, and provide an arch option to |
| extend that to activate_mm which allows architectures doing IPI based |
| TLB shootdowns to close the second race. |
| |
| This is a bit ugly, but in the interest of fixing the bug and backporting |
| before all architectures are converted this is a compromise. |
| |
| Signed-off-by: Nicholas Piggin <npiggin@gmail.com> |
| Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> |
| Link: https://lore.kernel.org/r/20200914045219.3736466-2-npiggin@gmail.com |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| arch/Kconfig | 7 +++++++ |
| fs/exec.c | 17 +++++++++++++++-- |
| 2 files changed, 22 insertions(+), 2 deletions(-) |
| |
| diff --git a/arch/Kconfig b/arch/Kconfig |
| index af14a567b493f..94821e3f94d16 100644 |
| --- a/arch/Kconfig |
| +++ b/arch/Kconfig |
| @@ -414,6 +414,13 @@ config MMU_GATHER_NO_GATHER |
| bool |
| depends on MMU_GATHER_TABLE_FREE |
| |
| +config ARCH_WANT_IRQS_OFF_ACTIVATE_MM |
| + bool |
| + help |
| + Temporary select until all architectures can be converted to have |
| + irqs disabled over activate_mm. Architectures that do IPI based TLB |
| + shootdowns should enable this. |
| + |
| config ARCH_HAVE_NMI_SAFE_CMPXCHG |
| bool |
| |
| diff --git a/fs/exec.c b/fs/exec.c |
| index 07910f5032e74..3622681489864 100644 |
| --- a/fs/exec.c |
| +++ b/fs/exec.c |
| @@ -1131,11 +1131,24 @@ static int exec_mmap(struct mm_struct *mm) |
| } |
| |
| task_lock(tsk); |
| - active_mm = tsk->active_mm; |
| membarrier_exec_mmap(mm); |
| - tsk->mm = mm; |
| + |
| + local_irq_disable(); |
| + active_mm = tsk->active_mm; |
| tsk->active_mm = mm; |
| + tsk->mm = mm; |
| + /* |
| + * This prevents preemption while active_mm is being loaded and |
| + * it and mm are being updated, which could cause problems for |
| + * lazy tlb mm refcounting when these are updated by context |
| + * switches. Not all architectures can handle irqs off over |
| + * activate_mm yet. |
| + */ |
| + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) |
| + local_irq_enable(); |
| activate_mm(active_mm, mm); |
| + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) |
| + local_irq_enable(); |
| tsk->mm->vmacache_seqnum = 0; |
| vmacache_flush(tsk); |
| task_unlock(tsk); |
| -- |
| 2.27.0 |
| |