| From d39268ad24c0fd0665d0c5cf55a7c1a0ebf94766 Mon Sep 17 00:00:00 2001 |
| From: Dave Hansen <dave.hansen@linux.intel.com> |
| Date: Fri, 18 Mar 2022 06:52:59 -0700 |
| Subject: x86/mm/tlb: Revert retpoline avoidance approach |
| |
| From: Dave Hansen <dave.hansen@linux.intel.com> |
| |
| commit d39268ad24c0fd0665d0c5cf55a7c1a0ebf94766 upstream. |
| |
| 0day reported a regression on a microbenchmark which is intended to |
| stress the TLB flushing path: |
| |
| https://lore.kernel.org/all/20220317090415.GE735@xsang-OptiPlex-9020/ |
| |
| It pointed at a commit from Nadav which intended to remove retpoline |
| overhead in the TLB flushing path by taking the 'cond'-ition in |
| on_each_cpu_cond_mask(), pre-calculating it, and incorporating it into |
| 'cpumask'. That allowed the code to use a bunch of earlier direct |
| calls instead of later indirect calls that need a retpoline. |
| |
| But, in practice, threads can go idle (and into lazy TLB mode where |
| they don't need to flush their TLB) between the early and late calls. |
| It works in this direction and not in the other because TLB-flushing |
| threads tend to hold mmap_lock for write. Contention on that lock |
| causes threads to _go_ idle right in this early/late window. |
| |
| There was not any performance data in the original commit specific |
| to the retpoline overhead. I did a few tests on a system with |
| retpolines: |
| |
| https://lore.kernel.org/all/dd8be93c-ded6-b962-50d4-96b1c3afb2b7@intel.com/ |
| |
| which showed a possible small win. But, that small win pales in |
| comparison with the bigger loss induced on non-retpoline systems. |
| |
| Revert the patch that removed the retpolines. This was not a |
| clean revert, but it was self-contained enough not to be too painful. |
| |
| Fixes: 6035152d8eeb ("x86/mm/tlb: Open-code on_each_cpu_cond_mask() for tlb_is_not_lazy()") |
| Reported-by: kernel test robot <oliver.sang@intel.com> |
| Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> |
| Signed-off-by: Borislav Petkov <bp@suse.de> |
| Acked-by: Nadav Amit <namit@vmware.com> |
| Cc: <stable@vger.kernel.org> |
| Link: https://lkml.kernel.org/r/164874672286.389.7021457716635788197.tip-bot2@tip-bot2 |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/x86/mm/tlb.c | 37 +++++-------------------------------- |
| 1 file changed, 5 insertions(+), 32 deletions(-) |
| |
| --- a/arch/x86/mm/tlb.c |
| +++ b/arch/x86/mm/tlb.c |
| @@ -854,13 +854,11 @@ done: |
| nr_invalidate); |
| } |
| |
| -static bool tlb_is_not_lazy(int cpu) |
| +static bool tlb_is_not_lazy(int cpu, void *data) |
| { |
| return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); |
| } |
| |
| -static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask); |
| - |
| DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); |
| EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared); |
| |
| @@ -889,36 +887,11 @@ STATIC_NOPV void native_flush_tlb_multi( |
| * up on the new contents of what used to be page tables, while |
| * doing a speculative memory access. |
| */ |
| - if (info->freed_tables) { |
| + if (info->freed_tables) |
| on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); |
| - } else { |
| - /* |
| - * Although we could have used on_each_cpu_cond_mask(), |
| - * open-coding it has performance advantages, as it eliminates |
| - * the need for indirect calls or retpolines. In addition, it |
| - * allows to use a designated cpumask for evaluating the |
| - * condition, instead of allocating one. |
| - * |
| - * This code works under the assumption that there are no nested |
| - * TLB flushes, an assumption that is already made in |
| - * flush_tlb_mm_range(). |
| - * |
| - * cond_cpumask is logically a stack-local variable, but it is |
| - * more efficient to have it off the stack and not to allocate |
| - * it on demand. Preemption is disabled and this code is |
| - * non-reentrant. |
| - */ |
| - struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask); |
| - int cpu; |
| - |
| - cpumask_clear(cond_cpumask); |
| - |
| - for_each_cpu(cpu, cpumask) { |
| - if (tlb_is_not_lazy(cpu)) |
| - __cpumask_set_cpu(cpu, cond_cpumask); |
| - } |
| - on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true); |
| - } |
| + else |
| + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, |
| + (void *)info, 1, cpumask); |
| } |
| |
| void flush_tlb_multi(const struct cpumask *cpumask, |