| From e6a72b7daeeb521753803550f0ed711152bb2555 Mon Sep 17 00:00:00 2001 |
| From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com> |
| Date: Mon, 14 Jan 2019 18:16:48 +0300 |
| Subject: ARCv2: lib: memeset: fix doing prefetchw outside of buffer |
| |
| From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com> |
| |
| commit e6a72b7daeeb521753803550f0ed711152bb2555 upstream. |
| |
| ARCv2 optimized memset uses PREFETCHW instruction for prefetching the |
| next cache line but doesn't ensure that the line is not past the end of |
| the buffer. PRETECHW changes the line ownership and marks it dirty, |
| which can cause issues in SMP config when next line was already owned by |
| other core. Fix the issue by avoiding the PREFETCHW |
| |
| Some more details: |
| |
| The current code has 3 logical loops (ignroing the unaligned part) |
| (a) Big loop for doing aligned 64 bytes per iteration with PREALLOC |
| (b) Loop for 32 x 2 bytes with PREFETCHW |
| (c) any left over bytes |
| |
| loop (a) was already eliding the last 64 bytes, so PREALLOC was |
| safe. The fix was removing PREFETCW from (b). |
| |
| Another potential issue (applicable to configs with 32 or 128 byte L1 |
| cache line) is that PREALLOC assumes 64 byte cache line and may not do |
| the right thing specially for 32b. While it would be easy to adapt, |
| there are no known configs with those lie sizes, so for now, just |
| compile out PREALLOC in such cases. |
| |
| Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com> |
| Cc: stable@vger.kernel.org #4.4+ |
| Signed-off-by: Vineet Gupta <vgupta@synopsys.com> |
| [vgupta: rewrote changelog, used asm .macro vs. "C" macro] |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| arch/arc/lib/memset-archs.S | 40 ++++++++++++++++++++++++++++++++-------- |
| 1 file changed, 32 insertions(+), 8 deletions(-) |
| |
| --- a/arch/arc/lib/memset-archs.S |
| +++ b/arch/arc/lib/memset-archs.S |
| @@ -7,11 +7,39 @@ |
| */ |
| |
| #include <linux/linkage.h> |
| +#include <asm/cache.h> |
| |
| -#undef PREALLOC_NOT_AVAIL |
| +/* |
| + * The memset implementation below is optimized to use prefetchw and prealloc |
| + * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6) |
| + * If you want to implement optimized memset for other possible L1 data cache |
| + * line lengths (32B and 128B) you should rewrite code carefully checking |
| + * we don't call any prefetchw/prealloc instruction for L1 cache lines which |
| + * don't belongs to memset area. |
| + */ |
| + |
| +#if L1_CACHE_SHIFT == 6 |
| + |
| +.macro PREALLOC_INSTR reg, off |
| + prealloc [\reg, \off] |
| +.endm |
| + |
| +.macro PREFETCHW_INSTR reg, off |
| + prefetchw [\reg, \off] |
| +.endm |
| + |
| +#else |
| + |
| +.macro PREALLOC_INSTR |
| +.endm |
| + |
| +.macro PREFETCHW_INSTR |
| +.endm |
| + |
| +#endif |
| |
| ENTRY_CFI(memset) |
| - prefetchw [r0] ; Prefetch the write location |
| + PREFETCHW_INSTR r0, 0 ; Prefetch the first write location |
| mov.f 0, r2 |
| ;;; if size is zero |
| jz.d [blink] |
| @@ -48,11 +76,8 @@ ENTRY_CFI(memset) |
| |
| lpnz @.Lset64bytes |
| ;; LOOP START |
| -#ifdef PREALLOC_NOT_AVAIL |
| - prefetchw [r3, 64] ;Prefetch the next write location |
| -#else |
| - prealloc [r3, 64] |
| -#endif |
| + PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching |
| + |
| #ifdef CONFIG_ARC_HAS_LL64 |
| std.ab r4, [r3, 8] |
| std.ab r4, [r3, 8] |
| @@ -85,7 +110,6 @@ ENTRY_CFI(memset) |
| lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes |
| lpnz .Lset32bytes |
| ;; LOOP START |
| - prefetchw [r3, 32] ;Prefetch the next write location |
| #ifdef CONFIG_ARC_HAS_LL64 |
| std.ab r4, [r3, 8] |
| std.ab r4, [r3, 8] |