| From foo@baz Wed Jan 3 18:58:12 CET 2018 |
| From: Dave Hansen <dave.hansen@linux.intel.com> |
| Date: Wed, 30 Aug 2017 16:23:00 -0700 |
| Subject: kaiser: enhanced by kernel and user PCIDs |
| |
| From: Dave Hansen <dave.hansen@linux.intel.com> |
| |
| |
| Merged performance improvements to Kaiser, using distinct kernel |
| and user Process Context Identifiers to minimize the TLB flushing. |
| |
| Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> |
| Signed-off-by: Hugh Dickins <hughd@google.com> |
| Acked-by: Jiri Kosina <jkosina@suse.cz> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/x86/entry/entry_64.S | 10 ++++- |
| arch/x86/entry/entry_64_compat.S | 1 |
| arch/x86/include/asm/cpufeature.h | 1 |
| arch/x86/include/asm/kaiser.h | 15 ++++++- |
| arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++ |
| arch/x86/include/asm/tlbflush.h | 54 +++++++++++++++++++++++----- |
| arch/x86/include/uapi/asm/processor-flags.h | 3 + |
| arch/x86/kernel/cpu/common.c | 34 +++++++++++++++++ |
| arch/x86/kvm/x86.c | 3 + |
| arch/x86/mm/kaiser.c | 7 +++ |
| arch/x86/mm/tlb.c | 46 ++++++++++++++++++++++- |
| 11 files changed, 182 insertions(+), 18 deletions(-) |
| |
| --- a/arch/x86/entry/entry_64.S |
| +++ b/arch/x86/entry/entry_64.S |
| @@ -1291,7 +1291,10 @@ ENTRY(nmi) |
| /* %rax is saved above, so OK to clobber here */ |
| movq %cr3, %rax |
| pushq %rax |
| - andq $(~KAISER_SHADOW_PGD_OFFSET), %rax |
| + /* mask off "user" bit of pgd address and 12 PCID bits: */ |
| + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
| + /* Add back kernel PCID and "no flush" bit */ |
| + orq X86_CR3_PCID_KERN_VAR, %rax |
| movq %rax, %cr3 |
| #endif |
| call do_nmi |
| @@ -1532,7 +1535,10 @@ end_repeat_nmi: |
| /* %rax is saved above, so OK to clobber here */ |
| movq %cr3, %rax |
| pushq %rax |
| - andq $(~KAISER_SHADOW_PGD_OFFSET), %rax |
| + /* mask off "user" bit of pgd address and 12 PCID bits: */ |
| + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
| + /* Add back kernel PCID and "no flush" bit */ |
| + orq X86_CR3_PCID_KERN_VAR, %rax |
| movq %rax, %cr3 |
| #endif |
| |
| --- a/arch/x86/entry/entry_64_compat.S |
| +++ b/arch/x86/entry/entry_64_compat.S |
| @@ -13,6 +13,7 @@ |
| #include <asm/irqflags.h> |
| #include <asm/asm.h> |
| #include <asm/smap.h> |
| +#include <asm/pgtable_types.h> |
| #include <asm/kaiser.h> |
| #include <linux/linkage.h> |
| #include <linux/err.h> |
| --- a/arch/x86/include/asm/cpufeature.h |
| +++ b/arch/x86/include/asm/cpufeature.h |
| @@ -187,6 +187,7 @@ |
| #define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */ |
| #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ |
| #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ |
| +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ |
| #define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */ |
| #define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */ |
| #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ |
| --- a/arch/x86/include/asm/kaiser.h |
| +++ b/arch/x86/include/asm/kaiser.h |
| @@ -1,5 +1,8 @@ |
| #ifndef _ASM_X86_KAISER_H |
| #define _ASM_X86_KAISER_H |
| + |
| +#include <uapi/asm/processor-flags.h> /* For PCID constants */ |
| + |
| /* |
| * This file includes the definitions for the KAISER feature. |
| * KAISER is a counter measure against x86_64 side channel attacks on |
| @@ -21,13 +24,21 @@ |
| |
| .macro _SWITCH_TO_KERNEL_CR3 reg |
| movq %cr3, \reg |
| -andq $(~KAISER_SHADOW_PGD_OFFSET), \reg |
| +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg |
| +orq X86_CR3_PCID_KERN_VAR, \reg |
| movq \reg, %cr3 |
| .endm |
| |
| .macro _SWITCH_TO_USER_CR3 reg |
| movq %cr3, \reg |
| -orq $(KAISER_SHADOW_PGD_OFFSET), \reg |
| +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg |
| +/* |
| + * This can obviously be one instruction by putting the |
| + * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. |
| + * But, just leave it now for simplicity. |
| + */ |
| +orq X86_CR3_PCID_USER_VAR, \reg |
| +orq $(KAISER_SHADOW_PGD_OFFSET), \reg |
| movq \reg, %cr3 |
| .endm |
| |
| --- a/arch/x86/include/asm/pgtable_types.h |
| +++ b/arch/x86/include/asm/pgtable_types.h |
| @@ -106,6 +106,32 @@ |
| _PAGE_SOFT_DIRTY) |
| #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) |
| |
| +/* The ASID is the lower 12 bits of CR3 */ |
| +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) |
| + |
| +/* Mask for all the PCID-related bits in CR3: */ |
| +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) |
| +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) |
| +#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) |
| +#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) |
| + |
| +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) |
| +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) |
| +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) |
| +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) |
| +#else |
| +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) |
| +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) |
| +/* |
| + * PCIDs are unsupported on 32-bit and none of these bits can be |
| + * set in CR3: |
| + */ |
| +#define X86_CR3_PCID_KERN_FLUSH (0) |
| +#define X86_CR3_PCID_USER_FLUSH (0) |
| +#define X86_CR3_PCID_KERN_NOFLUSH (0) |
| +#define X86_CR3_PCID_USER_NOFLUSH (0) |
| +#endif |
| + |
| /* |
| * The cache modes defined here are used to translate between pure SW usage |
| * and the HW defined cache mode bits and/or PAT entries. |
| --- a/arch/x86/include/asm/tlbflush.h |
| +++ b/arch/x86/include/asm/tlbflush.h |
| @@ -12,7 +12,6 @@ static inline void __invpcid(unsigned lo |
| unsigned long type) |
| { |
| struct { u64 d[2]; } desc = { { pcid, addr } }; |
| - |
| /* |
| * The memory clobber is because the whole point is to invalidate |
| * stale TLB entries and, especially if we're flushing global |
| @@ -133,14 +132,25 @@ static inline void cr4_set_bits_and_upda |
| |
| static inline void __native_flush_tlb(void) |
| { |
| + if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { |
| + /* |
| + * If current->mm == NULL then we borrow a mm which may change during a |
| + * task switch and therefore we must not be preempted while we write CR3 |
| + * back: |
| + */ |
| + preempt_disable(); |
| + native_write_cr3(native_read_cr3()); |
| + preempt_enable(); |
| + return; |
| + } |
| /* |
| - * If current->mm == NULL then we borrow a mm which may change during a |
| - * task switch and therefore we must not be preempted while we write CR3 |
| - * back: |
| - */ |
| - preempt_disable(); |
| - native_write_cr3(native_read_cr3()); |
| - preempt_enable(); |
| + * We are no longer using globals with KAISER, so a |
| + * "nonglobals" flush would work too. But, this is more |
| + * conservative. |
| + * |
| + * Note, this works with CR4.PCIDE=0 or 1. |
| + */ |
| + invpcid_flush_all(); |
| } |
| |
| static inline void __native_flush_tlb_global_irq_disabled(void) |
| @@ -162,6 +172,8 @@ static inline void __native_flush_tlb_gl |
| /* |
| * Using INVPCID is considerably faster than a pair of writes |
| * to CR4 sandwiched inside an IRQ flag save/restore. |
| + * |
| + * Note, this works with CR4.PCIDE=0 or 1. |
| */ |
| invpcid_flush_all(); |
| return; |
| @@ -181,7 +193,31 @@ static inline void __native_flush_tlb_gl |
| |
| static inline void __native_flush_tlb_single(unsigned long addr) |
| { |
| - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
| + /* |
| + * SIMICS #GP's if you run INVPCID with type 2/3 |
| + * and X86_CR4_PCIDE clear. Shame! |
| + * |
| + * The ASIDs used below are hard-coded. But, we must not |
| + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call |
| + * invpcid in the case we are called early. |
| + */ |
| + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { |
| + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
| + return; |
| + } |
| + /* Flush the address out of both PCIDs. */ |
| + /* |
| + * An optimization here might be to determine addresses |
| + * that are only kernel-mapped and only flush the kernel |
| + * ASID. But, userspace flushes are probably much more |
| + * important performance-wise. |
| + * |
| + * Make sure to do only a single invpcid when KAISER is |
| + * disabled and we have only a single ASID. |
| + */ |
| + if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) |
| + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); |
| + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); |
| } |
| |
| static inline void __flush_tlb_all(void) |
| --- a/arch/x86/include/uapi/asm/processor-flags.h |
| +++ b/arch/x86/include/uapi/asm/processor-flags.h |
| @@ -77,7 +77,8 @@ |
| #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) |
| #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ |
| #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) |
| -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ |
| +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ |
| +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) |
| |
| /* |
| * Intel CPU features in CR4 |
| --- a/arch/x86/kernel/cpu/common.c |
| +++ b/arch/x86/kernel/cpu/common.c |
| @@ -321,11 +321,45 @@ static __always_inline void setup_smap(s |
| } |
| } |
| |
| +/* |
| + * These can have bit 63 set, so we can not just use a plain "or" |
| + * instruction to get their value or'd into CR3. It would take |
| + * another register. So, we use a memory reference to these |
| + * instead. |
| + * |
| + * This is also handy because systems that do not support |
| + * PCIDs just end up or'ing a 0 into their CR3, which does |
| + * no harm. |
| + */ |
| +__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0; |
| +__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0; |
| + |
| static void setup_pcid(struct cpuinfo_x86 *c) |
| { |
| if (cpu_has(c, X86_FEATURE_PCID)) { |
| if (cpu_has(c, X86_FEATURE_PGE)) { |
| cr4_set_bits(X86_CR4_PCIDE); |
| + /* |
| + * These variables are used by the entry/exit |
| + * code to change PCIDs. |
| + */ |
| +#ifdef CONFIG_KAISER |
| + X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH; |
| + X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH; |
| +#endif |
| + /* |
| + * INVPCID has two "groups" of types: |
| + * 1/2: Invalidate an individual address |
| + * 3/4: Invalidate all contexts |
| + * |
| + * 1/2 take a PCID, but 3/4 do not. So, 3/4 |
| + * ignore the PCID argument in the descriptor. |
| + * But, we have to be careful not to call 1/2 |
| + * with an actual non-zero PCID in them before |
| + * we do the above cr4_set_bits(). |
| + */ |
| + if (cpu_has(c, X86_FEATURE_INVPCID)) |
| + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); |
| } else { |
| /* |
| * flush_tlb_all(), as currently implemented, won't |
| --- a/arch/x86/kvm/x86.c |
| +++ b/arch/x86/kvm/x86.c |
| @@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, u |
| return 1; |
| |
| /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ |
| - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) |
| + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || |
| + !is_long_mode(vcpu)) |
| return 1; |
| } |
| |
| --- a/arch/x86/mm/kaiser.c |
| +++ b/arch/x86/mm/kaiser.c |
| @@ -240,6 +240,8 @@ static void __init kaiser_init_all_pgds( |
| } while (0) |
| |
| extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
| +extern unsigned long X86_CR3_PCID_KERN_VAR; |
| +extern unsigned long X86_CR3_PCID_USER_VAR; |
| /* |
| * If anything in here fails, we will likely die on one of the |
| * first kernel->user transitions and init will die. But, we |
| @@ -290,6 +292,11 @@ void __init kaiser_init(void) |
| kaiser_add_user_map_early(&debug_idt_table, |
| sizeof(gate_desc) * NR_VECTORS, |
| __PAGE_KERNEL); |
| + |
| + kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, |
| + __PAGE_KERNEL); |
| + kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE, |
| + __PAGE_KERNEL); |
| } |
| |
| /* Add a mapping to the shadow mapping, and synchronize the mappings */ |
| --- a/arch/x86/mm/tlb.c |
| +++ b/arch/x86/mm/tlb.c |
| @@ -34,6 +34,46 @@ struct flush_tlb_info { |
| unsigned long flush_end; |
| }; |
| |
| +static void load_new_mm_cr3(pgd_t *pgdir) |
| +{ |
| + unsigned long new_mm_cr3 = __pa(pgdir); |
| + |
| + /* |
| + * KAISER, plus PCIDs needs some extra work here. But, |
| + * if either of features is not present, we need no |
| + * PCIDs here and just do a normal, full TLB flush with |
| + * the write_cr3() |
| + */ |
| + if (!IS_ENABLED(CONFIG_KAISER) || |
| + !cpu_feature_enabled(X86_FEATURE_PCID)) |
| + goto out_set_cr3; |
| + /* |
| + * We reuse the same PCID for different tasks, so we must |
| + * flush all the entires for the PCID out when we change |
| + * tasks. |
| + */ |
| + new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir); |
| + |
| + /* |
| + * The flush from load_cr3() may leave old TLB entries |
| + * for userspace in place. We must flush that context |
| + * separately. We can theoretically delay doing this |
| + * until we actually load up the userspace CR3, but |
| + * that's a bit tricky. We have to have the "need to |
| + * flush userspace PCID" bit per-cpu and check it in the |
| + * exit-to-userspace paths. |
| + */ |
| + invpcid_flush_single_context(X86_CR3_PCID_ASID_USER); |
| + |
| +out_set_cr3: |
| + /* |
| + * Caution: many callers of this function expect |
| + * that load_cr3() is serializing and orders TLB |
| + * fills with respect to the mm_cpumask writes. |
| + */ |
| + write_cr3(new_mm_cr3); |
| +} |
| + |
| /* |
| * We cannot call mmdrop() because we are in interrupt context, |
| * instead update mm->cpu_vm_mask. |
| @@ -45,7 +85,7 @@ void leave_mm(int cpu) |
| BUG(); |
| if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { |
| cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); |
| - load_cr3(swapper_pg_dir); |
| + load_new_mm_cr3(swapper_pg_dir); |
| /* |
| * This gets called in the idle path where RCU |
| * functions differently. Tracing normally |
| @@ -105,7 +145,7 @@ void switch_mm_irqs_off(struct mm_struct |
| * ordering guarantee we need. |
| * |
| */ |
| - load_cr3(next->pgd); |
| + load_new_mm_cr3(next->pgd); |
| |
| trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
| |
| @@ -152,7 +192,7 @@ void switch_mm_irqs_off(struct mm_struct |
| * As above, load_cr3() is serializing and orders TLB |
| * fills with respect to the mm_cpumask write. |
| */ |
| - load_cr3(next->pgd); |
| + load_new_mm_cr3(next->pgd); |
| trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
| load_mm_cr4(next); |
| load_mm_ldt(next); |