| From: Hugh Dickins <hughd@google.com> |
| Date: Mon, 11 Dec 2017 17:59:50 -0800 |
| Subject: KAISER: Kernel Address Isolation |
| |
| This patch introduces our implementation of KAISER (Kernel Address |
| Isolation to have Side-channels Efficiently Removed), a kernel isolation |
| technique to close hardware side channels on kernel address information. |
| |
| More information about the original patch can be found at: |
| https://github.com/IAIK/KAISER |
| http://marc.info/?l=linux-kernel&m=149390087310405&w=2 |
| |
| Daniel Gruss <daniel.gruss@iaik.tugraz.at> |
| Richard Fellner <richard.fellner@student.tugraz.at> |
| Michael Schwarz <michael.schwarz@iaik.tugraz.at> |
| <clementine.maurice@iaik.tugraz.at> |
| <moritz.lipp@iaik.tugraz.at> |
| |
| That original was then developed further by |
| Dave Hansen <dave.hansen@intel.com> |
| Hugh Dickins <hughd@google.com> |
| then others after this snapshot. |
| |
| This combined patch for 3.2.96 was derived from hughd's patches below |
| for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last, |
| which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been |
| combined in order to minimize the effort of rebasing: most of the |
| patches in the 3.18.72 series were small fixes and cleanups and |
| enhancements to three large patches. About the only new work in this |
| backport is a simple reimplementation of kaiser_remove_mapping(): |
| since mm/pageattr.c changed a lot between 3.2 and 3.18, and the |
| mods there for Kaiser never seemed necessary. |
| |
| KAISER: Kernel Address Isolation |
| kaiser: merged update |
| kaiser: do not set _PAGE_NX on pgd_none |
| kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE |
| kaiser: fix build and FIXME in alloc_ldt_struct() |
| kaiser: KAISER depends on SMP |
| kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER |
| kaiser: fix perf crashes |
| kaiser: ENOMEM if kaiser_pagetable_walk() NULL |
| kaiser: tidied up asm/kaiser.h somewhat |
| kaiser: tidied up kaiser_add/remove_mapping slightly |
| kaiser: kaiser_remove_mapping() move along the pgd |
| kaiser: align addition to x86/mm/Makefile |
| kaiser: cleanups while trying for gold link |
| kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET |
| kaiser: delete KAISER_REAL_SWITCH option |
| kaiser: vmstat show NR_KAISERTABLE as nr_overhead |
| kaiser: enhanced by kernel and user PCIDs |
| kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user |
| kaiser: PCID 0 for kernel and 128 for user |
| kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user |
| kaiser: paranoid_entry pass cr3 need to paranoid_exit |
| kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls |
| kaiser: fix unlikely error in alloc_ldt_struct() |
| kaiser: drop is_atomic arg to kaiser_pagetable_walk() |
| |
| Signed-off-by: Hugh Dickins <hughd@google.com> |
| [bwh: |
| - Fixed the #undef in arch/x86/boot/compressed/misc.h |
| - Add missing #include in arch/x86/mm/kaiser.c] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| arch/x86/boot/compressed/misc.h | 1 + |
| arch/x86/ia32/ia32entry.S | 7 + |
| arch/x86/include/asm/cpufeature.h | 1 + |
| arch/x86/include/asm/desc.h | 2 +- |
| arch/x86/include/asm/hw_irq.h | 2 +- |
| arch/x86/include/asm/kaiser.h | 126 ++++++++++ |
| arch/x86/include/asm/pgtable.h | 18 +- |
| arch/x86/include/asm/pgtable_64.h | 29 ++- |
| arch/x86/include/asm/pgtable_types.h | 33 ++- |
| arch/x86/include/asm/processor-flags.h | 2 + |
| arch/x86/include/asm/processor.h | 2 +- |
| arch/x86/include/asm/tlbflush.h | 64 ++++- |
| arch/x86/kernel/cpu/common.c | 18 +- |
| arch/x86/kernel/cpu/perf_event_intel_ds.c | 54 ++++- |
| arch/x86/kernel/entry_64.S | 117 +++++++-- |
| arch/x86/kernel/espfix_64.c | 9 + |
| arch/x86/kernel/head_64.S | 25 +- |
| arch/x86/kernel/init_task.c | 2 +- |
| arch/x86/kernel/irqinit.c | 2 +- |
| arch/x86/kernel/ldt.c | 25 +- |
| arch/x86/kernel/process_64.c | 2 +- |
| arch/x86/mm/Makefile | 1 + |
| arch/x86/mm/kaiser.c | 382 ++++++++++++++++++++++++++++++ |
| arch/x86/mm/pgtable.c | 31 ++- |
| arch/x86/mm/tlb.c | 48 +++- |
| include/asm-generic/vmlinux.lds.h | 7 + |
| include/linux/kaiser.h | 52 ++++ |
| include/linux/mmzone.h | 3 +- |
| include/linux/percpu-defs.h | 32 ++- |
| init/main.c | 2 + |
| kernel/fork.c | 6 + |
| mm/vmstat.c | 1 + |
| security/Kconfig | 10 + |
| 33 files changed, 1049 insertions(+), 67 deletions(-) |
| create mode 100644 arch/x86/include/asm/kaiser.h |
| create mode 100644 arch/x86/mm/kaiser.c |
| create mode 100644 include/linux/kaiser.h |
| |
| diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h |
| index 3f19c81a6203..2fa2635ee539 100644 |
| --- a/arch/x86/boot/compressed/misc.h |
| +++ b/arch/x86/boot/compressed/misc.h |
| @@ -7,6 +7,7 @@ |
| * we just keep it from happening |
| */ |
| #undef CONFIG_PARAVIRT |
| +#undef CONFIG_KAISER |
| #ifdef CONFIG_X86_32 |
| #define _ASM_X86_DESC_H 1 |
| #endif |
| diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S |
| index 2b5527726ae1..7eb0d4792800 100644 |
| --- a/arch/x86/ia32/ia32entry.S |
| +++ b/arch/x86/ia32/ia32entry.S |
| @@ -12,6 +12,8 @@ |
| #include <asm/ia32_unistd.h> |
| #include <asm/thread_info.h> |
| #include <asm/segment.h> |
| +#include <asm/pgtable_types.h> |
| +#include <asm/kaiser.h> |
| #include <asm/irqflags.h> |
| #include <linux/linkage.h> |
| |
| @@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target) |
| CFI_DEF_CFA rsp,0 |
| CFI_REGISTER rsp,rbp |
| SWAPGS_UNSAFE_STACK |
| + SWITCH_KERNEL_CR3_NO_STACK |
| movq PER_CPU_VAR(kernel_stack), %rsp |
| addq $(KERNEL_STACK_OFFSET),%rsp |
| /* |
| @@ -183,6 +186,7 @@ ENTRY(ia32_sysenter_target) |
| popq_cfi %rcx /* User %esp */ |
| CFI_REGISTER rsp,rcx |
| TRACE_IRQS_ON |
| + SWITCH_USER_CR3 |
| ENABLE_INTERRUPTS_SYSEXIT32 |
| |
| #ifdef CONFIG_AUDITSYSCALL |
| @@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target) |
| CFI_REGISTER rip,rcx |
| /*CFI_REGISTER rflags,r11*/ |
| SWAPGS_UNSAFE_STACK |
| + SWITCH_KERNEL_CR3_NO_STACK |
| movl %esp,%r8d |
| CFI_REGISTER rsp,r8 |
| movq PER_CPU_VAR(kernel_stack),%rsp |
| @@ -337,6 +342,7 @@ ENTRY(ia32_cstar_target) |
| xorq %r9,%r9 |
| xorq %r8,%r8 |
| TRACE_IRQS_ON |
| + SWITCH_USER_CR3 |
| movl RSP-ARGOFFSET(%rsp),%esp |
| CFI_RESTORE rsp |
| USERGS_SYSRET32 |
| @@ -409,6 +415,7 @@ ENTRY(ia32_syscall) |
| CFI_REL_OFFSET rip,RIP-RIP |
| PARAVIRT_ADJUST_EXCEPTION_FRAME |
| SWAPGS |
| + SWITCH_KERNEL_CR3_NO_STACK |
| /* |
| * No need to follow this irqs on/off section: the syscall |
| * disabled irqs and here we enable it straight after entry: |
| diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h |
| index 6f254f2fcd40..736272670870 100644 |
| --- a/arch/x86/include/asm/cpufeature.h |
| +++ b/arch/x86/include/asm/cpufeature.h |
| @@ -176,6 +176,7 @@ |
| #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ |
| #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ |
| #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ |
| +#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */ |
| |
| /* Virtualization flags: Linux defined, word 8 */ |
| #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ |
| diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h |
| index 382ce8a9fd62..7f1ead938ec1 100644 |
| --- a/arch/x86/include/asm/desc.h |
| +++ b/arch/x86/include/asm/desc.h |
| @@ -40,7 +40,7 @@ struct gdt_page { |
| struct desc_struct gdt[GDT_ENTRIES]; |
| } __attribute__((aligned(PAGE_SIZE))); |
| |
| -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); |
| +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); |
| |
| static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) |
| { |
| diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h |
| index eb92a6ed2be7..3354a390cc71 100644 |
| --- a/arch/x86/include/asm/hw_irq.h |
| +++ b/arch/x86/include/asm/hw_irq.h |
| @@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); |
| extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); |
| |
| typedef int vector_irq_t[NR_VECTORS]; |
| -DECLARE_PER_CPU(vector_irq_t, vector_irq); |
| +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); |
| extern void setup_vector_irq(int cpu); |
| |
| #ifdef CONFIG_X86_IO_APIC |
| diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h |
| new file mode 100644 |
| index 000000000000..6f4c8ef46881 |
| --- /dev/null |
| +++ b/arch/x86/include/asm/kaiser.h |
| @@ -0,0 +1,126 @@ |
| +#ifndef _ASM_X86_KAISER_H |
| +#define _ASM_X86_KAISER_H |
| + |
| +#include <asm/processor-flags.h> /* For PCID constants */ |
| + |
| +/* |
| + * This file includes the definitions for the KAISER feature. |
| + * KAISER is a counter measure against x86_64 side channel attacks on |
| + * the kernel virtual memory. It has a shadow pgd for every process: the |
| + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole |
| + * user memory. Within a kernel context switch, or when an interrupt is handled, |
| + * the pgd is switched to the normal one. When the system switches to user mode, |
| + * the shadow pgd is enabled. By this, the virtual memory caches are freed, |
| + * and the user may not attack the whole kernel memory. |
| + * |
| + * A minimalistic kernel mapping holds the parts needed to be mapped in user |
| + * mode, such as the entry/exit functions of the user space, or the stacks. |
| + */ |
| + |
| +#define KAISER_SHADOW_PGD_OFFSET 0x1000 |
| + |
| +#ifdef __ASSEMBLY__ |
| +#ifdef CONFIG_KAISER |
| + |
| +.macro _SWITCH_TO_KERNEL_CR3 reg |
| +movq %cr3, \reg |
| +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg |
| +orq x86_cr3_pcid_noflush, \reg |
| +movq \reg, %cr3 |
| +.endm |
| + |
| +.macro _SWITCH_TO_USER_CR3 reg regb |
| +/* |
| + * regb must be the low byte portion of reg: because we have arranged |
| + * for the low byte of the user PCID to serve as the high byte of NOFLUSH |
| + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are |
| + * not enabled): so that the one register can update both memory and cr3. |
| + */ |
| +movq %cr3, \reg |
| +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg |
| +js 9f |
| +/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ |
| +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) |
| +9: |
| +movq \reg, %cr3 |
| +.endm |
| + |
| +.macro SWITCH_KERNEL_CR3 |
| +pushq %rax |
| +_SWITCH_TO_KERNEL_CR3 %rax |
| +popq %rax |
| +.endm |
| + |
| +.macro SWITCH_USER_CR3 |
| +pushq %rax |
| +_SWITCH_TO_USER_CR3 %rax %al |
| +popq %rax |
| +.endm |
| + |
| +.macro SWITCH_KERNEL_CR3_NO_STACK |
| +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) |
| +_SWITCH_TO_KERNEL_CR3 %rax |
| +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax |
| +.endm |
| + |
| +#else /* CONFIG_KAISER */ |
| + |
| +.macro SWITCH_KERNEL_CR3 reg |
| +.endm |
| +.macro SWITCH_USER_CR3 reg regb |
| +.endm |
| +.macro SWITCH_KERNEL_CR3_NO_STACK |
| +.endm |
| + |
| +#endif /* CONFIG_KAISER */ |
| + |
| +#else /* __ASSEMBLY__ */ |
| + |
| +#ifdef CONFIG_KAISER |
| +/* |
| + * Upon kernel/user mode switch, it may happen that the address |
| + * space has to be switched before the registers have been |
| + * stored. To change the address space, another register is |
| + * needed. A register therefore has to be stored/restored. |
| +*/ |
| +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| + |
| +extern unsigned long x86_cr3_pcid_noflush; |
| +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
| + |
| +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
| + |
| +/** |
| + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping |
| + * @addr: the start address of the range |
| + * @size: the size of the range |
| + * @flags: The mapping flags of the pages |
| + * |
| + * The mapping is done on a global scope, so no bigger |
| + * synchronization has to be done. the pages have to be |
| + * manually unmapped again when they are not needed any longer. |
| + */ |
| +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); |
| + |
| +/** |
| + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping |
| + * @addr: the start address of the range |
| + * @size: the size of the range |
| + */ |
| +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); |
| + |
| +/** |
| + * kaiser_init - Initialize the shadow mapping |
| + * |
| + * Most parts of the shadow mapping can be mapped upon boot |
| + * time. Only per-process things like the thread stacks |
| + * or a new LDT have to be mapped at runtime. These boot- |
| + * time mappings are permanent and never unmapped. |
| + */ |
| +extern void kaiser_init(void); |
| + |
| +#endif /* CONFIG_KAISER */ |
| + |
| +#endif /* __ASSEMBLY */ |
| + |
| +#endif /* _ASM_X86_KAISER_H */ |
| diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h |
| index 6be990922d4b..b1c8b8d3b02a 100644 |
| --- a/arch/x86/include/asm/pgtable.h |
| +++ b/arch/x86/include/asm/pgtable.h |
| @@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) |
| |
| static inline int pgd_bad(pgd_t pgd) |
| { |
| - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; |
| + pgdval_t ignore_flags = _PAGE_USER; |
| + /* |
| + * We set NX on KAISER pgds that map userspace memory so |
| + * that userspace can not meaningfully use the kernel |
| + * page table by accident; it will fault on the first |
| + * instruction it tries to run. See native_set_pgd(). |
| + */ |
| + if (IS_ENABLED(CONFIG_KAISER)) |
| + ignore_flags |= _PAGE_NX; |
| + |
| + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; |
| } |
| |
| static inline int pgd_none(pgd_t pgd) |
| @@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, |
| static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) |
| { |
| memcpy(dst, src, count * sizeof(pgd_t)); |
| +#ifdef CONFIG_KAISER |
| + /* Clone the shadow pgd part as well */ |
| + memcpy(native_get_shadow_pgd(dst), |
| + native_get_shadow_pgd(src), |
| + count * sizeof(pgd_t)); |
| +#endif |
| } |
| |
| |
| diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h |
| index 975f709e09ae..a3bf3de9893b 100644 |
| --- a/arch/x86/include/asm/pgtable_64.h |
| +++ b/arch/x86/include/asm/pgtable_64.h |
| @@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud) |
| native_set_pud(pud, native_make_pud(0)); |
| } |
| |
| +#ifdef CONFIG_KAISER |
| +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); |
| + |
| +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
| +{ |
| + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); |
| +} |
| + |
| +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) |
| +{ |
| + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); |
| +} |
| +#else |
| +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| +{ |
| + return pgd; |
| +} |
| +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
| +{ |
| + return NULL; |
| +} |
| +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) |
| +{ |
| + return pgdp; |
| +} |
| +#endif /* CONFIG_KAISER */ |
| + |
| static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
| { |
| - *pgdp = pgd; |
| + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); |
| } |
| |
| static inline void native_pgd_clear(pgd_t *pgd) |
| diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h |
| index 013286a10c2c..6e1315068a62 100644 |
| --- a/arch/x86/include/asm/pgtable_types.h |
| +++ b/arch/x86/include/asm/pgtable_types.h |
| @@ -39,7 +39,11 @@ |
| #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) |
| #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) |
| #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) |
| +#ifdef CONFIG_KAISER |
| +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) |
| +#else |
| #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
| +#endif |
| #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) |
| #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) |
| #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) |
| @@ -62,7 +66,7 @@ |
| #endif |
| |
| #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) |
| -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
| +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
| |
| #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
| _PAGE_ACCESSED | _PAGE_DIRTY) |
| @@ -74,6 +78,33 @@ |
| _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) |
| #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) |
| |
| +/* The ASID is the lower 12 bits of CR3 */ |
| +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) |
| + |
| +/* Mask for all the PCID-related bits in CR3: */ |
| +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) |
| +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) |
| + |
| +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) |
| +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ |
| +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) |
| + |
| +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) |
| +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) |
| +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) |
| +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) |
| +#else |
| +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) |
| +/* |
| + * PCIDs are unsupported on 32-bit and none of these bits can be |
| + * set in CR3: |
| + */ |
| +#define X86_CR3_PCID_KERN_FLUSH (0) |
| +#define X86_CR3_PCID_USER_FLUSH (0) |
| +#define X86_CR3_PCID_KERN_NOFLUSH (0) |
| +#define X86_CR3_PCID_USER_NOFLUSH (0) |
| +#endif |
| + |
| #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) |
| #define _PAGE_CACHE_WB (0) |
| #define _PAGE_CACHE_WC (_PAGE_PWT) |
| diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h |
| index a9e14a52385f..360e80d0d217 100644 |
| --- a/arch/x86/include/asm/processor-flags.h |
| +++ b/arch/x86/include/asm/processor-flags.h |
| @@ -43,6 +43,8 @@ |
| */ |
| #define X86_CR3_PWT 0x00000008 /* Page Write Through */ |
| #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ |
| +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ |
| +#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT) |
| |
| /* |
| * Intel CPU features in CR4 |
| diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h |
| index f7c89e231c6c..048249e983ca 100644 |
| --- a/arch/x86/include/asm/processor.h |
| +++ b/arch/x86/include/asm/processor.h |
| @@ -266,7 +266,7 @@ struct tss_struct { |
| |
| } ____cacheline_aligned; |
| |
| -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); |
| +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss); |
| |
| /* |
| * Save the original ist values for checking stack pointers during debugging |
| diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h |
| index e04cbc550424..288195901c8a 100644 |
| --- a/arch/x86/include/asm/tlbflush.h |
| +++ b/arch/x86/include/asm/tlbflush.h |
| @@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void) |
| #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) |
| #endif |
| |
| +/* |
| + * Declare a couple of kaiser interfaces here for convenience, |
| + * to avoid the need for asm/kaiser.h in unexpected places. |
| + */ |
| +#ifdef CONFIG_KAISER |
| +extern void kaiser_setup_pcid(void); |
| +extern void kaiser_flush_tlb_on_return_to_user(void); |
| +#else |
| +static inline void kaiser_setup_pcid(void) |
| +{ |
| +} |
| +static inline void kaiser_flush_tlb_on_return_to_user(void) |
| +{ |
| +} |
| +#endif |
| + |
| static inline void __native_flush_tlb(void) |
| { |
| + if (this_cpu_has(X86_FEATURE_INVPCID)) { |
| + /* |
| + * Note, this works with CR4.PCIDE=0 or 1. |
| + */ |
| + invpcid_flush_all_nonglobals(); |
| + return; |
| + } |
| + |
| /* |
| * If current->mm == NULL then we borrow a mm which may change during a |
| * task switch and therefore we must not be preempted while we write CR3 |
| * back: |
| */ |
| preempt_disable(); |
| + if (this_cpu_has(X86_FEATURE_PCID)) |
| + kaiser_flush_tlb_on_return_to_user(); |
| native_write_cr3(native_read_cr3()); |
| preempt_enable(); |
| } |
| |
| static inline void __native_flush_tlb_global(void) |
| { |
| +#ifdef CONFIG_KAISER |
| + /* Globals are not used at all */ |
| + __native_flush_tlb(); |
| +#else |
| unsigned long flags; |
| unsigned long cr4; |
| |
| - if (static_cpu_has(X86_FEATURE_INVPCID)) { |
| + if (this_cpu_has(X86_FEATURE_INVPCID)) { |
| /* |
| * Using INVPCID is considerably faster than a pair of writes |
| * to CR4 sandwiched inside an IRQ flag save/restore. |
| + * |
| + * Note, this works with CR4.PCIDE=0 or 1. |
| */ |
| invpcid_flush_all(); |
| return; |
| @@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void) |
| native_write_cr4(cr4); |
| |
| raw_local_irq_restore(flags); |
| +#endif |
| } |
| |
| static inline void __native_flush_tlb_single(unsigned long addr) |
| { |
| - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
| + /* |
| + * SIMICS #GP's if you run INVPCID with type 2/3 |
| + * and X86_CR4_PCIDE clear. Shame! |
| + * |
| + * The ASIDs used below are hard-coded. But, we must not |
| + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call |
| + * invlpg in the case we are called early. |
| + */ |
| + |
| + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { |
| + if (this_cpu_has(X86_FEATURE_PCID)) |
| + kaiser_flush_tlb_on_return_to_user(); |
| + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
| + return; |
| + } |
| + /* Flush the address out of both PCIDs. */ |
| + /* |
| + * An optimization here might be to determine addresses |
| + * that are only kernel-mapped and only flush the kernel |
| + * ASID. But, userspace flushes are probably much more |
| + * important performance-wise. |
| + * |
| + * Make sure to do only a single invpcid when KAISER is |
| + * disabled and we have only a single ASID. |
| + */ |
| + if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) |
| + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); |
| + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); |
| } |
| |
| static inline void __flush_tlb_all(void) |
| diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c |
| index 895e4b88469c..b567c89fc628 100644 |
| --- a/arch/x86/kernel/cpu/common.c |
| +++ b/arch/x86/kernel/cpu/common.c |
| @@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = { |
| |
| static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; |
| |
| -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { |
| +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { |
| #ifdef CONFIG_X86_64 |
| /* |
| * We need valid kernel segments for data and code in long mode too |
| @@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) |
| * SDM says that it can't be enabled in 32-bit mode. |
| */ |
| set_in_cr4(X86_CR4_PCIDE); |
| + /* |
| + * INVPCID has two "groups" of types: |
| + * 1/2: Invalidate an individual address |
| + * 3/4: Invalidate all contexts |
| + * |
| + * 1/2 take a PCID, but 3/4 do not. So, 3/4 |
| + * ignore the PCID argument in the descriptor. |
| + * But, we have to be careful not to call 1/2 |
| + * with an actual non-zero PCID in them before |
| + * we do the above set_in_cr4(). |
| + */ |
| + if (cpu_has(c, X86_FEATURE_INVPCID)) |
| + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); |
| } else { |
| /* |
| * flush_tlb_all(), as currently implemented, won't |
| @@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) |
| clear_cpu_cap(c, X86_FEATURE_PCID); |
| } |
| } |
| + kaiser_setup_pcid(); |
| } |
| |
| /* |
| @@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { |
| [DEBUG_STACK - 1] = DEBUG_STKSZ |
| }; |
| |
| -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks |
| +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks |
| [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); |
| |
| /* May not be marked __init: used by software suspend */ |
| diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c |
| index 2d4e76ba2b5c..fb933cdca184 100644 |
| --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c |
| +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c |
| @@ -2,10 +2,14 @@ |
| #include <linux/types.h> |
| #include <linux/slab.h> |
| |
| +#include <asm/kaiser.h> |
| #include <asm/perf_event.h> |
| |
| #include "perf_event.h" |
| |
| +static |
| +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); |
| + |
| /* The size of a BTS record in bytes: */ |
| #define BTS_RECORD_SIZE 24 |
| |
| @@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu) |
| wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); |
| } |
| |
| +static void *dsalloc(size_t size, gfp_t flags, int node) |
| +{ |
| +#ifdef CONFIG_KAISER |
| + unsigned int order = get_order(size); |
| + struct page *page; |
| + unsigned long addr; |
| + |
| + page = alloc_pages_node(node, flags | __GFP_ZERO, order); |
| + if (!page) |
| + return NULL; |
| + addr = (unsigned long)page_address(page); |
| + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { |
| + __free_pages(page, order); |
| + addr = 0; |
| + } |
| + return (void *)addr; |
| +#else |
| + return kmalloc_node(size, flags | __GFP_ZERO, node); |
| +#endif |
| +} |
| + |
| +static void dsfree(const void *buffer, size_t size) |
| +{ |
| +#ifdef CONFIG_KAISER |
| + if (!buffer) |
| + return; |
| + kaiser_remove_mapping((unsigned long)buffer, size); |
| + free_pages((unsigned long)buffer, get_order(size)); |
| +#else |
| + kfree(buffer); |
| +#endif |
| +} |
| + |
| static int alloc_pebs_buffer(int cpu) |
| { |
| struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; |
| @@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu) |
| if (!x86_pmu.pebs) |
| return 0; |
| |
| - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); |
| + buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node); |
| if (unlikely(!buffer)) |
| return -ENOMEM; |
| |
| @@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu) |
| if (!ds || !x86_pmu.pebs) |
| return; |
| |
| - kfree((void *)(unsigned long)ds->pebs_buffer_base); |
| + dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE); |
| ds->pebs_buffer_base = 0; |
| } |
| |
| @@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu) |
| if (!x86_pmu.bts) |
| return 0; |
| |
| - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); |
| + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node); |
| if (unlikely(!buffer)) |
| return -ENOMEM; |
| |
| @@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu) |
| if (!ds || !x86_pmu.bts) |
| return; |
| |
| - kfree((void *)(unsigned long)ds->bts_buffer_base); |
| + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); |
| ds->bts_buffer_base = 0; |
| } |
| |
| static int alloc_ds_buffer(int cpu) |
| { |
| - int node = cpu_to_node(cpu); |
| - struct debug_store *ds; |
| - |
| - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); |
| - if (unlikely(!ds)) |
| - return -ENOMEM; |
| + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); |
| |
| + memset(ds, 0, sizeof(*ds)); |
| per_cpu(cpu_hw_events, cpu).ds = ds; |
| |
| return 0; |
| @@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu) |
| return; |
| |
| per_cpu(cpu_hw_events, cpu).ds = NULL; |
| - kfree(ds); |
| } |
| |
| void release_ds_buffers(void) |
| diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S |
| index f6daf3cdb878..3a4356a2f156 100644 |
| --- a/arch/x86/kernel/entry_64.S |
| +++ b/arch/x86/kernel/entry_64.S |
| @@ -56,6 +56,7 @@ |
| #include <asm/ftrace.h> |
| #include <asm/percpu.h> |
| #include <asm/pgtable_types.h> |
| +#include <asm/kaiser.h> |
| |
| /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
| #include <linux/elf-em.h> |
| @@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64) |
| testl $3, CS(%rdi) |
| je 1f |
| SWAPGS |
| + SWITCH_KERNEL_CR3 |
| /* |
| * irq_count is used to check if a CPU is already on an interrupt stack |
| * or not. While this is essentially redundant with preempt_count it is |
| @@ -362,6 +364,12 @@ END(save_rest) |
| |
| /* save complete stack frame */ |
| .pushsection .kprobes.text, "ax" |
| +/* |
| + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit |
| + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit |
| + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit |
| + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit |
| + */ |
| ENTRY(save_paranoid) |
| XCPT_FRAME 1 RDI+8 |
| cld |
| @@ -387,7 +395,25 @@ ENTRY(save_paranoid) |
| js 1f /* negative -> in kernel */ |
| SWAPGS |
| xorl %ebx,%ebx |
| -1: ret |
| +1: |
| +#ifdef CONFIG_KAISER |
| + /* |
| + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 |
| + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. |
| + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done |
| + * unconditionally, but we need to find out whether the reverse |
| + * should be done on return (conveyed to paranoid_exit in %ebx). |
| + */ |
| + movq %cr3, %rax |
| + testl $KAISER_SHADOW_PGD_OFFSET, %eax |
| + jz 2f |
| + orl $2, %ebx |
| + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
| + orq x86_cr3_pcid_noflush, %rax |
| + movq %rax, %cr3 |
| +2: |
| +#endif |
| + ret |
| CFI_ENDPROC |
| END(save_paranoid) |
| .popsection |
| @@ -464,6 +490,7 @@ ENTRY(system_call) |
| CFI_REGISTER rip,rcx |
| /*CFI_REGISTER rflags,r11*/ |
| SWAPGS_UNSAFE_STACK |
| + SWITCH_KERNEL_CR3_NO_STACK |
| /* |
| * A hypervisor implementation might want to use a label |
| * after the swapgs, so that it can do the swapgs |
| @@ -515,6 +542,14 @@ ENTRY(system_call_after_swapgs) |
| CFI_REGISTER rip,rcx |
| RESTORE_ARGS 1,-ARG_SKIP,0 |
| /*CFI_REGISTER rflags,r11*/ |
| + /* |
| + * This opens a window where we have a user CR3, but are |
| + * running in the kernel. This makes using the CS |
| + * register useless for telling whether or not we need to |
| + * switch CR3 in NMIs. Normal interrupts are OK because |
| + * they are off here. |
| + */ |
| + SWITCH_USER_CR3 |
| movq PER_CPU_VAR(old_rsp), %rsp |
| USERGS_SYSRET64 |
| |
| @@ -851,6 +886,14 @@ retint_swapgs: /* return to user-space */ |
| */ |
| DISABLE_INTERRUPTS(CLBR_ANY) |
| TRACE_IRQS_IRETQ |
| + /* |
| + * This opens a window where we have a user CR3, but are |
| + * running in the kernel. This makes using the CS |
| + * register useless for telling whether or not we need to |
| + * switch CR3 in NMIs. Normal interrupts are OK because |
| + * they are off here. |
| + */ |
| + SWITCH_USER_CR3 |
| SWAPGS |
| jmp restore_args |
| |
| @@ -891,6 +934,7 @@ ENTRY(native_iret) |
| pushq_cfi %rax |
| pushq_cfi %rdi |
| SWAPGS |
| + SWITCH_KERNEL_CR3 |
| movq PER_CPU_VAR(espfix_waddr),%rdi |
| movq %rax,(0*8)(%rdi) /* RAX */ |
| movq (2*8)(%rsp),%rax /* RIP */ |
| @@ -906,6 +950,7 @@ ENTRY(native_iret) |
| andl $0xffff0000,%eax |
| popq_cfi %rdi |
| orq PER_CPU_VAR(espfix_stack),%rax |
| + SWITCH_USER_CR3 |
| SWAPGS |
| movq %rax,%rsp |
| popq_cfi %rax |
| @@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) |
| * is fundamentally NMI-unsafe. (we cannot change the soft and |
| * hard flags at once, atomically) |
| */ |
| - |
| - /* ebx: no swapgs flag */ |
| +/* |
| + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 |
| + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 |
| + * ebx=2: needs both swapgs and SWITCH_USER_CR3 |
| + * ebx=3: needs SWITCH_USER_CR3 but not swapgs |
| + */ |
| ENTRY(paranoid_exit) |
| DEFAULT_FRAME |
| DISABLE_INTERRUPTS(CLBR_NONE) |
| TRACE_IRQS_OFF |
| - testl %ebx,%ebx /* swapgs needed? */ |
| - jnz paranoid_restore |
| - testl $3,CS(%rsp) |
| - jnz paranoid_userspace |
| -paranoid_swapgs: |
| + movq %rbx, %r12 /* paranoid_userspace uses %ebx */ |
| + testl $3, CS(%rsp) |
| + jnz paranoid_userspace |
| +paranoid_kernel: |
| + movq %r12, %rbx /* restore after paranoid_userspace */ |
| TRACE_IRQS_IRETQ 0 |
| +#ifdef CONFIG_KAISER |
| + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ |
| + jz paranoid_exit_no_switch |
| + SWITCH_USER_CR3 |
| +paranoid_exit_no_switch: |
| +#endif |
| + testl $1, %ebx /* swapgs needed? */ |
| + jnz paranoid_exit_no_swapgs |
| SWAPGS_UNSAFE_STACK |
| +paranoid_exit_no_swapgs: |
| RESTORE_ALL 8 |
| - jmp irq_return |
| -paranoid_restore: |
| - TRACE_IRQS_IRETQ 0 |
| - RESTORE_ALL 8 |
| - jmp irq_return |
| + jmp irq_return |
| + |
| paranoid_userspace: |
| GET_THREAD_INFO(%rcx) |
| movl TI_flags(%rcx),%ebx |
| andl $_TIF_WORK_MASK,%ebx |
| - jz paranoid_swapgs |
| + jz paranoid_kernel |
| movq %rsp,%rdi /* &pt_regs */ |
| call sync_regs |
| movq %rax,%rsp /* switch stack for scheduling */ |
| @@ -1438,6 +1493,13 @@ ENTRY(error_entry) |
| movq_cfi r13, R13+8 |
| movq_cfi r14, R14+8 |
| movq_cfi r15, R15+8 |
| + /* |
| + * error_entry() always returns with a kernel gsbase and |
| + * CR3. We must also have a kernel CR3/gsbase before |
| + * calling TRACE_IRQS_*. Just unconditionally switch to |
| + * the kernel CR3 here. |
| + */ |
| + SWITCH_KERNEL_CR3 |
| xorl %ebx,%ebx |
| testl $3,CS+8(%rsp) |
| je error_kernelspace |
| @@ -1527,22 +1589,31 @@ ENTRY(nmi) |
| call do_nmi |
| #ifdef CONFIG_TRACE_IRQFLAGS |
| /* paranoidexit; without TRACE_IRQS_OFF */ |
| - /* ebx: no swapgs flag */ |
| + /* ebx: no-swapgs and kaiser-switch-cr3 flag */ |
| DISABLE_INTERRUPTS(CLBR_NONE) |
| - testl %ebx,%ebx /* swapgs needed? */ |
| - jnz nmi_restore |
| - testl $3,CS(%rsp) |
| - jnz nmi_userspace |
| -nmi_swapgs: |
| + movq %rbx, %r12 /* nmi_userspace uses %ebx */ |
| + testl $3, CS(%rsp) |
| + jnz nmi_userspace |
| +nmi_kernel: |
| + movq %r12, %rbx /* restore after nmi_userspace */ |
| +#ifdef CONFIG_KAISER |
| + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ |
| + jz nmi_exit_no_switch |
| + SWITCH_USER_CR3 |
| +nmi_exit_no_switch: |
| +#endif |
| + testl $1, %ebx /* swapgs needed? */ |
| + jnz nmi_exit_no_swapgs |
| SWAPGS_UNSAFE_STACK |
| -nmi_restore: |
| +nmi_exit_no_swapgs: |
| RESTORE_ALL 8 |
| - jmp irq_return |
| + jmp irq_return |
| + |
| nmi_userspace: |
| GET_THREAD_INFO(%rcx) |
| movl TI_flags(%rcx),%ebx |
| andl $_TIF_WORK_MASK,%ebx |
| - jz nmi_swapgs |
| + jz nmi_kernel |
| movq %rsp,%rdi /* &pt_regs */ |
| call sync_regs |
| movq %rax,%rsp /* switch stack for scheduling */ |
| diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c |
| index 94d857fb1033..14cd73b0e634 100644 |
| --- a/arch/x86/kernel/espfix_64.c |
| +++ b/arch/x86/kernel/espfix_64.c |
| @@ -41,6 +41,7 @@ |
| #include <asm/pgalloc.h> |
| #include <asm/setup.h> |
| #include <asm/espfix.h> |
| +#include <asm/kaiser.h> |
| |
| /* |
| * Note: we only need 6*8 = 48 bytes for the espfix stack, but round |
| @@ -129,6 +130,14 @@ void __init init_espfix_bsp(void) |
| /* Install the espfix pud into the kernel page directory */ |
| pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
| pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); |
| + /* |
| + * Just copy the top-level PGD that is mapping the espfix |
| + * area to ensure it is mapped into the shadow user page |
| + * tables. |
| + */ |
| + if (IS_ENABLED(CONFIG_KAISER)) |
| + set_pgd(native_get_shadow_pgd(pgd_p), |
| + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); |
| |
| /* Randomize the locations */ |
| init_espfix_random(); |
| diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S |
| index 0f8ebf78253a..6e697ac3fb54 100644 |
| --- a/arch/x86/kernel/head_64.S |
| +++ b/arch/x86/kernel/head_64.S |
| @@ -338,6 +338,27 @@ ENTRY(early_idt_handler) |
| .balign PAGE_SIZE; \ |
| ENTRY(name) |
| |
| +#ifdef CONFIG_KAISER |
| +/* |
| + * Each PGD needs to be 8k long and 8k aligned. We do not |
| + * ever go out to userspace with these, so we do not |
| + * strictly *need* the second page, but this allows us to |
| + * have a single set_pgd() implementation that does not |
| + * need to worry about whether it has 4k or 8k to work |
| + * with. |
| + * |
| + * This ensures PGDs are 8k long: |
| + */ |
| +#define KAISER_USER_PGD_FILL 512 |
| +/* This ensures they are 8k-aligned: */ |
| +#define NEXT_PGD_PAGE(name) \ |
| + .balign 2 * PAGE_SIZE; \ |
| +GLOBAL(name) |
| +#else |
| +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) |
| +#define KAISER_USER_PGD_FILL 0 |
| +#endif |
| + |
| /* Automate the creation of 1 to 1 mapping pmd entries */ |
| #define PMDS(START, PERM, COUNT) \ |
| i = 0 ; \ |
| @@ -353,13 +374,14 @@ ENTRY(name) |
| * 0xffffffff80000000 to physical address 0x000000. (always using |
| * 2Mbyte large pages provided by PAE mode) |
| */ |
| -NEXT_PAGE(init_level4_pgt) |
| +NEXT_PGD_PAGE(init_level4_pgt) |
| .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 |
| .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| .org init_level4_pgt + L4_START_KERNEL*8, 0 |
| /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
| .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
| + .fill KAISER_USER_PGD_FILL,8,0 |
| |
| NEXT_PAGE(level3_ident_pgt) |
| .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| @@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt) |
| * Don't set NX because code runs from these pages. |
| */ |
| PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
| + .fill KAISER_USER_PGD_FILL,8,0 |
| |
| NEXT_PAGE(level2_kernel_pgt) |
| /* |
| diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c |
| index 43e9ccf44947..f00e6e734fbd 100644 |
| --- a/arch/x86/kernel/init_task.c |
| +++ b/arch/x86/kernel/init_task.c |
| @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); |
| * section. Since TSS's are completely CPU-local, we want them |
| * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
| */ |
| -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; |
| +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS; |
| |
| diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c |
| index e328f691eeef..990f743e21b8 100644 |
| --- a/arch/x86/kernel/irqinit.c |
| +++ b/arch/x86/kernel/irqinit.c |
| @@ -85,7 +85,7 @@ static struct irqaction irq2 = { |
| .flags = IRQF_NO_THREAD, |
| }; |
| |
| -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { |
| +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { |
| [0 ... NR_VECTORS - 1] = -1, |
| }; |
| |
| diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c |
| index 1dd32307a494..836a4c2d5ceb 100644 |
| --- a/arch/x86/kernel/ldt.c |
| +++ b/arch/x86/kernel/ldt.c |
| @@ -15,6 +15,7 @@ |
| #include <linux/slab.h> |
| #include <linux/vmalloc.h> |
| #include <linux/uaccess.h> |
| +#include <linux/kaiser.h> |
| |
| #include <asm/system.h> |
| #include <asm/ldt.h> |
| @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) |
| set_ldt(pc->ldt->entries, pc->ldt->size); |
| } |
| |
| +static void __free_ldt_struct(struct ldt_struct *ldt) |
| +{ |
| + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
| + vfree(ldt->entries); |
| + else |
| + free_page((unsigned long)ldt->entries); |
| + kfree(ldt); |
| +} |
| + |
| /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ |
| static struct ldt_struct *alloc_ldt_struct(int size) |
| { |
| struct ldt_struct *new_ldt; |
| int alloc_size; |
| + int ret; |
| |
| if (size > LDT_ENTRIES) |
| return NULL; |
| @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) |
| return NULL; |
| } |
| |
| + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, |
| + __PAGE_KERNEL); |
| new_ldt->size = size; |
| + if (ret) { |
| + __free_ldt_struct(new_ldt); |
| + return NULL; |
| + } |
| return new_ldt; |
| } |
| |
| @@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) |
| if (likely(!ldt)) |
| return; |
| |
| + kaiser_remove_mapping((unsigned long)ldt->entries, |
| + ldt->size * LDT_ENTRY_SIZE); |
| paravirt_free_ldt(ldt->entries, ldt->size); |
| - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
| - vfree(ldt->entries); |
| - else |
| - kfree(ldt->entries); |
| - kfree(ldt); |
| + __free_ldt_struct(ldt); |
| } |
| |
| /* |
| diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c |
| index 557eb3757edb..d2ce2a33d15b 100644 |
| --- a/arch/x86/kernel/process_64.c |
| +++ b/arch/x86/kernel/process_64.c |
| @@ -57,7 +57,7 @@ |
| |
| asmlinkage extern void ret_from_fork(void); |
| |
| -DEFINE_PER_CPU(unsigned long, old_rsp); |
| +DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp); |
| static DEFINE_PER_CPU(unsigned char, is_idle); |
| |
| static ATOMIC_NOTIFIER_HEAD(idle_notifier); |
| diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile |
| index cf2a84031dfd..c9a00a5e0b87 100644 |
| --- a/arch/x86/mm/Makefile |
| +++ b/arch/x86/mm/Makefile |
| @@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
| obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
| |
| obj-$(CONFIG_MEMTEST) += memtest.o |
| +obj-$(CONFIG_KAISER) += kaiser.o |
| diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c |
| new file mode 100644 |
| index 000000000000..79b0222ffa74 |
| --- /dev/null |
| +++ b/arch/x86/mm/kaiser.c |
| @@ -0,0 +1,382 @@ |
| +#include <linux/bug.h> |
| +#include <linux/kernel.h> |
| +#include <linux/errno.h> |
| +#include <linux/string.h> |
| +#include <linux/types.h> |
| +#include <linux/bug.h> |
| +#include <linux/init.h> |
| +#include <linux/interrupt.h> |
| +#include <linux/spinlock.h> |
| +#include <linux/mm.h> |
| +#include <linux/module.h> |
| +#include <linux/uaccess.h> |
| +#include <linux/ftrace.h> |
| + |
| +extern struct mm_struct init_mm; |
| + |
| +#include <asm/kaiser.h> |
| +#include <asm/tlbflush.h> /* to verify its kaiser declarations */ |
| +#include <asm/pgtable.h> |
| +#include <asm/pgalloc.h> |
| +#include <asm/desc.h> |
| + |
| +#ifdef CONFIG_KAISER |
| +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| + |
| +/* |
| + * These can have bit 63 set, so we can not just use a plain "or" |
| + * instruction to get their value or'd into CR3. It would take |
| + * another register. So, we use a memory reference to these instead. |
| + * |
| + * This is also handy because systems that do not support PCIDs |
| + * just end up or'ing a 0 into their CR3, which does no harm. |
| + */ |
| +unsigned long x86_cr3_pcid_noflush __read_mostly; |
| +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
| + |
| +/* |
| + * At runtime, the only things we map are some things for CPU |
| + * hotplug, and stacks for new processes. No two CPUs will ever |
| + * be populating the same addresses, so we only need to ensure |
| + * that we protect between two CPUs trying to allocate and |
| + * populate the same page table page. |
| + * |
| + * Only take this lock when doing a set_p[4um]d(), but it is not |
| + * needed for doing a set_pte(). We assume that only the *owner* |
| + * of a given allocation will be doing this for _their_ |
| + * allocation. |
| + * |
| + * This ensures that once a system has been running for a while |
| + * and there have been stacks all over and these page tables |
| + * are fully populated, there will be no further acquisitions of |
| + * this lock. |
| + */ |
| +static DEFINE_SPINLOCK(shadow_table_allocation_lock); |
| + |
| +/* |
| + * Returns -1 on error. |
| + */ |
| +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) |
| +{ |
| + pgd_t *pgd; |
| + pud_t *pud; |
| + pmd_t *pmd; |
| + pte_t *pte; |
| + |
| + pgd = pgd_offset_k(vaddr); |
| + /* |
| + * We made all the kernel PGDs present in kaiser_init(). |
| + * We expect them to stay that way. |
| + */ |
| + BUG_ON(pgd_none(*pgd)); |
| + /* |
| + * PGDs are either 512GB or 128TB on all x86_64 |
| + * configurations. We don't handle these. |
| + */ |
| + BUG_ON(pgd_large(*pgd)); |
| + |
| + pud = pud_offset(pgd, vaddr); |
| + if (pud_none(*pud)) { |
| + WARN_ON_ONCE(1); |
| + return -1; |
| + } |
| + |
| + if (pud_large(*pud)) |
| + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); |
| + |
| + pmd = pmd_offset(pud, vaddr); |
| + if (pmd_none(*pmd)) { |
| + WARN_ON_ONCE(1); |
| + return -1; |
| + } |
| + |
| + if (pmd_large(*pmd)) |
| + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); |
| + |
| + pte = pte_offset_kernel(pmd, vaddr); |
| + if (pte_none(*pte)) { |
| + WARN_ON_ONCE(1); |
| + return -1; |
| + } |
| + |
| + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); |
| +} |
| + |
| +/* |
| + * This is a relatively normal page table walk, except that it |
| + * also tries to allocate page tables pages along the way. |
| + * |
| + * Returns a pointer to a PTE on success, or NULL on failure. |
| + */ |
| +static pte_t *kaiser_pagetable_walk(unsigned long address) |
| +{ |
| + pmd_t *pmd; |
| + pud_t *pud; |
| + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
| + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); |
| + |
| + if (pgd_none(*pgd)) { |
| + WARN_ONCE(1, "All shadow pgds should have been populated"); |
| + return NULL; |
| + } |
| + BUILD_BUG_ON(pgd_large(*pgd) != 0); |
| + |
| + pud = pud_offset(pgd, address); |
| + /* The shadow page tables do not use large mappings: */ |
| + if (pud_large(*pud)) { |
| + WARN_ON(1); |
| + return NULL; |
| + } |
| + if (pud_none(*pud)) { |
| + unsigned long new_pmd_page = __get_free_page(gfp); |
| + if (!new_pmd_page) |
| + return NULL; |
| + spin_lock(&shadow_table_allocation_lock); |
| + if (pud_none(*pud)) { |
| + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); |
| + __inc_zone_page_state(virt_to_page((void *) |
| + new_pmd_page), NR_KAISERTABLE); |
| + } else |
| + free_page(new_pmd_page); |
| + spin_unlock(&shadow_table_allocation_lock); |
| + } |
| + |
| + pmd = pmd_offset(pud, address); |
| + /* The shadow page tables do not use large mappings: */ |
| + if (pmd_large(*pmd)) { |
| + WARN_ON(1); |
| + return NULL; |
| + } |
| + if (pmd_none(*pmd)) { |
| + unsigned long new_pte_page = __get_free_page(gfp); |
| + if (!new_pte_page) |
| + return NULL; |
| + spin_lock(&shadow_table_allocation_lock); |
| + if (pmd_none(*pmd)) { |
| + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); |
| + __inc_zone_page_state(virt_to_page((void *) |
| + new_pte_page), NR_KAISERTABLE); |
| + } else |
| + free_page(new_pte_page); |
| + spin_unlock(&shadow_table_allocation_lock); |
| + } |
| + |
| + return pte_offset_kernel(pmd, address); |
| +} |
| + |
| +int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| + unsigned long flags) |
| +{ |
| + int ret = 0; |
| + pte_t *pte; |
| + unsigned long start_addr = (unsigned long )__start_addr; |
| + unsigned long address = start_addr & PAGE_MASK; |
| + unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
| + unsigned long target_address; |
| + |
| + for (; address < end_addr; address += PAGE_SIZE) { |
| + target_address = get_pa_from_mapping(address); |
| + if (target_address == -1) { |
| + ret = -EIO; |
| + break; |
| + } |
| + pte = kaiser_pagetable_walk(address); |
| + if (!pte) { |
| + ret = -ENOMEM; |
| + break; |
| + } |
| + if (pte_none(*pte)) { |
| + set_pte(pte, __pte(flags | target_address)); |
| + } else { |
| + pte_t tmp; |
| + set_pte(&tmp, __pte(flags | target_address)); |
| + WARN_ON_ONCE(!pte_same(*pte, tmp)); |
| + } |
| + } |
| + return ret; |
| +} |
| + |
| +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) |
| +{ |
| + unsigned long size = end - start; |
| + |
| + return kaiser_add_user_map(start, size, flags); |
| +} |
| + |
| +/* |
| + * Ensure that the top level of the (shadow) page tables are |
| + * entirely populated. This ensures that all processes that get |
| + * forked have the same entries. This way, we do not have to |
| + * ever go set up new entries in older processes. |
| + * |
| + * Note: we never free these, so there are no updates to them |
| + * after this. |
| + */ |
| +static void __init kaiser_init_all_pgds(void) |
| +{ |
| + pgd_t *pgd; |
| + int i = 0; |
| + |
| + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); |
| + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { |
| + pgd_t new_pgd; |
| + pud_t *pud = pud_alloc_one(&init_mm, |
| + PAGE_OFFSET + i * PGDIR_SIZE); |
| + if (!pud) { |
| + WARN_ON(1); |
| + break; |
| + } |
| + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); |
| + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); |
| + /* |
| + * Make sure not to stomp on some other pgd entry. |
| + */ |
| + if (!pgd_none(pgd[i])) { |
| + WARN_ON(1); |
| + continue; |
| + } |
| + set_pgd(pgd + i, new_pgd); |
| + } |
| +} |
| + |
| +#define kaiser_add_user_map_early(start, size, flags) do { \ |
| + int __ret = kaiser_add_user_map(start, size, flags); \ |
| + WARN_ON(__ret); \ |
| +} while (0) |
| + |
| +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ |
| + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ |
| + WARN_ON(__ret); \ |
| +} while (0) |
| + |
| +/* |
| + * If anything in here fails, we will likely die on one of the |
| + * first kernel->user transitions and init will die. But, we |
| + * will have most of the kernel up by then and should be able to |
| + * get a clean warning out of it. If we BUG_ON() here, we run |
| + * the risk of being before we have good console output. |
| + */ |
| +void __init kaiser_init(void) |
| +{ |
| + int cpu; |
| + |
| + kaiser_init_all_pgds(); |
| + |
| + for_each_possible_cpu(cpu) { |
| + void *percpu_vaddr = __per_cpu_user_mapped_start + |
| + per_cpu_offset(cpu); |
| + unsigned long percpu_sz = __per_cpu_user_mapped_end - |
| + __per_cpu_user_mapped_start; |
| + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, |
| + __PAGE_KERNEL); |
| + } |
| + |
| + /* |
| + * Map the entry/exit text section, which is needed at |
| + * switches from user to and from kernel. |
| + */ |
| + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, |
| + __PAGE_KERNEL_RX); |
| +#ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| + kaiser_add_user_map_ptrs_early(__irqentry_text_start, |
| + __irqentry_text_end, |
| + __PAGE_KERNEL_RX); |
| +#endif |
| + kaiser_add_user_map_early((void *)idt_descr.address, |
| + sizeof(gate_desc) * NR_VECTORS, |
| + __PAGE_KERNEL_RO); |
| + kaiser_add_user_map_early(&x86_cr3_pcid_noflush, |
| + sizeof(x86_cr3_pcid_noflush), |
| + __PAGE_KERNEL); |
| +} |
| + |
| +/* Add a mapping to the shadow mapping, and synchronize the mappings */ |
| +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
| +{ |
| + return kaiser_add_user_map((const void *)addr, size, flags); |
| +} |
| + |
| +void kaiser_remove_mapping(unsigned long start, unsigned long size) |
| +{ |
| + unsigned long end = start + size; |
| + unsigned long addr; |
| + pte_t *pte; |
| + |
| + for (addr = start; addr < end; addr += PAGE_SIZE) { |
| + pte = kaiser_pagetable_walk(addr); |
| + if (pte) |
| + set_pte(pte, __pte(0)); |
| + } |
| +} |
| + |
| +/* |
| + * Page table pages are page-aligned. The lower half of the top |
| + * level is used for userspace and the top half for the kernel. |
| + * This returns true for user pages that need to get copied into |
| + * both the user and kernel copies of the page tables, and false |
| + * for kernel pages that should only be in the kernel copy. |
| + */ |
| +static inline bool is_userspace_pgd(pgd_t *pgdp) |
| +{ |
| + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); |
| +} |
| + |
| +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| +{ |
| + /* |
| + * Do we need to also populate the shadow pgd? Check _PAGE_USER to |
| + * skip cases like kexec and EFI which make temporary low mappings. |
| + */ |
| + if (pgd.pgd & _PAGE_USER) { |
| + if (is_userspace_pgd(pgdp)) { |
| + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| + /* |
| + * Even if the entry is *mapping* userspace, ensure |
| + * that userspace can not use it. This way, if we |
| + * get out to userspace running on the kernel CR3, |
| + * userspace will crash instead of running. |
| + */ |
| + pgd.pgd |= _PAGE_NX; |
| + } |
| + } else if (!pgd.pgd) { |
| + /* |
| + * pgd_clear() cannot check _PAGE_USER, and is even used to |
| + * clear corrupted pgd entries: so just rely on cases like |
| + * kexec and EFI never to be using pgd_clear(). |
| + */ |
| + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && |
| + is_userspace_pgd(pgdp)) |
| + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| + } |
| + return pgd; |
| +} |
| + |
| +void kaiser_setup_pcid(void) |
| +{ |
| + unsigned long kern_cr3 = 0; |
| + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; |
| + |
| + if (this_cpu_has(X86_FEATURE_PCID)) { |
| + kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; |
| + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; |
| + } |
| + /* |
| + * These variables are used by the entry/exit |
| + * code to change PCID and pgd and TLB flushing. |
| + */ |
| + x86_cr3_pcid_noflush = kern_cr3; |
| + this_cpu_write(x86_cr3_pcid_user, user_cr3); |
| +} |
| + |
| +/* |
| + * Make a note that this cpu will need to flush USER tlb on return to user. |
| + * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: |
| + * if cpu does not, then the NOFLUSH bit will never have been set. |
| + */ |
| +void kaiser_flush_tlb_on_return_to_user(void) |
| +{ |
| + this_cpu_write(x86_cr3_pcid_user, |
| + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); |
| +} |
| +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); |
| +#endif /* CONFIG_KAISER */ |
| diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c |
| index 8573b83a63d0..73285602c93f 100644 |
| --- a/arch/x86/mm/pgtable.c |
| +++ b/arch/x86/mm/pgtable.c |
| @@ -5,7 +5,7 @@ |
| #include <asm/tlb.h> |
| #include <asm/fixmap.h> |
| |
| -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO |
| +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) |
| |
| #ifdef CONFIG_HIGHPTE |
| #define PGALLOC_USER_GFP __GFP_HIGHMEM |
| @@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
| } |
| } |
| |
| +#ifdef CONFIG_KAISER |
| +/* |
| + * Instead of one pmd, we aquire two pmds. Being order-1, it is |
| + * both 8k in size and 8k-aligned. That lets us just flip bit 12 |
| + * in a pointer to swap between the two 4k halves. |
| + */ |
| +#define PGD_ALLOCATION_ORDER 1 |
| +#else |
| +#define PGD_ALLOCATION_ORDER 0 |
| +#endif |
| + |
| +static inline pgd_t *_pgd_alloc(void) |
| +{ |
| + /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ |
| + return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, |
| + PGD_ALLOCATION_ORDER); |
| +} |
| + |
| +static inline void _pgd_free(pgd_t *pgd) |
| +{ |
| + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); |
| +} |
| + |
| pgd_t *pgd_alloc(struct mm_struct *mm) |
| { |
| pgd_t *pgd; |
| pmd_t *pmds[PREALLOCATED_PMDS]; |
| |
| - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); |
| + pgd = _pgd_alloc(); |
| |
| if (pgd == NULL) |
| goto out; |
| @@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) |
| out_free_pmds: |
| free_pmds(pmds); |
| out_free_pgd: |
| - free_page((unsigned long)pgd); |
| + _pgd_free(pgd); |
| out: |
| return NULL; |
| } |
| @@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) |
| pgd_mop_up_pmds(mm, pgd); |
| pgd_dtor(pgd); |
| paravirt_pgd_free(mm, pgd); |
| - free_page((unsigned long)pgd); |
| + _pgd_free(pgd); |
| } |
| |
| int ptep_set_access_flags(struct vm_area_struct *vma, |
| diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c |
| index 4e4e6bc09e98..bd0ff4eb74a0 100644 |
| --- a/arch/x86/mm/tlb.c |
| +++ b/arch/x86/mm/tlb.c |
| @@ -12,10 +12,43 @@ |
| #include <asm/cache.h> |
| #include <asm/apic.h> |
| #include <asm/uv/uv.h> |
| +#include <asm/kaiser.h> |
| |
| DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) |
| = { &init_mm, 0, }; |
| |
| +static void load_new_mm_cr3(pgd_t *pgdir) |
| +{ |
| + unsigned long new_mm_cr3 = __pa(pgdir); |
| + |
| +#ifdef CONFIG_KAISER |
| + if (this_cpu_has(X86_FEATURE_PCID)) { |
| + /* |
| + * We reuse the same PCID for different tasks, so we must |
| + * flush all the entries for the PCID out when we change tasks. |
| + * Flush KERN below, flush USER when returning to userspace in |
| + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. |
| + * |
| + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could |
| + * do it here, but can only be used if X86_FEATURE_INVPCID is |
| + * available - and many machines support pcid without invpcid. |
| + * |
| + * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; |
| + * but keep that line in there in case something changes. |
| + */ |
| + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; |
| + kaiser_flush_tlb_on_return_to_user(); |
| + } |
| +#endif /* CONFIG_KAISER */ |
| + |
| + /* |
| + * Caution: many callers of this function expect |
| + * that load_new_mm_cr3() is serializing and orders TLB |
| + * fills with respect to the mm_cpumask writes. |
| + */ |
| + write_cr3(new_mm_cr3); |
| +} |
| + |
| /* |
| * TLB flushing, formerly SMP-only |
| * c/o Linus Torvalds. |
| @@ -65,7 +98,7 @@ void leave_mm(int cpu) |
| BUG(); |
| cpumask_clear_cpu(cpu, |
| mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); |
| - load_cr3(swapper_pg_dir); |
| + load_new_mm_cr3(swapper_pg_dir); |
| } |
| EXPORT_SYMBOL_GPL(leave_mm); |
| |
| @@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
| * from next->pgd. TLB fills are special and can happen |
| * due to instruction fetches or for no reason at all, |
| * and neither LOCK nor MFENCE orders them. |
| - * Fortunately, load_cr3() is serializing and gives the |
| - * ordering guarantee we need. |
| - * |
| + * Fortunately, load_new_mm_cr3() is serializing |
| + * and gives the ordering guarantee we need. |
| */ |
| - load_cr3(next->pgd); |
| + load_new_mm_cr3(next->pgd); |
| |
| /* stop flush ipis for the previous mm */ |
| cpumask_clear_cpu(cpu, mm_cpumask(prev)); |
| @@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
| * tlb flush IPI delivery. We must reload CR3 |
| * to make sure to use no freed page tables. |
| * |
| - * As above, load_cr3() is serializing and orders TLB |
| - * fills with respect to the mm_cpumask write. |
| + * As above, load_new_mm_cr3() is serializing and orders |
| + * TLB fills with respect to the mm_cpumask write. |
| */ |
| - load_cr3(next->pgd); |
| + load_new_mm_cr3(next->pgd); |
| load_mm_ldt(next); |
| } |
| } |
| diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h |
| index b5e2e4c6b017..01c8155dd613 100644 |
| --- a/include/asm-generic/vmlinux.lds.h |
| +++ b/include/asm-generic/vmlinux.lds.h |
| @@ -692,7 +692,14 @@ |
| */ |
| #define PERCPU_INPUT(cacheline) \ |
| VMLINUX_SYMBOL(__per_cpu_start) = .; \ |
| + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ |
| *(.data..percpu..first) \ |
| + . = ALIGN(cacheline); \ |
| + *(.data..percpu..user_mapped) \ |
| + *(.data..percpu..user_mapped..shared_aligned) \ |
| + . = ALIGN(PAGE_SIZE); \ |
| + *(.data..percpu..user_mapped..page_aligned) \ |
| + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ |
| . = ALIGN(PAGE_SIZE); \ |
| *(.data..percpu..page_aligned) \ |
| . = ALIGN(cacheline); \ |
| diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h |
| new file mode 100644 |
| index 000000000000..4a4d6d911a14 |
| --- /dev/null |
| +++ b/include/linux/kaiser.h |
| @@ -0,0 +1,52 @@ |
| +#ifndef _LINUX_KAISER_H |
| +#define _LINUX_KAISER_H |
| + |
| +#ifdef CONFIG_KAISER |
| +#include <asm/kaiser.h> |
| + |
| +static inline int kaiser_map_thread_stack(void *stack) |
| +{ |
| + /* |
| + * Map that page of kernel stack on which we enter from user context. |
| + */ |
| + return kaiser_add_mapping((unsigned long)stack + |
| + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); |
| +} |
| + |
| +static inline void kaiser_unmap_thread_stack(void *stack) |
| +{ |
| + /* |
| + * Note: may be called even when kaiser_map_thread_stack() failed. |
| + */ |
| + kaiser_remove_mapping((unsigned long)stack + |
| + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); |
| +} |
| +#else |
| + |
| +/* |
| + * These stubs are used whenever CONFIG_KAISER is off, which |
| + * includes architectures that support KAISER, but have it disabled. |
| + */ |
| + |
| +static inline void kaiser_init(void) |
| +{ |
| +} |
| +static inline int kaiser_add_mapping(unsigned long addr, |
| + unsigned long size, unsigned long flags) |
| +{ |
| + return 0; |
| +} |
| +static inline void kaiser_remove_mapping(unsigned long start, |
| + unsigned long size) |
| +{ |
| +} |
| +static inline int kaiser_map_thread_stack(void *stack) |
| +{ |
| + return 0; |
| +} |
| +static inline void kaiser_unmap_thread_stack(void *stack) |
| +{ |
| +} |
| + |
| +#endif /* !CONFIG_KAISER */ |
| +#endif /* _LINUX_KAISER_H */ |
| diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h |
| index 25842b6e72e1..a0b4422a116a 100644 |
| --- a/include/linux/mmzone.h |
| +++ b/include/linux/mmzone.h |
| @@ -95,8 +95,9 @@ enum zone_stat_item { |
| NR_SLAB_RECLAIMABLE, |
| NR_SLAB_UNRECLAIMABLE, |
| NR_PAGETABLE, /* used for pagetables */ |
| - NR_KERNEL_STACK, |
| /* Second 128 byte cacheline */ |
| + NR_KERNEL_STACK, |
| + NR_KAISERTABLE, |
| NR_UNSTABLE_NFS, /* NFS unstable pages */ |
| NR_BOUNCE, |
| NR_VMSCAN_WRITE, |
| diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h |
| index 27ef6b190ea6..56f5eeb78d1d 100644 |
| --- a/include/linux/percpu-defs.h |
| +++ b/include/linux/percpu-defs.h |
| @@ -28,6 +28,12 @@ |
| (void)__vpp_verify; \ |
| } while (0) |
| |
| +#ifdef CONFIG_KAISER |
| +#define USER_MAPPED_SECTION "..user_mapped" |
| +#else |
| +#define USER_MAPPED_SECTION "" |
| +#endif |
| + |
| /* |
| * s390 and alpha modules require percpu variables to be defined as |
| * weak to force the compiler to generate GOT based external |
| @@ -90,6 +96,12 @@ |
| #define DEFINE_PER_CPU(type, name) \ |
| DEFINE_PER_CPU_SECTION(type, name, "") |
| |
| +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ |
| + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) |
| + |
| +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ |
| + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) |
| + |
| /* |
| * Declaration/definition used for per-CPU variables that must come first in |
| * the set of variables. |
| @@ -119,6 +131,14 @@ |
| DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ |
| ____cacheline_aligned_in_smp |
| |
| +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ |
| + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ |
| + ____cacheline_aligned_in_smp |
| + |
| +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ |
| + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ |
| + ____cacheline_aligned_in_smp |
| + |
| #define DECLARE_PER_CPU_ALIGNED(type, name) \ |
| DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ |
| ____cacheline_aligned |
| @@ -137,11 +157,21 @@ |
| #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ |
| DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ |
| __aligned(PAGE_SIZE) |
| +/* |
| + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. |
| + */ |
| +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ |
| + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ |
| + __aligned(PAGE_SIZE) |
| + |
| +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ |
| + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ |
| + __aligned(PAGE_SIZE) |
| |
| /* |
| * Declaration/definition used for per-CPU variables that must be read mostly. |
| */ |
| -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ |
| +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ |
| DECLARE_PER_CPU_SECTION(type, name, "..readmostly") |
| |
| #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ |
| diff --git a/init/main.c b/init/main.c |
| index e937d9bda0f8..558a9fdd566d 100644 |
| --- a/init/main.c |
| +++ b/init/main.c |
| @@ -69,6 +69,7 @@ |
| #include <linux/slab.h> |
| #include <linux/perf_event.h> |
| #include <linux/random.h> |
| +#include <linux/kaiser.h> |
| |
| #include <asm/io.h> |
| #include <asm/bugs.h> |
| @@ -463,6 +464,7 @@ static void __init mm_init(void) |
| percpu_init_late(); |
| pgtable_cache_init(); |
| vmalloc_init(); |
| + kaiser_init(); |
| } |
| |
| asmlinkage void __init start_kernel(void) |
| diff --git a/kernel/fork.c b/kernel/fork.c |
| index 29b460431c12..511131a15a75 100644 |
| --- a/kernel/fork.c |
| +++ b/kernel/fork.c |
| @@ -55,6 +55,7 @@ |
| #include <linux/tsacct_kern.h> |
| #include <linux/cn_proc.h> |
| #include <linux/freezer.h> |
| +#include <linux/kaiser.h> |
| #include <linux/delayacct.h> |
| #include <linux/taskstats_kern.h> |
| #include <linux/random.h> |
| @@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
| |
| static inline void free_thread_info(struct thread_info *ti) |
| { |
| + kaiser_unmap_thread_stack(ti); |
| free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
| } |
| #endif |
| @@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) |
| |
| tsk->stack = ti; |
| |
| + err = kaiser_map_thread_stack(tsk->stack); |
| + if (err) |
| + goto out; |
| + |
| setup_thread_stack(tsk, orig); |
| clear_user_return_notifier(tsk); |
| clear_tsk_need_resched(tsk); |
| diff --git a/mm/vmstat.c b/mm/vmstat.c |
| index ff9060919c4b..eaf3db038652 100644 |
| --- a/mm/vmstat.c |
| +++ b/mm/vmstat.c |
| @@ -699,6 +699,7 @@ const char * const vmstat_text[] = { |
| "nr_slab_unreclaimable", |
| "nr_page_table_pages", |
| "nr_kernel_stack", |
| + "nr_overhead", |
| "nr_unstable", |
| "nr_bounce", |
| "nr_vmscan_write", |
| diff --git a/security/Kconfig b/security/Kconfig |
| index 51bd5a0b69ae..19f83193e7ab 100644 |
| --- a/security/Kconfig |
| +++ b/security/Kconfig |
| @@ -96,6 +96,16 @@ config SECURITY |
| |
| If you are unsure how to answer this question, answer N. |
| |
| +config KAISER |
| + bool "Remove the kernel mapping in user mode" |
| + default y |
| + depends on X86_64 && SMP && !PARAVIRT |
| + help |
| + This enforces a strict kernel and user space isolation, in order |
| + to close hardware side channels on kernel address information. |
| + |
| + If you are unsure how to answer this question, answer Y. |
| + |
| config SECURITYFS |
| bool "Enable the securityfs filesystem" |
| help |
| |