releases/3.2.98/kaiser-kernel-address-isolation.patch - pub/scm/linux/kernel/git/bwh/linux-stable-queue - Git at Google

 From: Hugh Dickins <hughd@google.com>
 Date: Mon, 11 Dec 2017 17:59:50 -0800
 Subject: KAISER: Kernel Address Isolation

 This patch introduces our implementation of KAISER (Kernel Address
 Isolation to have Side-channels Efficiently Removed), a kernel isolation
 technique to close hardware side channels on kernel address information.

 More information about the original patch can be found at:
 https://github.com/IAIK/KAISER
 http://marc.info/?l=linux-kernel&m=149390087310405&w=2

 Daniel Gruss <daniel.gruss@iaik.tugraz.at>
 Richard Fellner <richard.fellner@student.tugraz.at>
 Michael Schwarz <michael.schwarz@iaik.tugraz.at>
 <clementine.maurice@iaik.tugraz.at>
 <moritz.lipp@iaik.tugraz.at>

 That original was then developed further by
 Dave Hansen <dave.hansen@intel.com>
 Hugh Dickins <hughd@google.com>
 then others after this snapshot.

 This combined patch for 3.2.96 was derived from hughd's patches below
 for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last,
 which was sent in 2017-12-09's nokaiser-3.18.72.tar.  They have been
 combined in order to minimize the effort of rebasing: most of the
 patches in the 3.18.72 series were small fixes and cleanups and
 enhancements to three large patches.  About the only new work in this
 backport is a simple reimplementation of kaiser_remove_mapping():
 since mm/pageattr.c changed a lot between 3.2 and 3.18, and the
 mods there for Kaiser never seemed necessary.

 KAISER: Kernel Address Isolation
 kaiser: merged update
 kaiser: do not set _PAGE_NX on pgd_none
 kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
 kaiser: fix build and FIXME in alloc_ldt_struct()
 kaiser: KAISER depends on SMP
 kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
 kaiser: fix perf crashes
 kaiser: ENOMEM if kaiser_pagetable_walk() NULL
 kaiser: tidied up asm/kaiser.h somewhat
 kaiser: tidied up kaiser_add/remove_mapping slightly
 kaiser: kaiser_remove_mapping() move along the pgd
 kaiser: align addition to x86/mm/Makefile
 kaiser: cleanups while trying for gold link
 kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
 kaiser: delete KAISER_REAL_SWITCH option
 kaiser: vmstat show NR_KAISERTABLE as nr_overhead
 kaiser: enhanced by kernel and user PCIDs
 kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
 kaiser: PCID 0 for kernel and 128 for user
 kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
 kaiser: paranoid_entry pass cr3 need to paranoid_exit
 kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
 kaiser: fix unlikely error in alloc_ldt_struct()
 kaiser: drop is_atomic arg to kaiser_pagetable_walk()

 Signed-off-by: Hugh Dickins <hughd@google.com>
 [bwh:
  - Fixed the #undef in arch/x86/boot/compressed/misc.h
  - Add missing #include in arch/x86/mm/kaiser.c]
 Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
 ---
  arch/x86/boot/compressed/misc.h           |   1 +
  arch/x86/ia32/ia32entry.S                 |   7 +
  arch/x86/include/asm/cpufeature.h         |   1 +
  arch/x86/include/asm/desc.h               |   2 +-
  arch/x86/include/asm/hw_irq.h             |   2 +-
  arch/x86/include/asm/kaiser.h             | 126 ++++++++++
  arch/x86/include/asm/pgtable.h            |  18 +-
  arch/x86/include/asm/pgtable_64.h         |  29 ++-
  arch/x86/include/asm/pgtable_types.h      |  33 ++-
  arch/x86/include/asm/processor-flags.h    |   2 +
  arch/x86/include/asm/processor.h          |   2 +-
  arch/x86/include/asm/tlbflush.h           |  64 ++++-
  arch/x86/kernel/cpu/common.c              |  18 +-
  arch/x86/kernel/cpu/perf_event_intel_ds.c |  54 ++++-
  arch/x86/kernel/entry_64.S                | 117 +++++++--
  arch/x86/kernel/espfix_64.c               |   9 +
  arch/x86/kernel/head_64.S                 |  25 +-
  arch/x86/kernel/init_task.c               |   2 +-
  arch/x86/kernel/irqinit.c                 |   2 +-
  arch/x86/kernel/ldt.c                     |  25 +-
  arch/x86/kernel/process_64.c              |   2 +-
  arch/x86/mm/Makefile                      |   1 +
  arch/x86/mm/kaiser.c                      | 382 ++++++++++++++++++++++++++++++
  arch/x86/mm/pgtable.c                     |  31 ++-
  arch/x86/mm/tlb.c                         |  48 +++-
  include/asm-generic/vmlinux.lds.h         |   7 +
  include/linux/kaiser.h                    |  52 ++++
  include/linux/mmzone.h                    |   3 +-
  include/linux/percpu-defs.h               |  32 ++-
  init/main.c                               |   2 +
  kernel/fork.c                             |   6 +
  mm/vmstat.c                               |   1 +
  security/Kconfig                          |  10 +
  33 files changed, 1049 insertions(+), 67 deletions(-)
  create mode 100644 arch/x86/include/asm/kaiser.h
  create mode 100644 arch/x86/mm/kaiser.c
  create mode 100644 include/linux/kaiser.h

 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
 index 3f19c81a6203..2fa2635ee539 100644
 --- a/arch/x86/boot/compressed/misc.h
 +++ b/arch/x86/boot/compressed/misc.h
 @@ -7,6 +7,7 @@
   * we just keep it from happening
   */
  #undef CONFIG_PARAVIRT
 +#undef CONFIG_KAISER
  #ifdef CONFIG_X86_32
  #define _ASM_X86_DESC_H 1
  #endif
 diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
 index 2b5527726ae1..7eb0d4792800 100644
 --- a/arch/x86/ia32/ia32entry.S
 +++ b/arch/x86/ia32/ia32entry.S
 @@ -12,6 +12,8 @@
  #include <asm/ia32_unistd.h>
  #include <asm/thread_info.h>
  #include <asm/segment.h>
 +#include <asm/pgtable_types.h>
 +#include <asm/kaiser.h>
  #include <asm/irqflags.h>
  #include <linux/linkage.h>

 @@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target)
  	CFI_DEF_CFA	rsp,0
  	CFI_REGISTER	rsp,rbp
  	SWAPGS_UNSAFE_STACK
 +	SWITCH_KERNEL_CR3_NO_STACK
  	movq	PER_CPU_VAR(kernel_stack), %rsp
  	addq	$(KERNEL_STACK_OFFSET),%rsp
  	/*
 @@ -183,6 +186,7 @@ ENTRY(ia32_sysenter_target)
  	popq_cfi %rcx				/* User %esp */
  	CFI_REGISTER rsp,rcx
  	TRACE_IRQS_ON
 +	SWITCH_USER_CR3
  	ENABLE_INTERRUPTS_SYSEXIT32

  #ifdef CONFIG_AUDITSYSCALL
 @@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target)
  	CFI_REGISTER	rip,rcx
  	/*CFI_REGISTER	rflags,r11*/
  	SWAPGS_UNSAFE_STACK
 +	SWITCH_KERNEL_CR3_NO_STACK
  	movl	%esp,%r8d
  	CFI_REGISTER	rsp,r8
  	movq	PER_CPU_VAR(kernel_stack),%rsp
 @@ -337,6 +342,7 @@ ENTRY(ia32_cstar_target)
  	xorq	%r9,%r9
  	xorq	%r8,%r8
  	TRACE_IRQS_ON
 +	SWITCH_USER_CR3
  	movl RSP-ARGOFFSET(%rsp),%esp
  	CFI_RESTORE rsp
  	USERGS_SYSRET32
 @@ -409,6 +415,7 @@ ENTRY(ia32_syscall)
  	CFI_REL_OFFSET	rip,RIP-RIP
  	PARAVIRT_ADJUST_EXCEPTION_FRAME
  	SWAPGS
 +	SWITCH_KERNEL_CR3_NO_STACK
  	/*
  	 * No need to follow this irqs on/off section: the syscall
  	 * disabled irqs and here we enable it straight after entry:
 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
 index 6f254f2fcd40..736272670870 100644
 --- a/arch/x86/include/asm/cpufeature.h
 +++ b/arch/x86/include/asm/cpufeature.h
 @@ -176,6 +176,7 @@
  #define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */
  #define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */
  #define X86_FEATURE_DTHERM	(7*32+ 7) /* Digital Thermal Sensor */
 +#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */

  /* Virtualization flags: Linux defined, word 8 */
  #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
 index 382ce8a9fd62..7f1ead938ec1 100644
 --- a/arch/x86/include/asm/desc.h
 +++ b/arch/x86/include/asm/desc.h
 @@ -40,7 +40,7 @@ struct gdt_page {
  	struct desc_struct gdt[GDT_ENTRIES];
  } __attribute__((aligned(PAGE_SIZE)));

 -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
 +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);

  static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
  {
 diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
 index eb92a6ed2be7..3354a390cc71 100644
 --- a/arch/x86/include/asm/hw_irq.h
 +++ b/arch/x86/include/asm/hw_irq.h
 @@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
  extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);

  typedef int vector_irq_t[NR_VECTORS];
 -DECLARE_PER_CPU(vector_irq_t, vector_irq);
 +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
  extern void setup_vector_irq(int cpu);

  #ifdef CONFIG_X86_IO_APIC
 diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
 new file mode 100644
 index 000000000000..6f4c8ef46881
 --- /dev/null
 +++ b/arch/x86/include/asm/kaiser.h
 @@ -0,0 +1,126 @@
 +#ifndef _ASM_X86_KAISER_H
 +#define _ASM_X86_KAISER_H
 +
 +#include <asm/processor-flags.h> /* For PCID constants */
 +
 +/*
 + * This file includes the definitions for the KAISER feature.
 + * KAISER is a counter measure against x86_64 side channel attacks on
 + * the kernel virtual memory.  It has a shadow pgd for every process: the
 + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
 + * user memory. Within a kernel context switch, or when an interrupt is handled,
 + * the pgd is switched to the normal one. When the system switches to user mode,
 + * the shadow pgd is enabled. By this, the virtual memory caches are freed,
 + * and the user may not attack the whole kernel memory.
 + *
 + * A minimalistic kernel mapping holds the parts needed to be mapped in user
 + * mode, such as the entry/exit functions of the user space, or the stacks.
 + */
 +
 +#define KAISER_SHADOW_PGD_OFFSET 0x1000
 +
 +#ifdef __ASSEMBLY__
 +#ifdef CONFIG_KAISER
 +
 +.macro _SWITCH_TO_KERNEL_CR3 reg
 +movq %cr3, \reg
 +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
 +orq  x86_cr3_pcid_noflush, \reg
 +movq \reg, %cr3
 +.endm
 +
 +.macro _SWITCH_TO_USER_CR3 reg regb
 +/*
 + * regb must be the low byte portion of reg: because we have arranged
 + * for the low byte of the user PCID to serve as the high byte of NOFLUSH
 + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
 + * not enabled): so that the one register can update both memory and cr3.
 + */
 +movq %cr3, \reg
 +orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
 +js   9f
 +/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
 +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
 +9:
 +movq \reg, %cr3
 +.endm
 +
 +.macro SWITCH_KERNEL_CR3
 +pushq %rax
 +_SWITCH_TO_KERNEL_CR3 %rax
 +popq %rax
 +.endm
 +
 +.macro SWITCH_USER_CR3
 +pushq %rax
 +_SWITCH_TO_USER_CR3 %rax %al
 +popq %rax
 +.endm
 +
 +.macro SWITCH_KERNEL_CR3_NO_STACK
 +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
 +_SWITCH_TO_KERNEL_CR3 %rax
 +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 +.endm
 +
 +#else /* CONFIG_KAISER */
 +
 +.macro SWITCH_KERNEL_CR3 reg
 +.endm
 +.macro SWITCH_USER_CR3 reg regb
 +.endm
 +.macro SWITCH_KERNEL_CR3_NO_STACK
 +.endm
 +
 +#endif /* CONFIG_KAISER */
 +
 +#else /* __ASSEMBLY__ */
 +
 +#ifdef CONFIG_KAISER
 +/*
 + * Upon kernel/user mode switch, it may happen that the address
 + * space has to be switched before the registers have been
 + * stored.  To change the address space, another register is
 + * needed.  A register therefore has to be stored/restored.
 +*/
 +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 +
 +extern unsigned long x86_cr3_pcid_noflush;
 +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 +
 +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 +
 +/**
 + *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
 + *  @addr: the start address of the range
 + *  @size: the size of the range
 + *  @flags: The mapping flags of the pages
 + *
 + *  The mapping is done on a global scope, so no bigger
 + *  synchronization has to be done.  the pages have to be
 + *  manually unmapped again when they are not needed any longer.
 + */
 +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 +
 +/**
 + *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
 + *  @addr: the start address of the range
 + *  @size: the size of the range
 + */
 +extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 +
 +/**
 + *  kaiser_init - Initialize the shadow mapping
 + *
 + *  Most parts of the shadow mapping can be mapped upon boot
 + *  time.  Only per-process things like the thread stacks
 + *  or a new LDT have to be mapped at runtime.  These boot-
 + *  time mappings are permanent and never unmapped.
 + */
 +extern void kaiser_init(void);
 +
 +#endif /* CONFIG_KAISER */
 +
 +#endif /* __ASSEMBLY */
 +
 +#endif /* _ASM_X86_KAISER_H */
 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
 index 6be990922d4b..b1c8b8d3b02a 100644
 --- a/arch/x86/include/asm/pgtable.h
 +++ b/arch/x86/include/asm/pgtable.h
 @@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)

  static inline int pgd_bad(pgd_t pgd)
  {
 -	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
 +	pgdval_t ignore_flags = _PAGE_USER;
 +	/*
 +	 * We set NX on KAISER pgds that map userspace memory so
 +	 * that userspace can not meaningfully use the kernel
 +	 * page table by accident; it will fault on the first
 +	 * instruction it tries to run.  See native_set_pgd().
 +	 */
 +	if (IS_ENABLED(CONFIG_KAISER))
 +		ignore_flags |= _PAGE_NX;
 +
 +	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
  }

  static inline int pgd_none(pgd_t pgd)
 @@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  {
         memcpy(dst, src, count * sizeof(pgd_t));
 +#ifdef CONFIG_KAISER
 +	/* Clone the shadow pgd part as well */
 +	memcpy(native_get_shadow_pgd(dst),
 +	       native_get_shadow_pgd(src),
 +	       count * sizeof(pgd_t));
 +#endif
  }


 diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
 index 975f709e09ae..a3bf3de9893b 100644
 --- a/arch/x86/include/asm/pgtable_64.h
 +++ b/arch/x86/include/asm/pgtable_64.h
 @@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud)
  	native_set_pud(pud, native_make_pud(0));
  }

 +#ifdef CONFIG_KAISER
 +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
 +
 +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 +{
 +	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
 +}
 +
 +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
 +{
 +	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
 +}
 +#else
 +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 +{
 +	return pgd;
 +}
 +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 +{
 +	return NULL;
 +}
 +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
 +{
 +	return pgdp;
 +}
 +#endif /* CONFIG_KAISER */
 +
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
 -	*pgdp = pgd;
 +	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
  }

  static inline void native_pgd_clear(pgd_t *pgd)
 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
 index 013286a10c2c..6e1315068a62 100644
 --- a/arch/x86/include/asm/pgtable_types.h
 +++ b/arch/x86/include/asm/pgtable_types.h
 @@ -39,7 +39,11 @@
  #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
  #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
  #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 +#ifdef CONFIG_KAISER
 +#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
 +#else
  #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 +#endif
  #define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
  #define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
  #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 @@ -62,7 +66,7 @@
  #endif

  #define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 -#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 +#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

  #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
  			 _PAGE_ACCESSED | _PAGE_DIRTY)
 @@ -74,6 +78,33 @@
  			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
  #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)

 +/* The ASID is the lower 12 bits of CR3 */
 +#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
 +
 +/* Mask for all the PCID-related bits in CR3: */
 +#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
 +#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
 +
 +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
 +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
 +#define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))
 +
 +#define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)
 +#define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)
 +#define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
 +#define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
 +#else
 +#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
 +/*
 + * PCIDs are unsupported on 32-bit and none of these bits can be
 + * set in CR3:
 + */
 +#define X86_CR3_PCID_KERN_FLUSH		(0)
 +#define X86_CR3_PCID_USER_FLUSH		(0)
 +#define X86_CR3_PCID_KERN_NOFLUSH	(0)
 +#define X86_CR3_PCID_USER_NOFLUSH	(0)
 +#endif
 +
  #define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)
  #define _PAGE_CACHE_WB		(0)
  #define _PAGE_CACHE_WC		(_PAGE_PWT)
 diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
 index a9e14a52385f..360e80d0d217 100644
 --- a/arch/x86/include/asm/processor-flags.h
 +++ b/arch/x86/include/asm/processor-flags.h
 @@ -43,6 +43,8 @@
   */
  #define X86_CR3_PWT	0x00000008 /* Page Write Through */
  #define X86_CR3_PCD	0x00000010 /* Page Cache Disable */
 +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
 +#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)

  /*
   * Intel CPU features in CR4
 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
 index f7c89e231c6c..048249e983ca 100644
 --- a/arch/x86/include/asm/processor.h
 +++ b/arch/x86/include/asm/processor.h
 @@ -266,7 +266,7 @@ struct tss_struct {

  } ____cacheline_aligned;

 -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
 +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);

  /*
   * Save the original ist values for checking stack pointers during debugging
 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
 index e04cbc550424..288195901c8a 100644
 --- a/arch/x86/include/asm/tlbflush.h
 +++ b/arch/x86/include/asm/tlbflush.h
 @@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void)
  #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
  #endif

 +/*
 + * Declare a couple of kaiser interfaces here for convenience,
 + * to avoid the need for asm/kaiser.h in unexpected places.
 + */
 +#ifdef CONFIG_KAISER
 +extern void kaiser_setup_pcid(void);
 +extern void kaiser_flush_tlb_on_return_to_user(void);
 +#else
 +static inline void kaiser_setup_pcid(void)
 +{
 +}
 +static inline void kaiser_flush_tlb_on_return_to_user(void)
 +{
 +}
 +#endif
 +
  static inline void __native_flush_tlb(void)
  {
 +	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 +		/*
 +		 * Note, this works with CR4.PCIDE=0 or 1.
 +		 */
 +		invpcid_flush_all_nonglobals();
 +		return;
 +	}
 +
  	/*
  	 * If current->mm == NULL then we borrow a mm which may change during a
  	 * task switch and therefore we must not be preempted while we write CR3
  	 * back:
  	 */
  	preempt_disable();
 +	if (this_cpu_has(X86_FEATURE_PCID))
 +		kaiser_flush_tlb_on_return_to_user();
  	native_write_cr3(native_read_cr3());
  	preempt_enable();
  }

  static inline void __native_flush_tlb_global(void)
  {
 +#ifdef CONFIG_KAISER
 +	/* Globals are not used at all */
 +	__native_flush_tlb();
 +#else
  	unsigned long flags;
  	unsigned long cr4;

 -	if (static_cpu_has(X86_FEATURE_INVPCID)) {
 +	if (this_cpu_has(X86_FEATURE_INVPCID)) {
  		/*
  		 * Using INVPCID is considerably faster than a pair of writes
  		 * to CR4 sandwiched inside an IRQ flag save/restore.
 +		 *
 +	 	 * Note, this works with CR4.PCIDE=0 or 1.
  		 */
  		invpcid_flush_all();
  		return;
 @@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void)
  	native_write_cr4(cr4);

  	raw_local_irq_restore(flags);
 +#endif
  }

  static inline void __native_flush_tlb_single(unsigned long addr)
  {
 -	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 +	/*
 +	 * SIMICS #GP's if you run INVPCID with type 2/3
 +	 * and X86_CR4_PCIDE clear.  Shame!
 +	 *
 +	 * The ASIDs used below are hard-coded.  But, we must not
 +	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
 +	 * invlpg in the case we are called early.
 +	 */
 +
 +	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
 +		if (this_cpu_has(X86_FEATURE_PCID))
 +			kaiser_flush_tlb_on_return_to_user();
 +		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 +		return;
 +	}
 +	/* Flush the address out of both PCIDs. */
 +	/*
 +	 * An optimization here might be to determine addresses
 +	 * that are only kernel-mapped and only flush the kernel
 +	 * ASID.  But, userspace flushes are probably much more
 +	 * important performance-wise.
 +	 *
 +	 * Make sure to do only a single invpcid when KAISER is
 +	 * disabled and we have only a single ASID.
 +	 */
 +	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
 +		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
 +	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
  }

  static inline void __flush_tlb_all(void)
 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
 index 895e4b88469c..b567c89fc628 100644
 --- a/arch/x86/kernel/cpu/common.c
 +++ b/arch/x86/kernel/cpu/common.c
 @@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = {

  static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;

 -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
  #ifdef CONFIG_X86_64
  	/*
  	 * We need valid kernel segments for data and code in long mode too
 @@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
  			 * SDM says that it can't be enabled in 32-bit mode.
  			 */
  			set_in_cr4(X86_CR4_PCIDE);
 +			/*
 +			 * INVPCID has two "groups" of types:
 +			 * 1/2: Invalidate an individual address
 +			 * 3/4: Invalidate all contexts
 +			 *
 +			 * 1/2 take a PCID, but 3/4 do not.  So, 3/4
 +			 * ignore the PCID argument in the descriptor.
 +			 * But, we have to be careful not to call 1/2
 +			 * with an actual non-zero PCID in them before
 +			 * we do the above set_in_cr4().
 +			 */
 +			if (cpu_has(c, X86_FEATURE_INVPCID))
 +				set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
  		} else {
  			/*
  			 * flush_tlb_all(), as currently implemented, won't
 @@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
  			clear_cpu_cap(c, X86_FEATURE_PCID);
  		}
  	}
 +	kaiser_setup_pcid();
  }

  /*
 @@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
  	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
  };

 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
  	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);

  /* May not be marked __init: used by software suspend */
 diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
 index 2d4e76ba2b5c..fb933cdca184 100644
 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
 +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
 @@ -2,10 +2,14 @@
  #include <linux/types.h>
  #include <linux/slab.h>

 +#include <asm/kaiser.h>
  #include <asm/perf_event.h>

  #include "perf_event.h"

 +static
 +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
 +
  /* The size of a BTS record in bytes: */
  #define BTS_RECORD_SIZE		24

 @@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu)
  	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
  }

 +static void *dsalloc(size_t size, gfp_t flags, int node)
 +{
 +#ifdef CONFIG_KAISER
 +	unsigned int order = get_order(size);
 +	struct page *page;
 +	unsigned long addr;
 +
 +	page = alloc_pages_node(node, flags | __GFP_ZERO, order);
 +	if (!page)
 +		return NULL;
 +	addr = (unsigned long)page_address(page);
 +	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
 +		__free_pages(page, order);
 +		addr = 0;
 +	}
 +	return (void *)addr;
 +#else
 +	return kmalloc_node(size, flags | __GFP_ZERO, node);
 +#endif
 +}
 +
 +static void dsfree(const void *buffer, size_t size)
 +{
 +#ifdef CONFIG_KAISER
 +	if (!buffer)
 +		return;
 +	kaiser_remove_mapping((unsigned long)buffer, size);
 +	free_pages((unsigned long)buffer, get_order(size));
 +#else
 +	kfree(buffer);
 +#endif
 +}
 +
  static int alloc_pebs_buffer(int cpu)
  {
  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 @@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu)
  	if (!x86_pmu.pebs)
  		return 0;

 -	buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
 +	buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
  	if (unlikely(!buffer))
  		return -ENOMEM;

 @@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu)
  	if (!ds || !x86_pmu.pebs)
  		return;

 -	kfree((void *)(unsigned long)ds->pebs_buffer_base);
 +	dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
  	ds->pebs_buffer_base = 0;
  }

 @@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu)
  	if (!x86_pmu.bts)
  		return 0;

 -	buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
 +	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
  	if (unlikely(!buffer))
  		return -ENOMEM;

 @@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu)
  	if (!ds || !x86_pmu.bts)
  		return;

 -	kfree((void *)(unsigned long)ds->bts_buffer_base);
 +	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
  	ds->bts_buffer_base = 0;
  }

  static int alloc_ds_buffer(int cpu)
  {
 -	int node = cpu_to_node(cpu);
 -	struct debug_store *ds;
 -
 -	ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
 -	if (unlikely(!ds))
 -		return -ENOMEM;
 +	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);

 +	memset(ds, 0, sizeof(*ds));
  	per_cpu(cpu_hw_events, cpu).ds = ds;

  	return 0;
 @@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu)
  		return;

  	per_cpu(cpu_hw_events, cpu).ds = NULL;
 -	kfree(ds);
  }

  void release_ds_buffers(void)
 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
 index f6daf3cdb878..3a4356a2f156 100644
 --- a/arch/x86/kernel/entry_64.S
 +++ b/arch/x86/kernel/entry_64.S
 @@ -56,6 +56,7 @@
  #include <asm/ftrace.h>
  #include <asm/percpu.h>
  #include <asm/pgtable_types.h>
 +#include <asm/kaiser.h>

  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
  #include <linux/elf-em.h>
 @@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64)
  	testl $3, CS(%rdi)
  	je 1f
  	SWAPGS
 +	SWITCH_KERNEL_CR3
  	/*
  	 * irq_count is used to check if a CPU is already on an interrupt stack
  	 * or not. While this is essentially redundant with preempt_count it is
 @@ -362,6 +364,12 @@ END(save_rest)

  /* save complete stack frame */
  	.pushsection .kprobes.text, "ax"
 +/*
 + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
 + *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
 + *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
 + *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
 + */
  ENTRY(save_paranoid)
  	XCPT_FRAME 1 RDI+8
  	cld
 @@ -387,7 +395,25 @@ ENTRY(save_paranoid)
  	js 1f	/* negative -> in kernel */
  	SWAPGS
  	xorl %ebx,%ebx
 -1:	ret
 +1:
 +#ifdef CONFIG_KAISER
 +	/*
 +	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
 +	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
 +	 * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
 +	 * unconditionally, but we need to find out whether the reverse
 +	 * should be done on return (conveyed to paranoid_exit in %ebx).
 +	 */
 +	movq	%cr3, %rax
 +	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
 +	jz	2f
 +	orl	$2, %ebx
 +	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 +	orq	x86_cr3_pcid_noflush, %rax
 +	movq	%rax, %cr3
 +2:
 +#endif
 +	ret
  	CFI_ENDPROC
  END(save_paranoid)
  	.popsection
 @@ -464,6 +490,7 @@ ENTRY(system_call)
  	CFI_REGISTER	rip,rcx
  	/*CFI_REGISTER	rflags,r11*/
  	SWAPGS_UNSAFE_STACK
 +	SWITCH_KERNEL_CR3_NO_STACK
  	/*
  	 * A hypervisor implementation might want to use a label
  	 * after the swapgs, so that it can do the swapgs
 @@ -515,6 +542,14 @@ ENTRY(system_call_after_swapgs)
  	CFI_REGISTER	rip,rcx
  	RESTORE_ARGS 1,-ARG_SKIP,0
  	/*CFI_REGISTER	rflags,r11*/
 +	/*
 +	 * This opens a window where we have a user CR3, but are
 +	 * running in the kernel.  This makes using the CS
 +	 * register useless for telling whether or not we need to
 +	 * switch CR3 in NMIs.  Normal interrupts are OK because
 +	 * they are off here.
 +	 */
 +	SWITCH_USER_CR3
  	movq	PER_CPU_VAR(old_rsp), %rsp
  	USERGS_SYSRET64

 @@ -851,6 +886,14 @@ retint_swapgs:		/* return to user-space */
  	 */
  	DISABLE_INTERRUPTS(CLBR_ANY)
  	TRACE_IRQS_IRETQ
 +	/*
 +	 * This opens a window where we have a user CR3, but are
 +	 * running in the kernel.  This makes using the CS
 +	 * register useless for telling whether or not we need to
 +	 * switch CR3 in NMIs.  Normal interrupts are OK because
 +	 * they are off here.
 +	 */
 +	SWITCH_USER_CR3
  	SWAPGS
  	jmp restore_args

 @@ -891,6 +934,7 @@ ENTRY(native_iret)
  	pushq_cfi %rax
  	pushq_cfi %rdi
  	SWAPGS
 +	SWITCH_KERNEL_CR3
  	movq PER_CPU_VAR(espfix_waddr),%rdi
  	movq %rax,(0*8)(%rdi)	/* RAX */
  	movq (2*8)(%rsp),%rax	/* RIP */
 @@ -906,6 +950,7 @@ ENTRY(native_iret)
  	andl $0xffff0000,%eax
  	popq_cfi %rdi
  	orq PER_CPU_VAR(espfix_stack),%rax
 +	SWITCH_USER_CR3
  	SWAPGS
  	movq %rax,%rsp
  	popq_cfi %rax
 @@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
  	 * is fundamentally NMI-unsafe. (we cannot change the soft and
  	 * hard flags at once, atomically)
  	 */
 -
 -	/* ebx:	no swapgs flag */
 +/*
 + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
 + *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
 + *           ebx=2: needs both swapgs and SWITCH_USER_CR3
 + *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
 + */
  ENTRY(paranoid_exit)
  	DEFAULT_FRAME
  	DISABLE_INTERRUPTS(CLBR_NONE)
  	TRACE_IRQS_OFF
 -	testl %ebx,%ebx				/* swapgs needed? */
 -	jnz paranoid_restore
 -	testl $3,CS(%rsp)
 -	jnz   paranoid_userspace
 -paranoid_swapgs:
 +	movq	%rbx, %r12		/* paranoid_userspace uses %ebx */
 +	testl	$3, CS(%rsp)
 +	jnz	paranoid_userspace
 +paranoid_kernel:
 +	movq	%r12, %rbx		/* restore after paranoid_userspace */
  	TRACE_IRQS_IRETQ 0
 +#ifdef CONFIG_KAISER
 +	testl	$2, %ebx		/* SWITCH_USER_CR3 needed? */
 +	jz	paranoid_exit_no_switch
 +	SWITCH_USER_CR3
 +paranoid_exit_no_switch:
 +#endif
 +	testl	$1, %ebx		/* swapgs needed? */
 +	jnz	paranoid_exit_no_swapgs
  	SWAPGS_UNSAFE_STACK
 +paranoid_exit_no_swapgs:
  	RESTORE_ALL 8
 -	jmp irq_return
 -paranoid_restore:
 -	TRACE_IRQS_IRETQ 0
 -	RESTORE_ALL 8
 -	jmp irq_return
 +	jmp	irq_return
 +
  paranoid_userspace:
  	GET_THREAD_INFO(%rcx)
  	movl TI_flags(%rcx),%ebx
  	andl $_TIF_WORK_MASK,%ebx
 -	jz paranoid_swapgs
 +	jz paranoid_kernel
  	movq %rsp,%rdi			/* &pt_regs */
  	call sync_regs
  	movq %rax,%rsp			/* switch stack for scheduling */
 @@ -1438,6 +1493,13 @@ ENTRY(error_entry)
  	movq_cfi r13, R13+8
  	movq_cfi r14, R14+8
  	movq_cfi r15, R15+8
 +	/*
 +	 * error_entry() always returns with a kernel gsbase and
 +	 * CR3.  We must also have a kernel CR3/gsbase before
 +	 * calling TRACE_IRQS_*.  Just unconditionally switch to
 +	 * the kernel CR3 here.
 +	 */
 +	SWITCH_KERNEL_CR3
  	xorl %ebx,%ebx
  	testl $3,CS+8(%rsp)
  	je error_kernelspace
 @@ -1527,22 +1589,31 @@ ENTRY(nmi)
  	call do_nmi
  #ifdef CONFIG_TRACE_IRQFLAGS
  	/* paranoidexit; without TRACE_IRQS_OFF */
 -	/* ebx:	no swapgs flag */
 +	/* ebx:	no-swapgs and kaiser-switch-cr3 flag */
  	DISABLE_INTERRUPTS(CLBR_NONE)
 -	testl %ebx,%ebx				/* swapgs needed? */
 -	jnz nmi_restore
 -	testl $3,CS(%rsp)
 -	jnz nmi_userspace
 -nmi_swapgs:
 +	movq	%rbx, %r12		/* nmi_userspace uses %ebx */
 +	testl	$3, CS(%rsp)
 +	jnz	nmi_userspace
 +nmi_kernel:
 +	movq	%r12, %rbx		/* restore after nmi_userspace */
 +#ifdef CONFIG_KAISER
 +	testl	$2, %ebx		/* SWITCH_USER_CR3 needed? */
 +	jz	nmi_exit_no_switch
 +	SWITCH_USER_CR3
 +nmi_exit_no_switch:
 +#endif
 +	testl	$1, %ebx		/* swapgs needed? */
 +	jnz	nmi_exit_no_swapgs
  	SWAPGS_UNSAFE_STACK
 -nmi_restore:
 +nmi_exit_no_swapgs:
  	RESTORE_ALL 8
 -	jmp irq_return
 +	jmp	irq_return
 +
  nmi_userspace:
  	GET_THREAD_INFO(%rcx)
  	movl TI_flags(%rcx),%ebx
  	andl $_TIF_WORK_MASK,%ebx
 -	jz nmi_swapgs
 +	jz nmi_kernel
  	movq %rsp,%rdi			/* &pt_regs */
  	call sync_regs
  	movq %rax,%rsp			/* switch stack for scheduling */
 diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
 index 94d857fb1033..14cd73b0e634 100644
 --- a/arch/x86/kernel/espfix_64.c
 +++ b/arch/x86/kernel/espfix_64.c
 @@ -41,6 +41,7 @@
  #include <asm/pgalloc.h>
  #include <asm/setup.h>
  #include <asm/espfix.h>
 +#include <asm/kaiser.h>

  /*
   * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
 @@ -129,6 +130,14 @@ void __init init_espfix_bsp(void)
  	/* Install the espfix pud into the kernel page directory */
  	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
  	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
 +	/*
 +	 * Just copy the top-level PGD that is mapping the espfix
 +	 * area to ensure it is mapped into the shadow user page
 +	 * tables.
 +	 */
 +	if (IS_ENABLED(CONFIG_KAISER))
 +		set_pgd(native_get_shadow_pgd(pgd_p),
 +			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));

  	/* Randomize the locations */
  	init_espfix_random();
 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
 index 0f8ebf78253a..6e697ac3fb54 100644
 --- a/arch/x86/kernel/head_64.S
 +++ b/arch/x86/kernel/head_64.S
 @@ -338,6 +338,27 @@ ENTRY(early_idt_handler)
  	.balign	PAGE_SIZE; \
  ENTRY(name)

 +#ifdef CONFIG_KAISER
 +/*
 + * Each PGD needs to be 8k long and 8k aligned.  We do not
 + * ever go out to userspace with these, so we do not
 + * strictly *need* the second page, but this allows us to
 + * have a single set_pgd() implementation that does not
 + * need to worry about whether it has 4k or 8k to work
 + * with.
 + *
 + * This ensures PGDs are 8k long:
 + */
 +#define KAISER_USER_PGD_FILL	512
 +/* This ensures they are 8k-aligned: */
 +#define NEXT_PGD_PAGE(name) \
 +	.balign 2 * PAGE_SIZE; \
 +GLOBAL(name)
 +#else
 +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
 +#define KAISER_USER_PGD_FILL	0
 +#endif
 +
  /* Automate the creation of 1 to 1 mapping pmd entries */
  #define PMDS(START, PERM, COUNT)			\
  	i = 0 ;						\
 @@ -353,13 +374,14 @@ ENTRY(name)
  	 * 0xffffffff80000000 to physical address 0x000000. (always using
  	 * 2Mbyte large pages provided by PAE mode)
  	 */
 -NEXT_PAGE(init_level4_pgt)
 +NEXT_PGD_PAGE(init_level4_pgt)
  	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
  	.org	init_level4_pgt + L4_PAGE_OFFSET*8, 0
  	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
  	.org	init_level4_pgt + L4_START_KERNEL*8, 0
  	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
  	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 +	.fill	KAISER_USER_PGD_FILL,8,0

  NEXT_PAGE(level3_ident_pgt)
  	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 @@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt)
  	 * Don't set NX because code runs from these pages.
  	 */
  	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 +	.fill	KAISER_USER_PGD_FILL,8,0

  NEXT_PAGE(level2_kernel_pgt)
  	/*
 diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
 index 43e9ccf44947..f00e6e734fbd 100644
 --- a/arch/x86/kernel/init_task.c
 +++ b/arch/x86/kernel/init_task.c
 @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task);
   * section. Since TSS's are completely CPU-local, we want them
   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
   */
 -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;

 diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
 index e328f691eeef..990f743e21b8 100644
 --- a/arch/x86/kernel/irqinit.c
 +++ b/arch/x86/kernel/irqinit.c
 @@ -85,7 +85,7 @@ static struct irqaction irq2 = {
  	.flags = IRQF_NO_THREAD,
  };

 -DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
  	[0 ... NR_VECTORS - 1] = -1,
  };

 diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
 index 1dd32307a494..836a4c2d5ceb 100644
 --- a/arch/x86/kernel/ldt.c
 +++ b/arch/x86/kernel/ldt.c
 @@ -15,6 +15,7 @@
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/uaccess.h>
 +#include <linux/kaiser.h>

  #include <asm/system.h>
  #include <asm/ldt.h>
 @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
  	set_ldt(pc->ldt->entries, pc->ldt->size);
  }

 +static void __free_ldt_struct(struct ldt_struct *ldt)
 +{
 +	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
 +		vfree(ldt->entries);
 +	else
 +		free_page((unsigned long)ldt->entries);
 +	kfree(ldt);
 +}
 +
  /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
  static struct ldt_struct *alloc_ldt_struct(int size)
  {
  	struct ldt_struct *new_ldt;
  	int alloc_size;
 +	int ret;

  	if (size > LDT_ENTRIES)
  		return NULL;
 @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
  		return NULL;
  	}

 +	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
 +				 __PAGE_KERNEL);
  	new_ldt->size = size;
 +	if (ret) {
 +		__free_ldt_struct(new_ldt);
 +		return NULL;
 +	}
  	return new_ldt;
  }

 @@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
  	if (likely(!ldt))
  		return;

 +	kaiser_remove_mapping((unsigned long)ldt->entries,
 +			      ldt->size * LDT_ENTRY_SIZE);
  	paravirt_free_ldt(ldt->entries, ldt->size);
 -	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
 -		vfree(ldt->entries);
 -	else
 -		kfree(ldt->entries);
 -	kfree(ldt);
 +	__free_ldt_struct(ldt);
  }

  /*
 diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
 index 557eb3757edb..d2ce2a33d15b 100644
 --- a/arch/x86/kernel/process_64.c
 +++ b/arch/x86/kernel/process_64.c
 @@ -57,7 +57,7 @@

  asmlinkage extern void ret_from_fork(void);

 -DEFINE_PER_CPU(unsigned long, old_rsp);
 +DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
  static DEFINE_PER_CPU(unsigned char, is_idle);

  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
 index cf2a84031dfd..c9a00a5e0b87 100644
 --- a/arch/x86/mm/Makefile
 +++ b/arch/x86/mm/Makefile
 @@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
  obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o

  obj-$(CONFIG_MEMTEST)		+= memtest.o
 +obj-$(CONFIG_KAISER)		+= kaiser.o
 diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
 new file mode 100644
 index 000000000000..79b0222ffa74
 --- /dev/null
 +++ b/arch/x86/mm/kaiser.c
 @@ -0,0 +1,382 @@
 +#include <linux/bug.h>
 +#include <linux/kernel.h>
 +#include <linux/errno.h>
 +#include <linux/string.h>
 +#include <linux/types.h>
 +#include <linux/bug.h>
 +#include <linux/init.h>
 +#include <linux/interrupt.h>
 +#include <linux/spinlock.h>
 +#include <linux/mm.h>
 +#include <linux/module.h>
 +#include <linux/uaccess.h>
 +#include <linux/ftrace.h>
 +
 +extern struct mm_struct init_mm;
 +
 +#include <asm/kaiser.h>
 +#include <asm/tlbflush.h>	/* to verify its kaiser declarations */
 +#include <asm/pgtable.h>
 +#include <asm/pgalloc.h>
 +#include <asm/desc.h>
 +
 +#ifdef CONFIG_KAISER
 +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 +
 +/*
 + * These can have bit 63 set, so we can not just use a plain "or"
 + * instruction to get their value or'd into CR3.  It would take
 + * another register.  So, we use a memory reference to these instead.
 + *
 + * This is also handy because systems that do not support PCIDs
 + * just end up or'ing a 0 into their CR3, which does no harm.
 + */
 +unsigned long x86_cr3_pcid_noflush __read_mostly;
 +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 +
 +/*
 + * At runtime, the only things we map are some things for CPU
 + * hotplug, and stacks for new processes.  No two CPUs will ever
 + * be populating the same addresses, so we only need to ensure
 + * that we protect between two CPUs trying to allocate and
 + * populate the same page table page.
 + *
 + * Only take this lock when doing a set_p[4um]d(), but it is not
 + * needed for doing a set_pte().  We assume that only the *owner*
 + * of a given allocation will be doing this for _their_
 + * allocation.
 + *
 + * This ensures that once a system has been running for a while
 + * and there have been stacks all over and these page tables
 + * are fully populated, there will be no further acquisitions of
 + * this lock.
 + */
 +static DEFINE_SPINLOCK(shadow_table_allocation_lock);
 +
 +/*
 + * Returns -1 on error.
 + */
 +static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
 +{
 +	pgd_t *pgd;
 +	pud_t *pud;
 +	pmd_t *pmd;
 +	pte_t *pte;
 +
 +	pgd = pgd_offset_k(vaddr);
 +	/*
 +	 * We made all the kernel PGDs present in kaiser_init().
 +	 * We expect them to stay that way.
 +	 */
 +	BUG_ON(pgd_none(*pgd));
 +	/*
 +	 * PGDs are either 512GB or 128TB on all x86_64
 +	 * configurations.  We don't handle these.
 +	 */
 +	BUG_ON(pgd_large(*pgd));
 +
 +	pud = pud_offset(pgd, vaddr);
 +	if (pud_none(*pud)) {
 +		WARN_ON_ONCE(1);
 +		return -1;
 +	}
 +
 +	if (pud_large(*pud))
 +		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
 +
 +	pmd = pmd_offset(pud, vaddr);
 +	if (pmd_none(*pmd)) {
 +		WARN_ON_ONCE(1);
 +		return -1;
 +	}
 +
 +	if (pmd_large(*pmd))
 +		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
 +
 +	pte = pte_offset_kernel(pmd, vaddr);
 +	if (pte_none(*pte)) {
 +		WARN_ON_ONCE(1);
 +		return -1;
 +	}
 +
 +	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 +}
 +
 +/*
 + * This is a relatively normal page table walk, except that it
 + * also tries to allocate page tables pages along the way.
 + *
 + * Returns a pointer to a PTE on success, or NULL on failure.
 + */
 +static pte_t *kaiser_pagetable_walk(unsigned long address)
 +{
 +	pmd_t *pmd;
 +	pud_t *pud;
 +	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 +
 +	if (pgd_none(*pgd)) {
 +		WARN_ONCE(1, "All shadow pgds should have been populated");
 +		return NULL;
 +	}
 +	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 +
 +	pud = pud_offset(pgd, address);
 +	/* The shadow page tables do not use large mappings: */
 +	if (pud_large(*pud)) {
 +		WARN_ON(1);
 +		return NULL;
 +	}
 +	if (pud_none(*pud)) {
 +		unsigned long new_pmd_page = __get_free_page(gfp);
 +		if (!new_pmd_page)
 +			return NULL;
 +		spin_lock(&shadow_table_allocation_lock);
 +		if (pud_none(*pud)) {
 +			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 +			__inc_zone_page_state(virt_to_page((void *)
 +						new_pmd_page), NR_KAISERTABLE);
 +		} else
 +			free_page(new_pmd_page);
 +		spin_unlock(&shadow_table_allocation_lock);
 +	}
 +
 +	pmd = pmd_offset(pud, address);
 +	/* The shadow page tables do not use large mappings: */
 +	if (pmd_large(*pmd)) {
 +		WARN_ON(1);
 +		return NULL;
 +	}
 +	if (pmd_none(*pmd)) {
 +		unsigned long new_pte_page = __get_free_page(gfp);
 +		if (!new_pte_page)
 +			return NULL;
 +		spin_lock(&shadow_table_allocation_lock);
 +		if (pmd_none(*pmd)) {
 +			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
 +			__inc_zone_page_state(virt_to_page((void *)
 +						new_pte_page), NR_KAISERTABLE);
 +		} else
 +			free_page(new_pte_page);
 +		spin_unlock(&shadow_table_allocation_lock);
 +	}
 +
 +	return pte_offset_kernel(pmd, address);
 +}
 +
 +int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 +			unsigned long flags)
 +{
 +	int ret = 0;
 +	pte_t *pte;
 +	unsigned long start_addr = (unsigned long )__start_addr;
 +	unsigned long address = start_addr & PAGE_MASK;
 +	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 +	unsigned long target_address;
 +
 +	for (; address < end_addr; address += PAGE_SIZE) {
 +		target_address = get_pa_from_mapping(address);
 +		if (target_address == -1) {
 +			ret = -EIO;
 +			break;
 +		}
 +		pte = kaiser_pagetable_walk(address);
 +		if (!pte) {
 +			ret = -ENOMEM;
 +			break;
 +		}
 +		if (pte_none(*pte)) {
 +			set_pte(pte, __pte(flags | target_address));
 +		} else {
 +			pte_t tmp;
 +			set_pte(&tmp, __pte(flags | target_address));
 +			WARN_ON_ONCE(!pte_same(*pte, tmp));
 +		}
 +	}
 +	return ret;
 +}
 +
 +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
 +{
 +	unsigned long size = end - start;
 +
 +	return kaiser_add_user_map(start, size, flags);
 +}
 +
 +/*
 + * Ensure that the top level of the (shadow) page tables are
 + * entirely populated.  This ensures that all processes that get
 + * forked have the same entries.  This way, we do not have to
 + * ever go set up new entries in older processes.
 + *
 + * Note: we never free these, so there are no updates to them
 + * after this.
 + */
 +static void __init kaiser_init_all_pgds(void)
 +{
 +	pgd_t *pgd;
 +	int i = 0;
 +
 +	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 +	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 +		pgd_t new_pgd;
 +		pud_t *pud = pud_alloc_one(&init_mm,
 +					   PAGE_OFFSET + i * PGDIR_SIZE);
 +		if (!pud) {
 +			WARN_ON(1);
 +			break;
 +		}
 +		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 +		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 +		/*
 +		 * Make sure not to stomp on some other pgd entry.
 +		 */
 +		if (!pgd_none(pgd[i])) {
 +			WARN_ON(1);
 +			continue;
 +		}
 +		set_pgd(pgd + i, new_pgd);
 +	}
 +}
 +
 +#define kaiser_add_user_map_early(start, size, flags) do {	\
 +	int __ret = kaiser_add_user_map(start, size, flags);	\
 +	WARN_ON(__ret);						\
 +} while (0)
 +
 +#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
 +	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
 +	WARN_ON(__ret);							\
 +} while (0)
 +
 +/*
 + * If anything in here fails, we will likely die on one of the
 + * first kernel->user transitions and init will die.  But, we
 + * will have most of the kernel up by then and should be able to
 + * get a clean warning out of it.  If we BUG_ON() here, we run
 + * the risk of being before we have good console output.
 + */
 +void __init kaiser_init(void)
 +{
 +	int cpu;
 +
 +	kaiser_init_all_pgds();
 +
 +	for_each_possible_cpu(cpu) {
 +		void *percpu_vaddr = __per_cpu_user_mapped_start +
 +				     per_cpu_offset(cpu);
 +		unsigned long percpu_sz = __per_cpu_user_mapped_end -
 +					  __per_cpu_user_mapped_start;
 +		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 +					  __PAGE_KERNEL);
 +	}
 +
 +	/*
 +	 * Map the entry/exit text section, which is needed at
 +	 * switches from user to and from kernel.
 +	 */
 +	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 +				       __PAGE_KERNEL_RX);
 +#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 +	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 +				       __irqentry_text_end,
 +				       __PAGE_KERNEL_RX);
 +#endif
 +	kaiser_add_user_map_early((void *)idt_descr.address,
 +				  sizeof(gate_desc) * NR_VECTORS,
 +				  __PAGE_KERNEL_RO);
 +	kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
 +				  sizeof(x86_cr3_pcid_noflush),
 +				  __PAGE_KERNEL);
 +}
 +
 +/* Add a mapping to the shadow mapping, and synchronize the mappings */
 +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 +{
 +	return kaiser_add_user_map((const void *)addr, size, flags);
 +}
 +
 +void kaiser_remove_mapping(unsigned long start, unsigned long size)
 +{
 +	unsigned long end = start + size;
 +	unsigned long addr;
 +	pte_t *pte;
 +
 +	for (addr = start; addr < end; addr += PAGE_SIZE) {
 +		pte = kaiser_pagetable_walk(addr);
 +		if (pte)
 +			set_pte(pte, __pte(0));
 +	}
 +}
 +
 +/*
 + * Page table pages are page-aligned.  The lower half of the top
 + * level is used for userspace and the top half for the kernel.
 + * This returns true for user pages that need to get copied into
 + * both the user and kernel copies of the page tables, and false
 + * for kernel pages that should only be in the kernel copy.
 + */
 +static inline bool is_userspace_pgd(pgd_t *pgdp)
 +{
 +	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
 +}
 +
 +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 +{
 +	/*
 +	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 +	 * skip cases like kexec and EFI which make temporary low mappings.
 +	 */
 +	if (pgd.pgd & _PAGE_USER) {
 +		if (is_userspace_pgd(pgdp)) {
 +			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 +			/*
 +			 * Even if the entry is *mapping* userspace, ensure
 +			 * that userspace can not use it.  This way, if we
 +			 * get out to userspace running on the kernel CR3,
 +			 * userspace will crash instead of running.
 +			 */
 +			pgd.pgd |= _PAGE_NX;
 +		}
 +	} else if (!pgd.pgd) {
 +		/*
 +		 * pgd_clear() cannot check _PAGE_USER, and is even used to
 +		 * clear corrupted pgd entries: so just rely on cases like
 +		 * kexec and EFI never to be using pgd_clear().
 +		 */
 +		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
 +		    is_userspace_pgd(pgdp))
 +			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 +	}
 +	return pgd;
 +}
 +
 +void kaiser_setup_pcid(void)
 +{
 +	unsigned long kern_cr3 = 0;
 +	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
 +
 +	if (this_cpu_has(X86_FEATURE_PCID)) {
 +		kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
 +		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
 +	}
 +	/*
 +	 * These variables are used by the entry/exit
 +	 * code to change PCID and pgd and TLB flushing.
 +	 */
 +	x86_cr3_pcid_noflush = kern_cr3;
 +	this_cpu_write(x86_cr3_pcid_user, user_cr3);
 +}
 +
 +/*
 + * Make a note that this cpu will need to flush USER tlb on return to user.
 + * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
 + * if cpu does not, then the NOFLUSH bit will never have been set.
 + */
 +void kaiser_flush_tlb_on_return_to_user(void)
 +{
 +	this_cpu_write(x86_cr3_pcid_user,
 +			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 +}
 +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
 +#endif /* CONFIG_KAISER */
 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
 index 8573b83a63d0..73285602c93f 100644
 --- a/arch/x86/mm/pgtable.c
 +++ b/arch/x86/mm/pgtable.c
 @@ -5,7 +5,7 @@
  #include <asm/tlb.h>
  #include <asm/fixmap.h>

 -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)

  #ifdef CONFIG_HIGHPTE
  #define PGALLOC_USER_GFP __GFP_HIGHMEM
 @@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
  	}
  }

 +#ifdef CONFIG_KAISER
 +/*
 + * Instead of one pmd, we aquire two pmds.  Being order-1, it is
 + * both 8k in size and 8k-aligned.  That lets us just flip bit 12
 + * in a pointer to swap between the two 4k halves.
 + */
 +#define PGD_ALLOCATION_ORDER 1
 +#else
 +#define PGD_ALLOCATION_ORDER 0
 +#endif
 +
 +static inline pgd_t *_pgd_alloc(void)
 +{
 +	/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
 +	return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
 +					 PGD_ALLOCATION_ORDER);
 +}
 +
 +static inline void _pgd_free(pgd_t *pgd)
 +{
 +	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 +}
 +
  pgd_t *pgd_alloc(struct mm_struct *mm)
  {
  	pgd_t *pgd;
  	pmd_t *pmds[PREALLOCATED_PMDS];

 -	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
 +	pgd = _pgd_alloc();

  	if (pgd == NULL)
  		goto out;
 @@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
  out_free_pmds:
  	free_pmds(pmds);
  out_free_pgd:
 -	free_page((unsigned long)pgd);
 +	_pgd_free(pgd);
  out:
  	return NULL;
  }
 @@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
  	pgd_mop_up_pmds(mm, pgd);
  	pgd_dtor(pgd);
  	paravirt_pgd_free(mm, pgd);
 -	free_page((unsigned long)pgd);
 +	_pgd_free(pgd);
  }

  int ptep_set_access_flags(struct vm_area_struct *vma,
 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
 index 4e4e6bc09e98..bd0ff4eb74a0 100644
 --- a/arch/x86/mm/tlb.c
 +++ b/arch/x86/mm/tlb.c
 @@ -12,10 +12,43 @@
  #include <asm/cache.h>
  #include <asm/apic.h>
  #include <asm/uv/uv.h>
 +#include <asm/kaiser.h>

  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
  			= { &init_mm, 0, };

 +static void load_new_mm_cr3(pgd_t *pgdir)
 +{
 +	unsigned long new_mm_cr3 = __pa(pgdir);
 +
 +#ifdef CONFIG_KAISER
 +	if (this_cpu_has(X86_FEATURE_PCID)) {
 +		/*
 +		 * We reuse the same PCID for different tasks, so we must
 +		 * flush all the entries for the PCID out when we change tasks.
 +		 * Flush KERN below, flush USER when returning to userspace in
 +		 * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
 +		 *
 +		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
 +		 * do it here, but can only be used if X86_FEATURE_INVPCID is
 +		 * available - and many machines support pcid without invpcid.
 +		 *
 +		 * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
 +		 * but keep that line in there in case something changes.
 +		 */
 +		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
 +		kaiser_flush_tlb_on_return_to_user();
 +	}
 +#endif /* CONFIG_KAISER */
 +
 +	/*
 +	 * Caution: many callers of this function expect
 +	 * that load_new_mm_cr3() is serializing and orders TLB
 +	 * fills with respect to the mm_cpumask writes.
 +	 */
 +	write_cr3(new_mm_cr3);
 +}
 +
  /*
   *	TLB flushing, formerly SMP-only
   *		c/o Linus Torvalds.
 @@ -65,7 +98,7 @@ void leave_mm(int cpu)
  		BUG();
  	cpumask_clear_cpu(cpu,
  			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
 -	load_cr3(swapper_pg_dir);
 +	load_new_mm_cr3(swapper_pg_dir);
  }
  EXPORT_SYMBOL_GPL(leave_mm);

 @@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  		 * from next->pgd.  TLB fills are special and can happen
  		 * due to instruction fetches or for no reason at all,
  		 * and neither LOCK nor MFENCE orders them.
 -		 * Fortunately, load_cr3() is serializing and gives the
 -		 * ordering guarantee we need.
 -		 *
 +		 * Fortunately, load_new_mm_cr3() is serializing
 +		 * and gives the  ordering guarantee we need.
  		 */
 -		load_cr3(next->pgd);
 +		load_new_mm_cr3(next->pgd);

  		/* stop flush ipis for the previous mm */
  		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 @@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  			 * tlb flush IPI delivery. We must reload CR3
  			 * to make sure to use no freed page tables.
  			 *
 -			 * As above, load_cr3() is serializing and orders TLB
 -			 * fills with respect to the mm_cpumask write.
 +			 * As above, load_new_mm_cr3() is serializing and orders
 +			 * TLB fills with respect to the mm_cpumask write.
  			 */
 -			load_cr3(next->pgd);
 +			load_new_mm_cr3(next->pgd);
  			load_mm_ldt(next);
  		}
  	}
 diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
 index b5e2e4c6b017..01c8155dd613 100644
 --- a/include/asm-generic/vmlinux.lds.h
 +++ b/include/asm-generic/vmlinux.lds.h
 @@ -692,7 +692,14 @@
   */
  #define PERCPU_INPUT(cacheline)						\
  	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
 +	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;		\
  	*(.data..percpu..first)						\
 +	. = ALIGN(cacheline);						\
 +	*(.data..percpu..user_mapped)					\
 +	*(.data..percpu..user_mapped..shared_aligned)			\
 +	. = ALIGN(PAGE_SIZE);						\
 +	*(.data..percpu..user_mapped..page_aligned)			\
 +	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;			\
  	. = ALIGN(PAGE_SIZE);						\
  	*(.data..percpu..page_aligned)					\
  	. = ALIGN(cacheline);						\
 diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
 new file mode 100644
 index 000000000000..4a4d6d911a14
 --- /dev/null
 +++ b/include/linux/kaiser.h
 @@ -0,0 +1,52 @@
 +#ifndef _LINUX_KAISER_H
 +#define _LINUX_KAISER_H
 +
 +#ifdef CONFIG_KAISER
 +#include <asm/kaiser.h>
 +
 +static inline int kaiser_map_thread_stack(void *stack)
 +{
 +	/*
 +	 * Map that page of kernel stack on which we enter from user context.
 +	 */
 +	return kaiser_add_mapping((unsigned long)stack +
 +			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
 +}
 +
 +static inline void kaiser_unmap_thread_stack(void *stack)
 +{
 +	/*
 +	 * Note: may be called even when kaiser_map_thread_stack() failed.
 +	 */
 +	kaiser_remove_mapping((unsigned long)stack +
 +			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
 +}
 +#else
 +
 +/*
 + * These stubs are used whenever CONFIG_KAISER is off, which
 + * includes architectures that support KAISER, but have it disabled.
 + */
 +
 +static inline void kaiser_init(void)
 +{
 +}
 +static inline int kaiser_add_mapping(unsigned long addr,
 +				     unsigned long size, unsigned long flags)
 +{
 +	return 0;
 +}
 +static inline void kaiser_remove_mapping(unsigned long start,
 +					 unsigned long size)
 +{
 +}
 +static inline int kaiser_map_thread_stack(void *stack)
 +{
 +	return 0;
 +}
 +static inline void kaiser_unmap_thread_stack(void *stack)
 +{
 +}
 +
 +#endif /* !CONFIG_KAISER */
 +#endif /* _LINUX_KAISER_H */
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 index 25842b6e72e1..a0b4422a116a 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
 @@ -95,8 +95,9 @@ enum zone_stat_item {
  	NR_SLAB_RECLAIMABLE,
  	NR_SLAB_UNRECLAIMABLE,
  	NR_PAGETABLE,		/* used for pagetables */
 -	NR_KERNEL_STACK,
  	/* Second 128 byte cacheline */
 +	NR_KERNEL_STACK,
 +	NR_KAISERTABLE,
  	NR_UNSTABLE_NFS,	/* NFS unstable pages */
  	NR_BOUNCE,
  	NR_VMSCAN_WRITE,
 diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
 index 27ef6b190ea6..56f5eeb78d1d 100644
 --- a/include/linux/percpu-defs.h
 +++ b/include/linux/percpu-defs.h
 @@ -28,6 +28,12 @@
  	(void)__vpp_verify;						\
  } while (0)

 +#ifdef CONFIG_KAISER
 +#define USER_MAPPED_SECTION "..user_mapped"
 +#else
 +#define USER_MAPPED_SECTION ""
 +#endif
 +
  /*
   * s390 and alpha modules require percpu variables to be defined as
   * weak to force the compiler to generate GOT based external
 @@ -90,6 +96,12 @@
  #define DEFINE_PER_CPU(type, name)					\
  	DEFINE_PER_CPU_SECTION(type, name, "")

 +#define DECLARE_PER_CPU_USER_MAPPED(type, name)				\
 +	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
 +
 +#define DEFINE_PER_CPU_USER_MAPPED(type, name)				\
 +	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
 +
  /*
   * Declaration/definition used for per-CPU variables that must come first in
   * the set of variables.
 @@ -119,6 +131,14 @@
  	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
  	____cacheline_aligned_in_smp

 +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\
 +	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
 +	____cacheline_aligned_in_smp
 +
 +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\
 +	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
 +	____cacheline_aligned_in_smp
 +
  #define DECLARE_PER_CPU_ALIGNED(type, name)				\
  	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
  	____cacheline_aligned
 @@ -137,11 +157,21 @@
  #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
  	DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")		\
  	__aligned(PAGE_SIZE)
 +/*
 + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
 + */
 +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\
 +	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
 +	__aligned(PAGE_SIZE)
 +
 +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\
 +	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
 +	__aligned(PAGE_SIZE)

  /*
   * Declaration/definition used for per-CPU variables that must be read mostly.
   */
 -#define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\
 +#define DECLARE_PER_CPU_READ_MOSTLY(type, name)				\
  	DECLARE_PER_CPU_SECTION(type, name, "..readmostly")

  #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\
 diff --git a/init/main.c b/init/main.c
 index e937d9bda0f8..558a9fdd566d 100644
 --- a/init/main.c
 +++ b/init/main.c
 @@ -69,6 +69,7 @@
  #include <linux/slab.h>
  #include <linux/perf_event.h>
  #include <linux/random.h>
 +#include <linux/kaiser.h>

  #include <asm/io.h>
  #include <asm/bugs.h>
 @@ -463,6 +464,7 @@ static void __init mm_init(void)
  	percpu_init_late();
  	pgtable_cache_init();
  	vmalloc_init();
 +	kaiser_init();
  }

  asmlinkage void __init start_kernel(void)
 diff --git a/kernel/fork.c b/kernel/fork.c
 index 29b460431c12..511131a15a75 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -55,6 +55,7 @@
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/freezer.h>
 +#include <linux/kaiser.h>
  #include <linux/delayacct.h>
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
 @@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,

  static inline void free_thread_info(struct thread_info *ti)
  {
 +	kaiser_unmap_thread_stack(ti);
  	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
  }
  #endif
 @@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)

  	tsk->stack = ti;

 +	err = kaiser_map_thread_stack(tsk->stack);
 +	if (err)
 +		goto out;
 +
  	setup_thread_stack(tsk, orig);
  	clear_user_return_notifier(tsk);
  	clear_tsk_need_resched(tsk);
 diff --git a/mm/vmstat.c b/mm/vmstat.c
 index ff9060919c4b..eaf3db038652 100644
 --- a/mm/vmstat.c
 +++ b/mm/vmstat.c
 @@ -699,6 +699,7 @@ const char * const vmstat_text[] = {
  	"nr_slab_unreclaimable",
  	"nr_page_table_pages",
  	"nr_kernel_stack",
 +	"nr_overhead",
  	"nr_unstable",
  	"nr_bounce",
  	"nr_vmscan_write",
 diff --git a/security/Kconfig b/security/Kconfig
 index 51bd5a0b69ae..19f83193e7ab 100644
 --- a/security/Kconfig
 +++ b/security/Kconfig
 @@ -96,6 +96,16 @@ config SECURITY

  	  If you are unsure how to answer this question, answer N.

 +config KAISER
 +	bool "Remove the kernel mapping in user mode"
 +	default y
 +	depends on X86_64 && SMP && !PARAVIRT
 +	help
 +	  This enforces a strict kernel and user space isolation, in order
 +	  to close hardware side channels on kernel address information.
 +
 +	  If you are unsure how to answer this question, answer Y.
 +
  config SECURITYFS
  	bool "Enable the securityfs filesystem"
  	help