kaiser-20171025004
diff --git a/Documentation/x86/kaiser.txt b/Documentation/x86/kaiser.txt
new file mode 100644
index 0000000..9554fb2
--- /dev/null
+++ b/Documentation/x86/kaiser.txt
@@ -0,0 +1,85 @@
+KAISER is a countermeasure against side channel attacks against
+kernel virtual memory.  It leaves alone the page tables that
+exist for each process and refers to them as the "kernel page
+tables".  But, it adds a second set of "shadow" (aka. user) page
+tables which are active when running userspace.
+
+Whenever we enter the kernel (syscalls, interrupts, exceptions),
+the page tables are switched to the "kernel" copy, but when the
+system switches back to user mode, the user/shadow copy is used.
+
+The minimalistic kernel portion of the user page tables try to
+map only what is needed to enter/exit the kernel such as the
+entry/exit functions themselves and the interrupt descriptor
+table (IDT).
+
+This helps ensure that side-channel attacks that leverage the
+paging structures do not function when KAISER is enabled.  It
+can be enabled by setting CONFIG_KAISER=y
+
+Protection against side-channel attacks is important.  But,
+this protection comes at a cost:
+
+1. Increased Memory Use
+  a. Statically-allocated structures must be padded out to 4k
+     (or 8k for PGDs) so they can be mapped into the user page
+     tables.
+  b. Each task now needs an order-1 PGD instead of order-0
+  c. The shadow page tables eventually grow to map all of used
+     vmalloc() space.  They can have roughly the same memory
+     consumption as the vmalloc() page tables.
+
+2. Runtime Cost
+  a. CR3 manipulation to switch between the page table copies
+     must be done at interrupt, syscall, and exception entry
+     and exit (it can be skipped when the kernel is interrupted,
+     though.)  Moves to CR3 are on the order of a hundred
+     cycles, and we need one at entry and another at exit.
+  b. Task stacks must be mapped/unmapped.  We need to walk
+     and modify the shadow page tables at fork() and exit().
+  c. Global pages are disabled.  This feature of the MMU
+     allows different processes to share TLB entries mapping
+     the kernel.  Losing the feature means potentially more
+     TLB misses after a context switch.
+  d. Process Context IDentifiers (PCID) is a CPU feature that
+     allows us to skip flushing the entire TLB when we switch
+     the page tables.  This makes switching the page tables
+     (at context switch, or kernel entry/exit) cheaper.  But,
+     on systems with PCID support, the context switch code
+     must flush both the user and kernel entries out of the
+     TLB, with an INVPCID in addition to the CR3 write.  This
+     INVPCID is generally slower than a CR3 write, but still
+     on the order of a hundred cycles.
+  e. The shadow page tables must be populated for each new
+     process.  Even without KAISER, since we share all of the
+     kernel mappings in all processes, we can do all this
+     population for kernel addresses at the top level of the
+     page tables (the PGD level).  But, with KAISER, we now
+     have *two* kernel mappings: one in the kernel page tables
+     that maps everything and one in the user/shadow page
+     tables mapping the "minimal" kernel.  At fork(), we
+     copy the portion of the shadow PGD that maps the minimal
+     kernel structures in addition to the normal kernel one.
+  f. In addition to the fork()-time copying, we must also
+     update the shadow PGD any time a set_pgd() is done on a
+     PGD used to map userspace.  This ensures that the kernel
+     and user/shadow copies always map the same userspace
+     memory.
+  g. On systems without PCID support, each CR3 write flushes
+     the entire TLB.  That means that each syscall, interrupt
+     or exception flushes the TLB.
+
+Future work:
+1. We can be more careful about not actually writing to CR3
+   unless we actually switch it.
+2. Try to have dedicated entry/exist kernel stacks so we do
+   not have to map/unmap the task/thread stacks.
+3. Compress the user/shadow-mapped data to be mapped together
+   underneath a single PGD entry.
+4. Re-enable global pages, but use them for mappings in the
+   user/shadow page tables.  This would allow the kernel to
+   take advantage of TLB entries that were established from
+   the user page tables.  This might speed up the entry/exit
+   code or userspace since it will not have to reload all of
+   its TLB entries.  However, its upside is limited by PCID
+   being used.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac..66ec43d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -327,6 +327,10 @@
 config FIX_EARLYCON_MEM
 	def_bool y
 
+config X86_GLOBAL_PAGES
+	def_bool y
+	depends on ! KAISER
+
 config PGTABLE_LEVELS
 	int
 	default 5 if X86_5LEVEL
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 640aafe..d1b97dd 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,5 +1,8 @@
 #include <linux/jump_label.h>
 #include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
 
 /*
 
@@ -217,6 +220,73 @@
 #endif
 .endm
 
+#ifdef CONFIG_KAISER
+
+/* KAISER PGDs are 8k.  We flip bit 12 to switch between the two halves: */
+#define KASIER_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
+#define KAISER_SWITCH_MASK     (KASIER_SWITCH_PGTABLES_MASK|\
+				(1<<X86_CR3_KAISER_SWITCH_BIT))
+
+.macro ADJUST_KERNEL_CR3 reg:req
+	ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+        /* Clear PCID and "KAISER bit", point CR3 at kernel pagetables: */
+	andq    $(~KAISER_SWITCH_MASK), \reg
+.endm
+
+.macro ADJUST_USER_CR3 reg:req
+	ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+	/* Set user PCID bit, and move CR3 up a page to the user page tables: */
+	orq     $(KAISER_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	mov	%cr3, \scratch_reg
+	ADJUST_KERNEL_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+	mov	%cr3, \scratch_reg
+	ADJUST_USER_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	movq	%cr3, %r\scratch_reg
+	movq	%r\scratch_reg, \save_reg
+	/*
+         * Is the "switch mask" all zero?  That means that both
+	 * the user/kernel PCID bit, and the user/kernel "bit"
+	 * that points CR3 to the top or bottom of the 8k PGD
+	 * are 0.  That's a kernel CR3 value, not user/shadow.
+	 */
+	testl	$(KAISER_SWITCH_MASK), %e\scratch_reg
+	jz	.Ldone_\@
+
+	ADJUST_KERNEL_CR3 %r\scratch_reg
+	movq	%r\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 save_reg:req
+	/* optimize this */
+	movq	\save_reg, %cr3
+.endm
+
+#else /* CONFIG_KAISER=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 save_reg:req
+.endm
+
+#endif
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4916725..e3e0a59 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -147,8 +147,6 @@
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
-	TRACE_IRQS_OFF
-
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS			/* pt_regs->ss */
 	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
@@ -169,6 +167,13 @@
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
 	UNWIND_HINT_REGS extra=0
 
+	/* NB: right here, all regs except r11 are live. */
+
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
+
+	/* Must wait until we have the kernel CR3 to call C functions: */
+	TRACE_IRQS_OFF
+
 	/*
 	 * If we need to do entry work or if we guess we'll need to do
 	 * exit work, go straight to the slow path.
@@ -220,6 +225,7 @@
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
 	UNWIND_HINT_EMPTY
@@ -313,6 +319,7 @@
 	 * perf profiles. Nothing jumps here.
 	 */
 syscall_return_via_sysret:
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
@@ -320,6 +327,7 @@
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
@@ -422,6 +430,7 @@
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 	SWAPGS
 	jmp	restore_regs_and_iret
 
@@ -566,6 +575,7 @@
 	 * tracking that we're in kernel mode.
 	 */
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -611,6 +621,7 @@
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 	SWAPGS
 	jmp	restore_regs_and_iret
 
@@ -1091,7 +1102,11 @@
 	js	1f				/* negative -> in kernel */
 	SWAPGS
 	xorl	%ebx, %ebx
-1:	ret
+
+1:
+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=ax save_reg=%r14
+
+	ret
 END(paranoid_entry)
 
 /*
@@ -1118,6 +1133,7 @@
 paranoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
 paranoid_exit_restore:
+	RESTORE_CR3	%r14
 	RESTORE_EXTRA_REGS
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1144,6 +1160,9 @@
 	 */
 	SWAPGS
 
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -1190,9 +1209,10 @@
 
 .Lerror_bad_iret:
 	/*
-	 * We came from an IRET to user mode, so we have user gsbase.
-	 * Switch to kernel gsbase:
+	 * We came from an IRET to user mode, so we have user
+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
 	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 	SWAPGS
 
 	/*
@@ -1313,6 +1333,7 @@
 	UNWIND_HINT_REGS
 	ENCODE_FRAME_POINTER
 
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
 	/*
 	 * At this point we no longer need to worry about stack damage
 	 * due to nesting -- we're on the normal thread stack and we're
@@ -1328,6 +1349,7 @@
 	 * work, because we don't want to enable interrupts.
 	 */
 	SWAPGS
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 	jmp	restore_regs_and_iret
 
 .Lnmi_from_kernel:
@@ -1538,6 +1560,8 @@
 	movq	$-1, %rsi
 	call	do_nmi
 
+	RESTORE_CR3 save_reg=%r14
+
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e26c25c..9558206 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -48,8 +48,13 @@
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
+	pushq	%rdi
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+	popq	%rdi
+
 	/*
 	 * User tracing code (ptrace or signal handlers) might assume that
 	 * the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -91,6 +96,9 @@
 	pushq   $0			/* pt_regs->r15 = 0 */
 	cld
 
+	pushq	%rdi
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+	popq	%rdi
 	/*
 	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
 	 * ourselves.  To save a few cycles, we can check whether
@@ -214,6 +222,8 @@
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
@@ -240,6 +250,7 @@
 	popq	%rsi			/* pt_regs->si */
 	popq	%rdi			/* pt_regs->di */
 
+	SWITCH_TO_USER_CR3 scratch_reg=%r8
         /*
          * USERGS_SYSRET32 does:
          *  GSBASE = user's GS base
@@ -324,6 +335,7 @@
 	pushq   %r15                    /* pt_regs->r15 */
 	cld
 
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
 	/*
 	 * User mode is traced as though IRQs are on, and the interrupt
 	 * gate turned them off.
@@ -337,6 +349,7 @@
 	/* Go back to user mode. */
 	TRACE_IRQS_ON
 	SWAPGS
+	SWITCH_TO_USER_CR3 scratch_reg=%r11
 	jmp	restore_regs_and_iret
 END(entry_INT80_compat)
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e1965e5..1449595 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2,11 +2,15 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/kaiser.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
 #include "../perf_event.h"
 
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 
@@ -278,6 +282,39 @@
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+	unsigned int order = get_order(size);
+	struct page *page;
+	unsigned long addr;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	if (!page)
+		return NULL;
+	addr = (unsigned long)page_address(page);
+	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+		__free_pages(page, order);
+		addr = 0;
+	}
+	return (void *)addr;
+#else
+	return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+	if (!buffer)
+		return;
+	kaiser_remove_mapping((unsigned long)buffer, size);
+	free_pages((unsigned long)buffer, get_order(size));
+#else
+	kfree(buffer);
+#endif
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -288,7 +325,7 @@
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -299,7 +336,7 @@
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree(buffer, x86_pmu.pebs_buffer_size);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
@@ -325,7 +362,8 @@
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+			x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
 }
 
@@ -339,7 +377,7 @@
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
@@ -365,19 +403,15 @@
 	if (!ds || !x86_pmu.bts)
 		return;
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
+	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
 
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
-
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
 
 	return 0;
@@ -391,7 +425,6 @@
 		return;
 
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2519c6c..35d4593 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -193,6 +193,7 @@
 #define X86_FEATURE_CAT_L3	( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L2	( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3	( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6dfe366..5134d08 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -160,7 +160,7 @@
 #define VECTOR_RETRIGGERED	((void *)~0UL)
 
 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 0000000..d17f553
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Based on work published here: https://github.com/IAIK/KAISER
+ * Modified by Dave Hansen <dave.hansen@intel.com to actually work.
+ */
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KAISER
+/**
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
+			      unsigned long flags);
+
+/**
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  kaiser_initialize_mapping - Initialize the shadow mapping
+ *
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and nevertunmapped.
+ */
+extern void kaiser_init(void);
+
+void kaiser_check_user_mapped(const void *addr);
+
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 3c856a1..bec7cb9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -281,33 +281,6 @@
 }
 
 /*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
- * bits.  This serves two purposes.  It prevents a nasty situation in
- * which PCID-unaware code saves CR3, loads some other value (with PCID
- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
- * the saved ASID was nonzero.  It also means that any bugs involving
- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
- * deterministically.
- */
-
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
-	if (static_cpu_has(X86_FEATURE_PCID)) {
-		VM_WARN_ON_ONCE(asid > 4094);
-		return __sme_pa(mm->pgd) | (asid + 1);
-	} else {
-		VM_WARN_ON_ONCE(asid != 0);
-		return __sme_pa(mm->pgd);
-	}
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
-	VM_WARN_ON_ONCE(asid > 4094);
-	return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
-}
-
-/*
  * This can be used from process context to figure out what the value of
  * CR3 is without needing to do a (slow) __read_cr3().
  *
@@ -316,7 +289,7 @@
  */
 static inline unsigned long __get_current_cr3_fast(void)
 {
-	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
 		this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 
 	/* For now, be very restrictive about when this can be called. */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b2d0cd8..9d5f54b 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -61,20 +61,36 @@
 	___pte_free_tlb(tlb, pte);
 }
 
+/*
+ * _KERNPG_TABLE has _PAGE_USER clear which tells the KAISER code
+ * that this mapping is for kernel use only.  That makes sure that
+ * we leave the mapping usable by the kernel and do not try to
+ * sabotage it by doing stuff like setting _PAGE_NX on it.
+ */
+static inline pteval_t mm_pgtable_flags(struct mm_struct *mm)
+{
+	if (!mm || (mm == &init_mm))
+		return _KERNPG_TABLE;
+	return _PAGE_TABLE;
+}
+
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *pte)
 {
+	pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
 	paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
-	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+	set_pmd(pmd, __pmd(__pa(pte) | pgtable_flags));
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				struct page *pte)
 {
+	pteval_t pgtable_flags = mm_pgtable_flags(mm);
 	unsigned long pfn = page_to_pfn(pte);
 
 	paravirt_alloc_pte(mm, pfn);
-	set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+	set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | pgtable_flags));
 }
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
@@ -117,16 +133,20 @@
 #else	/* !CONFIG_X86_PAE */
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
+	pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
 	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
-	set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+	set_pud(pud, __pud(__pa(pmd) | pgtable_flags));
 }
 #endif	/* CONFIG_X86_PAE */
 
 #if CONFIG_PGTABLE_LEVELS > 3
 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
 {
+	pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
 	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
-	set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
+	set_p4d(p4d, __p4d(__pa(pud) | pgtable_flags));
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -155,8 +175,10 @@
 #if CONFIG_PGTABLE_LEVELS > 4
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
 {
+	pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
 	paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
-	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+	set_pgd(pgd, __pgd(__pa(p4d) | pgtable_flags));
 }
 
 static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b714934..c149fdf 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -845,7 +845,12 @@
 
 static inline int p4d_bad(p4d_t p4d)
 {
-	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (p4d_flags(p4d) & ~ignore_flags) != 0;
 }
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
@@ -879,7 +884,12 @@
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	unsigned long ignore_flags = _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -1105,6 +1115,12 @@
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+	/* Clone the shadow pgd part as well */
+	memcpy(native_get_shadow_pgd(dst),
+	       native_get_shadow_pgd(src),
+	       count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 972a469..4c1fe00 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -130,9 +130,134 @@
 #endif
 }
 
+#ifdef CONFIG_KAISER
+/*
+ * All top-level KAISER page tables are order-1 pages (8k-aligned
+ * and 8k in size).  The kernel one is at the beginning 4k and
+ * the user (shadow) one is in the last 4k.  To switch between
+ * them, you just need to flip the 12th bit in their addresses.
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+	__ptr |= (1<<bit);
+	return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+	__ptr &= ~(1<<bit);
+	return (void *)__ptr;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+	return ptr_set_bit(pgdp, PAGE_SHIFT);
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+	return ptr_clear_bit(pgdp, PAGE_SHIFT);
+}
+static inline p4d_t *native_get_shadow_p4d(p4d_t *p4dp)
+{
+	return ptr_set_bit(p4dp, PAGE_SHIFT);
+}
+static inline p4d_t *native_get_normal_p4d(p4d_t *p4dp)
+{
+	return ptr_clear_bit(p4dp, PAGE_SHIFT);
+}
+#endif /* CONFIG_KAISER */
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
+/*
+ * Does this PGD allow access via userspace?
+ */
+static inline bool pgd_userspace_access(pgd_t pgd)
+{
+	return (pgd.pgd & _PAGE_USER);
+}
+
+
+void pgd_check_tag(pgd_t *pgd);
+void tag_pgd_with_size(pgd_t *pgd, unsigned long size);
+void clear_pgd_tag(pgd_t *pgd);
+
+/*
+ * Returns the pgd_t that the kernel should use in its page tables.
+ */
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+#ifdef CONFIG_KAISER
+	/* ensure pgdp points to an actual correctly-allocated PGD */
+	pgd_check_tag(pgdp);
+
+	if (pgd_userspace_access(pgd)) {
+		if (pgdp_maps_userspace(pgdp)) {
+			/*
+			 * The user/shadow page tables get the full
+			 * PGD, accessible to userspace:
+			 */
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+			/*
+			 * For the copy of the pgd that the kernel
+			 * uses, make it unusable to userspace.  This
+			 * ensures if we get out to userspace with the
+			 * wrong CR3 value, userspace will crash
+			 * instead of running.
+			 */
+			pgd.pgd |= _PAGE_NX;
+		}
+	} else if (!pgd.pgd) {
+		/*
+		 * We are clearing the PGD and can not check  _PAGE_USER
+		 * in the zero'd PGD.  We never do this on the
+		 * pre-populated kernel PGDs, except for pgd_bad().
+		 */
+		if (pgdp_maps_userspace(pgdp)) {
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+		} else {
+			/*
+			 * Uh, we are very confused.  We have been
+			 * asked to clear a PGD that is in the kernel
+			 * part of the address space.  We preallocated
+			 * all the KAISER PGDs, so this should never
+			 * happen.
+			 */
+			WARN_ON_ONCE(1);
+		}
+	}
+#endif
+	/* return the copy of the PGD we want the kernel to use: */
+	return pgd;
+}
+
+
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
+#if defined(CONFIG_KAISER) && !defined(CONFIG_X86_5LEVEL)
+	/*
+	 * set_pgd() does not get called when we are running
+	 * CONFIG_X86_5LEVEL=y.  So, just hack around it.  We
+	 * know here that we have a p4d but that it is really at
+	 * the top level of the page tables; it is really just a
+	 * pgd.
+	 */
+	p4dp->pgd = kaiser_set_shadow_pgd(&p4dp->pgd, p4d.pgd);
+#else /* CONFIG_KAISER */
 	*p4dp = p4d;
+#endif
 }
 
 static inline void native_p4d_clear(p4d_t *p4d)
@@ -146,7 +271,11 @@
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_KAISER
+	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
+#else /* CONFIG_KAISER */
 	*pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f149247..e9330a4 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -47,7 +47,12 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#ifdef CONFIG_X86_GLOBAL_PAGES
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#else
+/* We must ensure that kernel TLBs are unusable while in userspace */
+#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
+#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -121,7 +126,7 @@
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
 				 _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -139,6 +144,17 @@
 			 _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1, UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+
+/* Make sure this is only usable in KAISER #ifdef'd code: */
+#ifdef CONFIG_KAISER
+#define X86_CR3_KAISER_SWITCH_BIT 11
+#endif
+
 /*
  * The cache modes defined here are used to translate between pure SW usage
  * and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b390ff7..c1dd583 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -348,7 +348,7 @@
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c4aed0d..32f981d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,7 +13,7 @@
 			     unsigned long type)
 {
 	struct { u64 d[2]; } desc = { { pcid, addr } };
-
+	void *ptr = &desc;
 	/*
 	 * The memory clobber is because the whole point is to invalidate
 	 * stale TLB entries and, especially if we're flushing global
@@ -24,7 +24,7 @@
 	 * invpcid (%rcx), %rax in long mode.
 	 */
 	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
-		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+		      : : "m" (desc), "a" (type), "c" (ptr) : "memory");
 }
 
 #define INVPCID_TYPE_INDIV_ADDR		0
@@ -48,13 +48,13 @@
 /* Flush all mappings, including globals, for all PCIDs. */
 static inline void invpcid_flush_all(void)
 {
-	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+	__invpcid(0x123, 0x456, INVPCID_TYPE_ALL_INCL_GLOBAL);
 }
 
 /* Flush all mappings for all PCIDs except globals. */
 static inline void invpcid_flush_all_nonglobals(void)
 {
-	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+	__invpcid(0x123, 0x456, INVPCID_TYPE_ALL_NON_GLOBAL);
 }
 
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
@@ -74,6 +74,97 @@
 	return new_tlb_gen;
 }
 
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+/* When enabled, KAISER consumes a single bit for user/kernel switches */
+#ifdef CONFIG_KAISER
+#define X86_CR3_KAISER_SWITCH_BIT 11
+#define KAISER_CONSUMED_ASID_BITS 1
+#else
+#define KAISER_CONSUMED_ASID_BITS 0
+#endif
+
+#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS-KAISER_CONSUMED_ASID_BITS)
+/*
+ * We lose a single extra ASID because 0 is reserved for use
+ * by non-PCID-aware users.
+ */
+#define NR_AVAIL_ASIDS ((1<<CR3_AVAIL_ASID_BITS) - 1)
+
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+static inline u16 kern_asid(u16 asid)
+{
+	VM_WARN_ON_ONCE(asid >= NR_AVAIL_ASIDS);
+
+#ifdef CONFIG_KAISER
+	/*
+	 * Make sure that the dynamic ASID space does not confict
+	 * with the bit we are using to switch between user and
+	 * kernel ASIDs.
+	 */
+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1<<X86_CR3_KAISER_SWITCH_BIT));
+
+	/*
+	 * The ASID being passed in here should have respected
+	 * the NR_AVAIL_ASIDS and thus never have the switch
+	 * bit set.
+	 */
+	VM_WARN_ON_ONCE(asid & (1<<X86_CR3_KAISER_SWITCH_BIT));
+#endif
+	/*
+	 * The dynamically-assigned ASIDs that get passed in  are
+	 * small (<TLB_NR_DYN_ASIDS).  They never have the high
+	 * switch bit set, so do not bother to clear it.
+	 */
+
+	/*
+	 * If PCID is on, ASID-aware code paths put the ASID+1
+	 * into the PCID bits.  This serves two purposes.  It
+	 * prevents a nasty situation in which PCID-unaware code
+	 * saves CR3, loads some other value (with PCID == 0),
+	 * and then restores CR3, thus corrupting the TLB for
+	 * ASID 0 if the saved ASID was nonzero.  It also means
+	 * that any bugs involving loading a PCID-enabled CR3
+	 * with CR4.PCIDE off will trigger deterministically.
+	 */
+	return asid + 1;
+}
+
+/*
+ * The user ASID is just the kernel one, plus the "switch bit".
+ */
+static inline u16 user_asid(u16 asid)
+{
+	u16 ret = kern_asid(asid);
+#ifdef CONFIG_KAISER
+	ret |= 1<<X86_CR3_KAISER_SWITCH_BIT;
+#endif
+	return ret;
+}
+
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+	if (static_cpu_has(X86_FEATURE_PCID)) {
+		return __sme_pa(pgd) | kern_asid(asid);
+	} else {
+		VM_WARN_ON_ONCE(asid != 0);
+		return __sme_pa(pgd);
+	}
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+	VM_WARN_ON_ONCE(asid > NR_AVAIL_ASIDS);
+	VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
+	return __sme_pa(pgd) | kern_asid(asid) | CR3_NOFLUSH;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -98,12 +189,6 @@
 	return !static_cpu_has(X86_FEATURE_PCID);
 }
 
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -138,6 +223,17 @@
 	bool is_lazy;
 
 	/*
+	 * If set we changed the page tables in such a way that we
+	 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+	 * This tells us to go invalidate all the non-loaded ctxs[]
+	 * on the next context switch.
+	 *
+	 * The current ctx was kept up-to-date as it ran and does not
+	 * need to be invalidated.
+	 */
+	bool all_other_ctxs_invalid;
+
+	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
 	 */
@@ -214,6 +310,19 @@
 	return this_cpu_read(cpu_tlbstate.cr4);
 }
 
+static inline void tlb_flush_shared_nonglobals(void)
+{
+	/*
+	 * With global pages, all of the shared kenel page tables
+	 * are set as _PAGE_GLOBAL.  We have no shared nonglobals
+	 * and nothing to do here.
+	 */
+	if (IS_ENABLED(CONFIG_X86_GLOBAL_PAGES))
+		return;
+
+	this_cpu_write(cpu_tlbstate.all_other_ctxs_invalid, true);
+}
+
 /*
  * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
  * enable and PPro Global page enable), so that any CPU's that boot
@@ -235,14 +344,38 @@
 
 static inline void __native_flush_tlb(void)
 {
+	if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+		/*
+		 * native_write_cr3() only clears the current PCID if
+		 * CR4 has X86_CR4_PCIDE set.  In other words, this does
+		 * not fully flush the TLB if PCIDs are in use.  Warn if
+		 * it gets called.
+		 */
+		WARN_ON_ONCE(this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE);
+		/*
+		 * If current->mm == NULL then we borrow a mm
+		 * which may change during a task switch and
+		 * therefore we must not be preempted while we
+		 * write CR3 back:
+		 */
+		preempt_disable();
+		native_write_cr3(__native_read_cr3());
+		preempt_enable();
+		/*
+		 * Does not need tlb_flush_shared_nonglobals()
+		 * since the CR3 write without PCIDs flushes all
+		 * non-globals.
+		 */
+		return;
+	}
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change during a
-	 * task switch and therefore we must not be preempted while we write CR3
-	 * back:
+	 * We are no longer using globals with KAISER, so a
+	 * "nonglobals" flush would work too. But, this is more
+	 * conservative.
+	 *
+	 * Note, this works with CR4.PCIDE=0 or 1.
 	 */
-	preempt_disable();
-	native_write_cr3(__native_read_cr3());
-	preempt_enable();
+	invpcid_flush_all();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -250,7 +383,18 @@
 	unsigned long cr4;
 
 	cr4 = this_cpu_read(cpu_tlbstate.cr4);
-	/* clear PGE */
+	/*
+	 * According to the SDM, a MOV to CR4 invalidates all TLB
+	 * entries (including global entries) and all entries in
+	 * all paging-structure caches (for all PCIDs) if it
+	 * changes the value of CR4.PGE or changes CR4.PCIDE
+	 * from 1->0.
+	 *
+	 * We can not change CR4.PCIDE 1->0 if we have a non-zero
+	 * PCID in CR3, so just toggle X86_CR4_PGE.  But,
+	 * this will clear all PCIDs, just as if X86_CR4_PCIDE
+	 * had also been toggled.
+	 */
 	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
 	native_write_cr4(cr4);
@@ -264,6 +408,8 @@
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+		 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		invpcid_flush_all();
 		return;
@@ -283,29 +429,51 @@
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
+	/*
+	 * Some platforms #GP if we call invpcid(type=1/2) before
+	 * CR4.PCIDE=1.  Just call invpcid in the case we are called
+	 * early.
+	 */
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+		return;
+	}
+	/* Flush the address out of both PCIDs. */
+	/*
+	 * An optimization here might be to determine addresses
+	 * that are only kernel-mapped and only flush the kernel
+	 * ASID.  But, userspace flushes are probably much more
+	 * important performance-wise.
+	 *
+	 * Make sure to do only a single invpcid when KAISER is
+	 * disabled and we have only a single ASID.
+	 */
+	if (kern_asid(loaded_mm_asid) != user_asid(loaded_mm_asid))
+		invpcid_flush_one(user_asid(loaded_mm_asid), addr);
+	invpcid_flush_one(kern_asid(loaded_mm_asid), addr);
 }
 
 static inline void __flush_tlb_all(void)
 {
-	if (boot_cpu_has(X86_FEATURE_PGE))
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		__flush_tlb_global();
-	else
+	} else {
 		__flush_tlb();
-
-	/*
-	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
-	 * we'd end up flushing kernel translations for the current ASID but
-	 * we might fail to flush kernel translations for other cached ASIDs.
-	 *
-	 * To avoid this issue, we force PCID off if PGE is off.
-	 */
+		tlb_flush_shared_nonglobals();
+	}
 }
 
 static inline void __flush_tlb_one(unsigned long addr)
 {
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
 	__flush_tlb_single(addr);
+	/*
+	 * Invalidate other address spaces inaccessible to single-page
+	 * invalidation:
+	 */
+	tlb_flush_shared_nonglobals();
 }
 
 #define TLB_FLUSH_ALL	-1UL
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 185f3d1..6616457 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d33..dcd6ea7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -722,7 +722,8 @@
 	clear_fixmap(FIX_TEXT_POKE0);
 	if (pages[1])
 		clear_fixmap(FIX_TEXT_POKE1);
-	local_flush_tlb();
+	/* Make sure to flush Global pages: */
+	__flush_tlb_all();
 	sync_core();
 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
 	   that causes hangs on some VIA CPUs. */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9176ba..f4985b4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
 #include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
+#include <linux/kaiser.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched/mm.h>
@@ -98,7 +99,7 @@
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -487,6 +488,20 @@
 #endif
 
 	__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+
+	/* CPU 0's mapping is done in kaiser_init() */
+	if (cpu) {
+		int ret;
+
+		ret = kaiser_add_mapping((unsigned long) get_cpu_gdt_ro(cpu),
+					 PAGE_SIZE, __PAGE_KERNEL_RO);
+		/*
+		 * We do not have a good way to fail CPU bringup.
+		 * Just WARN about it and hope we boot far enough
+		 * to get a good log out.
+		 */
+		WARN_ON(ret);
+	}
 }
 
 /* Load the original GDT from the per-cpu structure */
@@ -1345,7 +1360,7 @@
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
@@ -1425,7 +1440,7 @@
  * the top of the kernel stack.  Use an extra percpu variable to track the
  * top of the kernel stack directly.
  */
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, cpu_current_top_of_stack) =
 	(unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9c4e7ba..673137a 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -33,6 +33,7 @@
 
 #include <linux/init.h>
 #include <linux/init_task.h>
+#include <linux/kaiser.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
@@ -60,8 +61,8 @@
 #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
 
 /* This contains the *bottom* address of the espfix stack */
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, espfix_stack);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, espfix_waddr);
 
 /* Initialization mutex - should this be a spinlock? */
 static DEFINE_MUTEX(espfix_init_mutex);
@@ -128,6 +129,22 @@
 	pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
 	p4d_populate(&init_mm, p4d, espfix_pud_page);
+	/*
+	 * Just copy the top-level PGD that is mapping the espfix
+	 * area to ensure it is mapped into the shadow user page
+	 * tables.
+	 *
+	 * For 5-level paging, we should have already populated
+	 * the espfix pgd when kaiser_init() pre-populated all
+	 * the pgd entries.  The above p4d_alloc() would never do
+	 * anything and the p4d_populate() would be done to a p4d
+	 * already mapped in the userspace pgd.
+	 */
+#ifdef CONFIG_KAISER
+	if (CONFIG_PGTABLE_LEVELS <= 4)
+		*native_get_shadow_pgd(pgd) =
+			__pgd(_KERNPG_TABLE | (p4d_pfn(*p4d) << PAGE_SHIFT));
+#endif
 
 	/* Randomize the locations */
 	init_espfix_random();
@@ -208,4 +225,5 @@
 	per_cpu(espfix_stack, cpu) = addr;
 	per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
 				      + (addr & ~PAGE_MASK);
+	kaiser_add_mapping((unsigned long)stack_page, PAGE_SIZE, __PAGE_KERNEL);
 }
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 513cbb0..35eab60 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -339,6 +339,27 @@
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL	0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -348,13 +369,14 @@
 	.endr
 
 	__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
 	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -362,16 +384,18 @@
 	.data
 
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.fill	512,8,0
+	.fill	KAISER_USER_PGD_FILL,8,0
 #else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 6107ee1..fa63b0b 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -4,6 +4,7 @@
  * This file is licensed under the GPL V2
  */
 #include <linux/interrupt.h>
+#include <linux/kaiser.h>
 
 #include <asm/traps.h>
 #include <asm/proto.h>
@@ -221,6 +222,8 @@
 {
 	gate_desc desc;
 
+	kaiser_check_user_mapped(t->addr);
+
 	for (; size > 0; t++, size--) {
 		idt_init_desc(&desc, t);
 		write_idt_entry(idt, t->vector, &desc);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1add9e0..8ae1196 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@
 	.flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f0e64db..3f53930 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -10,6 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/string.h>
+#include <linux/kaiser.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
@@ -55,11 +56,21 @@
 	refresh_ldt_segments();
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree_atomic(ldt->entries);
+	else
+		free_page((unsigned long)ldt->entries);
+	kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 {
 	struct ldt_struct *new_ldt;
 	unsigned int alloc_size;
+	int ret = 0;
 
 	if (num_entries > LDT_ENTRIES)
 		return NULL;
@@ -87,6 +98,12 @@
 		return NULL;
 	}
 
+	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+				 __PAGE_KERNEL);
+	if (ret) {
+		__free_ldt_struct(new_ldt);
+		return NULL;
+	}
 	new_ldt->nr_entries = num_entries;
 	return new_ldt;
 }
@@ -113,12 +130,10 @@
 	if (likely(!ldt))
 		return;
 
+	kaiser_remove_mapping((unsigned long)ldt->entries,
+			      ldt->nr_entries * LDT_ENTRY_SIZE);
 	paravirt_free_ldt(ldt->entries, ldt->nr_entries);
-	if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree_atomic(ldt->entries);
-	else
-		free_page((unsigned long)ldt->entries);
-	kfree(ldt);
+	__free_ldt_struct(ldt);
 }
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bd6b85f..a1f65ea 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,7 +46,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
 		.sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 302e7b2..3839f06 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -59,7 +59,7 @@
 #include <asm/unistd_32_ia32.h>
 #endif
 
-__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs *regs, int all)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb..cd7ed7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -805,7 +805,8 @@
 			return 1;
 
 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
-		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+		    !is_long_mode(vcpu))
 			return 1;
 	}
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index e1f0958..9639303 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -45,6 +45,7 @@
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_KAISER) += kaiser.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af5c1ed..ca03303 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -196,34 +196,59 @@
 
 static void setup_pcid(void)
 {
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_PCID)) {
-		if (boot_cpu_has(X86_FEATURE_PGE)) {
-			/*
-			 * This can't be cr4_set_bits_and_update_boot() --
-			 * the trampoline code can't handle CR4.PCIDE and
-			 * it wouldn't do any good anyway.  Despite the name,
-			 * cr4_set_bits_and_update_boot() doesn't actually
-			 * cause the bits in question to remain set all the
-			 * way through the secondary boot asm.
-			 *
-			 * Instead, we brute-force it and set CR4.PCIDE
-			 * manually in start_secondary().
-			 */
-			cr4_set_bits(X86_CR4_PCIDE);
-		} else {
-			/*
-			 * flush_tlb_all(), as currently implemented, won't
-			 * work if PCID is on but PGE is not.  Since that
-			 * combination doesn't exist on real hardware, there's
-			 * no reason to try to fully support it, but it's
-			 * polite to avoid corrupting data if we're on
-			 * an improperly configured VM.
-			 */
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return;
+
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
+		/*
+		 * KAISER uses a PCID for the kernel and another
+		 * for userspace.  Both PCIDs need to be flushed
+		 * when the TLB flush functions are called.  But,
+		 * flushing *another* PCID is insane without
+		 * INVPCID.  Just avoid using PCIDs at all if we
+		 * have KAISER and do not have INVPCID.
+		 */
+		if (!IS_ENABLED(CONFIG_X86_GLOBAL_PAGES) &&
+		    !boot_cpu_has(X86_FEATURE_INVPCID)) {
 			setup_clear_cpu_cap(X86_FEATURE_PCID);
+			return;
 		}
+		/*
+		 * This can't be cr4_set_bits_and_update_boot() --
+		 * the trampoline code can't handle CR4.PCIDE and
+		 * it wouldn't do any good anyway.  Despite the name,
+		 * cr4_set_bits_and_update_boot() doesn't actually
+		 * cause the bits in question to remain set all the
+		 * way through the secondary boot asm.
+		 *
+		 * Instead, we brute-force it and set CR4.PCIDE
+		 * manually in start_secondary().
+		 */
+		cr4_set_bits(X86_CR4_PCIDE);
+
+		/*
+		 * INVPCID's single-context modes (2/3) only work
+		 * if we set X86_CR4_PCIDE, *and* we INVPCID
+		 * support.  It's unusable on systems that have
+		 * X86_CR4_PCIDE clear, or that have no INVPCID
+		 * support at all.
+		 */
+		if (boot_cpu_has(X86_FEATURE_INVPCID))
+			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+	} else {
+		/*
+		 * flush_tlb_all(), as currently implemented, won't
+		 * work if PCID is on but PGE is not.  Since that
+		 * combination doesn't exist on real hardware, there's
+		 * no reason to try to fully support it, but it's
+		 * polite to avoid corrupting data if we're on
+		 * an improperly configured VM.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_PCID);
 	}
-#endif
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 0000000..3366d20
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Based on work published here: https://github.com/IAIK/KAISER
+ * Modified by Dave Hansen <dave.hansen@intel.com to actually work.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/kaiser.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(vaddr);
+	/*
+	 * We made all the kernel PGDs present in kaiser_init().
+	 * We expect them to stay that way.
+	 */
+	if (pgd_none(*pgd)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+	/*
+	 * PGDs are either 512GB or 128TB on all x86_64
+	 * configurations.  We don't handle these.
+	 */
+	if (pgd_large(*pgd)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	p4d = p4d_offset(pgd, vaddr);
+	if (p4d_none(*p4d)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	pud = pud_offset(p4d, vaddr);
+	if (pud_none(*pud)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	if (pud_large(*pud))
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	if (pmd_large(*pmd))
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+#define KAISER_NO_FLAGS		0
+#define KAISER_WALK_ATOMIC	0x1
+#define KAISER_WALK_NO_ALLOC	0x2
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * only walks the shadow page tables and can try to allocate
+ * new page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, unsigned long flags)
+{
+	pmd_t *pmd;
+	pud_t *pud;
+	p4d_t *p4d;
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	might_sleep();
+	if (flags & KAISER_WALK_ATOMIC) {
+		gfp &= ~GFP_KERNEL;
+		gfp |= __GFP_HIGH | __GFP_ATOMIC;
+	}
+
+	if (pgd_none(*pgd)) {
+		WARN_ONCE(1, "All shadow pgds should have been populated");
+		return NULL;
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+	p4d = p4d_offset(pgd, address);
+	BUILD_BUG_ON(p4d_large(*p4d) != 0);
+	if (p4d_none(*p4d)) {
+		unsigned long new_pud_page;
+		if (flags & KAISER_WALK_NO_ALLOC)
+			return NULL;
+		new_pud_page = __get_free_page(gfp);
+		if (!new_pud_page)
+			return NULL;
+
+		spin_lock(&shadow_table_allocation_lock);
+		if (p4d_none(*p4d))
+			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+		else
+			free_page(new_pud_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
+
+	pud = pud_offset(p4d, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page;
+
+		if (flags & KAISER_WALK_NO_ALLOC)
+			return NULL;
+
+		new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+
+		spin_lock(&shadow_table_allocation_lock);
+		if (pud_none(*pud))
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+		else
+			free_page(new_pmd_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
+
+	pmd = pmd_offset(pud, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page;
+
+		if (flags & KAISER_WALK_NO_ALLOC)
+			return NULL;
+
+		new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+
+		spin_lock(&shadow_table_allocation_lock);
+		if (pmd_none(*pmd))
+			set_pmd(pmd, __pmd(_KERNPG_TABLE  | __pa(new_pte_page)));
+		else
+			free_page(new_pte_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
+
+	return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Given a kernel address, @__start_addr, copy that mapping into
+ * the user (shadow) page tables.  This may need to allocate page
+ * table pages.
+ */
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			unsigned long flags)
+{
+	pte_t *pte;
+	unsigned long start_addr = (unsigned long)__start_addr;
+	unsigned long address = start_addr & PAGE_MASK;
+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+	unsigned long target_address;
+
+	for (; address < end_addr; address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+		if (target_address == -1)
+			return -EIO;
+
+		pte = kaiser_pagetable_walk(address, KAISER_NO_FLAGS);
+		/*
+		 * Errors come from either -ENOMEM for a page
+		 * table page, or something screwy that did a
+		 * WARN_ON().  Just return -ENOMEM.
+		 */
+		if (!pte)
+			return -ENOMEM;
+		if (pte_none(*pte)) {
+			set_pte(pte, __pte(flags | target_address));
+		} else {
+			pte_t tmp;
+			set_pte(&tmp, __pte(flags | target_address));
+			WARN_ON_ONCE(!pte_same(*pte, tmp));
+		}
+	}
+	return 0;
+}
+
+int kaiser_add_user_map_ptrs(const void *__start_addr,
+			     const void *__end_addr,
+			     unsigned long flags)
+{
+	return kaiser_add_user_map(__start_addr,
+				   __end_addr - __start_addr,
+				   flags);
+}
+
+static int kaiser_user_map_ptr_early(const void *start_addr, unsigned long size,
+				 unsigned long flags)
+{
+	int ret = kaiser_add_user_map(start_addr, size, flags);
+	WARN_ON(ret);
+	return ret;
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+	pgd_t *pgd;
+	int i = 0;
+
+	pgd = native_get_shadow_pgd(pgd_offset_k(0UL));
+	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+		unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE;
+#if CONFIG_PGTABLE_LEVELS > 4
+		p4d_t *p4d = p4d_alloc_one(&init_mm, addr);
+		if (!p4d) {
+			WARN_ON(1);
+			break;
+		}
+		pgd[i] = __pgd(_KERNPG_TABLE | __pa(p4d));
+#else /* CONFIG_PGTABLE_LEVELS <= 4 */
+		pud_t *pud = pud_alloc_one(&init_mm, addr);
+		if (!pud) {
+			WARN_ON(1);
+			break;
+		}
+		pgd[i] = __pgd(_KERNPG_TABLE | __pa(pud));
+#endif /* CONFIG_PGTABLE_LEVELS */
+	}
+}
+
+/*
+ * The page table allocations in here can theoretically fail, but
+ * we can not do much about it in early boot.  Do the checking
+ * and warning in a macro to make it more readable.
+ */
+#define kaiser_add_user_map_early(start, size, flags) do {	\
+	int __ret = kaiser_add_user_map(start, size, flags);	\
+	WARN_ON(__ret);						\
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
+	WARN_ON(__ret);							\
+} while (0)
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+static int kaiser_init_finished = 0;
+void kaiser_check_idt_table(const struct desc_ptr *idt_desc);
+void __init kaiser_init(void)
+{
+	int cpu;
+
+	kaiser_init_all_pgds();
+
+	for_each_possible_cpu(cpu) {
+		void *percpu_vaddr = __per_cpu_user_mapped_start +
+				     per_cpu_offset(cpu);
+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
+					  __per_cpu_user_mapped_start;
+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+					  __PAGE_KERNEL);
+	}
+
+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+				       __PAGE_KERNEL_RX);
+
+	/* the fixed map address of the idt_table */
+	kaiser_add_user_map_early((void *)idt_descr.address,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL_RO);
+
+	kaiser_user_map_ptr_early(&debug_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
+
+	/*
+	 * We could theoretically do this in setup_fixmap_gdt().
+	 * But, we would need to rewrite the above page table
+	 * allocation code to use the bootmem allocator.  The
+	 * buddy allocator is not available at the time that we
+	 * call setup_fixmap_gdt() for CPU 0.
+	 */
+	kaiser_add_user_map_early(get_cpu_gdt_ro(0), PAGE_SIZE,
+				  __PAGE_KERNEL_RO);
+
+	/*
+	 * .irqentry.text helps us identify code that runs before
+	 * we get a chance to call entering_irq().  This includes
+	 * the interrupt entry assembly plus the first C function
+	 * that gets called.  KAISER does not need the C code
+	 * mapped.  We just use the .irqentry.text section as-is
+	 * to avoid having to carve out a new section for the
+	 * assembly only.
+	 */
+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+				       __irqentry_text_end,
+				       __PAGE_KERNEL_RX);
+
+	kaiser_init_finished = 1;
+	kaiser_check_idt_table(&idt_descr);
+	kaiser_check_idt_table(&debug_idt_descr);
+}
+
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start,
+				   unsigned long end);
+int kaiser_add_mapping(unsigned long addr, unsigned long size,
+		       unsigned long flags)
+{
+	return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+	unsigned long end = start + size;
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PGDIR_SIZE) {
+		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+		/*
+		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
+		 * so no need to trim 'end'.
+		 */
+		unmap_pud_range_nofree(pgd, addr, end);
+	}
+}
+
+/*
+ * If we fail to map something, it is not fun to debug because it
+ * will often crash in the entry code, which jumps into the entry
+ * code.  To catch these problems, allow the kaiser mappings to
+ * be checked before we actually need to use them.
+ */
+void kaiser_check_user_mapped(const void *addr)
+{
+	pte_t *pte;
+
+	/*
+	 * idt_setup_early_handler() might call in here before
+	 * kaiser_init() runs.  Ignore those calls.
+	 */
+	if (!kaiser_init_finished)
+		return;
+
+	pte = kaiser_pagetable_walk((unsigned long)addr, KAISER_WALK_NO_ALLOC);
+
+	if (pte && pte_present(*pte))
+		return;
+
+	WARN(1, "address is not mapped into user page tables: %pK\n", addr);
+}
+
+void kaiser_check_idt_table(const struct desc_ptr *idt_desc)
+{
+	struct gate_struct *hw_table_ptr;
+	int table_nr_entries;
+	int i;
+
+	/*
+	 * Note: "idt_desc" itself is not user-mapped.  The
+	 * descriptor's content are loaded into a register and it
+	 * is not referenced after the 'lidt' instruction used to
+	 * load its contents.
+	 */
+
+	hw_table_ptr = (struct gate_struct *)idt_desc->address;
+	/* The hardware description in 'idt_desc->size' is in bytes. */
+	table_nr_entries = idt_desc->size / sizeof(*hw_table_ptr);
+
+	for (i = 0; i < table_nr_entries; i++) {
+		struct gate_struct *hw_table_entry = &hw_table_ptr[i];
+		void *idt_handler_addr = (void *)gate_offset(hw_table_entry);
+
+		/* Make sure the table itself is mapped: */
+		kaiser_check_user_mapped(hw_table_entry);
+
+		/* Skip empty entries */
+		if (!idt_handler_addr)
+			continue;
+		/*
+		 * Entries for the early IDT handlers are left in
+		 * the IDT even after boot.  They are not used
+		 * and are not user-mapped, so skip the checks
+		 * for them.
+		 */
+		if (!test_bit(i, used_vectors))
+			continue;
+
+		kaiser_check_user_mapped(idt_handler_addr);
+	}
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dfb7d65..416642e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -53,6 +53,7 @@
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 #define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -765,10 +766,13 @@
 	return 0;
 }
 
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		if (!pte_none(pte[i]))
 			return false;
@@ -777,10 +781,13 @@
 	return true;
 }
 
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PMD; i++)
 		if (!pmd_none(pmd[i]))
 			return false;
@@ -789,7 +796,9 @@
 	return true;
 }
 
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+			    unsigned long start,
+			    unsigned long end)
 {
 	pte_t *pte = pte_offset_kernel(pmd, start);
 
@@ -800,22 +809,23 @@
 		pte++;
 	}
 
-	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
 		pmd_clear(pmd);
 		return true;
 	}
 	return false;
 }
 
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
 			      unsigned long start, unsigned long end)
 {
-	if (unmap_pte_range(pmd, start, end))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+	if (unmap_pte_range(cpa, pmd, start, end))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+			    unsigned long start, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, start);
 
@@ -826,7 +836,7 @@
 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 		unsigned long pre_end = min_t(unsigned long, end, next_page);
 
-		__unmap_pmd_range(pud, pmd, start, pre_end);
+		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
 
 		start = pre_end;
 		pmd++;
@@ -839,7 +849,8 @@
 		if (pmd_large(*pmd))
 			pmd_clear(pmd);
 		else
-			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+			__unmap_pmd_range(cpa, pud, pmd,
+					  start, start + PMD_SIZE);
 
 		start += PMD_SIZE;
 		pmd++;
@@ -849,17 +860,19 @@
 	 * 4K leftovers?
 	 */
 	if (start < end)
-		return __unmap_pmd_range(pud, pmd, start, end);
+		return __unmap_pmd_range(cpa, pud, pmd, start, end);
 
 	/*
 	 * Try again to free the PMD page if haven't succeeded above.
 	 */
 	if (!pud_none(*pud))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, p4d_t *p4d,
+			      unsigned long start,
+			      unsigned long end)
 {
 	pud_t *pud = pud_offset(p4d, start);
 
@@ -870,7 +883,7 @@
 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 
-		unmap_pmd_range(pud, start, pre_end);
+		unmap_pmd_range(cpa, pud, start, pre_end);
 
 		start = pre_end;
 		pud++;
@@ -884,7 +897,7 @@
 		if (pud_large(*pud))
 			pud_clear(pud);
 		else
-			unmap_pmd_range(pud, start, start + PUD_SIZE);
+			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
 
 		start += PUD_SIZE;
 		pud++;
@@ -894,7 +907,7 @@
 	 * 2M leftovers?
 	 */
 	if (start < end)
-		unmap_pmd_range(pud, start, end);
+		unmap_pmd_range(cpa, pud, start, end);
 
 	/*
 	 * No need to try to free the PUD page because we'll free it in
@@ -902,6 +915,24 @@
 	 */
 }
 
+void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = CPA_FREE_PAGETABLES,
+	};
+
+	__unmap_pud_range(&cpa, p4d, start, end);
+}
+
+void unmap_pud_range_nofree(p4d_t *p4d, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = 0,
+	};
+
+	__unmap_pud_range(&cpa, p4d, start, end);
+}
+
 static int alloc_pte_page(pmd_t *pmd)
 {
 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -1149,8 +1180,7 @@
 		 * already found it, but remove any useless entries we just
 		 * added to it.
 		 */
-		unmap_pud_range(p4d, addr,
-				addr + (cpa->numpages << PAGE_SHIFT));
+		unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT));
 		return ret;
 	}
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b372f34..9bc6fc8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -354,14 +354,69 @@
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
+
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pgd, we aquire two pgds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+const unsigned long pgd_magic = 0x70676400 | PGD_ALLOCATION_ORDER;
+
+void tag_pgd_with_size(pgd_t *pgd, unsigned long size)
+{
+	struct page *pgd_page = virt_to_page(pgd);
+
+	/*
+	 * Make sure the size the caller is tagging with matches
+	 * the size that we need the PGD to be.  This check is a
+	 * bit superfluous for the callers within this file but
+	 * is quite necessary for the callers from outside.
+	 */
+	WARN_ON_ONCE(size != PAGE_SIZE << PGD_ALLOCATION_ORDER);
+
+	pgd_page->private = pgd_magic;
+}
+
+void clear_pgd_tag(pgd_t *pgd)
+{
+	struct page *pgd_page = virt_to_page(pgd);
+	pgd_page->private = 0;
+}
+
 static inline pgd_t *_pgd_alloc(void)
 {
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+	pgd_t *pgd = (pgd_t *)__get_free_pages(PGALLOC_GFP,
+					       PGD_ALLOCATION_ORDER);
+	tag_pgd_with_size(pgd, PAGE_SIZE << PGD_ALLOCATION_ORDER);
+
+	return pgd;
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-	free_page((unsigned long)pgd);
+	clear_pgd_tag(pgd);
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
+void pgd_check_tag(pgd_t *pgd)
+{
+	struct page *pgd_page = virt_to_page(pgd);
+
+	if (pgd_page == virt_to_page(init_top_pgt))
+		return;
+
+	if (pgd_page->private == pgd_magic)
+		return;
+
+	if (WARN_ON_ONCE(1))
+		printk(KERN_WARNING "bad pgd page @ 0x%pK, magic: 0x%lx\n",
+				pgd, pgd_page->private);
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0f3d0ce..00ae052 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts.  We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_non_loaded_ctxs(void)
+{
+	u16 asid;
+
+	/*
+	 * This is only expected to be set if we have disabled
+	 * kernel _PAGE_GLOBAL pages.
+	 */
+        if (IS_ENABLED(CONFIG_X86_GLOBAL_PAGES)) {
+		WARN_ON_ONCE(1);
+                return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		/* Do not need to flush the current asid */
+		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+			continue;
+		/*
+		 * Make sure the next time we go to switch to
+		 * this asid, we do a flush:
+		 */
+		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+	}
+	this_cpu_write(cpu_tlbstate.all_other_ctxs_invalid, false);
+}
+
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
 
@@ -42,6 +74,9 @@
 		return;
 	}
 
+	if (this_cpu_read(cpu_tlbstate.all_other_ctxs_invalid))
+		clear_non_loaded_ctxs();
+
 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
 		    next->context.ctx_id)
@@ -65,6 +100,68 @@
 	*need_flush = true;
 }
 
+/*
+ * Given a kernel asid, flush the corresponding KAISER
+ * user ASID.
+ */
+static void flush_user_asid(pgd_t *pgd, u16 kern_asid)
+{
+	/* There is no user ASID if KAISER is off */
+	if (!IS_ENABLED(CONFIG_KAISER))
+		return;
+	/*
+	 * We only have a single ASID if PCID is off and the CR3
+	 * write will have flushed it.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_PCID))
+		return;
+	/*
+	 * With PCIDs enabled, write_cr3() only flushes TLB
+	 * entries for the current (kernel) ASID.  This leaves
+	 * old TLB entries for the user ASID in place and we must
+	 * flush that context separately.  We can theoretically
+	 * delay doing this until we actually load up the
+	 * userspace CR3, but do it here for simplicity.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+		invpcid_flush_single_context(user_asid(kern_asid));
+	} else {
+		/*
+		 * On systems with PCIDs, but no INVPCID, the only
+		 * way to flush a PCID is a CR3 write.  Note that
+		 * we use the kernel page tables with the *user*
+		 * ASID here.
+		 */
+		unsigned long user_asid_flush_cr3;
+		user_asid_flush_cr3 = build_cr3(pgd, user_asid(kern_asid));
+		write_cr3(user_asid_flush_cr3);
+		/*
+		 * We do not use PCIDs with KAISER unless we also
+		 * have INVPCID.  Getting here is unexpected.
+		 */
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+	unsigned long new_mm_cr3;
+
+	if (need_flush) {
+		flush_user_asid(pgdir, new_asid);
+		new_mm_cr3 = build_cr3(pgdir, new_asid);
+	} else {
+		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+	}
+
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -127,7 +224,7 @@
 	 * isn't free.
 	 */
 #ifdef CONFIG_DEBUG_VM
-	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
 		/*
 		 * If we were to BUG here, we'd be very likely to kill
 		 * the system so hard that we don't see the call trace.
@@ -194,12 +291,12 @@
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(build_cr3(next, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, true);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
-			write_cr3(build_cr3_noflush(next, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, false);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
 		}
 
@@ -277,7 +374,7 @@
 		!(cr4_read_shadow() & X86_CR4_PCIDE));
 
 	/* Force ASID 0 and force a TLB flush. */
-	write_cr3(build_cr3(mm, 0));
+	write_cr3(build_cr3(mm->pgd, 0));
 
 	/* Reinitialize tlbstate. */
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 12e8388..aab0ec4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -69,6 +69,35 @@
 	}
 }
 
+/*
+ * This is an unapologetic hack.  KAISER PGDs have the NX bit set
+ * on the entries mapping userspace.  See native_set_pgd() for an
+ * explanation.  Unfortunately, EFI runs in what are traditionally
+ * userspace addresses and setting NX on the PGDs will keep EFI
+ * calls from being able to run.
+ *
+ * We do not have a nice pgd_clear_flags() like we have for ptes,
+ * so just hack this in here rather than complicate set_pgd().
+ */
+void pgd_clear_nx(void *__pgd)
+{
+#ifdef CONFIG_KAISER
+	unsigned long *pgd = __pgd;
+	/*
+	 * The EFI code always passes init_mm to the pagetable
+	 * allocation functions, which should trigger them
+	 * via mm_pgtable_flags() set _PAGE_USER, which will
+	 * then trigger the KAISER code to not set _PAGE_NX.
+	 *
+	 * This is probably superfluous, but make it safe and
+	 * warn if the mm_pgtable_flags() stuff did not work
+	 * out somehow.
+	 */
+	WARN_ON_ONCE(*pgd & _PAGE_NX);
+	*pgd &= ~_PAGE_NX;
+#endif
+}
+
 pgd_t * __init efi_call_phys_prolog(void)
 {
 	unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
@@ -105,6 +134,7 @@
 		save_pgd[pgd] = *pgd_efi;
 
 		p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+		pgd_clear_nx(pgd_efi);
 		if (!p4d) {
 			pr_err("Failed to allocate p4d table!\n");
 			goto out;
@@ -115,6 +145,7 @@
 			p4d_efi = p4d + p4d_index(addr_p4d);
 
 			pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+			pgd_clear_nx(p4d_efi);
 			if (!pud) {
 				pr_err("Failed to allocate pud table!\n");
 				goto out;
@@ -201,27 +232,45 @@
 	p4d_t *p4d;
 	pud_t *pud;
 	gfp_t gfp_mask;
+	int pgd_order = 0;
+
+	/*
+	 * set_pgd() assumes that there is a shadow PGD for all
+	 * pgds.  Without one, it goes off writing on whatever
+	 * page happened to be after the PGD.  We can either use
+	 * something other than set_pgd(), or just add a shadow
+	 * page.  Just do the shadow page.  We should never
+	 * actually use this shadow PGD, though.
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		pgd_order = 1;
 
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
 	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
-	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_order);
+	tag_pgd_with_size(efi_pgd, PAGE_SIZE << pgd_order);
 	if (!efi_pgd)
 		return -ENOMEM;
 
 	pgd = efi_pgd + pgd_index(EFI_VA_END);
 	p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
+	/* PGDs "appear" at the P4D level when PGTABLE_LEVELS=4 */
+	pgd_clear_nx(pgd);
 	if (!p4d) {
-		free_page((unsigned long)efi_pgd);
+		clear_pgd_tag(efi_pgd);
+		free_pages((unsigned long)efi_pgd, pgd_order);
 		return -ENOMEM;
 	}
 
 	pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
+	pgd_clear_nx(pud);
 	if (!pud) {
 		if (CONFIG_PGTABLE_LEVELS > 4)
 			free_page((unsigned long) pgd_page_vaddr(*pgd));
-		free_page((unsigned long)efi_pgd);
+		clear_pgd_tag(efi_pgd);
+		free_pages((unsigned long)efi_pgd, pgd_order);
 		return -ENOMEM;
 	}
 
@@ -233,6 +282,7 @@
  */
 void efi_sync_low_kernel_mappings(void)
 {
+	int i;
 	unsigned num_entries;
 	pgd_t *pgd_k, *pgd_efi;
 	p4d_t *p4d_k, *p4d_efi;
@@ -257,7 +307,12 @@
 	pgd_k = pgd_offset_k(PAGE_OFFSET);
 
 	num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
-	memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
+	for (i = 0; i < num_entries; i++) {
+		pgd_efi[i] = pgd_k[i];
+		pgd_clear_nx(&pgd_efi[i]);
+		if (IS_ENABLED(CONFIG_KAISER))
+			pgd_efi[i + PTRS_PER_PGD] = pgd_k[i];
+	}
 
 	/*
 	 * As with PGDs, we share all P4D entries apart from the one entry
@@ -272,7 +327,12 @@
 	p4d_k = p4d_offset(pgd_k, 0);
 
 	num_entries = p4d_index(EFI_VA_END);
-	memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+	for (i = 0; i < num_entries; i++) {
+		p4d_efi[i] = p4d_k[i];
+		pgd_clear_nx(&p4d_efi[i]);
+		if (IS_ENABLED(CONFIG_KAISER))
+			p4d_efi[i + PTRS_PER_PGD] = p4d_k[i];
+	}
 
 	/*
 	 * We share all the PUD entries apart from those that map the
@@ -405,6 +465,9 @@
 
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
+	int pgd_nr;
+	int start_pgd_nr;
+	int end_pgd_nr;
 	unsigned long flags = _PAGE_RW;
 	unsigned long pfn;
 	pgd_t *pgd = efi_pgd;
@@ -416,6 +479,12 @@
 	if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
 		pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
 			   md->phys_addr, va);
+
+	start_pgd_nr = pgd_index(va);
+	end_pgd_nr = pgd_index(va + md->num_pages * PAGE_SIZE);
+	for (pgd_nr = start_pgd_nr; pgd_nr <= end_pgd_nr; pgd_nr++)
+		pgd_clear_nx(&efi_pgd[pgd_nr]);
+
 }
 
 void __init efi_map_region(efi_memory_desc_t *md)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8acfc1e..d7c6acb 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -59,6 +59,12 @@
 /* Align . to a 8 byte boundary equals to maximum function alignment. */
 #define ALIGN_FUNCTION()  . = ALIGN(8)
 
+#ifdef CONFIG_KAISER
+#define ALIGN_KAISER()	. = ALIGN(PAGE_SIZE);
+#else
+#define ALIGN_KAISER()
+#endif
+
 /*
  * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which
  * generates .data.identifier sections, which need to be pulled in with
@@ -493,15 +499,19 @@
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
 #define ENTRY_TEXT							\
+		ALIGN_KAISER();						\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__entry_text_start) = .;			\
 		*(.entry.text)						\
+		ALIGN_KAISER();						\
 		VMLINUX_SYMBOL(__entry_text_end) = .;
 
 #define IRQENTRY_TEXT							\
+		ALIGN_KAISER();						\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
 		*(.irqentry.text)					\
+		ALIGN_KAISER();						\
 		VMLINUX_SYMBOL(__irqentry_text_end) = .;
 
 #define SOFTIRQENTRY_TEXT						\
@@ -807,7 +817,16 @@
  */
 #define PERCPU_INPUT(cacheline)						\
 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
-	*(.data..percpu..first)						\
+	\
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;        \
+	*(.data..percpu..first)           \
+	. = ALIGN(cacheline);           \
+	*(.data..percpu..user_mapped)            \
+	*(.data..percpu..user_mapped..shared_aligned)        \
+	. = ALIGN(PAGE_SIZE);           \
+	*(.data..percpu..user_mapped..page_aligned)          \
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;        \
+	\
 	. = ALIGN(PAGE_SIZE);						\
 	*(.data..percpu..page_aligned)					\
 	. = ALIGN(cacheline);						\
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 0000000..1c3a7d5
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,31 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size,
+				     unsigned long flags)
+{
+	return 0;
+}
+
+static inline void kaiser_check_user_mapped(const void *addr)
+{
+}
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299..8ea945f 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
 
 #endif
 
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
 /*
  * Base implementations of per-CPU variable declarations and definitions, where
  * the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
 #define DEFINE_PER_CPU(type, name)					\
 	DEFINE_PER_CPU_SECTION(type, name, "")
 
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)         \
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)          \
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
@@ -144,6 +156,14 @@
 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)			\
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)			\
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
 #define DECLARE_PER_CPU_ALIGNED(type, name)				\
 	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
 	____cacheline_aligned
@@ -162,6 +182,16 @@
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
 	DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")		\
 	__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)      \
+  DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")   \
+  __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)       \
+  DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")    \
+  __aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c
index 0ee9c686..83cfd69 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
+#include <linux/kaiser.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
@@ -504,6 +505,7 @@
 	pgtable_init();
 	vmalloc_init();
 	ioremap_huge_init();
+	kaiser_init();
 }
 
 asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743..8a17e62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
+#include <linux/kaiser.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -247,6 +248,8 @@
 
 static inline void free_thread_stack(struct task_struct *tsk)
 {
+	kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+
 #ifdef CONFIG_VMAP_STACK
 	if (task_stack_vm_area(tsk)) {
 		int i;
@@ -536,6 +539,10 @@
 	 * functions again.
 	 */
 	tsk->stack = stack;
+	err = kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
+				 __PAGE_KERNEL);
+	if (err)
+		goto free_stack;
 #ifdef CONFIG_VMAP_STACK
 	tsk->stack_vm_area = stack_vm_area;
 #endif
diff --git a/mm/gup.c b/mm/gup.c
index b2b4d42..1a148a6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -36,6 +36,20 @@
 	return NULL;
 }
 
+enum pgtable_levels {
+	PGD_LEVEL = 5,
+	P4D_LEVEL = 4,
+	PUD_LEVEL = 3,
+	PMD_LEVEL = 2,
+	PTE_LEVEL = 1
+};
+/* noinline so the call site is visible in backtrace */
+noinline static struct page *bad_page_table(enum pgtable_levels level)
+{
+	WARN_ONCE(1, "bad page table at level %d\n", level);
+	return NULL;
+}
+
 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 		pte_t *pte, unsigned int flags)
 {
@@ -81,7 +95,7 @@
 
 retry:
 	if (unlikely(pmd_bad(*pmd)))
-		return no_page_table(vma, flags);
+		return bad_page_table(PMD_LEVEL);
 
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	pte = *ptep;
@@ -334,7 +348,7 @@
 			return page;
 	}
 	if (unlikely(pud_bad(*pud)))
-		return no_page_table(vma, flags);
+		return bad_page_table(PUD_LEVEL);
 
 	return follow_pmd_mask(vma, address, pud, flags, page_mask);
 }
@@ -352,7 +366,7 @@
 		return no_page_table(vma, flags);
 	BUILD_BUG_ON(p4d_huge(*p4d));
 	if (unlikely(p4d_bad(*p4d)))
-		return no_page_table(vma, flags);
+		return bad_page_table(P4D_LEVEL);
 
 	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 		page = follow_huge_pd(vma, address,
@@ -397,8 +411,10 @@
 
 	pgd = pgd_offset(mm, address);
 
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+	if (pgd_none(*pgd))
 		return no_page_table(vma, flags);
+	if (unlikely(pgd_bad(*pgd)))
+		return bad_page_table(PGD_LEVEL);
 
 	if (pgd_huge(*pgd)) {
 		page = follow_huge_pgd(mm, address, pgd, flags);
diff --git a/security/Kconfig b/security/Kconfig
index e8e4494..0033d81 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@
 	  implement socket and networking access controls.
 	  If you are unsure how to answer this question, answer N.
 
+config KAISER
+	bool "Remove the kernel mapping in user mode"
+	depends on X86_64 && SMP && !PARAVIRT
+	help
+	  This feature reduces the number of hardware side channels by
+	  ensuring that the majority of kernel addresses are not mapped
+	  into userspace.
+
+	  See Documentation/x86/kaiser.txt for more details.
+
 config SECURITY_INFINIBAND
 	bool "Infiniband Security Hooks"
 	depends on SECURITY && INFINIBAND
@@ -75,7 +85,6 @@
 	  to communicate unlabelled data can send without using
 	  IPSec.
 	  If you are unsure how to answer this question, answer N.
-
 config SECURITY_PATH
 	bool "Security hooks for pathname based access control"
 	depends on SECURITY