releases/4.4.110/kaiser-merged-update.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From foo@baz Wed Jan  3 18:58:12 CET 2018
 From: Dave Hansen <dave.hansen@linux.intel.com>
 Date: Wed, 30 Aug 2017 16:23:00 -0700
 Subject: kaiser: merged update

 From: Dave Hansen <dave.hansen@linux.intel.com>


 Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).

 Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
 Signed-off-by: Hugh Dickins <hughd@google.com>
 Acked-by: Jiri Kosina <jkosina@suse.cz>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  arch/x86/entry/entry_64.S            |  106 ++++++++++-
  arch/x86/include/asm/kaiser.h        |   43 ++--
  arch/x86/include/asm/pgtable.h       |   18 +
  arch/x86/include/asm/pgtable_64.h    |   48 ++++-
  arch/x86/include/asm/pgtable_types.h |    6
  arch/x86/kernel/espfix_64.c          |   13 -
  arch/x86/kernel/head_64.S            |   19 +-
  arch/x86/kernel/ldt.c                |   27 ++
  arch/x86/kernel/tracepoint.c         |    2
  arch/x86/mm/kaiser.c                 |  318 +++++++++++++++++++++++++----------
  arch/x86/mm/pageattr.c               |   63 +++++-
  arch/x86/mm/pgtable.c                |   40 +---
  include/linux/kaiser.h               |   26 ++
  kernel/fork.c                        |    9
  security/Kconfig                     |    5
  15 files changed, 553 insertions(+), 190 deletions(-)
  create mode 100644 include/linux/kaiser.h

 --- a/arch/x86/entry/entry_64.S
 +++ b/arch/x86/entry/entry_64.S
 @@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
  	movq	RIP(%rsp), %rcx
  	movq	EFLAGS(%rsp), %r11
  	RESTORE_C_REGS_EXCEPT_RCX_R11
 +	/*
 +	 * This opens a window where we have a user CR3, but are
 +	 * running in the kernel.  This makes using the CS
 +	 * register useless for telling whether or not we need to
 +	 * switch CR3 in NMIs.  Normal interrupts are OK because
 +	 * they are off here.
 +	 */
  	SWITCH_USER_CR3
  	movq	RSP(%rsp), %rsp
  	/*
 @@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
  syscall_return_via_sysret:
  	/* rcx and r11 are already restored (see code above) */
  	RESTORE_C_REGS_EXCEPT_RCX_R11
 +	/*
 +	 * This opens a window where we have a user CR3, but are
 +	 * running in the kernel.  This makes using the CS
 +	 * register useless for telling whether or not we need to
 +	 * switch CR3 in NMIs.  Normal interrupts are OK because
 +	 * they are off here.
 +	 */
  	SWITCH_USER_CR3
  	movq	RSP(%rsp), %rsp
  	USERGS_SYSRET64

  opportunistic_sysret_failed:
 +	/*
 +	 * This opens a window where we have a user CR3, but are
 +	 * running in the kernel.  This makes using the CS
 +	 * register useless for telling whether or not we need to
 +	 * switch CR3 in NMIs.  Normal interrupts are OK because
 +	 * they are off here.
 +	 */
  	SWITCH_USER_CR3
  	SWAPGS
  	jmp	restore_c_regs_and_iret
 @@ -1059,6 +1080,13 @@ ENTRY(error_entry)
  	cld
  	SAVE_C_REGS 8
  	SAVE_EXTRA_REGS 8
 +	/*
 +	 * error_entry() always returns with a kernel gsbase and
 +	 * CR3.  We must also have a kernel CR3/gsbase before
 +	 * calling TRACE_IRQS_*.  Just unconditionally switch to
 +	 * the kernel CR3 here.
 +	 */
 +	SWITCH_KERNEL_CR3
  	xorl	%ebx, %ebx
  	testb	$3, CS+8(%rsp)
  	jz	.Lerror_kernelspace
 @@ -1069,7 +1097,6 @@ ENTRY(error_entry)
  	 * from user mode due to an IRET fault.
  	 */
  	SWAPGS
 -	SWITCH_KERNEL_CR3

  .Lerror_entry_from_usermode_after_swapgs:
  	/*
 @@ -1122,7 +1149,7 @@ ENTRY(error_entry)
  	 * Switch to kernel gsbase:
  	 */
  	SWAPGS
 -	SWITCH_KERNEL_CR3
 +
  	/*
  	 * Pretend that the exception came from user mode: set up pt_regs
  	 * as if we faulted immediately after IRET and clear EBX so that
 @@ -1222,7 +1249,10 @@ ENTRY(nmi)
  	 */

  	SWAPGS_UNSAFE_STACK
 -	SWITCH_KERNEL_CR3_NO_STACK
 +	/*
 +	 * percpu variables are mapped with user CR3, so no need
 +	 * to switch CR3 here.
 +	 */
  	cld
  	movq	%rsp, %rdx
  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 @@ -1256,14 +1286,33 @@ ENTRY(nmi)

  	movq	%rsp, %rdi
  	movq	$-1, %rsi
 +#ifdef CONFIG_KAISER
 +	/* Unconditionally use kernel CR3 for do_nmi() */
 +	/* %rax is saved above, so OK to clobber here */
 +	movq	%cr3, %rax
 +	pushq	%rax
 +#ifdef CONFIG_KAISER_REAL_SWITCH
 +	andq	$(~0x1000), %rax
 +#endif
 +	movq	%rax, %cr3
 +#endif
  	call	do_nmi
 +	/*
 +	 * Unconditionally restore CR3.  I know we return to
 +	 * kernel code that needs user CR3, but do we ever return
 +	 * to "user mode" where we need the kernel CR3?
 +	 */
 +#ifdef CONFIG_KAISER
 +	popq	%rax
 +	mov	%rax, %cr3
 +#endif

  	/*
  	 * Return back to user mode.  We must *not* do the normal exit
 -	 * work, because we don't want to enable interrupts.  Fortunately,
 -	 * do_nmi doesn't modify pt_regs.
 +	 * work, because we don't want to enable interrupts.  Do not
 +	 * switch to user CR3: we might be going back to kernel code
 +	 * that had a user CR3 set.
  	 */
 -	SWITCH_USER_CR3
  	SWAPGS
  	jmp	restore_c_regs_and_iret

 @@ -1459,23 +1508,54 @@ end_repeat_nmi:
  	ALLOC_PT_GPREGS_ON_STACK

  	/*
 -	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
 -	 * as we should not be calling schedule in NMI context.
 -	 * Even with normal interrupts enabled. An NMI should not be
 -	 * setting NEED_RESCHED or anything that normal interrupts and
 -	 * exceptions might do.
 +	 * Use the same approach as paranoid_entry to handle SWAPGS, but
 +	 * without CR3 handling since we do that differently in NMIs.  No
 +	 * need to use paranoid_exit as we should not be calling schedule
 +	 * in NMI context.  Even with normal interrupts enabled. An NMI
 +	 * should not be setting NEED_RESCHED or anything that normal
 +	 * interrupts and exceptions might do.
  	 */
 -	call	paranoid_entry
 +	cld
 +	SAVE_C_REGS
 +	SAVE_EXTRA_REGS
 +	movl	$1, %ebx
 +	movl	$MSR_GS_BASE, %ecx
 +	rdmsr
 +	testl	%edx, %edx
 +	js	1f				/* negative -> in kernel */
 +	SWAPGS
 +	xorl	%ebx, %ebx
 +1:
 +#ifdef CONFIG_KAISER
 +	/* Unconditionally use kernel CR3 for do_nmi() */
 +	/* %rax is saved above, so OK to clobber here */
 +	movq	%cr3, %rax
 +	pushq	%rax
 +#ifdef CONFIG_KAISER_REAL_SWITCH
 +	andq	$(~0x1000), %rax
 +#endif
 +	movq	%rax, %cr3
 +#endif

  	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
  	movq	%rsp, %rdi
 +	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
  	movq	$-1, %rsi
  	call	do_nmi
 +	/*
 +	 * Unconditionally restore CR3.  We might be returning to
 +	 * kernel code that needs user CR3, like just just before
 +	 * a sysret.
 +	 */
 +#ifdef CONFIG_KAISER
 +	popq	%rax
 +	mov	%rax, %cr3
 +#endif

  	testl	%ebx, %ebx			/* swapgs needed? */
  	jnz	nmi_restore
  nmi_swapgs:
 -	SWITCH_USER_CR3_NO_STACK
 +	/* We fixed up CR3 above, so no need to switch it here */
  	SWAPGS_UNSAFE_STACK
  nmi_restore:
  	RESTORE_EXTRA_REGS
 --- a/arch/x86/include/asm/kaiser.h
 +++ b/arch/x86/include/asm/kaiser.h
 @@ -16,13 +16,17 @@

  .macro _SWITCH_TO_KERNEL_CR3 reg
  movq %cr3, \reg
 +#ifdef CONFIG_KAISER_REAL_SWITCH
  andq $(~0x1000), \reg
 +#endif
  movq \reg, %cr3
  .endm

  .macro _SWITCH_TO_USER_CR3 reg
  movq %cr3, \reg
 +#ifdef CONFIG_KAISER_REAL_SWITCH
  orq $(0x1000), \reg
 +#endif
  movq \reg, %cr3
  .endm

 @@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_b
  .endm

  #endif /* CONFIG_KAISER */
 +
  #else /* __ASSEMBLY__ */


  #ifdef CONFIG_KAISER
 -// Upon kernel/user mode switch, it may happen that
 -// the address space has to be switched before the registers have been stored.
 -// To change the address space, another register is needed.
 -// A register therefore has to be stored/restored.
 -//
 -DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 +/*
 + * Upon kernel/user mode switch, it may happen that the address
 + * space has to be switched before the registers have been
 + * stored.  To change the address space, another register is
 + * needed.  A register therefore has to be stored/restored.
 +*/

 -#endif /* CONFIG_KAISER */
 +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

  /**
 - *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
 + *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
   *  @addr: the start address of the range
   *  @size: the size of the range
   *  @flags: The mapping flags of the pages
   *
 - *  the mapping is done on a global scope, so no bigger synchronization has to be done.
 - *  the pages have to be manually unmapped again when they are not needed any longer.
 + *  The mapping is done on a global scope, so no bigger
 + *  synchronization has to be done.  the pages have to be
 + *  manually unmapped again when they are not needed any longer.
   */
 -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);


  /**
 - *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
 + *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
   *  @addr: the start address of the range
   *  @size: the size of the range
   */
  extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

  /**
 - *  shadowmem_initialize_mapping - Initalize the shadow mapping
 + *  kaiser_initialize_mapping - Initalize the shadow mapping
   *
 - *  most parts of the shadow mapping can be mapped upon boot time.
 - *  only the thread stacks have to be mapped on runtime.
 - *  the mapped regions are not unmapped at all.
 + *  Most parts of the shadow mapping can be mapped upon boot
 + *  time.  Only per-process things like the thread stacks
 + *  or a new LDT have to be mapped at runtime.  These boot-
 + *  time mappings are permanent and nevertunmapped.
   */
  extern void kaiser_init(void);

 -#endif
 +#endif /* CONFIG_KAISER */
 +
 +#endif /* __ASSEMBLY */


 --- a/arch/x86/include/asm/pgtable.h
 +++ b/arch/x86/include/asm/pgtable.h
 @@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *p

  static inline int pgd_bad(pgd_t pgd)
  {
 -	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
 +	pgdval_t ignore_flags = _PAGE_USER;
 +	/*
 +	 * We set NX on KAISER pgds that map userspace memory so
 +	 * that userspace can not meaningfully use the kernel
 +	 * page table by accident; it will fault on the first
 +	 * instruction it tries to run.  See native_set_pgd().
 +	 */
 +	if (IS_ENABLED(CONFIG_KAISER))
 +		ignore_flags |= _PAGE_NX;
 +
 +	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
  }

  static inline int pgd_none(pgd_t pgd)
 @@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t
  {
         memcpy(dst, src, count * sizeof(pgd_t));
  #ifdef CONFIG_KAISER
 -	// clone the shadow pgd part as well
 -	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
 +	/* Clone the shadow pgd part as well */
 +	memcpy(native_get_shadow_pgd(dst),
 +	       native_get_shadow_pgd(src),
 +	       count * sizeof(pgd_t));
  #endif
  }

 --- a/arch/x86/include/asm/pgtable_64.h
 +++ b/arch/x86/include/asm/pgtable_64.h
 @@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_
  }

  #ifdef CONFIG_KAISER
 -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
 +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
 +{
  	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
  }

 -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
 +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
 +{
  	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
  }
 +#else
 +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
 +{
 +	BUILD_BUG_ON(1);
 +	return NULL;
 +}
 +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
 +{
 +	return pgdp;
 +}
  #endif /* CONFIG_KAISER */

 +/*
 + * Page table pages are page-aligned.  The lower half of the top
 + * level is used for userspace and the top half for the kernel.
 + * This returns true for user pages that need to get copied into
 + * both the user and kernel copies of the page tables, and false
 + * for kernel pages that should only be in the kernel copy.
 + */
 +static inline bool is_userspace_pgd(void *__ptr)
 +{
 +	unsigned long ptr = (unsigned long)__ptr;
 +
 +	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
 +}
 +
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
  #ifdef CONFIG_KAISER
 -	// We know that a pgd is page aligned.
 -	// Therefore the lower indices have to be mapped to user space.
 -	// These pages are mapped to the shadow mapping.
 -	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
 +	pteval_t extra_kern_pgd_flags = 0;
 +	/* Do we need to also populate the shadow pgd? */
 +	if (is_userspace_pgd(pgdp)) {
  		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 +		/*
 +		 * Even if the entry is *mapping* userspace, ensure
 +		 * that userspace can not use it.  This way, if we
 +		 * get out to userspace running on the kernel CR3,
 +		 * userspace will crash instead of running.
 +		 */
 +		extra_kern_pgd_flags = _PAGE_NX;
  	}
 -
 -	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
 +	pgdp->pgd = pgd.pgd;
 +	pgdp->pgd |= extra_kern_pgd_flags;
  #else /* CONFIG_KAISER */
  	*pgdp = pgd;
  #endif
 --- a/arch/x86/include/asm/pgtable_types.h
 +++ b/arch/x86/include/asm/pgtable_types.h
 @@ -42,7 +42,7 @@
  #ifdef CONFIG_KAISER
  #define _PAGE_GLOBAL	(_AT(pteval_t, 0))
  #else
 -#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 +#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
  #endif
  #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
  #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 @@ -93,11 +93,7 @@
  #define _PAGE_NX	(_AT(pteval_t, 0))
  #endif

 -#ifdef CONFIG_KAISER
 -#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
 -#else
  #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 -#endif

  #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
  			 _PAGE_ACCESSED | _PAGE_DIRTY)
 --- a/arch/x86/kernel/espfix_64.c
 +++ b/arch/x86/kernel/espfix_64.c
 @@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
  	/* Install the espfix pud into the kernel page directory */
  	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
  	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
 -#ifdef CONFIG_KAISER
 -	// add the esp stack pud to the shadow mapping here.
 -	// This can be done directly, because the fixup stack has its own pud
 -	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
 -#endif
 +	/*
 +	 * Just copy the top-level PGD that is mapping the espfix
 +	 * area to ensure it is mapped into the shadow user page
 +	 * tables.
 +	 */
 +	if (IS_ENABLED(CONFIG_KAISER))
 +		set_pgd(native_get_shadow_pgd(pgd_p),
 +			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));

  	/* Randomize the locations */
  	init_espfix_random();
 --- a/arch/x86/kernel/head_64.S
 +++ b/arch/x86/kernel/head_64.S
 @@ -442,11 +442,24 @@ early_idt_ripmsg:
  GLOBAL(name)

  #ifdef CONFIG_KAISER
 +/*
 + * Each PGD needs to be 8k long and 8k aligned.  We do not
 + * ever go out to userspace with these, so we do not
 + * strictly *need* the second page, but this allows us to
 + * have a single set_pgd() implementation that does not
 + * need to worry about whether it has 4k or 8k to work
 + * with.
 + *
 + * This ensures PGDs are 8k long:
 + */
 +#define KAISER_USER_PGD_FILL	512
 +/* This ensures they are 8k-aligned: */
  #define NEXT_PGD_PAGE(name) \
  	.balign 2 * PAGE_SIZE; \
  GLOBAL(name)
  #else
  #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
 +#define KAISER_USER_PGD_FILL	0
  #endif

  /* Automate the creation of 1 to 1 mapping pmd entries */
 @@ -461,6 +474,7 @@ GLOBAL(name)
  NEXT_PGD_PAGE(early_level4_pgt)
  	.fill	511,8,0
  	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 +	.fill	KAISER_USER_PGD_FILL,8,0

  NEXT_PAGE(early_dynamic_pgts)
  	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
 @@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)

  #ifndef CONFIG_XEN
  NEXT_PGD_PAGE(init_level4_pgt)
 -	.fill	2*512,8,0
 +	.fill	512,8,0
 +	.fill	KAISER_USER_PGD_FILL,8,0
  #else
  NEXT_PGD_PAGE(init_level4_pgt)
  	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 @@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
  	.org    init_level4_pgt + L4_START_KERNEL*8, 0
  	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
  	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 +	.fill	KAISER_USER_PGD_FILL,8,0

  NEXT_PAGE(level3_ident_pgt)
  	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 @@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
  	 */
  	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
  #endif
 +	.fill	KAISER_USER_PGD_FILL,8,0

  NEXT_PAGE(level3_kernel_pgt)
  	.fill	L3_START_KERNEL,8,0
 --- a/arch/x86/kernel/ldt.c
 +++ b/arch/x86/kernel/ldt.c
 @@ -18,6 +18,7 @@
  #include <linux/uaccess.h>

  #include <asm/ldt.h>
 +#include <asm/kaiser.h>
  #include <asm/desc.h>
  #include <asm/mmu_context.h>
  #include <asm/syscalls.h>
 @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
  	set_ldt(pc->ldt->entries, pc->ldt->size);
  }

 +static void __free_ldt_struct(struct ldt_struct *ldt)
 +{
 +	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
 +		vfree(ldt->entries);
 +	else
 +		free_page((unsigned long)ldt->entries);
 +	kfree(ldt);
 +}
 +
  /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
  static struct ldt_struct *alloc_ldt_struct(int size)
  {
  	struct ldt_struct *new_ldt;
  	int alloc_size;
 +	int ret = 0;

  	if (size > LDT_ENTRIES)
  		return NULL;
 @@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_stru
  		return NULL;
  	}

 +	// FIXME: make kaiser_add_mapping() return an error code
 +	// when it fails
 +	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
 +			   __PAGE_KERNEL);
 +	if (ret) {
 +		__free_ldt_struct(new_ldt);
 +		return NULL;
 +	}
  	new_ldt->size = size;
  	return new_ldt;
  }
 @@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_s
  	if (likely(!ldt))
  		return;

 +	kaiser_remove_mapping((unsigned long)ldt->entries,
 +			      ldt->size * LDT_ENTRY_SIZE);
  	paravirt_free_ldt(ldt->entries, ldt->size);
 -	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
 -		vfree(ldt->entries);
 -	else
 -		free_page((unsigned long)ldt->entries);
 -	kfree(ldt);
 +	__free_ldt_struct(ldt);
  }

  /*
 --- a/arch/x86/kernel/tracepoint.c
 +++ b/arch/x86/kernel/tracepoint.c
 @@ -9,10 +9,12 @@
  #include <linux/atomic.h>

  atomic_t trace_idt_ctr = ATOMIC_INIT(0);
 +__aligned(PAGE_SIZE)
  struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
  				(unsigned long) trace_idt_table };

  /* No need to be aligned, but done to keep all IDTs defined the same way. */
 +__aligned(PAGE_SIZE)
  gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;

  static int trace_irq_vector_refcount;
 --- a/arch/x86/mm/kaiser.c
 +++ b/arch/x86/mm/kaiser.c
 @@ -1,160 +1,306 @@
 -
 -
 +#include <linux/bug.h>
  #include <linux/kernel.h>
  #include <linux/errno.h>
  #include <linux/string.h>
  #include <linux/types.h>
  #include <linux/bug.h>
  #include <linux/init.h>
 +#include <linux/interrupt.h>
  #include <linux/spinlock.h>
  #include <linux/mm.h>
 -
  #include <linux/uaccess.h>
 +#include <linux/ftrace.h>
 +
 +#include <asm/kaiser.h>
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
  #include <asm/desc.h>
  #ifdef CONFIG_KAISER

  __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 +/*
 + * At runtime, the only things we map are some things for CPU
 + * hotplug, and stacks for new processes.  No two CPUs will ever
 + * be populating the same addresses, so we only need to ensure
 + * that we protect between two CPUs trying to allocate and
 + * populate the same page table page.
 + *
 + * Only take this lock when doing a set_p[4um]d(), but it is not
 + * needed for doing a set_pte().  We assume that only the *owner*
 + * of a given allocation will be doing this for _their_
 + * allocation.
 + *
 + * This ensures that once a system has been running for a while
 + * and there have been stacks all over and these page tables
 + * are fully populated, there will be no further acquisitions of
 + * this lock.
 + */
 +static DEFINE_SPINLOCK(shadow_table_allocation_lock);

 -/**
 - * Get the real ppn from a address in kernel mapping.
 - * @param address The virtual adrress
 - * @return the physical address
 +/*
 + * Returns -1 on error.
   */
 -static inline unsigned long get_pa_from_mapping (unsigned long address)
 +static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  {
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *pte;

 -	pgd = pgd_offset_k(address);
 -	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
 +	pgd = pgd_offset_k(vaddr);
 +	/*
 +	 * We made all the kernel PGDs present in kaiser_init().
 +	 * We expect them to stay that way.
 +	 */
 +	BUG_ON(pgd_none(*pgd));
 +	/*
 +	 * PGDs are either 512GB or 128TB on all x86_64
 +	 * configurations.  We don't handle these.
 +	 */
 +	BUG_ON(pgd_large(*pgd));
 +
 +	pud = pud_offset(pgd, vaddr);
 +	if (pud_none(*pud)) {
 +		WARN_ON_ONCE(1);
 +		return -1;
 +	}

 -	pud = pud_offset(pgd, address);
 -	BUG_ON(pud_none(*pud));
 +	if (pud_large(*pud))
 +		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);

 -	if (pud_large(*pud)) {
 -		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
 +	pmd = pmd_offset(pud, vaddr);
 +	if (pmd_none(*pmd)) {
 +		WARN_ON_ONCE(1);
 +		return -1;
  	}

 -	pmd = pmd_offset(pud, address);
 -	BUG_ON(pmd_none(*pmd));
 +	if (pmd_large(*pmd))
 +		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);

 -	if (pmd_large(*pmd)) {
 -		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
 +	pte = pte_offset_kernel(pmd, vaddr);
 +	if (pte_none(*pte)) {
 +		WARN_ON_ONCE(1);
 +		return -1;
  	}

 -	pte = pte_offset_kernel(pmd, address);
 -	BUG_ON(pte_none(*pte));
 -
 -	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
 +	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
  }

 -void _kaiser_copy (unsigned long start_addr, unsigned long size,
 -					unsigned long flags)
 +/*
 + * This is a relatively normal page table walk, except that it
 + * also tries to allocate page tables pages along the way.
 + *
 + * Returns a pointer to a PTE on success, or NULL on failure.
 + */
 +static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
  {
 -	pgd_t *pgd;
 -	pud_t *pud;
  	pmd_t *pmd;
 -	pte_t *pte;
 -	unsigned long address;
 -	unsigned long end_addr = start_addr + size;
 -	unsigned long target_address;
 +	pud_t *pud;
 +	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);

 -	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
 -			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
 -		target_address = get_pa_from_mapping(address);
 +	might_sleep();
 +	if (is_atomic) {
 +		gfp &= ~GFP_KERNEL;
 +		gfp |= __GFP_HIGH | __GFP_ATOMIC;
 +	}

 -		pgd = native_get_shadow_pgd(pgd_offset_k(address));
 +	if (pgd_none(*pgd)) {
 +		WARN_ONCE(1, "All shadow pgds should have been populated");
 +		return NULL;
 +	}
 +	BUILD_BUG_ON(pgd_large(*pgd) != 0);

 -		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
 -		BUG_ON(pgd_large(*pgd));
 +	pud = pud_offset(pgd, address);
 +	/* The shadow page tables do not use large mappings: */
 +	if (pud_large(*pud)) {
 +		WARN_ON(1);
 +		return NULL;
 +	}
 +	if (pud_none(*pud)) {
 +		unsigned long new_pmd_page = __get_free_page(gfp);
 +		if (!new_pmd_page)
 +			return NULL;
 +		spin_lock(&shadow_table_allocation_lock);
 +		if (pud_none(*pud))
 +			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 +		else
 +			free_page(new_pmd_page);
 +		spin_unlock(&shadow_table_allocation_lock);
 +	}

 -		pud = pud_offset(pgd, address);
 -		if (pud_none(*pud)) {
 -			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
 -		}
 -		BUG_ON(pud_large(*pud));
 +	pmd = pmd_offset(pud, address);
 +	/* The shadow page tables do not use large mappings: */
 +	if (pmd_large(*pmd)) {
 +		WARN_ON(1);
 +		return NULL;
 +	}
 +	if (pmd_none(*pmd)) {
 +		unsigned long new_pte_page = __get_free_page(gfp);
 +		if (!new_pte_page)
 +			return NULL;
 +		spin_lock(&shadow_table_allocation_lock);
 +		if (pmd_none(*pmd))
 +			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
 +		else
 +			free_page(new_pte_page);
 +		spin_unlock(&shadow_table_allocation_lock);
 +	}

 -		pmd = pmd_offset(pud, address);
 -		if (pmd_none(*pmd)) {
 -			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
 -		}
 -		BUG_ON(pmd_large(*pmd));
 +	return pte_offset_kernel(pmd, address);
 +}

 -		pte = pte_offset_kernel(pmd, address);
 +int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 +			unsigned long flags)
 +{
 +	int ret = 0;
 +	pte_t *pte;
 +	unsigned long start_addr = (unsigned long )__start_addr;
 +	unsigned long address = start_addr & PAGE_MASK;
 +	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 +	unsigned long target_address;
 +
 +	for (;address < end_addr; address += PAGE_SIZE) {
 +		target_address = get_pa_from_mapping(address);
 +		if (target_address == -1) {
 +			ret = -EIO;
 +			break;
 +		}
 +		pte = kaiser_pagetable_walk(address, false);
  		if (pte_none(*pte)) {
  			set_pte(pte, __pte(flags | target_address));
  		} else {
 -			BUG_ON(__pa(pte_page(*pte)) != target_address);
 +			pte_t tmp;
 +			set_pte(&tmp, __pte(flags | target_address));
 +			WARN_ON_ONCE(!pte_same(*pte, tmp));
  		}
  	}
 +	return ret;
  }

 -// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
 -static inline void __init _kaiser_init(void)
 +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
 +{
 +	unsigned long size = end - start;
 +
 +	return kaiser_add_user_map(start, size, flags);
 +}
 +
 +/*
 + * Ensure that the top level of the (shadow) page tables are
 + * entirely populated.  This ensures that all processes that get
 + * forked have the same entries.  This way, we do not have to
 + * ever go set up new entries in older processes.
 + *
 + * Note: we never free these, so there are no updates to them
 + * after this.
 + */
 +static void __init kaiser_init_all_pgds(void)
  {
  	pgd_t *pgd;
  	int i = 0;

  	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
  	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 -		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
 +		pgd_t new_pgd;
 +		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
 +		if (!pud) {
 +			WARN_ON(1);
 +			break;
 +		}
 +		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 +		/*
 +		 * Make sure not to stomp on some other pgd entry.
 +		 */
 +		if (!pgd_none(pgd[i])) {
 +			WARN_ON(1);
 +			continue;
 +		}
 +		set_pgd(pgd + i, new_pgd);
  	}
  }

 +#define kaiser_add_user_map_early(start, size, flags) do {	\
 +	int __ret = kaiser_add_user_map(start, size, flags);	\
 +	WARN_ON(__ret);						\
 +} while (0)
 +
 +#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
 +	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
 +	WARN_ON(__ret);							\
 +} while (0)
 +
  extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 -spinlock_t shadow_table_lock;
 +/*
 + * If anything in here fails, we will likely die on one of the
 + * first kernel->user transitions and init will die.  But, we
 + * will have most of the kernel up by then and should be able to
 + * get a clean warning out of it.  If we BUG_ON() here, we run
 + * the risk of being before we have good console output.
 + */
  void __init kaiser_init(void)
  {
  	int cpu;
 -	spin_lock_init(&shadow_table_lock);
 -
 -	spin_lock(&shadow_table_lock);

 -	_kaiser_init();
 +	kaiser_init_all_pgds();

  	for_each_possible_cpu(cpu) {
 -		// map the per cpu user variables
 -		_kaiser_copy(
 -				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
 -				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
 -				__PAGE_KERNEL);
 -	}
 -
 -	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
 -	_kaiser_copy(
 -			(unsigned long) __entry_text_start,
 -			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
 -			__PAGE_KERNEL_RX);
 -
 -	// the fixed map address of the idt_table
 -	_kaiser_copy(
 -			(unsigned long) idt_descr.address,
 -			sizeof(gate_desc) * NR_VECTORS,
 -			__PAGE_KERNEL_RO);
 +		void *percpu_vaddr = __per_cpu_user_mapped_start +
 +				     per_cpu_offset(cpu);
 +		unsigned long percpu_sz = __per_cpu_user_mapped_end -
 +					  __per_cpu_user_mapped_start;
 +		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 +					  __PAGE_KERNEL);
 +	}

 -	spin_unlock(&shadow_table_lock);
 +	/*
 +	 * Map the entry/exit text section, which is needed at
 +	 * switches from user to and from kernel.
 +	 */
 +	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 +				       __PAGE_KERNEL_RX);
 +
 +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 +	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 +				       __irqentry_text_end,
 +				       __PAGE_KERNEL_RX);
 +#endif
 +	kaiser_add_user_map_early((void *)idt_descr.address,
 +				  sizeof(gate_desc) * NR_VECTORS,
 +				  __PAGE_KERNEL_RO);
 +#ifdef CONFIG_TRACING
 +	kaiser_add_user_map_early(&trace_idt_descr,
 +				  sizeof(trace_idt_descr),
 +				  __PAGE_KERNEL);
 +	kaiser_add_user_map_early(&trace_idt_table,
 +				  sizeof(gate_desc) * NR_VECTORS,
 +				  __PAGE_KERNEL);
 +#endif
 +	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
 +				  __PAGE_KERNEL);
 +	kaiser_add_user_map_early(&debug_idt_table,
 +				  sizeof(gate_desc) * NR_VECTORS,
 +				  __PAGE_KERNEL);
  }

 +extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
  // add a mapping to the shadow-mapping, and synchronize the mappings
 -void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
  {
 -	spin_lock(&shadow_table_lock);
 -	_kaiser_copy(addr, size, flags);
 -	spin_unlock(&shadow_table_lock);
 +	return kaiser_add_user_map((const void *)addr, size, flags);
  }

 -extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
  void kaiser_remove_mapping(unsigned long start, unsigned long size)
  {
 -	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
 -	spin_lock(&shadow_table_lock);
 -	do {
 -		unmap_pud_range(pgd, start, start + size);
 -	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
 -	spin_unlock(&shadow_table_lock);
 +	unsigned long end = start + size;
 +	unsigned long addr;
 +
 +	for (addr = start; addr < end; addr += PGDIR_SIZE) {
 +		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
 +		/*
 +		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
 +		 * so no need to trim 'end'.
 +		 */
 +		unmap_pud_range_nofree(pgd, addr, end);
 +	}
  }
  #endif /* CONFIG_KAISER */
 --- a/arch/x86/mm/pageattr.c
 +++ b/arch/x86/mm/pageattr.c
 @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
  #define CPA_FLUSHTLB 1
  #define CPA_ARRAY 2
  #define CPA_PAGES_ARRAY 4
 +#define CPA_FREE_PAGETABLES 8

  #ifdef CONFIG_PROC_FS
  static unsigned long direct_pages_count[PG_LEVEL_NUM];
 @@ -723,10 +724,13 @@ static int split_large_page(struct cpa_d
  	return 0;
  }

 -static bool try_to_free_pte_page(pte_t *pte)
 +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
  {
  	int i;

 +	if (!(cpa->flags & CPA_FREE_PAGETABLES))
 +		return false;
 +
  	for (i = 0; i < PTRS_PER_PTE; i++)
  		if (!pte_none(pte[i]))
  			return false;
 @@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *
  	return true;
  }

 -static bool try_to_free_pmd_page(pmd_t *pmd)
 +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
  {
  	int i;

 +	if (!(cpa->flags & CPA_FREE_PAGETABLES))
 +		return false;
 +
  	for (i = 0; i < PTRS_PER_PMD; i++)
  		if (!pmd_none(pmd[i]))
  			return false;
 @@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *
  	return true;
  }

 -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
 +			    unsigned long start,
 +			    unsigned long end)
  {
  	pte_t *pte = pte_offset_kernel(pmd, start);

 @@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd,
  		pte++;
  	}

 -	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 +	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
  		pmd_clear(pmd);
  		return true;
  	}
  	return false;
  }

 -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
  			      unsigned long start, unsigned long end)
  {
 -	if (unmap_pte_range(pmd, start, end))
 -		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 +	if (unmap_pte_range(cpa, pmd, start, end))
 +		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
  			pud_clear(pud);
  }

 -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
 +			    unsigned long start, unsigned long end)
  {
  	pmd_t *pmd = pmd_offset(pud, start);

 @@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud,
  		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
  		unsigned long pre_end = min_t(unsigned long, end, next_page);

 -		__unmap_pmd_range(pud, pmd, start, pre_end);
 +		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);

  		start = pre_end;
  		pmd++;
 @@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud,
  		if (pmd_large(*pmd))
  			pmd_clear(pmd);
  		else
 -			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 +			__unmap_pmd_range(cpa, pud, pmd,
 +					  start, start + PMD_SIZE);

  		start += PMD_SIZE;
  		pmd++;
 @@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud,
  	 * 4K leftovers?
  	 */
  	if (start < end)
 -		return __unmap_pmd_range(pud, pmd, start, end);
 +		return __unmap_pmd_range(cpa, pud, pmd, start, end);

  	/*
  	 * Try again to free the PMD page if haven't succeeded above.
  	 */
  	if (!pud_none(*pud))
 -		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 +		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
  			pud_clear(pud);
  }

 -void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
 +			      unsigned long start,
 +			      unsigned long end)
  {
  	pud_t *pud = pud_offset(pgd, start);

 @@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
  		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
  		unsigned long pre_end	= min_t(unsigned long, end, next_page);

 -		unmap_pmd_range(pud, start, pre_end);
 +		unmap_pmd_range(cpa, pud, start, pre_end);

  		start = pre_end;
  		pud++;
 @@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
  		if (pud_large(*pud))
  			pud_clear(pud);
  		else
 -			unmap_pmd_range(pud, start, start + PUD_SIZE);
 +			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);

  		start += PUD_SIZE;
  		pud++;
 @@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
  	 * 2M leftovers?
  	 */
  	if (start < end)
 -		unmap_pmd_range(pud, start, end);
 +		unmap_pmd_range(cpa, pud, start, end);

  	/*
  	 * No need to try to free the PUD page because we'll free it in
 @@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigne
  	 */
  }

 +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 +{
 +	struct cpa_data cpa = {
 +		.flags = CPA_FREE_PAGETABLES,
 +	};
 +
 +	__unmap_pud_range(&cpa, pgd, start, end);
 +}
 +
 +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
 +{
 +	struct cpa_data cpa = {
 +		.flags = 0,
 +	};
 +
 +	__unmap_pud_range(&cpa, pgd, start, end);
 +}
 +
  static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
  {
  	pgd_t *pgd_entry = root + pgd_index(addr);
 --- a/arch/x86/mm/pgtable.c
 +++ b/arch/x86/mm/pgtable.c
 @@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
  		kmem_cache_free(pgd_cache, pgd);
  }
  #else
 -static inline pgd_t *_pgd_alloc(void)
 -{
 -#ifdef CONFIG_KAISER
 -	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
 -	// block. Therefore, we have to allocate at least 3 pages. However, the
 -	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
 -	// the beginning of the page of our 8kb-aligned memory block in order to
 -	// correctly free it afterwars.

 -	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
 -
 -	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
 -	{
 -		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
 -		return (pgd_t *) pages;
 -	}
 -	else
 -	{
 -		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
 -		return (pgd_t *) (pages + PAGE_SIZE);
 -	}
 +#ifdef CONFIG_KAISER
 +/*
 + * Instead of one pmd, we aquire two pmds.  Being order-1, it is
 + * both 8k in size and 8k-aligned.  That lets us just flip bit 12
 + * in a pointer to swap between the two 4k halves.
 + */
 +#define PGD_ALLOCATION_ORDER 1
  #else
 -	return (pgd_t *)__get_free_page(PGALLOC_GFP);
 +#define PGD_ALLOCATION_ORDER 0
  #endif
 +
 +static inline pgd_t *_pgd_alloc(void)
 +{
 +	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
  }

  static inline void _pgd_free(pgd_t *pgd)
  {
 -#ifdef CONFIG_KAISER
 -  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
 -	free_pages(pages, get_order(4*PAGE_SIZE));
 -#else
 -	free_page((unsigned long)pgd);
 -#endif
 +	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
  }
  #endif /* CONFIG_X86_PAE */

 --- /dev/null
 +++ b/include/linux/kaiser.h
 @@ -0,0 +1,26 @@
 +#ifndef _INCLUDE_KAISER_H
 +#define _INCLUDE_KAISER_H
 +
 +#ifdef CONFIG_KAISER
 +#include <asm/kaiser.h>
 +#else
 +
 +/*
 + * These stubs are used whenever CONFIG_KAISER is off, which
 + * includes architectures that support KAISER, but have it
 + * disabled.
 + */
 +
 +static inline void kaiser_init(void)
 +{
 +}
 +static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
 +{
 +}
 +static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 +{
 +	return 0;
 +}
 +
 +#endif /* !CONFIG_KAISER */
 +#endif /* _INCLUDE_KAISER_H */
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -58,6 +58,7 @@
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/freezer.h>
 +#include <linux/kaiser.h>
  #include <linux/delayacct.h>
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
 @@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct tas
  	*stackend = STACK_END_MAGIC;	/* for overflow detection */
  }

 -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  {
  	struct task_struct *tsk;
 @@ -357,9 +357,10 @@ static struct task_struct *dup_task_stru
  		goto free_ti;

  	tsk->stack = ti;
 -#ifdef CONFIG_KAISER
 -	kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
 -#endif
 +
 +	err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
 +	if (err)
 +		goto free_ti;
  #ifdef CONFIG_SECCOMP
  	/*
  	 * We must handle setting up seccomp filters once we're under
 --- a/security/Kconfig
 +++ b/security/Kconfig
 @@ -32,12 +32,17 @@ config SECURITY
  	  If you are unsure how to answer this question, answer N.
  config KAISER
  	bool "Remove the kernel mapping in user mode"
 +	default y
  	depends on X86_64
  	depends on !PARAVIRT
  	help
  	  This enforces a strict kernel and user space isolation in order to close
  	  hardware side channels on kernel address information.

 +config KAISER_REAL_SWITCH
 +	bool "KAISER: actually switch page tables"
 +	default y
 +
  config SECURITYFS
  	bool "Enable the securityfs filesystem"
  	help