arch/x86/mm/kaiser.c - pub/scm/linux/kernel/git/daveh/x86-kaiser - Git at Google

 /*
  * Copyright(c) 2017 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * Based on work published here: https://github.com/IAIK/KAISER
  * Modified by Dave Hansen <dave.hansen@intel.com to actually work.
  */
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/debugfs.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>

 #include <asm/kaiser.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/desc.h>

 __aligned(PAGE_SIZE)
 unsigned long kaiser_asm_do_switch[PAGE_SIZE/sizeof(unsigned long)] = { 1 };

 /*
  * At runtime, the only things we map are some things for CPU
  * hotplug, and stacks for new processes.  No two CPUs will ever
  * be populating the same addresses, so we only need to ensure
  * that we protect between two CPUs trying to allocate and
  * populate the same page table page.
  *
  * Only take this lock when doing a set_p[4um]d(), but it is not
  * needed for doing a set_pte().  We assume that only the *owner*
  * of a given allocation will be doing this for _their_
  * allocation.
  *
  * This ensures that once a system has been running for a while
  * and there have been stacks all over and these page tables
  * are fully populated, there will be no further acquisitions of
  * this lock.
  */
 static DEFINE_SPINLOCK(shadow_table_allocation_lock);

 /*
  * This is a generic page table walker used only for walking kernel
  * addresses.  We use it too help recreate the "shadow" page tables
  * which are used while we are in userspace.
  *
  * This can be called on any kernel memory addresses and will work
  * with any page sizes and any types: normal linear map memory,
  * vmalloc(), even kmap().
  *
  * Note: this is only used when mapping new *kernel* entries into
  * the user/shadow page tables.  It is never used for userspace
  * addresses.
  *
  * Returns -1 on error.
  */
 static inline unsigned long get_pa_from_kernel_map(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;

 	WARN_ON_ONCE(vaddr < PAGE_OFFSET);

 	pgd = pgd_offset_k(vaddr);
 	/*
 	 * We made all the kernel PGDs present in kaiser_init().
 	 * We expect them to stay that way.
 	 */
 	if (pgd_none(*pgd)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}
 	/*
 	 * PGDs are either 512GB or 128TB on all x86_64
 	 * configurations.  We don't handle these.
 	 */
 	if (pgd_large(*pgd)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	p4d = p4d_offset(pgd, vaddr);
 	if (p4d_none(*p4d)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	pud = pud_offset(p4d, vaddr);
 	if (pud_none(*pud)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	if (pud_large(*pud))
 		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);

 	pmd = pmd_offset(pud, vaddr);
 	if (pmd_none(*pmd)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	if (pmd_large(*pmd))
 		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);

 	pte = pte_offset_kernel(pmd, vaddr);
 	if (pte_none(*pte)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 }

 /*
  * Walk the shadow copy of the page tables (optionally) trying to
  * allocate page table pages on the way down.  Does not support
  * large pages since the data we are mapping is (generally) not
  * large enough or aligned to 2MB.
  *
  * Note: this is only used when mapping *new* kernel data into the
  * user/shadow page tables.  It is never used for userspace data.
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
 #define KAISER_WALK_ATOMIC  0x1
 static pte_t *kaiser_shadow_pagetable_walk(unsigned long address,
 					   unsigned long flags)
 {
 	pmd_t *pmd;
 	pud_t *pud;
 	p4d_t *p4d;
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);

 	if (flags & KAISER_WALK_ATOMIC) {
 		gfp &= ~GFP_KERNEL;
 		gfp |= __GFP_HIGH | __GFP_ATOMIC;
 	}

 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
 		return NULL;
 	}
 	BUILD_BUG_ON(pgd_large(*pgd) != 0);

 	p4d = p4d_offset(pgd, address);
 	BUILD_BUG_ON(p4d_large(*p4d) != 0);
 	if (p4d_none(*p4d)) {
 		unsigned long new_pud_page = __get_free_page(gfp);
 		if (!new_pud_page)
 			return NULL;

 		spin_lock(&shadow_table_allocation_lock);
 		if (p4d_none(*p4d))
 			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
 		else
 			free_page(new_pud_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}

 	pud = pud_offset(p4d, address);
 	/* The shadow page tables do not use large mappings: */
 	if (pud_large(*pud)) {
 		WARN_ON(1);
 		return NULL;
 	}
 	if (pud_none(*pud)) {
 		unsigned long new_pmd_page = __get_free_page(gfp);
 		if (!new_pmd_page)
 			return NULL;

 		spin_lock(&shadow_table_allocation_lock);
 		if (pud_none(*pud))
 			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 		else
 			free_page(new_pmd_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}

 	pmd = pmd_offset(pud, address);
 	/* The shadow page tables do not use large mappings: */
 	if (pmd_large(*pmd)) {
 		WARN_ON(1);
 		return NULL;
 	}
 	if (pmd_none(*pmd)) {
 		unsigned long new_pte_page = __get_free_page(gfp);
 		if (!new_pte_page)
 			return NULL;

 		spin_lock(&shadow_table_allocation_lock);
 		if (pmd_none(*pmd))
 			set_pmd(pmd, __pmd(_KERNPG_TABLE  | __pa(new_pte_page)));
 		else
 			free_page(new_pte_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}

 	return pte_offset_kernel(pmd, address);
 }

 static int __kaiser_add_user_map(const void *__start_addr, unsigned long size,
 				 unsigned long flags)
 {
 	pte_t *pte;
 	unsigned long start_addr = (unsigned long)__start_addr;
 	unsigned long address = start_addr & PAGE_MASK;
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;

 	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_kernel_map(address);
 		if (target_address == -1)
 			return -EIO;

 		pte = kaiser_shadow_pagetable_walk(address, false);
 		/*
 		 * Errors come from either -ENOMEM for a page
 		 * table page, or something screwy that did a
 		 * WARN_ON().  Just return -ENOMEM.
 		 */
 		if (!pte)
 			return -ENOMEM;
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
 			pte_t tmp;
 			set_pte(&tmp, __pte(flags | target_address));
 			WARN_ON_ONCE(!pte_same(*pte, tmp));
 		}
 	}
 	return 0;
 }

 /*
  * Given a kernel address, @__start_addr, copy that mapping into
  * the user (shadow) page tables.  This may need to allocate page
  * table pages.
  */
 int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 			unsigned long flags)
 {
 	/*
 	 * Since this mapping is the same between the user and kernel
 	 * copies *and* is mapped to userspace anyway (and thus
 	 * exposed to side-channels anyway), there is no danger in
 	 * setting this Global
 	 *
 	 * This has a potential performance benefit because it will
 	 * reduce the reloading of the TLB at entry/exit time since
 	 * that code is mapped this way.
 	 */
 	flags |= _PAGE_GLOBAL;

 	return __kaiser_add_user_map(__start_addr, size, flags);
 }

 /*
  * The stack mapping is called in generic code and can't use
  * __PAGE_KERNEL
  */
 int kaiser_map_stack(struct task_struct *tsk)
 {
 	/*
 	 * Note: This intentionally avoids the _PAGE_GLOBAL bit being
 	 * set via kaiser_add_user_map().  We do not want it set for
 	 * stacks.
 	 */
 	return __kaiser_add_user_map(tsk->stack, THREAD_SIZE,
 				      __PAGE_KERNEL);
 }

 int kaiser_add_user_map_ptrs(const void *__start_addr,
 			     const void *__end_addr,
 			     unsigned long flags)
 {
 	return kaiser_add_user_map(__start_addr,
 				   __end_addr - __start_addr,
 				   flags);
 }

 static int kaiser_user_map_ptr_early(const void *start_addr, unsigned long size,
 				 unsigned long flags)
 {
 	int ret = kaiser_add_user_map(start_addr, size, flags);
 	WARN_ON(ret);
 	return ret;
 }

 /*
  * Ensure that the top level of the (shadow) page tables are
  * entirely populated.  This ensures that all processes that get
  * forked have the same entries.  This way, we do not have to
  * ever go set up new entries in older processes.
  *
  * Note: we never free these, so there are no updates to them
  * after this.
  */
 static void __init kaiser_init_all_pgds(void)
 {
 	pgd_t *pgd;
 	int i = 0;

 	pgd = native_get_shadow_pgd(pgd_offset_k(0UL));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 		unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE;
 #if CONFIG_PGTABLE_LEVELS > 4
 		p4d_t *p4d = p4d_alloc_one(&init_mm, addr);
 		if (!p4d) {
 			WARN_ON(1);
 			break;
 		}
 		set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(p4d)));
 #else /* CONFIG_PGTABLE_LEVELS <= 4 */
 		pud_t *pud = pud_alloc_one(&init_mm, addr);
 		if (!pud) {
 			WARN_ON(1);
 			break;
 		}
 		set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(pud)));
 #endif /* CONFIG_PGTABLE_LEVELS */
 	}
 }

 /*
  * The page table allocations in here can theoretically fail, but
  * we can not do much about it in early boot.  Do the checking
  * and warning in a macro to make it more readable.
  */
 #define kaiser_add_user_map_early(start, size, flags) do {	\
 	int __ret = kaiser_add_user_map(start, size, flags);	\
 	WARN_ON(__ret);						\
 } while (0)

 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
 	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
 	WARN_ON(__ret);							\
 } while (0)

 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
  * will have most of the kernel up by then and should be able to
  * get a clean warning out of it.  If we BUG_ON() here, we run
  * the risk of being before we have good console output.
  */
 void __init kaiser_init(void)
 {
 	int cpu;

 	kaiser_init_all_pgds();

 	kaiser_add_user_map_early(&kaiser_asm_do_switch, PAGE_SIZE,
 				  __PAGE_KERNEL);

 	for_each_possible_cpu(cpu) {
 		void *percpu_vaddr = __per_cpu_user_mapped_start +
 				     per_cpu_offset(cpu);
 		unsigned long percpu_sz = __per_cpu_user_mapped_end -
 					  __per_cpu_user_mapped_start;
 		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 					  __PAGE_KERNEL);
 	}

 	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 				       __PAGE_KERNEL_RX);

 	/* the fixed map address of the idt_table */
 	kaiser_add_user_map_early((void *)idt_descr.address,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL_RO);

 	kaiser_user_map_ptr_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);

 	/*
 	 * We could theoretically do this in setup_fixmap_gdt().
 	 * But, we would need to rewrite the above page table
 	 * allocation code to use the bootmem allocator.  The
 	 * buddy allocator is not available at the time that we
 	 * call setup_fixmap_gdt() for CPU 0.
 	 */
 	kaiser_add_user_map_early(get_cpu_gdt_ro(0), PAGE_SIZE,
 				  __PAGE_KERNEL_RO);

 	/*
 	 * .irqentry.text helps us identify code that runs before
 	 * we get a chance to call entering_irq().  This includes
 	 * the interrupt entry assembly plus the first C function
 	 * that gets called.  KAISER does not need the C code
 	 * mapped.  We just use the .irqentry.text section as-is
 	 * to avoid having to carve out a new section for the
 	 * assembly only.
 	 */
 	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 				       __irqentry_text_end,
 				       __PAGE_KERNEL_RX);
 }

 int kaiser_add_mapping(unsigned long addr, unsigned long size,
 		       unsigned long flags)
 {
 	return kaiser_add_user_map((const void *)addr, size, flags);
 }

 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
 	unsigned long addr;

 	/* The shadow page tables always use small pages: */
 	for (addr = start; addr < start + size; addr += PAGE_SIZE) {
 		/*
 		 * Do an "atomic" walk in case this got called from an atomic
 		 * context.  This should not do any allocations because we
 		 * should only be walking things that are known to be mapped.
 		 */
 		pte_t *pte = kaiser_shadow_pagetable_walk(addr, KAISER_WALK_ATOMIC);

 		/*
 		 * We are removing a mapping that shoud
 		 * exist.  WARN if it was not there:
 		 */
 		if (!pte) {
 			WARN_ON_ONCE(1);
 			continue;
 		}

 		pte_clear(&init_mm, addr, pte);
 	}
 	/*
 	 * This ensures that the TLB entries used to map this data are
 	 * no longer usable on *this* CPU.  We theoretically want to
 	 * flush the entries on all CPUs here, but that's too
 	 * expensive right now: this is called to unmap process
 	 * stacks in the exit() path path.
 	 *
 	 * This can change if we get to the point where this is not
 	 * in a remotely hot path, like only called via write_ldt().
 	 *
 	 * Note: we could probably also just invalidate the individual
 	 * addresses to take care of *this* PCID and then do a
 	 * tlb_flush_shared_nonglobals() to ensure that all other
 	 * PCIDs get flushed before being used again.
 	 */
 	__native_flush_tlb_global();
 }

 int kaiser_enabled = 1;
 static ssize_t kaiser_enabled_read_file(struct file *file, char __user *user_buf,
 			     size_t count, loff_t *ppos)
 {
 	char buf[32];
 	unsigned int len;

 	len = sprintf(buf, "%d\n", kaiser_enabled);
 	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 }

 enum poison {
 	KAISER_POISON,
 	KAISER_UNPOISON
 };
 void kaiser_poison_pgds(enum poison do_poison);

 void kaiser_do_disable(void)
 {
 	/* Make sure the kernel PGDs are usable by userspace: */
 	kaiser_poison_pgds(KAISER_UNPOISON);

 	/*
 	 * Make sure all the CPUs have the poison clear in their TLBs.
 	 * This also functions as a barrier to ensure that everyone
 	 * sees the unpoisoned PGDs.
 	 */
 	flush_tlb_all();

 	/* Tell the assembly code to stop switching CR3. */
 	kaiser_asm_do_switch[0] = 0;

 	/*
 	 * Make sure everybody does an interrupt.  This means that
 	 * they have gone through a SWITCH_TO_KERNEL_CR3 amd are no
 	 * longer running on the userspace CR3.  If we did not do
 	 * this, we might have CPUs running on the shadow page tables
 	 * that then enter the kernel and think they do *not* need to
 	 * switch.
 	 */
 	flush_tlb_all();
 }

 void kaiser_do_enable(void)
 {
 	/* Tell the assembly code to start switching CR3: */
 	kaiser_asm_do_switch[0] = 1;

 	/* Make sure everyone can see the kaiser_asm_do_switch update: */
 	synchronize_rcu();

 	/*
 	 * Now that userspace is no longer using the kernel copy of
 	 * the page tables, we can poison it:
 	 */
 	kaiser_poison_pgds(KAISER_POISON);

 	/* Make sure all the CPUs see the poison: */
 	flush_tlb_all();
 }

 static ssize_t kaiser_enabled_write_file(struct file *file,
 		 const char __user *user_buf, size_t count, loff_t *ppos)
 {
 	char buf[32];
 	ssize_t len;
 	unsigned int enable;

 	len = min(count, sizeof(buf) - 1);
 	if (copy_from_user(buf, user_buf, len))
 		return -EFAULT;

 	buf[len] = '\0';
 	if (kstrtoint(buf, 0, &enable))
 		return -EINVAL;

 	if (enable > 1)
 		return -EINVAL;

 	if (kaiser_enabled == enable)
 		return count;

 	/*
 	 * This tells the page table code to stop poisoning PGDs
 	 */
 	WRITE_ONCE(kaiser_enabled, enable);
 	synchronize_rcu();

 	if (enable)
 		kaiser_do_enable();
 	else
 		kaiser_do_disable();

 	return count;
 }

 static const struct file_operations fops_kaiser_enabled = {
 	.read = kaiser_enabled_read_file,
 	.write = kaiser_enabled_write_file,
 	.llseek = default_llseek,
 };

 static int __init create_kaiser_enabled(void)
 {
 	debugfs_create_file("kaiser-enabled", S_IRUSR | S_IWUSR,
 			    arch_debugfs_dir, NULL, &fops_kaiser_enabled);
 	return 0;
 }
 late_initcall(create_kaiser_enabled);

 void kaiser_poison_pgd_page(pgd_t *pgd_page, enum poison do_poison)
 {
 	int i = 0;

 	for (i = 0; i < PTRS_PER_PGD; i++) {
 		pgd_t *pgd = &pgd_page[i];

 		/* Stop once we hit kernel addresses: */
 		if (!pgdp_maps_userspace(pgd))
 			break;

 		if (do_poison == KAISER_POISON)
 			kaiser_poison_pgd(pgd);
 		else
 			kaiser_unpoison_pgd(pgd);
 	}

 }

 void kaiser_poison_pgds(enum poison do_poison)
 {
 	struct page *page;

 	spin_lock(&pgd_lock);
 	list_for_each_entry(page, &pgd_list, lru) {
 		pgd_t *pgd = (pgd_t *)page_address(page);
 		kaiser_poison_pgd_page(pgd, do_poison);
 	}
 	spin_unlock(&pgd_lock);
 }
	/*
	* Copyright(c) 2017 Intel Corporation. All rights reserved.
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of version 2 of the GNU General Public License as
	* published by the Free Software Foundation.
	*
	* This program is distributed in the hope that it will be useful, but
	* WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* Based on work published here: https://github.com/IAIK/KAISER
	* Modified by Dave Hansen <dave.hansen@intel.com to actually work.
	*/
	#include <linux/kernel.h>
	#include <linux/errno.h>
	#include <linux/string.h>
	#include <linux/types.h>
	#include <linux/bug.h>
	#include <linux/debugfs.h>
	#include <linux/init.h>
	#include <linux/interrupt.h>
	#include <linux/spinlock.h>
	#include <linux/mm.h>
	#include <linux/uaccess.h>

	#include <asm/kaiser.h>
	#include <asm/pgtable.h>
	#include <asm/pgalloc.h>
	#include <asm/tlbflush.h>
	#include <asm/desc.h>

	__aligned(PAGE_SIZE)
	unsigned long kaiser_asm_do_switch[PAGE_SIZE/sizeof(unsigned long)] = { 1 };

	/*
	* At runtime, the only things we map are some things for CPU
	* hotplug, and stacks for new processes. No two CPUs will ever
	* be populating the same addresses, so we only need to ensure
	* that we protect between two CPUs trying to allocate and
	* populate the same page table page.
	*
	* Only take this lock when doing a set_p[4um]d(), but it is not
	* needed for doing a set_pte(). We assume that only the owner
	* of a given allocation will be doing this for _their_
	* allocation.
	*
	* This ensures that once a system has been running for a while
	* and there have been stacks all over and these page tables
	* are fully populated, there will be no further acquisitions of
	* this lock.
	*/
	static DEFINE_SPINLOCK(shadow_table_allocation_lock);

	/*
	* This is a generic page table walker used only for walking kernel
	* addresses. We use it too help recreate the "shadow" page tables
	* which are used while we are in userspace.
	*
	* This can be called on any kernel memory addresses and will work
	* with any page sizes and any types: normal linear map memory,
	* vmalloc(), even kmap().
	*
	* Note: this is only used when mapping new kernel entries into
	* the user/shadow page tables. It is never used for userspace
	* addresses.
	*
	* Returns -1 on error.
	*/
	static inline unsigned long get_pa_from_kernel_map(unsigned long vaddr)
	{
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	WARN_ON_ONCE(vaddr < PAGE_OFFSET);

	pgd = pgd_offset_k(vaddr);
	/*
	* We made all the kernel PGDs present in kaiser_init().
	* We expect them to stay that way.
	*/
	if (pgd_none(*pgd)) {
	WARN_ON_ONCE(1);
	return -1;
	}
	/*
	* PGDs are either 512GB or 128TB on all x86_64
	* configurations. We don't handle these.
	*/
	if (pgd_large(*pgd)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	p4d = p4d_offset(pgd, vaddr);
	if (p4d_none(*p4d)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	pud = pud_offset(p4d, vaddr);
	if (pud_none(*pud)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	if (pud_large(*pud))
	return (pud_pfn(*pud) << PAGE_SHIFT) \| (vaddr & ~PUD_PAGE_MASK);

	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	if (pmd_large(*pmd))
	return (pmd_pfn(*pmd) << PAGE_SHIFT) \| (vaddr & ~PMD_PAGE_MASK);

	pte = pte_offset_kernel(pmd, vaddr);
	if (pte_none(*pte)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	return (pte_pfn(*pte) << PAGE_SHIFT) \| (vaddr & ~PAGE_MASK);
	}

	/*
	* Walk the shadow copy of the page tables (optionally) trying to
	* allocate page table pages on the way down. Does not support
	* large pages since the data we are mapping is (generally) not
	* large enough or aligned to 2MB.
	*
	* Note: this is only used when mapping new kernel data into the
	* user/shadow page tables. It is never used for userspace data.
	*
	* Returns a pointer to a PTE on success, or NULL on failure.
	*/
	#define KAISER_WALK_ATOMIC 0x1
	static pte_t *kaiser_shadow_pagetable_walk(unsigned long address,
	unsigned long flags)
	{
	pmd_t *pmd;
	pud_t *pud;
	p4d_t *p4d;
	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
	gfp_t gfp = (GFP_KERNEL \| __GFP_NOTRACK \| __GFP_ZERO);

	if (flags & KAISER_WALK_ATOMIC) {
	gfp &= ~GFP_KERNEL;
	gfp \|= __GFP_HIGH \| __GFP_ATOMIC;
	}

	if (pgd_none(*pgd)) {
	WARN_ONCE(1, "All shadow pgds should have been populated");
	return NULL;
	}
	BUILD_BUG_ON(pgd_large(*pgd) != 0);

	p4d = p4d_offset(pgd, address);
	BUILD_BUG_ON(p4d_large(*p4d) != 0);
	if (p4d_none(*p4d)) {
	unsigned long new_pud_page = __get_free_page(gfp);
	if (!new_pud_page)
	return NULL;

	spin_lock(&shadow_table_allocation_lock);
	if (p4d_none(*p4d))
	set_p4d(p4d, __p4d(_KERNPG_TABLE \| __pa(new_pud_page)));
	else
	free_page(new_pud_page);
	spin_unlock(&shadow_table_allocation_lock);
	}

	pud = pud_offset(p4d, address);
	/* The shadow page tables do not use large mappings: */
	if (pud_large(*pud)) {
	WARN_ON(1);
	return NULL;
	}
	if (pud_none(*pud)) {
	unsigned long new_pmd_page = __get_free_page(gfp);
	if (!new_pmd_page)
	return NULL;

	spin_lock(&shadow_table_allocation_lock);
	if (pud_none(*pud))
	set_pud(pud, __pud(_KERNPG_TABLE \| __pa(new_pmd_page)));
	else
	free_page(new_pmd_page);
	spin_unlock(&shadow_table_allocation_lock);
	}

	pmd = pmd_offset(pud, address);
	/* The shadow page tables do not use large mappings: */
	if (pmd_large(*pmd)) {
	WARN_ON(1);
	return NULL;
	}
	if (pmd_none(*pmd)) {
	unsigned long new_pte_page = __get_free_page(gfp);
	if (!new_pte_page)
	return NULL;

	spin_lock(&shadow_table_allocation_lock);
	if (pmd_none(*pmd))
	set_pmd(pmd, __pmd(_KERNPG_TABLE \| __pa(new_pte_page)));
	else
	free_page(new_pte_page);
	spin_unlock(&shadow_table_allocation_lock);
	}

	return pte_offset_kernel(pmd, address);
	}

	static int __kaiser_add_user_map(const void *__start_addr, unsigned long size,
	unsigned long flags)
	{
	pte_t *pte;
	unsigned long start_addr = (unsigned long)__start_addr;
	unsigned long address = start_addr & PAGE_MASK;
	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
	unsigned long target_address;

	for (; address < end_addr; address += PAGE_SIZE) {
	target_address = get_pa_from_kernel_map(address);
	if (target_address == -1)
	return -EIO;

	pte = kaiser_shadow_pagetable_walk(address, false);
	/*
	* Errors come from either -ENOMEM for a page
	* table page, or something screwy that did a
	* WARN_ON(). Just return -ENOMEM.
	*/
	if (!pte)
	return -ENOMEM;
	if (pte_none(*pte)) {
	set_pte(pte, __pte(flags \| target_address));
	} else {
	pte_t tmp;
	set_pte(&tmp, __pte(flags \| target_address));
	WARN_ON_ONCE(!pte_same(*pte, tmp));
	}
	}
	return 0;
	}

	/*
	* Given a kernel address, @__start_addr, copy that mapping into
	* the user (shadow) page tables. This may need to allocate page
	* table pages.
	*/
	int kaiser_add_user_map(const void *__start_addr, unsigned long size,
	unsigned long flags)
	{
	/*
	* Since this mapping is the same between the user and kernel
	* copies and is mapped to userspace anyway (and thus
	* exposed to side-channels anyway), there is no danger in
	* setting this Global
	*
	* This has a potential performance benefit because it will
	* reduce the reloading of the TLB at entry/exit time since
	* that code is mapped this way.
	*/
	flags \|= _PAGE_GLOBAL;

	return __kaiser_add_user_map(__start_addr, size, flags);
	}

	/*
	* The stack mapping is called in generic code and can't use
	* __PAGE_KERNEL
	*/
	int kaiser_map_stack(struct task_struct *tsk)
	{
	/*
	* Note: This intentionally avoids the _PAGE_GLOBAL bit being
	* set via kaiser_add_user_map(). We do not want it set for
	* stacks.
	*/
	return __kaiser_add_user_map(tsk->stack, THREAD_SIZE,
	__PAGE_KERNEL);
	}

	int kaiser_add_user_map_ptrs(const void *__start_addr,
	const void *__end_addr,
	unsigned long flags)
	{
	return kaiser_add_user_map(__start_addr,
	__end_addr - __start_addr,
	flags);
	}

	static int kaiser_user_map_ptr_early(const void *start_addr, unsigned long size,
	unsigned long flags)
	{
	int ret = kaiser_add_user_map(start_addr, size, flags);
	WARN_ON(ret);
	return ret;
	}

	/*
	* Ensure that the top level of the (shadow) page tables are
	* entirely populated. This ensures that all processes that get
	* forked have the same entries. This way, we do not have to
	* ever go set up new entries in older processes.
	*
	* Note: we never free these, so there are no updates to them
	* after this.
	*/
	static void __init kaiser_init_all_pgds(void)
	{
	pgd_t *pgd;
	int i = 0;

	pgd = native_get_shadow_pgd(pgd_offset_k(0UL));
	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
	unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE;
	#if CONFIG_PGTABLE_LEVELS > 4
	p4d_t *p4d = p4d_alloc_one(&init_mm, addr);
	if (!p4d) {
	WARN_ON(1);
	break;
	}
	set_pgd(pgd + i, __pgd(_KERNPG_TABLE \| __pa(p4d)));
	#else /* CONFIG_PGTABLE_LEVELS <= 4 */
	pud_t *pud = pud_alloc_one(&init_mm, addr);
	if (!pud) {
	WARN_ON(1);
	break;
	}
	set_pgd(pgd + i, __pgd(_KERNPG_TABLE \| __pa(pud)));
	#endif /* CONFIG_PGTABLE_LEVELS */
	}
	}

	/*
	* The page table allocations in here can theoretically fail, but
	* we can not do much about it in early boot. Do the checking
	* and warning in a macro to make it more readable.
	*/
	#define kaiser_add_user_map_early(start, size, flags) do { \
	int __ret = kaiser_add_user_map(start, size, flags); \
	WARN_ON(__ret); \
	} while (0)

	#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
	int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
	WARN_ON(__ret); \
	} while (0)

	extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
	/*
	* If anything in here fails, we will likely die on one of the
	* first kernel->user transitions and init will die. But, we
	* will have most of the kernel up by then and should be able to
	* get a clean warning out of it. If we BUG_ON() here, we run
	* the risk of being before we have good console output.
	*/
	void __init kaiser_init(void)
	{
	int cpu;

	kaiser_init_all_pgds();

	kaiser_add_user_map_early(&kaiser_asm_do_switch, PAGE_SIZE,
	__PAGE_KERNEL);

	for_each_possible_cpu(cpu) {
	void *percpu_vaddr = __per_cpu_user_mapped_start +
	per_cpu_offset(cpu);
	unsigned long percpu_sz = __per_cpu_user_mapped_end -
	__per_cpu_user_mapped_start;
	kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
	__PAGE_KERNEL);
	}

	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
	__PAGE_KERNEL_RX);

	/* the fixed map address of the idt_table */
	kaiser_add_user_map_early((void *)idt_descr.address,
	sizeof(gate_desc) * NR_VECTORS,
	__PAGE_KERNEL_RO);

	kaiser_user_map_ptr_early(&debug_idt_table,
	sizeof(gate_desc) * NR_VECTORS,
	__PAGE_KERNEL);

	/*
	* We could theoretically do this in setup_fixmap_gdt().
	* But, we would need to rewrite the above page table
	* allocation code to use the bootmem allocator. The
	* buddy allocator is not available at the time that we
	* call setup_fixmap_gdt() for CPU 0.
	*/
	kaiser_add_user_map_early(get_cpu_gdt_ro(0), PAGE_SIZE,
	__PAGE_KERNEL_RO);

	/*
	* .irqentry.text helps us identify code that runs before
	* we get a chance to call entering_irq(). This includes
	* the interrupt entry assembly plus the first C function
	* that gets called. KAISER does not need the C code
	* mapped. We just use the .irqentry.text section as-is
	* to avoid having to carve out a new section for the
	* assembly only.
	*/
	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
	__irqentry_text_end,
	__PAGE_KERNEL_RX);
	}

	int kaiser_add_mapping(unsigned long addr, unsigned long size,
	unsigned long flags)
	{
	return kaiser_add_user_map((const void *)addr, size, flags);
	}

	void kaiser_remove_mapping(unsigned long start, unsigned long size)
	{
	unsigned long addr;

	/* The shadow page tables always use small pages: */
	for (addr = start; addr < start + size; addr += PAGE_SIZE) {
	/*
	* Do an "atomic" walk in case this got called from an atomic
	* context. This should not do any allocations because we
	* should only be walking things that are known to be mapped.
	*/
	pte_t *pte = kaiser_shadow_pagetable_walk(addr, KAISER_WALK_ATOMIC);

	/*
	* We are removing a mapping that shoud
	* exist. WARN if it was not there:
	*/
	if (!pte) {
	WARN_ON_ONCE(1);
	continue;
	}

	pte_clear(&init_mm, addr, pte);
	}
	/*
	* This ensures that the TLB entries used to map this data are
	* no longer usable on this CPU. We theoretically want to
	* flush the entries on all CPUs here, but that's too
	* expensive right now: this is called to unmap process
	* stacks in the exit() path path.
	*
	* This can change if we get to the point where this is not
	* in a remotely hot path, like only called via write_ldt().
	*
	* Note: we could probably also just invalidate the individual
	* addresses to take care of this PCID and then do a
	* tlb_flush_shared_nonglobals() to ensure that all other
	* PCIDs get flushed before being used again.
	*/
	__native_flush_tlb_global();
	}

	int kaiser_enabled = 1;
	static ssize_t kaiser_enabled_read_file(struct file file, char __user user_buf,
	size_t count, loff_t *ppos)
	{
	char buf[32];
	unsigned int len;

	len = sprintf(buf, "%d\n", kaiser_enabled);
	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
	}

	enum poison {
	KAISER_POISON,
	KAISER_UNPOISON
	};
	void kaiser_poison_pgds(enum poison do_poison);

	void kaiser_do_disable(void)
	{
	/* Make sure the kernel PGDs are usable by userspace: */
	kaiser_poison_pgds(KAISER_UNPOISON);

	/*
	* Make sure all the CPUs have the poison clear in their TLBs.
	* This also functions as a barrier to ensure that everyone
	* sees the unpoisoned PGDs.
	*/
	flush_tlb_all();

	/* Tell the assembly code to stop switching CR3. */
	kaiser_asm_do_switch[0] = 0;

	/*
	* Make sure everybody does an interrupt. This means that
	* they have gone through a SWITCH_TO_KERNEL_CR3 amd are no
	* longer running on the userspace CR3. If we did not do
	* this, we might have CPUs running on the shadow page tables
	* that then enter the kernel and think they do not need to
	* switch.
	*/
	flush_tlb_all();
	}

	void kaiser_do_enable(void)
	{
	/* Tell the assembly code to start switching CR3: */
	kaiser_asm_do_switch[0] = 1;

	/* Make sure everyone can see the kaiser_asm_do_switch update: */
	synchronize_rcu();

	/*
	* Now that userspace is no longer using the kernel copy of
	* the page tables, we can poison it:
	*/
	kaiser_poison_pgds(KAISER_POISON);

	/* Make sure all the CPUs see the poison: */
	flush_tlb_all();
	}

	static ssize_t kaiser_enabled_write_file(struct file *file,
	const char __user user_buf, size_t count, loff_t ppos)
	{
	char buf[32];
	ssize_t len;
	unsigned int enable;

	len = min(count, sizeof(buf) - 1);
	if (copy_from_user(buf, user_buf, len))
	return -EFAULT;

	buf[len] = '\0';
	if (kstrtoint(buf, 0, &enable))
	return -EINVAL;

	if (enable > 1)
	return -EINVAL;

	if (kaiser_enabled == enable)
	return count;

	/*
	* This tells the page table code to stop poisoning PGDs
	*/
	WRITE_ONCE(kaiser_enabled, enable);
	synchronize_rcu();

	if (enable)
	kaiser_do_enable();
	else
	kaiser_do_disable();

	return count;
	}

	static const struct file_operations fops_kaiser_enabled = {
	.read = kaiser_enabled_read_file,
	.write = kaiser_enabled_write_file,
	.llseek = default_llseek,
	};

	static int __init create_kaiser_enabled(void)
	{
	debugfs_create_file("kaiser-enabled", S_IRUSR \| S_IWUSR,
	arch_debugfs_dir, NULL, &fops_kaiser_enabled);
	return 0;
	}
	late_initcall(create_kaiser_enabled);

	void kaiser_poison_pgd_page(pgd_t *pgd_page, enum poison do_poison)
	{
	int i = 0;

	for (i = 0; i < PTRS_PER_PGD; i++) {
	pgd_t *pgd = &pgd_page[i];

	/* Stop once we hit kernel addresses: */
	if (!pgdp_maps_userspace(pgd))
	break;

	if (do_poison == KAISER_POISON)
	kaiser_poison_pgd(pgd);
	else
	kaiser_unpoison_pgd(pgd);
	}

	}

	void kaiser_poison_pgds(enum poison do_poison)
	{
	struct page *page;

	spin_lock(&pgd_lock);
	list_for_each_entry(page, &pgd_list, lru) {
	pgd_t pgd = (pgd_t )page_address(page);
	kaiser_poison_pgd_page(pgd, do_poison);
	}
	spin_unlock(&pgd_lock);
	}