| /* |
| * Copyright(c) 2017 Intel Corporation. All rights reserved. |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of version 2 of the GNU General Public License as |
| * published by the Free Software Foundation. |
| * |
| * This program is distributed in the hope that it will be useful, but |
| * WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * Based on work published here: https://github.com/IAIK/KAISER |
| * Modified by Dave Hansen <dave.hansen@intel.com to actually work. |
| */ |
| #include <linux/kernel.h> |
| #include <linux/errno.h> |
| #include <linux/string.h> |
| #include <linux/types.h> |
| #include <linux/bug.h> |
| #include <linux/debugfs.h> |
| #include <linux/init.h> |
| #include <linux/interrupt.h> |
| #include <linux/spinlock.h> |
| #include <linux/mm.h> |
| #include <linux/uaccess.h> |
| |
| #include <asm/kaiser.h> |
| #include <asm/pgtable.h> |
| #include <asm/pgalloc.h> |
| #include <asm/tlbflush.h> |
| #include <asm/desc.h> |
| |
| __aligned(PAGE_SIZE) |
| unsigned long kaiser_asm_do_switch[PAGE_SIZE/sizeof(unsigned long)] = { 1 }; |
| |
| /* |
| * At runtime, the only things we map are some things for CPU |
| * hotplug, and stacks for new processes. No two CPUs will ever |
| * be populating the same addresses, so we only need to ensure |
| * that we protect between two CPUs trying to allocate and |
| * populate the same page table page. |
| * |
| * Only take this lock when doing a set_p[4um]d(), but it is not |
| * needed for doing a set_pte(). We assume that only the *owner* |
| * of a given allocation will be doing this for _their_ |
| * allocation. |
| * |
| * This ensures that once a system has been running for a while |
| * and there have been stacks all over and these page tables |
| * are fully populated, there will be no further acquisitions of |
| * this lock. |
| */ |
| static DEFINE_SPINLOCK(shadow_table_allocation_lock); |
| |
| /* |
| * This is a generic page table walker used only for walking kernel |
| * addresses. We use it too help recreate the "shadow" page tables |
| * which are used while we are in userspace. |
| * |
| * This can be called on any kernel memory addresses and will work |
| * with any page sizes and any types: normal linear map memory, |
| * vmalloc(), even kmap(). |
| * |
| * Note: this is only used when mapping new *kernel* entries into |
| * the user/shadow page tables. It is never used for userspace |
| * addresses. |
| * |
| * Returns -1 on error. |
| */ |
| static inline unsigned long get_pa_from_kernel_map(unsigned long vaddr) |
| { |
| pgd_t *pgd; |
| p4d_t *p4d; |
| pud_t *pud; |
| pmd_t *pmd; |
| pte_t *pte; |
| |
| WARN_ON_ONCE(vaddr < PAGE_OFFSET); |
| |
| pgd = pgd_offset_k(vaddr); |
| /* |
| * We made all the kernel PGDs present in kaiser_init(). |
| * We expect them to stay that way. |
| */ |
| if (pgd_none(*pgd)) { |
| WARN_ON_ONCE(1); |
| return -1; |
| } |
| /* |
| * PGDs are either 512GB or 128TB on all x86_64 |
| * configurations. We don't handle these. |
| */ |
| if (pgd_large(*pgd)) { |
| WARN_ON_ONCE(1); |
| return -1; |
| } |
| |
| p4d = p4d_offset(pgd, vaddr); |
| if (p4d_none(*p4d)) { |
| WARN_ON_ONCE(1); |
| return -1; |
| } |
| |
| pud = pud_offset(p4d, vaddr); |
| if (pud_none(*pud)) { |
| WARN_ON_ONCE(1); |
| return -1; |
| } |
| |
| if (pud_large(*pud)) |
| return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); |
| |
| pmd = pmd_offset(pud, vaddr); |
| if (pmd_none(*pmd)) { |
| WARN_ON_ONCE(1); |
| return -1; |
| } |
| |
| if (pmd_large(*pmd)) |
| return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); |
| |
| pte = pte_offset_kernel(pmd, vaddr); |
| if (pte_none(*pte)) { |
| WARN_ON_ONCE(1); |
| return -1; |
| } |
| |
| return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); |
| } |
| |
| /* |
| * Walk the shadow copy of the page tables (optionally) trying to |
| * allocate page table pages on the way down. Does not support |
| * large pages since the data we are mapping is (generally) not |
| * large enough or aligned to 2MB. |
| * |
| * Note: this is only used when mapping *new* kernel data into the |
| * user/shadow page tables. It is never used for userspace data. |
| * |
| * Returns a pointer to a PTE on success, or NULL on failure. |
| */ |
| #define KAISER_WALK_ATOMIC 0x1 |
| static pte_t *kaiser_shadow_pagetable_walk(unsigned long address, |
| unsigned long flags) |
| { |
| pmd_t *pmd; |
| pud_t *pud; |
| p4d_t *p4d; |
| pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
| gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); |
| |
| if (flags & KAISER_WALK_ATOMIC) { |
| gfp &= ~GFP_KERNEL; |
| gfp |= __GFP_HIGH | __GFP_ATOMIC; |
| } |
| |
| if (pgd_none(*pgd)) { |
| WARN_ONCE(1, "All shadow pgds should have been populated"); |
| return NULL; |
| } |
| BUILD_BUG_ON(pgd_large(*pgd) != 0); |
| |
| p4d = p4d_offset(pgd, address); |
| BUILD_BUG_ON(p4d_large(*p4d) != 0); |
| if (p4d_none(*p4d)) { |
| unsigned long new_pud_page = __get_free_page(gfp); |
| if (!new_pud_page) |
| return NULL; |
| |
| spin_lock(&shadow_table_allocation_lock); |
| if (p4d_none(*p4d)) |
| set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); |
| else |
| free_page(new_pud_page); |
| spin_unlock(&shadow_table_allocation_lock); |
| } |
| |
| pud = pud_offset(p4d, address); |
| /* The shadow page tables do not use large mappings: */ |
| if (pud_large(*pud)) { |
| WARN_ON(1); |
| return NULL; |
| } |
| if (pud_none(*pud)) { |
| unsigned long new_pmd_page = __get_free_page(gfp); |
| if (!new_pmd_page) |
| return NULL; |
| |
| spin_lock(&shadow_table_allocation_lock); |
| if (pud_none(*pud)) |
| set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); |
| else |
| free_page(new_pmd_page); |
| spin_unlock(&shadow_table_allocation_lock); |
| } |
| |
| pmd = pmd_offset(pud, address); |
| /* The shadow page tables do not use large mappings: */ |
| if (pmd_large(*pmd)) { |
| WARN_ON(1); |
| return NULL; |
| } |
| if (pmd_none(*pmd)) { |
| unsigned long new_pte_page = __get_free_page(gfp); |
| if (!new_pte_page) |
| return NULL; |
| |
| spin_lock(&shadow_table_allocation_lock); |
| if (pmd_none(*pmd)) |
| set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); |
| else |
| free_page(new_pte_page); |
| spin_unlock(&shadow_table_allocation_lock); |
| } |
| |
| return pte_offset_kernel(pmd, address); |
| } |
| |
| static int __kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| unsigned long flags) |
| { |
| pte_t *pte; |
| unsigned long start_addr = (unsigned long)__start_addr; |
| unsigned long address = start_addr & PAGE_MASK; |
| unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
| unsigned long target_address; |
| |
| for (; address < end_addr; address += PAGE_SIZE) { |
| target_address = get_pa_from_kernel_map(address); |
| if (target_address == -1) |
| return -EIO; |
| |
| pte = kaiser_shadow_pagetable_walk(address, false); |
| /* |
| * Errors come from either -ENOMEM for a page |
| * table page, or something screwy that did a |
| * WARN_ON(). Just return -ENOMEM. |
| */ |
| if (!pte) |
| return -ENOMEM; |
| if (pte_none(*pte)) { |
| set_pte(pte, __pte(flags | target_address)); |
| } else { |
| pte_t tmp; |
| set_pte(&tmp, __pte(flags | target_address)); |
| WARN_ON_ONCE(!pte_same(*pte, tmp)); |
| } |
| } |
| return 0; |
| } |
| |
| /* |
| * Given a kernel address, @__start_addr, copy that mapping into |
| * the user (shadow) page tables. This may need to allocate page |
| * table pages. |
| */ |
| int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| unsigned long flags) |
| { |
| /* |
| * Since this mapping is the same between the user and kernel |
| * copies *and* is mapped to userspace anyway (and thus |
| * exposed to side-channels anyway), there is no danger in |
| * setting this Global |
| * |
| * This has a potential performance benefit because it will |
| * reduce the reloading of the TLB at entry/exit time since |
| * that code is mapped this way. |
| */ |
| flags |= _PAGE_GLOBAL; |
| |
| return __kaiser_add_user_map(__start_addr, size, flags); |
| } |
| |
| /* |
| * The stack mapping is called in generic code and can't use |
| * __PAGE_KERNEL |
| */ |
| int kaiser_map_stack(struct task_struct *tsk) |
| { |
| /* |
| * Note: This intentionally avoids the _PAGE_GLOBAL bit being |
| * set via kaiser_add_user_map(). We do not want it set for |
| * stacks. |
| */ |
| return __kaiser_add_user_map(tsk->stack, THREAD_SIZE, |
| __PAGE_KERNEL); |
| } |
| |
| int kaiser_add_user_map_ptrs(const void *__start_addr, |
| const void *__end_addr, |
| unsigned long flags) |
| { |
| return kaiser_add_user_map(__start_addr, |
| __end_addr - __start_addr, |
| flags); |
| } |
| |
| static int kaiser_user_map_ptr_early(const void *start_addr, unsigned long size, |
| unsigned long flags) |
| { |
| int ret = kaiser_add_user_map(start_addr, size, flags); |
| WARN_ON(ret); |
| return ret; |
| } |
| |
| /* |
| * Ensure that the top level of the (shadow) page tables are |
| * entirely populated. This ensures that all processes that get |
| * forked have the same entries. This way, we do not have to |
| * ever go set up new entries in older processes. |
| * |
| * Note: we never free these, so there are no updates to them |
| * after this. |
| */ |
| static void __init kaiser_init_all_pgds(void) |
| { |
| pgd_t *pgd; |
| int i = 0; |
| |
| pgd = native_get_shadow_pgd(pgd_offset_k(0UL)); |
| for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { |
| unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE; |
| #if CONFIG_PGTABLE_LEVELS > 4 |
| p4d_t *p4d = p4d_alloc_one(&init_mm, addr); |
| if (!p4d) { |
| WARN_ON(1); |
| break; |
| } |
| set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(p4d))); |
| #else /* CONFIG_PGTABLE_LEVELS <= 4 */ |
| pud_t *pud = pud_alloc_one(&init_mm, addr); |
| if (!pud) { |
| WARN_ON(1); |
| break; |
| } |
| set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(pud))); |
| #endif /* CONFIG_PGTABLE_LEVELS */ |
| } |
| } |
| |
| /* |
| * The page table allocations in here can theoretically fail, but |
| * we can not do much about it in early boot. Do the checking |
| * and warning in a macro to make it more readable. |
| */ |
| #define kaiser_add_user_map_early(start, size, flags) do { \ |
| int __ret = kaiser_add_user_map(start, size, flags); \ |
| WARN_ON(__ret); \ |
| } while (0) |
| |
| #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ |
| int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ |
| WARN_ON(__ret); \ |
| } while (0) |
| |
| extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
| /* |
| * If anything in here fails, we will likely die on one of the |
| * first kernel->user transitions and init will die. But, we |
| * will have most of the kernel up by then and should be able to |
| * get a clean warning out of it. If we BUG_ON() here, we run |
| * the risk of being before we have good console output. |
| */ |
| void __init kaiser_init(void) |
| { |
| int cpu; |
| |
| kaiser_init_all_pgds(); |
| |
| kaiser_add_user_map_early(&kaiser_asm_do_switch, PAGE_SIZE, |
| __PAGE_KERNEL); |
| |
| for_each_possible_cpu(cpu) { |
| void *percpu_vaddr = __per_cpu_user_mapped_start + |
| per_cpu_offset(cpu); |
| unsigned long percpu_sz = __per_cpu_user_mapped_end - |
| __per_cpu_user_mapped_start; |
| kaiser_add_user_map_early(percpu_vaddr, percpu_sz, |
| __PAGE_KERNEL); |
| } |
| |
| kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, |
| __PAGE_KERNEL_RX); |
| |
| /* the fixed map address of the idt_table */ |
| kaiser_add_user_map_early((void *)idt_descr.address, |
| sizeof(gate_desc) * NR_VECTORS, |
| __PAGE_KERNEL_RO); |
| |
| kaiser_user_map_ptr_early(&debug_idt_table, |
| sizeof(gate_desc) * NR_VECTORS, |
| __PAGE_KERNEL); |
| |
| /* |
| * We could theoretically do this in setup_fixmap_gdt(). |
| * But, we would need to rewrite the above page table |
| * allocation code to use the bootmem allocator. The |
| * buddy allocator is not available at the time that we |
| * call setup_fixmap_gdt() for CPU 0. |
| */ |
| kaiser_add_user_map_early(get_cpu_gdt_ro(0), PAGE_SIZE, |
| __PAGE_KERNEL_RO); |
| |
| /* |
| * .irqentry.text helps us identify code that runs before |
| * we get a chance to call entering_irq(). This includes |
| * the interrupt entry assembly plus the first C function |
| * that gets called. KAISER does not need the C code |
| * mapped. We just use the .irqentry.text section as-is |
| * to avoid having to carve out a new section for the |
| * assembly only. |
| */ |
| kaiser_add_user_map_ptrs_early(__irqentry_text_start, |
| __irqentry_text_end, |
| __PAGE_KERNEL_RX); |
| } |
| |
| int kaiser_add_mapping(unsigned long addr, unsigned long size, |
| unsigned long flags) |
| { |
| return kaiser_add_user_map((const void *)addr, size, flags); |
| } |
| |
| void kaiser_remove_mapping(unsigned long start, unsigned long size) |
| { |
| unsigned long addr; |
| |
| /* The shadow page tables always use small pages: */ |
| for (addr = start; addr < start + size; addr += PAGE_SIZE) { |
| /* |
| * Do an "atomic" walk in case this got called from an atomic |
| * context. This should not do any allocations because we |
| * should only be walking things that are known to be mapped. |
| */ |
| pte_t *pte = kaiser_shadow_pagetable_walk(addr, KAISER_WALK_ATOMIC); |
| |
| /* |
| * We are removing a mapping that shoud |
| * exist. WARN if it was not there: |
| */ |
| if (!pte) { |
| WARN_ON_ONCE(1); |
| continue; |
| } |
| |
| pte_clear(&init_mm, addr, pte); |
| } |
| /* |
| * This ensures that the TLB entries used to map this data are |
| * no longer usable on *this* CPU. We theoretically want to |
| * flush the entries on all CPUs here, but that's too |
| * expensive right now: this is called to unmap process |
| * stacks in the exit() path path. |
| * |
| * This can change if we get to the point where this is not |
| * in a remotely hot path, like only called via write_ldt(). |
| * |
| * Note: we could probably also just invalidate the individual |
| * addresses to take care of *this* PCID and then do a |
| * tlb_flush_shared_nonglobals() to ensure that all other |
| * PCIDs get flushed before being used again. |
| */ |
| __native_flush_tlb_global(); |
| } |
| |
| int kaiser_enabled = 1; |
| static ssize_t kaiser_enabled_read_file(struct file *file, char __user *user_buf, |
| size_t count, loff_t *ppos) |
| { |
| char buf[32]; |
| unsigned int len; |
| |
| len = sprintf(buf, "%d\n", kaiser_enabled); |
| return simple_read_from_buffer(user_buf, count, ppos, buf, len); |
| } |
| |
| enum poison { |
| KAISER_POISON, |
| KAISER_UNPOISON |
| }; |
| void kaiser_poison_pgds(enum poison do_poison); |
| |
| void kaiser_do_disable(void) |
| { |
| /* Make sure the kernel PGDs are usable by userspace: */ |
| kaiser_poison_pgds(KAISER_UNPOISON); |
| |
| /* |
| * Make sure all the CPUs have the poison clear in their TLBs. |
| * This also functions as a barrier to ensure that everyone |
| * sees the unpoisoned PGDs. |
| */ |
| flush_tlb_all(); |
| |
| /* Tell the assembly code to stop switching CR3. */ |
| kaiser_asm_do_switch[0] = 0; |
| |
| /* |
| * Make sure everybody does an interrupt. This means that |
| * they have gone through a SWITCH_TO_KERNEL_CR3 amd are no |
| * longer running on the userspace CR3. If we did not do |
| * this, we might have CPUs running on the shadow page tables |
| * that then enter the kernel and think they do *not* need to |
| * switch. |
| */ |
| flush_tlb_all(); |
| } |
| |
| void kaiser_do_enable(void) |
| { |
| /* Tell the assembly code to start switching CR3: */ |
| kaiser_asm_do_switch[0] = 1; |
| |
| /* Make sure everyone can see the kaiser_asm_do_switch update: */ |
| synchronize_rcu(); |
| |
| /* |
| * Now that userspace is no longer using the kernel copy of |
| * the page tables, we can poison it: |
| */ |
| kaiser_poison_pgds(KAISER_POISON); |
| |
| /* Make sure all the CPUs see the poison: */ |
| flush_tlb_all(); |
| } |
| |
| static ssize_t kaiser_enabled_write_file(struct file *file, |
| const char __user *user_buf, size_t count, loff_t *ppos) |
| { |
| char buf[32]; |
| ssize_t len; |
| unsigned int enable; |
| |
| len = min(count, sizeof(buf) - 1); |
| if (copy_from_user(buf, user_buf, len)) |
| return -EFAULT; |
| |
| buf[len] = '\0'; |
| if (kstrtoint(buf, 0, &enable)) |
| return -EINVAL; |
| |
| if (enable > 1) |
| return -EINVAL; |
| |
| if (kaiser_enabled == enable) |
| return count; |
| |
| /* |
| * This tells the page table code to stop poisoning PGDs |
| */ |
| WRITE_ONCE(kaiser_enabled, enable); |
| synchronize_rcu(); |
| |
| if (enable) |
| kaiser_do_enable(); |
| else |
| kaiser_do_disable(); |
| |
| return count; |
| } |
| |
| static const struct file_operations fops_kaiser_enabled = { |
| .read = kaiser_enabled_read_file, |
| .write = kaiser_enabled_write_file, |
| .llseek = default_llseek, |
| }; |
| |
| static int __init create_kaiser_enabled(void) |
| { |
| debugfs_create_file("kaiser-enabled", S_IRUSR | S_IWUSR, |
| arch_debugfs_dir, NULL, &fops_kaiser_enabled); |
| return 0; |
| } |
| late_initcall(create_kaiser_enabled); |
| |
| void kaiser_poison_pgd_page(pgd_t *pgd_page, enum poison do_poison) |
| { |
| int i = 0; |
| |
| for (i = 0; i < PTRS_PER_PGD; i++) { |
| pgd_t *pgd = &pgd_page[i]; |
| |
| /* Stop once we hit kernel addresses: */ |
| if (!pgdp_maps_userspace(pgd)) |
| break; |
| |
| if (do_poison == KAISER_POISON) |
| kaiser_poison_pgd(pgd); |
| else |
| kaiser_unpoison_pgd(pgd); |
| } |
| |
| } |
| |
| void kaiser_poison_pgds(enum poison do_poison) |
| { |
| struct page *page; |
| |
| spin_lock(&pgd_lock); |
| list_for_each_entry(page, &pgd_list, lru) { |
| pgd_t *pgd = (pgd_t *)page_address(page); |
| kaiser_poison_pgd_page(pgd, do_poison); |
| } |
| spin_unlock(&pgd_lock); |
| } |