blob: ec9f19079d67c9df1c148321f3a50f9924b8ef8f [file] [log] [blame]
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Based on work published here: https://github.com/IAIK/KAISER
* Modified by Dave Hansen <dave.hansen@intel.com to actually work.
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/kaiser.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
__aligned(PAGE_SIZE)
unsigned long kaiser_asm_do_switch[PAGE_SIZE/sizeof(unsigned long)] = { 1 };
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
* be populating the same addresses, so we only need to ensure
* that we protect between two CPUs trying to allocate and
* populate the same page table page.
*
* Only take this lock when doing a set_p[4um]d(), but it is not
* needed for doing a set_pte(). We assume that only the *owner*
* of a given allocation will be doing this for _their_
* allocation.
*
* This ensures that once a system has been running for a while
* and there have been stacks all over and these page tables
* are fully populated, there will be no further acquisitions of
* this lock.
*/
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
/*
* This is a generic page table walker used only for walking kernel
* addresses. We use it too help recreate the "shadow" page tables
* which are used while we are in userspace.
*
* This can be called on any kernel memory addresses and will work
* with any page sizes and any types: normal linear map memory,
* vmalloc(), even kmap().
*
* Note: this is only used when mapping new *kernel* entries into
* the user/shadow page tables. It is never used for userspace
* addresses.
*
* Returns -1 on error.
*/
static inline unsigned long get_pa_from_kernel_map(unsigned long vaddr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
WARN_ON_ONCE(vaddr < PAGE_OFFSET);
pgd = pgd_offset_k(vaddr);
/*
* We made all the kernel PGDs present in kaiser_init().
* We expect them to stay that way.
*/
if (pgd_none(*pgd)) {
WARN_ON_ONCE(1);
return -1;
}
/*
* PGDs are either 512GB or 128TB on all x86_64
* configurations. We don't handle these.
*/
if (pgd_large(*pgd)) {
WARN_ON_ONCE(1);
return -1;
}
p4d = p4d_offset(pgd, vaddr);
if (p4d_none(*p4d)) {
WARN_ON_ONCE(1);
return -1;
}
pud = pud_offset(p4d, vaddr);
if (pud_none(*pud)) {
WARN_ON_ONCE(1);
return -1;
}
if (pud_large(*pud))
return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
WARN_ON_ONCE(1);
return -1;
}
if (pmd_large(*pmd))
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
pte = pte_offset_kernel(pmd, vaddr);
if (pte_none(*pte)) {
WARN_ON_ONCE(1);
return -1;
}
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
}
/*
* Walk the shadow copy of the page tables (optionally) trying to
* allocate page table pages on the way down. Does not support
* large pages since the data we are mapping is (generally) not
* large enough or aligned to 2MB.
*
* Note: this is only used when mapping *new* kernel data into the
* user/shadow page tables. It is never used for userspace data.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
#define KAISER_WALK_ATOMIC 0x1
static pte_t *kaiser_shadow_pagetable_walk(unsigned long address,
unsigned long flags)
{
pmd_t *pmd;
pud_t *pud;
p4d_t *p4d;
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
if (flags & KAISER_WALK_ATOMIC) {
gfp &= ~GFP_KERNEL;
gfp |= __GFP_HIGH | __GFP_ATOMIC;
}
if (pgd_none(*pgd)) {
WARN_ONCE(1, "All shadow pgds should have been populated");
return NULL;
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
p4d = p4d_offset(pgd, address);
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
if (!new_pud_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (p4d_none(*p4d))
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
else
free_page(new_pud_page);
spin_unlock(&shadow_table_allocation_lock);
}
pud = pud_offset(p4d, address);
/* The shadow page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pud_none(*pud))
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
else
free_page(new_pmd_page);
spin_unlock(&shadow_table_allocation_lock);
}
pmd = pmd_offset(pud, address);
/* The shadow page tables do not use large mappings: */
if (pmd_large(*pmd)) {
WARN_ON(1);
return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pmd_none(*pmd))
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
else
free_page(new_pte_page);
spin_unlock(&shadow_table_allocation_lock);
}
return pte_offset_kernel(pmd, address);
}
static int __kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long flags)
{
pte_t *pte;
unsigned long start_addr = (unsigned long)__start_addr;
unsigned long address = start_addr & PAGE_MASK;
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
unsigned long target_address;
for (; address < end_addr; address += PAGE_SIZE) {
target_address = get_pa_from_kernel_map(address);
if (target_address == -1)
return -EIO;
pte = kaiser_shadow_pagetable_walk(address, false);
/*
* Errors come from either -ENOMEM for a page
* table page, or something screwy that did a
* WARN_ON(). Just return -ENOMEM.
*/
if (!pte)
return -ENOMEM;
if (pte_none(*pte)) {
set_pte(pte, __pte(flags | target_address));
} else {
pte_t tmp;
set_pte(&tmp, __pte(flags | target_address));
WARN_ON_ONCE(!pte_same(*pte, tmp));
}
}
return 0;
}
/*
* Given a kernel address, @__start_addr, copy that mapping into
* the user (shadow) page tables. This may need to allocate page
* table pages.
*/
int kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long flags)
{
/*
* Since this mapping is the same between the user and kernel
* copies *and* is mapped to userspace anyway (and thus
* exposed to side-channels anyway), there is no danger in
* setting this Global
*
* This has a potential performance benefit because it will
* reduce the reloading of the TLB at entry/exit time since
* that code is mapped this way.
*/
flags |= _PAGE_GLOBAL;
return __kaiser_add_user_map(__start_addr, size, flags);
}
/*
* The stack mapping is called in generic code and can't use
* __PAGE_KERNEL
*/
int kaiser_map_stack(struct task_struct *tsk)
{
/*
* Note: This intentionally avoids the _PAGE_GLOBAL bit being
* set via kaiser_add_user_map(). We do not want it set for
* stacks.
*/
return __kaiser_add_user_map(tsk->stack, THREAD_SIZE,
__PAGE_KERNEL);
}
int kaiser_add_user_map_ptrs(const void *__start_addr,
const void *__end_addr,
unsigned long flags)
{
return kaiser_add_user_map(__start_addr,
__end_addr - __start_addr,
flags);
}
static int kaiser_user_map_ptr_early(const void *start_addr, unsigned long size,
unsigned long flags)
{
int ret = kaiser_add_user_map(start_addr, size, flags);
WARN_ON(ret);
return ret;
}
/*
* Ensure that the top level of the (shadow) page tables are
* entirely populated. This ensures that all processes that get
* forked have the same entries. This way, we do not have to
* ever go set up new entries in older processes.
*
* Note: we never free these, so there are no updates to them
* after this.
*/
static void __init kaiser_init_all_pgds(void)
{
pgd_t *pgd;
int i = 0;
pgd = native_get_shadow_pgd(pgd_offset_k(0UL));
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE;
#if CONFIG_PGTABLE_LEVELS > 4
p4d_t *p4d = p4d_alloc_one(&init_mm, addr);
if (!p4d) {
WARN_ON(1);
break;
}
set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(p4d)));
#else /* CONFIG_PGTABLE_LEVELS <= 4 */
pud_t *pud = pud_alloc_one(&init_mm, addr);
if (!pud) {
WARN_ON(1);
break;
}
set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(pud)));
#endif /* CONFIG_PGTABLE_LEVELS */
}
}
/*
* The page table allocations in here can theoretically fail, but
* we can not do much about it in early boot. Do the checking
* and warning in a macro to make it more readable.
*/
#define kaiser_add_user_map_early(start, size, flags) do { \
int __ret = kaiser_add_user_map(start, size, flags); \
WARN_ON(__ret); \
} while (0)
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
WARN_ON(__ret); \
} while (0)
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
/*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
* will have most of the kernel up by then and should be able to
* get a clean warning out of it. If we BUG_ON() here, we run
* the risk of being before we have good console output.
*/
void __init kaiser_init(void)
{
int cpu;
kaiser_init_all_pgds();
kaiser_add_user_map_early(&kaiser_asm_do_switch, PAGE_SIZE,
__PAGE_KERNEL);
for_each_possible_cpu(cpu) {
void *percpu_vaddr = __per_cpu_user_mapped_start +
per_cpu_offset(cpu);
unsigned long percpu_sz = __per_cpu_user_mapped_end -
__per_cpu_user_mapped_start;
kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
__PAGE_KERNEL);
}
kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
__PAGE_KERNEL_RX);
/* the fixed map address of the idt_table */
kaiser_add_user_map_early((void *)idt_descr.address,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL_RO);
kaiser_user_map_ptr_early(&debug_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
/*
* We could theoretically do this in setup_fixmap_gdt().
* But, we would need to rewrite the above page table
* allocation code to use the bootmem allocator. The
* buddy allocator is not available at the time that we
* call setup_fixmap_gdt() for CPU 0.
*/
kaiser_add_user_map_early(get_cpu_gdt_ro(0), PAGE_SIZE,
__PAGE_KERNEL_RO);
/*
* .irqentry.text helps us identify code that runs before
* we get a chance to call entering_irq(). This includes
* the interrupt entry assembly plus the first C function
* that gets called. KAISER does not need the C code
* mapped. We just use the .irqentry.text section as-is
* to avoid having to carve out a new section for the
* assembly only.
*/
kaiser_add_user_map_ptrs_early(__irqentry_text_start,
__irqentry_text_end,
__PAGE_KERNEL_RX);
}
int kaiser_add_mapping(unsigned long addr, unsigned long size,
unsigned long flags)
{
return kaiser_add_user_map((const void *)addr, size, flags);
}
void kaiser_remove_mapping(unsigned long start, unsigned long size)
{
unsigned long addr;
/* The shadow page tables always use small pages: */
for (addr = start; addr < start + size; addr += PAGE_SIZE) {
/*
* Do an "atomic" walk in case this got called from an atomic
* context. This should not do any allocations because we
* should only be walking things that are known to be mapped.
*/
pte_t *pte = kaiser_shadow_pagetable_walk(addr, KAISER_WALK_ATOMIC);
/*
* We are removing a mapping that shoud
* exist. WARN if it was not there:
*/
if (!pte) {
WARN_ON_ONCE(1);
continue;
}
pte_clear(&init_mm, addr, pte);
}
/*
* This ensures that the TLB entries used to map this data are
* no longer usable on *this* CPU. We theoretically want to
* flush the entries on all CPUs here, but that's too
* expensive right now: this is called to unmap process
* stacks in the exit() path path.
*
* This can change if we get to the point where this is not
* in a remotely hot path, like only called via write_ldt().
*
* Note: we could probably also just invalidate the individual
* addresses to take care of *this* PCID and then do a
* tlb_flush_shared_nonglobals() to ensure that all other
* PCIDs get flushed before being used again.
*/
__native_flush_tlb_global();
}
int kaiser_enabled = 1;
static ssize_t kaiser_enabled_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
len = sprintf(buf, "%d\n", kaiser_enabled);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
enum poison {
KAISER_POISON,
KAISER_UNPOISON
};
void kaiser_poison_pgds(enum poison do_poison);
void kaiser_do_disable(void)
{
/* Make sure the kernel PGDs are usable by userspace: */
kaiser_poison_pgds(KAISER_UNPOISON);
/*
* Make sure all the CPUs have the poison clear in their TLBs.
* This also functions as a barrier to ensure that everyone
* sees the unpoisoned PGDs.
*/
flush_tlb_all();
/* Tell the assembly code to stop switching CR3. */
kaiser_asm_do_switch[0] = 0;
/*
* Make sure everybody does an interrupt. This means that
* they have gone through a SWITCH_TO_KERNEL_CR3 amd are no
* longer running on the userspace CR3. If we did not do
* this, we might have CPUs running on the shadow page tables
* that then enter the kernel and think they do *not* need to
* switch.
*/
flush_tlb_all();
}
void kaiser_do_enable(void)
{
/* Tell the assembly code to start switching CR3: */
kaiser_asm_do_switch[0] = 1;
/* Make sure everyone can see the kaiser_asm_do_switch update: */
synchronize_rcu();
/*
* Now that userspace is no longer using the kernel copy of
* the page tables, we can poison it:
*/
kaiser_poison_pgds(KAISER_POISON);
/* Make sure all the CPUs see the poison: */
flush_tlb_all();
}
static ssize_t kaiser_enabled_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
ssize_t len;
unsigned int enable;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
buf[len] = '\0';
if (kstrtoint(buf, 0, &enable))
return -EINVAL;
if (enable > 1)
return -EINVAL;
if (kaiser_enabled == enable)
return count;
/*
* This tells the page table code to stop poisoning PGDs
*/
WRITE_ONCE(kaiser_enabled, enable);
synchronize_rcu();
if (enable)
kaiser_do_enable();
else
kaiser_do_disable();
return count;
}
static const struct file_operations fops_kaiser_enabled = {
.read = kaiser_enabled_read_file,
.write = kaiser_enabled_write_file,
.llseek = default_llseek,
};
static int __init create_kaiser_enabled(void)
{
debugfs_create_file("kaiser-enabled", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_kaiser_enabled);
return 0;
}
late_initcall(create_kaiser_enabled);
void kaiser_poison_pgd_page(pgd_t *pgd_page, enum poison do_poison)
{
int i = 0;
for (i = 0; i < PTRS_PER_PGD; i++) {
pgd_t *pgd = &pgd_page[i];
/* Stop once we hit kernel addresses: */
if (!pgdp_maps_userspace(pgd))
break;
if (do_poison == KAISER_POISON)
kaiser_poison_pgd(pgd);
else
kaiser_unpoison_pgd(pgd);
}
}
void kaiser_poison_pgds(enum poison do_poison)
{
struct page *page;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd = (pgd_t *)page_address(page);
kaiser_poison_pgd_page(pgd, do_poison);
}
spin_unlock(&pgd_lock);
}