| From foo@baz Wed Jan 3 18:58:12 CET 2018 |
| From: Dave Hansen <dave.hansen@linux.intel.com> |
| Date: Wed, 30 Aug 2017 16:23:00 -0700 |
| Subject: kaiser: merged update |
| |
| From: Dave Hansen <dave.hansen@linux.intel.com> |
| |
| |
| Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging). |
| |
| Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> |
| Signed-off-by: Hugh Dickins <hughd@google.com> |
| Acked-by: Jiri Kosina <jkosina@suse.cz> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/x86/entry/entry_64.S | 106 ++++++++++- |
| arch/x86/include/asm/kaiser.h | 43 ++-- |
| arch/x86/include/asm/pgtable.h | 18 + |
| arch/x86/include/asm/pgtable_64.h | 48 ++++- |
| arch/x86/include/asm/pgtable_types.h | 6 |
| arch/x86/kernel/espfix_64.c | 13 - |
| arch/x86/kernel/head_64.S | 19 +- |
| arch/x86/kernel/ldt.c | 27 ++ |
| arch/x86/kernel/tracepoint.c | 2 |
| arch/x86/mm/kaiser.c | 318 +++++++++++++++++++++++++---------- |
| arch/x86/mm/pageattr.c | 63 +++++- |
| arch/x86/mm/pgtable.c | 40 +--- |
| include/linux/kaiser.h | 26 ++ |
| kernel/fork.c | 9 |
| security/Kconfig | 5 |
| 15 files changed, 553 insertions(+), 190 deletions(-) |
| create mode 100644 include/linux/kaiser.h |
| |
| --- a/arch/x86/entry/entry_64.S |
| +++ b/arch/x86/entry/entry_64.S |
| @@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath: |
| movq RIP(%rsp), %rcx |
| movq EFLAGS(%rsp), %r11 |
| RESTORE_C_REGS_EXCEPT_RCX_R11 |
| + /* |
| + * This opens a window where we have a user CR3, but are |
| + * running in the kernel. This makes using the CS |
| + * register useless for telling whether or not we need to |
| + * switch CR3 in NMIs. Normal interrupts are OK because |
| + * they are off here. |
| + */ |
| SWITCH_USER_CR3 |
| movq RSP(%rsp), %rsp |
| /* |
| @@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call) |
| syscall_return_via_sysret: |
| /* rcx and r11 are already restored (see code above) */ |
| RESTORE_C_REGS_EXCEPT_RCX_R11 |
| + /* |
| + * This opens a window where we have a user CR3, but are |
| + * running in the kernel. This makes using the CS |
| + * register useless for telling whether or not we need to |
| + * switch CR3 in NMIs. Normal interrupts are OK because |
| + * they are off here. |
| + */ |
| SWITCH_USER_CR3 |
| movq RSP(%rsp), %rsp |
| USERGS_SYSRET64 |
| |
| opportunistic_sysret_failed: |
| + /* |
| + * This opens a window where we have a user CR3, but are |
| + * running in the kernel. This makes using the CS |
| + * register useless for telling whether or not we need to |
| + * switch CR3 in NMIs. Normal interrupts are OK because |
| + * they are off here. |
| + */ |
| SWITCH_USER_CR3 |
| SWAPGS |
| jmp restore_c_regs_and_iret |
| @@ -1059,6 +1080,13 @@ ENTRY(error_entry) |
| cld |
| SAVE_C_REGS 8 |
| SAVE_EXTRA_REGS 8 |
| + /* |
| + * error_entry() always returns with a kernel gsbase and |
| + * CR3. We must also have a kernel CR3/gsbase before |
| + * calling TRACE_IRQS_*. Just unconditionally switch to |
| + * the kernel CR3 here. |
| + */ |
| + SWITCH_KERNEL_CR3 |
| xorl %ebx, %ebx |
| testb $3, CS+8(%rsp) |
| jz .Lerror_kernelspace |
| @@ -1069,7 +1097,6 @@ ENTRY(error_entry) |
| * from user mode due to an IRET fault. |
| */ |
| SWAPGS |
| - SWITCH_KERNEL_CR3 |
| |
| .Lerror_entry_from_usermode_after_swapgs: |
| /* |
| @@ -1122,7 +1149,7 @@ ENTRY(error_entry) |
| * Switch to kernel gsbase: |
| */ |
| SWAPGS |
| - SWITCH_KERNEL_CR3 |
| + |
| /* |
| * Pretend that the exception came from user mode: set up pt_regs |
| * as if we faulted immediately after IRET and clear EBX so that |
| @@ -1222,7 +1249,10 @@ ENTRY(nmi) |
| */ |
| |
| SWAPGS_UNSAFE_STACK |
| - SWITCH_KERNEL_CR3_NO_STACK |
| + /* |
| + * percpu variables are mapped with user CR3, so no need |
| + * to switch CR3 here. |
| + */ |
| cld |
| movq %rsp, %rdx |
| movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
| @@ -1256,14 +1286,33 @@ ENTRY(nmi) |
| |
| movq %rsp, %rdi |
| movq $-1, %rsi |
| +#ifdef CONFIG_KAISER |
| + /* Unconditionally use kernel CR3 for do_nmi() */ |
| + /* %rax is saved above, so OK to clobber here */ |
| + movq %cr3, %rax |
| + pushq %rax |
| +#ifdef CONFIG_KAISER_REAL_SWITCH |
| + andq $(~0x1000), %rax |
| +#endif |
| + movq %rax, %cr3 |
| +#endif |
| call do_nmi |
| + /* |
| + * Unconditionally restore CR3. I know we return to |
| + * kernel code that needs user CR3, but do we ever return |
| + * to "user mode" where we need the kernel CR3? |
| + */ |
| +#ifdef CONFIG_KAISER |
| + popq %rax |
| + mov %rax, %cr3 |
| +#endif |
| |
| /* |
| * Return back to user mode. We must *not* do the normal exit |
| - * work, because we don't want to enable interrupts. Fortunately, |
| - * do_nmi doesn't modify pt_regs. |
| + * work, because we don't want to enable interrupts. Do not |
| + * switch to user CR3: we might be going back to kernel code |
| + * that had a user CR3 set. |
| */ |
| - SWITCH_USER_CR3 |
| SWAPGS |
| jmp restore_c_regs_and_iret |
| |
| @@ -1459,23 +1508,54 @@ end_repeat_nmi: |
| ALLOC_PT_GPREGS_ON_STACK |
| |
| /* |
| - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
| - * as we should not be calling schedule in NMI context. |
| - * Even with normal interrupts enabled. An NMI should not be |
| - * setting NEED_RESCHED or anything that normal interrupts and |
| - * exceptions might do. |
| + * Use the same approach as paranoid_entry to handle SWAPGS, but |
| + * without CR3 handling since we do that differently in NMIs. No |
| + * need to use paranoid_exit as we should not be calling schedule |
| + * in NMI context. Even with normal interrupts enabled. An NMI |
| + * should not be setting NEED_RESCHED or anything that normal |
| + * interrupts and exceptions might do. |
| */ |
| - call paranoid_entry |
| + cld |
| + SAVE_C_REGS |
| + SAVE_EXTRA_REGS |
| + movl $1, %ebx |
| + movl $MSR_GS_BASE, %ecx |
| + rdmsr |
| + testl %edx, %edx |
| + js 1f /* negative -> in kernel */ |
| + SWAPGS |
| + xorl %ebx, %ebx |
| +1: |
| +#ifdef CONFIG_KAISER |
| + /* Unconditionally use kernel CR3 for do_nmi() */ |
| + /* %rax is saved above, so OK to clobber here */ |
| + movq %cr3, %rax |
| + pushq %rax |
| +#ifdef CONFIG_KAISER_REAL_SWITCH |
| + andq $(~0x1000), %rax |
| +#endif |
| + movq %rax, %cr3 |
| +#endif |
| |
| /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
| movq %rsp, %rdi |
| + addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ |
| movq $-1, %rsi |
| call do_nmi |
| + /* |
| + * Unconditionally restore CR3. We might be returning to |
| + * kernel code that needs user CR3, like just just before |
| + * a sysret. |
| + */ |
| +#ifdef CONFIG_KAISER |
| + popq %rax |
| + mov %rax, %cr3 |
| +#endif |
| |
| testl %ebx, %ebx /* swapgs needed? */ |
| jnz nmi_restore |
| nmi_swapgs: |
| - SWITCH_USER_CR3_NO_STACK |
| + /* We fixed up CR3 above, so no need to switch it here */ |
| SWAPGS_UNSAFE_STACK |
| nmi_restore: |
| RESTORE_EXTRA_REGS |
| --- a/arch/x86/include/asm/kaiser.h |
| +++ b/arch/x86/include/asm/kaiser.h |
| @@ -16,13 +16,17 @@ |
| |
| .macro _SWITCH_TO_KERNEL_CR3 reg |
| movq %cr3, \reg |
| +#ifdef CONFIG_KAISER_REAL_SWITCH |
| andq $(~0x1000), \reg |
| +#endif |
| movq \reg, %cr3 |
| .endm |
| |
| .macro _SWITCH_TO_USER_CR3 reg |
| movq %cr3, \reg |
| +#ifdef CONFIG_KAISER_REAL_SWITCH |
| orq $(0x1000), \reg |
| +#endif |
| movq \reg, %cr3 |
| .endm |
| |
| @@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_b |
| .endm |
| |
| #endif /* CONFIG_KAISER */ |
| + |
| #else /* __ASSEMBLY__ */ |
| |
| |
| #ifdef CONFIG_KAISER |
| -// Upon kernel/user mode switch, it may happen that |
| -// the address space has to be switched before the registers have been stored. |
| -// To change the address space, another register is needed. |
| -// A register therefore has to be stored/restored. |
| -// |
| -DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| +/* |
| + * Upon kernel/user mode switch, it may happen that the address |
| + * space has to be switched before the registers have been |
| + * stored. To change the address space, another register is |
| + * needed. A register therefore has to be stored/restored. |
| +*/ |
| |
| -#endif /* CONFIG_KAISER */ |
| +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| |
| /** |
| - * shadowmem_add_mapping - map a virtual memory part to the shadow mapping |
| + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping |
| * @addr: the start address of the range |
| * @size: the size of the range |
| * @flags: The mapping flags of the pages |
| * |
| - * the mapping is done on a global scope, so no bigger synchronization has to be done. |
| - * the pages have to be manually unmapped again when they are not needed any longer. |
| + * The mapping is done on a global scope, so no bigger |
| + * synchronization has to be done. the pages have to be |
| + * manually unmapped again when they are not needed any longer. |
| */ |
| -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); |
| +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); |
| |
| |
| /** |
| - * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping |
| + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping |
| * @addr: the start address of the range |
| * @size: the size of the range |
| */ |
| extern void kaiser_remove_mapping(unsigned long start, unsigned long size); |
| |
| /** |
| - * shadowmem_initialize_mapping - Initalize the shadow mapping |
| + * kaiser_initialize_mapping - Initalize the shadow mapping |
| * |
| - * most parts of the shadow mapping can be mapped upon boot time. |
| - * only the thread stacks have to be mapped on runtime. |
| - * the mapped regions are not unmapped at all. |
| + * Most parts of the shadow mapping can be mapped upon boot |
| + * time. Only per-process things like the thread stacks |
| + * or a new LDT have to be mapped at runtime. These boot- |
| + * time mappings are permanent and nevertunmapped. |
| */ |
| extern void kaiser_init(void); |
| |
| -#endif |
| +#endif /* CONFIG_KAISER */ |
| + |
| +#endif /* __ASSEMBLY */ |
| |
| |
| |
| --- a/arch/x86/include/asm/pgtable.h |
| +++ b/arch/x86/include/asm/pgtable.h |
| @@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *p |
| |
| static inline int pgd_bad(pgd_t pgd) |
| { |
| - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; |
| + pgdval_t ignore_flags = _PAGE_USER; |
| + /* |
| + * We set NX on KAISER pgds that map userspace memory so |
| + * that userspace can not meaningfully use the kernel |
| + * page table by accident; it will fault on the first |
| + * instruction it tries to run. See native_set_pgd(). |
| + */ |
| + if (IS_ENABLED(CONFIG_KAISER)) |
| + ignore_flags |= _PAGE_NX; |
| + |
| + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; |
| } |
| |
| static inline int pgd_none(pgd_t pgd) |
| @@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t |
| { |
| memcpy(dst, src, count * sizeof(pgd_t)); |
| #ifdef CONFIG_KAISER |
| - // clone the shadow pgd part as well |
| - memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); |
| + /* Clone the shadow pgd part as well */ |
| + memcpy(native_get_shadow_pgd(dst), |
| + native_get_shadow_pgd(src), |
| + count * sizeof(pgd_t)); |
| #endif |
| } |
| |
| --- a/arch/x86/include/asm/pgtable_64.h |
| +++ b/arch/x86/include/asm/pgtable_64.h |
| @@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_ |
| } |
| |
| #ifdef CONFIG_KAISER |
| -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { |
| +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) |
| +{ |
| return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); |
| } |
| |
| -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { |
| +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) |
| +{ |
| return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); |
| } |
| +#else |
| +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) |
| +{ |
| + BUILD_BUG_ON(1); |
| + return NULL; |
| +} |
| +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) |
| +{ |
| + return pgdp; |
| +} |
| #endif /* CONFIG_KAISER */ |
| |
| +/* |
| + * Page table pages are page-aligned. The lower half of the top |
| + * level is used for userspace and the top half for the kernel. |
| + * This returns true for user pages that need to get copied into |
| + * both the user and kernel copies of the page tables, and false |
| + * for kernel pages that should only be in the kernel copy. |
| + */ |
| +static inline bool is_userspace_pgd(void *__ptr) |
| +{ |
| + unsigned long ptr = (unsigned long)__ptr; |
| + |
| + return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); |
| +} |
| + |
| static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
| { |
| #ifdef CONFIG_KAISER |
| - // We know that a pgd is page aligned. |
| - // Therefore the lower indices have to be mapped to user space. |
| - // These pages are mapped to the shadow mapping. |
| - if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { |
| + pteval_t extra_kern_pgd_flags = 0; |
| + /* Do we need to also populate the shadow pgd? */ |
| + if (is_userspace_pgd(pgdp)) { |
| native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| + /* |
| + * Even if the entry is *mapping* userspace, ensure |
| + * that userspace can not use it. This way, if we |
| + * get out to userspace running on the kernel CR3, |
| + * userspace will crash instead of running. |
| + */ |
| + extra_kern_pgd_flags = _PAGE_NX; |
| } |
| - |
| - pgdp->pgd = pgd.pgd & ~_PAGE_USER; |
| + pgdp->pgd = pgd.pgd; |
| + pgdp->pgd |= extra_kern_pgd_flags; |
| #else /* CONFIG_KAISER */ |
| *pgdp = pgd; |
| #endif |
| --- a/arch/x86/include/asm/pgtable_types.h |
| +++ b/arch/x86/include/asm/pgtable_types.h |
| @@ -42,7 +42,7 @@ |
| #ifdef CONFIG_KAISER |
| #define _PAGE_GLOBAL (_AT(pteval_t, 0)) |
| #else |
| -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
| +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
| #endif |
| #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) |
| #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) |
| @@ -93,11 +93,7 @@ |
| #define _PAGE_NX (_AT(pteval_t, 0)) |
| #endif |
| |
| -#ifdef CONFIG_KAISER |
| -#define _PAGE_PROTNONE (_AT(pteval_t, 0)) |
| -#else |
| #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
| -#endif |
| |
| #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
| _PAGE_ACCESSED | _PAGE_DIRTY) |
| --- a/arch/x86/kernel/espfix_64.c |
| +++ b/arch/x86/kernel/espfix_64.c |
| @@ -127,11 +127,14 @@ void __init init_espfix_bsp(void) |
| /* Install the espfix pud into the kernel page directory */ |
| pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
| pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); |
| -#ifdef CONFIG_KAISER |
| - // add the esp stack pud to the shadow mapping here. |
| - // This can be done directly, because the fixup stack has its own pud |
| - set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); |
| -#endif |
| + /* |
| + * Just copy the top-level PGD that is mapping the espfix |
| + * area to ensure it is mapped into the shadow user page |
| + * tables. |
| + */ |
| + if (IS_ENABLED(CONFIG_KAISER)) |
| + set_pgd(native_get_shadow_pgd(pgd_p), |
| + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); |
| |
| /* Randomize the locations */ |
| init_espfix_random(); |
| --- a/arch/x86/kernel/head_64.S |
| +++ b/arch/x86/kernel/head_64.S |
| @@ -442,11 +442,24 @@ early_idt_ripmsg: |
| GLOBAL(name) |
| |
| #ifdef CONFIG_KAISER |
| +/* |
| + * Each PGD needs to be 8k long and 8k aligned. We do not |
| + * ever go out to userspace with these, so we do not |
| + * strictly *need* the second page, but this allows us to |
| + * have a single set_pgd() implementation that does not |
| + * need to worry about whether it has 4k or 8k to work |
| + * with. |
| + * |
| + * This ensures PGDs are 8k long: |
| + */ |
| +#define KAISER_USER_PGD_FILL 512 |
| +/* This ensures they are 8k-aligned: */ |
| #define NEXT_PGD_PAGE(name) \ |
| .balign 2 * PAGE_SIZE; \ |
| GLOBAL(name) |
| #else |
| #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) |
| +#define KAISER_USER_PGD_FILL 0 |
| #endif |
| |
| /* Automate the creation of 1 to 1 mapping pmd entries */ |
| @@ -461,6 +474,7 @@ GLOBAL(name) |
| NEXT_PGD_PAGE(early_level4_pgt) |
| .fill 511,8,0 |
| .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
| + .fill KAISER_USER_PGD_FILL,8,0 |
| |
| NEXT_PAGE(early_dynamic_pgts) |
| .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
| @@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts) |
| |
| #ifndef CONFIG_XEN |
| NEXT_PGD_PAGE(init_level4_pgt) |
| - .fill 2*512,8,0 |
| + .fill 512,8,0 |
| + .fill KAISER_USER_PGD_FILL,8,0 |
| #else |
| NEXT_PGD_PAGE(init_level4_pgt) |
| .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| @@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt) |
| .org init_level4_pgt + L4_START_KERNEL*8, 0 |
| /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
| .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
| + .fill KAISER_USER_PGD_FILL,8,0 |
| |
| NEXT_PAGE(level3_ident_pgt) |
| .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| @@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt) |
| */ |
| PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
| #endif |
| + .fill KAISER_USER_PGD_FILL,8,0 |
| |
| NEXT_PAGE(level3_kernel_pgt) |
| .fill L3_START_KERNEL,8,0 |
| --- a/arch/x86/kernel/ldt.c |
| +++ b/arch/x86/kernel/ldt.c |
| @@ -18,6 +18,7 @@ |
| #include <linux/uaccess.h> |
| |
| #include <asm/ldt.h> |
| +#include <asm/kaiser.h> |
| #include <asm/desc.h> |
| #include <asm/mmu_context.h> |
| #include <asm/syscalls.h> |
| @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) |
| set_ldt(pc->ldt->entries, pc->ldt->size); |
| } |
| |
| +static void __free_ldt_struct(struct ldt_struct *ldt) |
| +{ |
| + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
| + vfree(ldt->entries); |
| + else |
| + free_page((unsigned long)ldt->entries); |
| + kfree(ldt); |
| +} |
| + |
| /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ |
| static struct ldt_struct *alloc_ldt_struct(int size) |
| { |
| struct ldt_struct *new_ldt; |
| int alloc_size; |
| + int ret = 0; |
| |
| if (size > LDT_ENTRIES) |
| return NULL; |
| @@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_stru |
| return NULL; |
| } |
| |
| + // FIXME: make kaiser_add_mapping() return an error code |
| + // when it fails |
| + kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, |
| + __PAGE_KERNEL); |
| + if (ret) { |
| + __free_ldt_struct(new_ldt); |
| + return NULL; |
| + } |
| new_ldt->size = size; |
| return new_ldt; |
| } |
| @@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_s |
| if (likely(!ldt)) |
| return; |
| |
| + kaiser_remove_mapping((unsigned long)ldt->entries, |
| + ldt->size * LDT_ENTRY_SIZE); |
| paravirt_free_ldt(ldt->entries, ldt->size); |
| - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
| - vfree(ldt->entries); |
| - else |
| - free_page((unsigned long)ldt->entries); |
| - kfree(ldt); |
| + __free_ldt_struct(ldt); |
| } |
| |
| /* |
| --- a/arch/x86/kernel/tracepoint.c |
| +++ b/arch/x86/kernel/tracepoint.c |
| @@ -9,10 +9,12 @@ |
| #include <linux/atomic.h> |
| |
| atomic_t trace_idt_ctr = ATOMIC_INIT(0); |
| +__aligned(PAGE_SIZE) |
| struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, |
| (unsigned long) trace_idt_table }; |
| |
| /* No need to be aligned, but done to keep all IDTs defined the same way. */ |
| +__aligned(PAGE_SIZE) |
| gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; |
| |
| static int trace_irq_vector_refcount; |
| --- a/arch/x86/mm/kaiser.c |
| +++ b/arch/x86/mm/kaiser.c |
| @@ -1,160 +1,306 @@ |
| - |
| - |
| +#include <linux/bug.h> |
| #include <linux/kernel.h> |
| #include <linux/errno.h> |
| #include <linux/string.h> |
| #include <linux/types.h> |
| #include <linux/bug.h> |
| #include <linux/init.h> |
| +#include <linux/interrupt.h> |
| #include <linux/spinlock.h> |
| #include <linux/mm.h> |
| - |
| #include <linux/uaccess.h> |
| +#include <linux/ftrace.h> |
| + |
| +#include <asm/kaiser.h> |
| #include <asm/pgtable.h> |
| #include <asm/pgalloc.h> |
| #include <asm/desc.h> |
| #ifdef CONFIG_KAISER |
| |
| __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| +/* |
| + * At runtime, the only things we map are some things for CPU |
| + * hotplug, and stacks for new processes. No two CPUs will ever |
| + * be populating the same addresses, so we only need to ensure |
| + * that we protect between two CPUs trying to allocate and |
| + * populate the same page table page. |
| + * |
| + * Only take this lock when doing a set_p[4um]d(), but it is not |
| + * needed for doing a set_pte(). We assume that only the *owner* |
| + * of a given allocation will be doing this for _their_ |
| + * allocation. |
| + * |
| + * This ensures that once a system has been running for a while |
| + * and there have been stacks all over and these page tables |
| + * are fully populated, there will be no further acquisitions of |
| + * this lock. |
| + */ |
| +static DEFINE_SPINLOCK(shadow_table_allocation_lock); |
| |
| -/** |
| - * Get the real ppn from a address in kernel mapping. |
| - * @param address The virtual adrress |
| - * @return the physical address |
| +/* |
| + * Returns -1 on error. |
| */ |
| -static inline unsigned long get_pa_from_mapping (unsigned long address) |
| +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) |
| { |
| pgd_t *pgd; |
| pud_t *pud; |
| pmd_t *pmd; |
| pte_t *pte; |
| |
| - pgd = pgd_offset_k(address); |
| - BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); |
| + pgd = pgd_offset_k(vaddr); |
| + /* |
| + * We made all the kernel PGDs present in kaiser_init(). |
| + * We expect them to stay that way. |
| + */ |
| + BUG_ON(pgd_none(*pgd)); |
| + /* |
| + * PGDs are either 512GB or 128TB on all x86_64 |
| + * configurations. We don't handle these. |
| + */ |
| + BUG_ON(pgd_large(*pgd)); |
| + |
| + pud = pud_offset(pgd, vaddr); |
| + if (pud_none(*pud)) { |
| + WARN_ON_ONCE(1); |
| + return -1; |
| + } |
| |
| - pud = pud_offset(pgd, address); |
| - BUG_ON(pud_none(*pud)); |
| + if (pud_large(*pud)) |
| + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); |
| |
| - if (pud_large(*pud)) { |
| - return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); |
| + pmd = pmd_offset(pud, vaddr); |
| + if (pmd_none(*pmd)) { |
| + WARN_ON_ONCE(1); |
| + return -1; |
| } |
| |
| - pmd = pmd_offset(pud, address); |
| - BUG_ON(pmd_none(*pmd)); |
| + if (pmd_large(*pmd)) |
| + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); |
| |
| - if (pmd_large(*pmd)) { |
| - return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); |
| + pte = pte_offset_kernel(pmd, vaddr); |
| + if (pte_none(*pte)) { |
| + WARN_ON_ONCE(1); |
| + return -1; |
| } |
| |
| - pte = pte_offset_kernel(pmd, address); |
| - BUG_ON(pte_none(*pte)); |
| - |
| - return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); |
| + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); |
| } |
| |
| -void _kaiser_copy (unsigned long start_addr, unsigned long size, |
| - unsigned long flags) |
| +/* |
| + * This is a relatively normal page table walk, except that it |
| + * also tries to allocate page tables pages along the way. |
| + * |
| + * Returns a pointer to a PTE on success, or NULL on failure. |
| + */ |
| +static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) |
| { |
| - pgd_t *pgd; |
| - pud_t *pud; |
| pmd_t *pmd; |
| - pte_t *pte; |
| - unsigned long address; |
| - unsigned long end_addr = start_addr + size; |
| - unsigned long target_address; |
| + pud_t *pud; |
| + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
| + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); |
| |
| - for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); |
| - address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { |
| - target_address = get_pa_from_mapping(address); |
| + might_sleep(); |
| + if (is_atomic) { |
| + gfp &= ~GFP_KERNEL; |
| + gfp |= __GFP_HIGH | __GFP_ATOMIC; |
| + } |
| |
| - pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
| + if (pgd_none(*pgd)) { |
| + WARN_ONCE(1, "All shadow pgds should have been populated"); |
| + return NULL; |
| + } |
| + BUILD_BUG_ON(pgd_large(*pgd) != 0); |
| |
| - BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); |
| - BUG_ON(pgd_large(*pgd)); |
| + pud = pud_offset(pgd, address); |
| + /* The shadow page tables do not use large mappings: */ |
| + if (pud_large(*pud)) { |
| + WARN_ON(1); |
| + return NULL; |
| + } |
| + if (pud_none(*pud)) { |
| + unsigned long new_pmd_page = __get_free_page(gfp); |
| + if (!new_pmd_page) |
| + return NULL; |
| + spin_lock(&shadow_table_allocation_lock); |
| + if (pud_none(*pud)) |
| + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); |
| + else |
| + free_page(new_pmd_page); |
| + spin_unlock(&shadow_table_allocation_lock); |
| + } |
| |
| - pud = pud_offset(pgd, address); |
| - if (pud_none(*pud)) { |
| - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); |
| - } |
| - BUG_ON(pud_large(*pud)); |
| + pmd = pmd_offset(pud, address); |
| + /* The shadow page tables do not use large mappings: */ |
| + if (pmd_large(*pmd)) { |
| + WARN_ON(1); |
| + return NULL; |
| + } |
| + if (pmd_none(*pmd)) { |
| + unsigned long new_pte_page = __get_free_page(gfp); |
| + if (!new_pte_page) |
| + return NULL; |
| + spin_lock(&shadow_table_allocation_lock); |
| + if (pmd_none(*pmd)) |
| + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); |
| + else |
| + free_page(new_pte_page); |
| + spin_unlock(&shadow_table_allocation_lock); |
| + } |
| |
| - pmd = pmd_offset(pud, address); |
| - if (pmd_none(*pmd)) { |
| - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); |
| - } |
| - BUG_ON(pmd_large(*pmd)); |
| + return pte_offset_kernel(pmd, address); |
| +} |
| |
| - pte = pte_offset_kernel(pmd, address); |
| +int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| + unsigned long flags) |
| +{ |
| + int ret = 0; |
| + pte_t *pte; |
| + unsigned long start_addr = (unsigned long )__start_addr; |
| + unsigned long address = start_addr & PAGE_MASK; |
| + unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
| + unsigned long target_address; |
| + |
| + for (;address < end_addr; address += PAGE_SIZE) { |
| + target_address = get_pa_from_mapping(address); |
| + if (target_address == -1) { |
| + ret = -EIO; |
| + break; |
| + } |
| + pte = kaiser_pagetable_walk(address, false); |
| if (pte_none(*pte)) { |
| set_pte(pte, __pte(flags | target_address)); |
| } else { |
| - BUG_ON(__pa(pte_page(*pte)) != target_address); |
| + pte_t tmp; |
| + set_pte(&tmp, __pte(flags | target_address)); |
| + WARN_ON_ONCE(!pte_same(*pte, tmp)); |
| } |
| } |
| + return ret; |
| } |
| |
| -// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping |
| -static inline void __init _kaiser_init(void) |
| +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) |
| +{ |
| + unsigned long size = end - start; |
| + |
| + return kaiser_add_user_map(start, size, flags); |
| +} |
| + |
| +/* |
| + * Ensure that the top level of the (shadow) page tables are |
| + * entirely populated. This ensures that all processes that get |
| + * forked have the same entries. This way, we do not have to |
| + * ever go set up new entries in older processes. |
| + * |
| + * Note: we never free these, so there are no updates to them |
| + * after this. |
| + */ |
| +static void __init kaiser_init_all_pgds(void) |
| { |
| pgd_t *pgd; |
| int i = 0; |
| |
| pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); |
| for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { |
| - set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); |
| + pgd_t new_pgd; |
| + pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); |
| + if (!pud) { |
| + WARN_ON(1); |
| + break; |
| + } |
| + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); |
| + /* |
| + * Make sure not to stomp on some other pgd entry. |
| + */ |
| + if (!pgd_none(pgd[i])) { |
| + WARN_ON(1); |
| + continue; |
| + } |
| + set_pgd(pgd + i, new_pgd); |
| } |
| } |
| |
| +#define kaiser_add_user_map_early(start, size, flags) do { \ |
| + int __ret = kaiser_add_user_map(start, size, flags); \ |
| + WARN_ON(__ret); \ |
| +} while (0) |
| + |
| +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ |
| + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ |
| + WARN_ON(__ret); \ |
| +} while (0) |
| + |
| extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
| -spinlock_t shadow_table_lock; |
| +/* |
| + * If anything in here fails, we will likely die on one of the |
| + * first kernel->user transitions and init will die. But, we |
| + * will have most of the kernel up by then and should be able to |
| + * get a clean warning out of it. If we BUG_ON() here, we run |
| + * the risk of being before we have good console output. |
| + */ |
| void __init kaiser_init(void) |
| { |
| int cpu; |
| - spin_lock_init(&shadow_table_lock); |
| - |
| - spin_lock(&shadow_table_lock); |
| |
| - _kaiser_init(); |
| + kaiser_init_all_pgds(); |
| |
| for_each_possible_cpu(cpu) { |
| - // map the per cpu user variables |
| - _kaiser_copy( |
| - (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), |
| - (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, |
| - __PAGE_KERNEL); |
| - } |
| - |
| - // map the entry/exit text section, which is responsible to switch between user- and kernel mode |
| - _kaiser_copy( |
| - (unsigned long) __entry_text_start, |
| - (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, |
| - __PAGE_KERNEL_RX); |
| - |
| - // the fixed map address of the idt_table |
| - _kaiser_copy( |
| - (unsigned long) idt_descr.address, |
| - sizeof(gate_desc) * NR_VECTORS, |
| - __PAGE_KERNEL_RO); |
| + void *percpu_vaddr = __per_cpu_user_mapped_start + |
| + per_cpu_offset(cpu); |
| + unsigned long percpu_sz = __per_cpu_user_mapped_end - |
| + __per_cpu_user_mapped_start; |
| + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, |
| + __PAGE_KERNEL); |
| + } |
| |
| - spin_unlock(&shadow_table_lock); |
| + /* |
| + * Map the entry/exit text section, which is needed at |
| + * switches from user to and from kernel. |
| + */ |
| + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, |
| + __PAGE_KERNEL_RX); |
| + |
| +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) |
| + kaiser_add_user_map_ptrs_early(__irqentry_text_start, |
| + __irqentry_text_end, |
| + __PAGE_KERNEL_RX); |
| +#endif |
| + kaiser_add_user_map_early((void *)idt_descr.address, |
| + sizeof(gate_desc) * NR_VECTORS, |
| + __PAGE_KERNEL_RO); |
| +#ifdef CONFIG_TRACING |
| + kaiser_add_user_map_early(&trace_idt_descr, |
| + sizeof(trace_idt_descr), |
| + __PAGE_KERNEL); |
| + kaiser_add_user_map_early(&trace_idt_table, |
| + sizeof(gate_desc) * NR_VECTORS, |
| + __PAGE_KERNEL); |
| +#endif |
| + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), |
| + __PAGE_KERNEL); |
| + kaiser_add_user_map_early(&debug_idt_table, |
| + sizeof(gate_desc) * NR_VECTORS, |
| + __PAGE_KERNEL); |
| } |
| |
| +extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); |
| // add a mapping to the shadow-mapping, and synchronize the mappings |
| -void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
| +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
| { |
| - spin_lock(&shadow_table_lock); |
| - _kaiser_copy(addr, size, flags); |
| - spin_unlock(&shadow_table_lock); |
| + return kaiser_add_user_map((const void *)addr, size, flags); |
| } |
| |
| -extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); |
| void kaiser_remove_mapping(unsigned long start, unsigned long size) |
| { |
| - pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); |
| - spin_lock(&shadow_table_lock); |
| - do { |
| - unmap_pud_range(pgd, start, start + size); |
| - } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); |
| - spin_unlock(&shadow_table_lock); |
| + unsigned long end = start + size; |
| + unsigned long addr; |
| + |
| + for (addr = start; addr < end; addr += PGDIR_SIZE) { |
| + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); |
| + /* |
| + * unmap_p4d_range() handles > P4D_SIZE unmaps, |
| + * so no need to trim 'end'. |
| + */ |
| + unmap_pud_range_nofree(pgd, addr, end); |
| + } |
| } |
| #endif /* CONFIG_KAISER */ |
| --- a/arch/x86/mm/pageattr.c |
| +++ b/arch/x86/mm/pageattr.c |
| @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); |
| #define CPA_FLUSHTLB 1 |
| #define CPA_ARRAY 2 |
| #define CPA_PAGES_ARRAY 4 |
| +#define CPA_FREE_PAGETABLES 8 |
| |
| #ifdef CONFIG_PROC_FS |
| static unsigned long direct_pages_count[PG_LEVEL_NUM]; |
| @@ -723,10 +724,13 @@ static int split_large_page(struct cpa_d |
| return 0; |
| } |
| |
| -static bool try_to_free_pte_page(pte_t *pte) |
| +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) |
| { |
| int i; |
| |
| + if (!(cpa->flags & CPA_FREE_PAGETABLES)) |
| + return false; |
| + |
| for (i = 0; i < PTRS_PER_PTE; i++) |
| if (!pte_none(pte[i])) |
| return false; |
| @@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t * |
| return true; |
| } |
| |
| -static bool try_to_free_pmd_page(pmd_t *pmd) |
| +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) |
| { |
| int i; |
| |
| + if (!(cpa->flags & CPA_FREE_PAGETABLES)) |
| + return false; |
| + |
| for (i = 0; i < PTRS_PER_PMD; i++) |
| if (!pmd_none(pmd[i])) |
| return false; |
| @@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t * |
| return true; |
| } |
| |
| -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
| +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, |
| + unsigned long start, |
| + unsigned long end) |
| { |
| pte_t *pte = pte_offset_kernel(pmd, start); |
| |
| @@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, |
| pte++; |
| } |
| |
| - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { |
| + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { |
| pmd_clear(pmd); |
| return true; |
| } |
| return false; |
| } |
| |
| -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, |
| +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, |
| unsigned long start, unsigned long end) |
| { |
| - if (unmap_pte_range(pmd, start, end)) |
| - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
| + if (unmap_pte_range(cpa, pmd, start, end)) |
| + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) |
| pud_clear(pud); |
| } |
| |
| -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
| +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, |
| + unsigned long start, unsigned long end) |
| { |
| pmd_t *pmd = pmd_offset(pud, start); |
| |
| @@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, |
| unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
| unsigned long pre_end = min_t(unsigned long, end, next_page); |
| |
| - __unmap_pmd_range(pud, pmd, start, pre_end); |
| + __unmap_pmd_range(cpa, pud, pmd, start, pre_end); |
| |
| start = pre_end; |
| pmd++; |
| @@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, |
| if (pmd_large(*pmd)) |
| pmd_clear(pmd); |
| else |
| - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); |
| + __unmap_pmd_range(cpa, pud, pmd, |
| + start, start + PMD_SIZE); |
| |
| start += PMD_SIZE; |
| pmd++; |
| @@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, |
| * 4K leftovers? |
| */ |
| if (start < end) |
| - return __unmap_pmd_range(pud, pmd, start, end); |
| + return __unmap_pmd_range(cpa, pud, pmd, start, end); |
| |
| /* |
| * Try again to free the PMD page if haven't succeeded above. |
| */ |
| if (!pud_none(*pud)) |
| - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
| + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) |
| pud_clear(pud); |
| } |
| |
| -void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
| +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, |
| + unsigned long start, |
| + unsigned long end) |
| { |
| pud_t *pud = pud_offset(pgd, start); |
| |
| @@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne |
| unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
| unsigned long pre_end = min_t(unsigned long, end, next_page); |
| |
| - unmap_pmd_range(pud, start, pre_end); |
| + unmap_pmd_range(cpa, pud, start, pre_end); |
| |
| start = pre_end; |
| pud++; |
| @@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne |
| if (pud_large(*pud)) |
| pud_clear(pud); |
| else |
| - unmap_pmd_range(pud, start, start + PUD_SIZE); |
| + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); |
| |
| start += PUD_SIZE; |
| pud++; |
| @@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne |
| * 2M leftovers? |
| */ |
| if (start < end) |
| - unmap_pmd_range(pud, start, end); |
| + unmap_pmd_range(cpa, pud, start, end); |
| |
| /* |
| * No need to try to free the PUD page because we'll free it in |
| @@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigne |
| */ |
| } |
| |
| +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
| +{ |
| + struct cpa_data cpa = { |
| + .flags = CPA_FREE_PAGETABLES, |
| + }; |
| + |
| + __unmap_pud_range(&cpa, pgd, start, end); |
| +} |
| + |
| +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) |
| +{ |
| + struct cpa_data cpa = { |
| + .flags = 0, |
| + }; |
| + |
| + __unmap_pud_range(&cpa, pgd, start, end); |
| +} |
| + |
| static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) |
| { |
| pgd_t *pgd_entry = root + pgd_index(addr); |
| --- a/arch/x86/mm/pgtable.c |
| +++ b/arch/x86/mm/pgtable.c |
| @@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd) |
| kmem_cache_free(pgd_cache, pgd); |
| } |
| #else |
| -static inline pgd_t *_pgd_alloc(void) |
| -{ |
| -#ifdef CONFIG_KAISER |
| - // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory |
| - // block. Therefore, we have to allocate at least 3 pages. However, the |
| - // __get_free_pages returns us 4 pages. Hence, we store the base pointer at |
| - // the beginning of the page of our 8kb-aligned memory block in order to |
| - // correctly free it afterwars. |
| |
| - unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); |
| - |
| - if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) |
| - { |
| - *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; |
| - return (pgd_t *) pages; |
| - } |
| - else |
| - { |
| - *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; |
| - return (pgd_t *) (pages + PAGE_SIZE); |
| - } |
| +#ifdef CONFIG_KAISER |
| +/* |
| + * Instead of one pmd, we aquire two pmds. Being order-1, it is |
| + * both 8k in size and 8k-aligned. That lets us just flip bit 12 |
| + * in a pointer to swap between the two 4k halves. |
| + */ |
| +#define PGD_ALLOCATION_ORDER 1 |
| #else |
| - return (pgd_t *)__get_free_page(PGALLOC_GFP); |
| +#define PGD_ALLOCATION_ORDER 0 |
| #endif |
| + |
| +static inline pgd_t *_pgd_alloc(void) |
| +{ |
| + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); |
| } |
| |
| static inline void _pgd_free(pgd_t *pgd) |
| { |
| -#ifdef CONFIG_KAISER |
| - unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); |
| - free_pages(pages, get_order(4*PAGE_SIZE)); |
| -#else |
| - free_page((unsigned long)pgd); |
| -#endif |
| + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); |
| } |
| #endif /* CONFIG_X86_PAE */ |
| |
| --- /dev/null |
| +++ b/include/linux/kaiser.h |
| @@ -0,0 +1,26 @@ |
| +#ifndef _INCLUDE_KAISER_H |
| +#define _INCLUDE_KAISER_H |
| + |
| +#ifdef CONFIG_KAISER |
| +#include <asm/kaiser.h> |
| +#else |
| + |
| +/* |
| + * These stubs are used whenever CONFIG_KAISER is off, which |
| + * includes architectures that support KAISER, but have it |
| + * disabled. |
| + */ |
| + |
| +static inline void kaiser_init(void) |
| +{ |
| +} |
| +static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) |
| +{ |
| +} |
| +static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
| +{ |
| + return 0; |
| +} |
| + |
| +#endif /* !CONFIG_KAISER */ |
| +#endif /* _INCLUDE_KAISER_H */ |
| --- a/kernel/fork.c |
| +++ b/kernel/fork.c |
| @@ -58,6 +58,7 @@ |
| #include <linux/tsacct_kern.h> |
| #include <linux/cn_proc.h> |
| #include <linux/freezer.h> |
| +#include <linux/kaiser.h> |
| #include <linux/delayacct.h> |
| #include <linux/taskstats_kern.h> |
| #include <linux/random.h> |
| @@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct tas |
| *stackend = STACK_END_MAGIC; /* for overflow detection */ |
| } |
| |
| -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); |
| static struct task_struct *dup_task_struct(struct task_struct *orig, int node) |
| { |
| struct task_struct *tsk; |
| @@ -357,9 +357,10 @@ static struct task_struct *dup_task_stru |
| goto free_ti; |
| |
| tsk->stack = ti; |
| -#ifdef CONFIG_KAISER |
| - kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); |
| -#endif |
| + |
| + err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); |
| + if (err) |
| + goto free_ti; |
| #ifdef CONFIG_SECCOMP |
| /* |
| * We must handle setting up seccomp filters once we're under |
| --- a/security/Kconfig |
| +++ b/security/Kconfig |
| @@ -32,12 +32,17 @@ config SECURITY |
| If you are unsure how to answer this question, answer N. |
| config KAISER |
| bool "Remove the kernel mapping in user mode" |
| + default y |
| depends on X86_64 |
| depends on !PARAVIRT |
| help |
| This enforces a strict kernel and user space isolation in order to close |
| hardware side channels on kernel address information. |
| |
| +config KAISER_REAL_SWITCH |
| + bool "KAISER: actually switch page tables" |
| + default y |
| + |
| config SECURITYFS |
| bool "Enable the securityfs filesystem" |
| help |