kaiser-20171025004
diff --git a/Documentation/x86/kaiser.txt b/Documentation/x86/kaiser.txt
new file mode 100644
index 0000000..9554fb2
--- /dev/null
+++ b/Documentation/x86/kaiser.txt
@@ -0,0 +1,85 @@
+KAISER is a countermeasure against side channel attacks against
+kernel virtual memory. It leaves alone the page tables that
+exist for each process and refers to them as the "kernel page
+tables". But, it adds a second set of "shadow" (aka. user) page
+tables which are active when running userspace.
+
+Whenever we enter the kernel (syscalls, interrupts, exceptions),
+the page tables are switched to the "kernel" copy, but when the
+system switches back to user mode, the user/shadow copy is used.
+
+The minimalistic kernel portion of the user page tables try to
+map only what is needed to enter/exit the kernel such as the
+entry/exit functions themselves and the interrupt descriptor
+table (IDT).
+
+This helps ensure that side-channel attacks that leverage the
+paging structures do not function when KAISER is enabled. It
+can be enabled by setting CONFIG_KAISER=y
+
+Protection against side-channel attacks is important. But,
+this protection comes at a cost:
+
+1. Increased Memory Use
+ a. Statically-allocated structures must be padded out to 4k
+ (or 8k for PGDs) so they can be mapped into the user page
+ tables.
+ b. Each task now needs an order-1 PGD instead of order-0
+ c. The shadow page tables eventually grow to map all of used
+ vmalloc() space. They can have roughly the same memory
+ consumption as the vmalloc() page tables.
+
+2. Runtime Cost
+ a. CR3 manipulation to switch between the page table copies
+ must be done at interrupt, syscall, and exception entry
+ and exit (it can be skipped when the kernel is interrupted,
+ though.) Moves to CR3 are on the order of a hundred
+ cycles, and we need one at entry and another at exit.
+ b. Task stacks must be mapped/unmapped. We need to walk
+ and modify the shadow page tables at fork() and exit().
+ c. Global pages are disabled. This feature of the MMU
+ allows different processes to share TLB entries mapping
+ the kernel. Losing the feature means potentially more
+ TLB misses after a context switch.
+ d. Process Context IDentifiers (PCID) is a CPU feature that
+ allows us to skip flushing the entire TLB when we switch
+ the page tables. This makes switching the page tables
+ (at context switch, or kernel entry/exit) cheaper. But,
+ on systems with PCID support, the context switch code
+ must flush both the user and kernel entries out of the
+ TLB, with an INVPCID in addition to the CR3 write. This
+ INVPCID is generally slower than a CR3 write, but still
+ on the order of a hundred cycles.
+ e. The shadow page tables must be populated for each new
+ process. Even without KAISER, since we share all of the
+ kernel mappings in all processes, we can do all this
+ population for kernel addresses at the top level of the
+ page tables (the PGD level). But, with KAISER, we now
+ have *two* kernel mappings: one in the kernel page tables
+ that maps everything and one in the user/shadow page
+ tables mapping the "minimal" kernel. At fork(), we
+ copy the portion of the shadow PGD that maps the minimal
+ kernel structures in addition to the normal kernel one.
+ f. In addition to the fork()-time copying, we must also
+ update the shadow PGD any time a set_pgd() is done on a
+ PGD used to map userspace. This ensures that the kernel
+ and user/shadow copies always map the same userspace
+ memory.
+ g. On systems without PCID support, each CR3 write flushes
+ the entire TLB. That means that each syscall, interrupt
+ or exception flushes the TLB.
+
+Future work:
+1. We can be more careful about not actually writing to CR3
+ unless we actually switch it.
+2. Try to have dedicated entry/exist kernel stacks so we do
+ not have to map/unmap the task/thread stacks.
+3. Compress the user/shadow-mapped data to be mapped together
+ underneath a single PGD entry.
+4. Re-enable global pages, but use them for mappings in the
+ user/shadow page tables. This would allow the kernel to
+ take advantage of TLB entries that were established from
+ the user page tables. This might speed up the entry/exit
+ code or userspace since it will not have to reload all of
+ its TLB entries. However, its upside is limited by PCID
+ being used.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac..66ec43d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -327,6 +327,10 @@
config FIX_EARLYCON_MEM
def_bool y
+config X86_GLOBAL_PAGES
+ def_bool y
+ depends on ! KAISER
+
config PGTABLE_LEVELS
int
default 5 if X86_5LEVEL
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 640aafe..d1b97dd 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,5 +1,8 @@
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
/*
@@ -217,6 +220,73 @@
#endif
.endm
+#ifdef CONFIG_KAISER
+
+/* KAISER PGDs are 8k. We flip bit 12 to switch between the two halves: */
+#define KASIER_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
+#define KAISER_SWITCH_MASK (KASIER_SWITCH_PGTABLES_MASK|\
+ (1<<X86_CR3_KAISER_SWITCH_BIT))
+
+.macro ADJUST_KERNEL_CR3 reg:req
+ ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+ /* Clear PCID and "KAISER bit", point CR3 at kernel pagetables: */
+ andq $(~KAISER_SWITCH_MASK), \reg
+.endm
+
+.macro ADJUST_USER_CR3 reg:req
+ ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+ /* Set user PCID bit, and move CR3 up a page to the user page tables: */
+ orq $(KAISER_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ mov %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+ mov %cr3, \scratch_reg
+ ADJUST_USER_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ movq %cr3, %r\scratch_reg
+ movq %r\scratch_reg, \save_reg
+ /*
+ * Is the "switch mask" all zero? That means that both
+ * the user/kernel PCID bit, and the user/kernel "bit"
+ * that points CR3 to the top or bottom of the 8k PGD
+ * are 0. That's a kernel CR3 value, not user/shadow.
+ */
+ testl $(KAISER_SWITCH_MASK), %e\scratch_reg
+ jz .Ldone_\@
+
+ ADJUST_KERNEL_CR3 %r\scratch_reg
+ movq %r\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 save_reg:req
+ /* optimize this */
+ movq \save_reg, %cr3
+.endm
+
+#else /* CONFIG_KAISER=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 save_reg:req
+.endm
+
+#endif
+
#endif /* CONFIG_X86_64 */
/*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4916725..e3e0a59 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -147,8 +147,6 @@
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- TRACE_IRQS_OFF
-
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
@@ -169,6 +167,13 @@
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
UNWIND_HINT_REGS extra=0
+ /* NB: right here, all regs except r11 are live. */
+
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
+
+ /* Must wait until we have the kernel CR3 to call C functions: */
+ TRACE_IRQS_OFF
+
/*
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
@@ -220,6 +225,7 @@
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
UNWIND_HINT_EMPTY
@@ -313,6 +319,7 @@
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
@@ -320,6 +327,7 @@
USERGS_SYSRET64
opportunistic_sysret_failed:
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -422,6 +430,7 @@
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi
SWAPGS
jmp restore_regs_and_iret
@@ -566,6 +575,7 @@
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -611,6 +621,7 @@
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi
SWAPGS
jmp restore_regs_and_iret
@@ -1091,7 +1102,11 @@
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
-1: ret
+
+1:
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=ax save_reg=%r14
+
+ ret
END(paranoid_entry)
/*
@@ -1118,6 +1133,7 @@
paranoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
paranoid_exit_restore:
+ RESTORE_CR3 %r14
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1144,6 +1160,9 @@
*/
SWAPGS
+ /* We have user CR3. Change to kernel CR3. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+
.Lerror_entry_from_usermode_after_swapgs:
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -1190,9 +1209,10 @@
.Lerror_bad_iret:
/*
- * We came from an IRET to user mode, so we have user gsbase.
- * Switch to kernel gsbase:
+ * We came from an IRET to user mode, so we have user
+ * gsbase and CR3. Switch to kernel gsbase and CR3:
*/
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
SWAPGS
/*
@@ -1313,6 +1333,7 @@
UNWIND_HINT_REGS
ENCODE_FRAME_POINTER
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
/*
* At this point we no longer need to worry about stack damage
* due to nesting -- we're on the normal thread stack and we're
@@ -1328,6 +1349,7 @@
* work, because we don't want to enable interrupts.
*/
SWAPGS
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi
jmp restore_regs_and_iret
.Lnmi_from_kernel:
@@ -1538,6 +1560,8 @@
movq $-1, %rsi
call do_nmi
+ RESTORE_CR3 save_reg=%r14
+
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e26c25c..9558206 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -48,8 +48,13 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ pushq %rdi
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ popq %rdi
+
/*
* User tracing code (ptrace or signal handlers) might assume that
* the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -91,6 +96,9 @@
pushq $0 /* pt_regs->r15 = 0 */
cld
+ pushq %rdi
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ popq %rdi
/*
* SYSENTER doesn't filter flags, so we need to clear NT and AC
* ourselves. To save a few cycles, we can check whether
@@ -214,6 +222,8 @@
pushq $0 /* pt_regs->r14 = 0 */
pushq $0 /* pt_regs->r15 = 0 */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
/*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
@@ -240,6 +250,7 @@
popq %rsi /* pt_regs->si */
popq %rdi /* pt_regs->di */
+ SWITCH_TO_USER_CR3 scratch_reg=%r8
/*
* USERGS_SYSRET32 does:
* GSBASE = user's GS base
@@ -324,6 +335,7 @@
pushq %r15 /* pt_regs->r15 */
cld
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
/*
* User mode is traced as though IRQs are on, and the interrupt
* gate turned them off.
@@ -337,6 +349,7 @@
/* Go back to user mode. */
TRACE_IRQS_ON
SWAPGS
+ SWITCH_TO_USER_CR3 scratch_reg=%r11
jmp restore_regs_and_iret
END(entry_INT80_compat)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e1965e5..1449595 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2,11 +2,15 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/kaiser.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "../perf_event.h"
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@@ -278,6 +282,39 @@
static DEFINE_PER_CPU(void *, insn_buffer);
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+ unsigned int order = get_order(size);
+ struct page *page;
+ unsigned long addr;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ if (!page)
+ return NULL;
+ addr = (unsigned long)page_address(page);
+ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+ __free_pages(page, order);
+ addr = 0;
+ }
+ return (void *)addr;
+#else
+ return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+ if (!buffer)
+ return;
+ kaiser_remove_mapping((unsigned long)buffer, size);
+ free_pages((unsigned long)buffer, get_order(size));
+#else
+ kfree(buffer);
+#endif
+}
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -288,7 +325,7 @@
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -299,7 +336,7 @@
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
- kfree(buffer);
+ dsfree(buffer, x86_pmu.pebs_buffer_size);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
@@ -325,7 +362,8 @@
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+ x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
}
@@ -339,7 +377,7 @@
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
@@ -365,19 +403,15 @@
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
+ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
- if (unlikely(!ds))
- return -ENOMEM;
-
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
@@ -391,7 +425,6 @@
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2519c6c..35d4593 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -193,6 +193,7 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6dfe366..5134d08 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -160,7 +160,7 @@
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 0000000..d17f553
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Based on work published here: https://github.com/IAIK/KAISER
+ * Modified by Dave Hansen <dave.hansen@intel.com to actually work.
+ */
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KAISER
+/**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * The mapping is done on a global scope, so no bigger
+ * synchronization has to be done. the pages have to be
+ * manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
+ unsigned long flags);
+
+/**
+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * kaiser_initialize_mapping - Initialize the shadow mapping
+ *
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and nevertunmapped.
+ */
+extern void kaiser_init(void);
+
+void kaiser_check_user_mapped(const void *addr);
+
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 3c856a1..bec7cb9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -281,33 +281,6 @@
}
/*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
- * bits. This serves two purposes. It prevents a nasty situation in
- * which PCID-unaware code saves CR3, loads some other value (with PCID
- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
- * the saved ASID was nonzero. It also means that any bugs involving
- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
- * deterministically.
- */
-
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
- if (static_cpu_has(X86_FEATURE_PCID)) {
- VM_WARN_ON_ONCE(asid > 4094);
- return __sme_pa(mm->pgd) | (asid + 1);
- } else {
- VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(mm->pgd);
- }
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
- VM_WARN_ON_ONCE(asid > 4094);
- return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
-}
-
-/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
@@ -316,7 +289,7 @@
*/
static inline unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b2d0cd8..9d5f54b 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -61,20 +61,36 @@
___pte_free_tlb(tlb, pte);
}
+/*
+ * _KERNPG_TABLE has _PAGE_USER clear which tells the KAISER code
+ * that this mapping is for kernel use only. That makes sure that
+ * we leave the mapping usable by the kernel and do not try to
+ * sabotage it by doing stuff like setting _PAGE_NX on it.
+ */
+static inline pteval_t mm_pgtable_flags(struct mm_struct *mm)
+{
+ if (!mm || (mm == &init_mm))
+ return _KERNPG_TABLE;
+ return _PAGE_TABLE;
+}
+
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
{
+ pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
- set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+ set_pmd(pmd, __pmd(__pa(pte) | pgtable_flags));
}
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
struct page *pte)
{
+ pteval_t pgtable_flags = mm_pgtable_flags(mm);
unsigned long pfn = page_to_pfn(pte);
paravirt_alloc_pte(mm, pfn);
- set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+ set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | pgtable_flags));
}
#define pmd_pgtable(pmd) pmd_page(pmd)
@@ -117,16 +133,20 @@
#else /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
+ pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+ set_pud(pud, __pud(__pa(pmd) | pgtable_flags));
}
#endif /* CONFIG_X86_PAE */
#if CONFIG_PGTABLE_LEVELS > 3
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
+ pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
- set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
+ set_p4d(p4d, __p4d(__pa(pud) | pgtable_flags));
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -155,8 +175,10 @@
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
+ pteval_t pgtable_flags = mm_pgtable_flags(mm);
+
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+ set_pgd(pgd, __pgd(__pa(p4d) | pgtable_flags));
}
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b714934..c149fdf 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -845,7 +845,12 @@
static inline int p4d_bad(p4d_t p4d)
{
- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_KAISER))
+ ignore_flags |= _PAGE_NX;
+
+ return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -879,7 +884,12 @@
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ unsigned long ignore_flags = _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_KAISER))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -1105,6 +1115,12 @@
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 972a469..4c1fe00 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -130,9 +130,134 @@
#endif
}
+#ifdef CONFIG_KAISER
+/*
+ * All top-level KAISER page tables are order-1 pages (8k-aligned
+ * and 8k in size). The kernel one is at the beginning 4k and
+ * the user (shadow) one is in the last 4k. To switch between
+ * them, you just need to flip the 12th bit in their addresses.
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+ __ptr |= (1<<bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+ __ptr &= ~(1<<bit);
+ return (void *)__ptr;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PAGE_SHIFT);
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PAGE_SHIFT);
+}
+static inline p4d_t *native_get_shadow_p4d(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PAGE_SHIFT);
+}
+static inline p4d_t *native_get_normal_p4d(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PAGE_SHIFT);
+}
+#endif /* CONFIG_KAISER */
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
+/*
+ * Does this PGD allow access via userspace?
+ */
+static inline bool pgd_userspace_access(pgd_t pgd)
+{
+ return (pgd.pgd & _PAGE_USER);
+}
+
+
+void pgd_check_tag(pgd_t *pgd);
+void tag_pgd_with_size(pgd_t *pgd, unsigned long size);
+void clear_pgd_tag(pgd_t *pgd);
+
+/*
+ * Returns the pgd_t that the kernel should use in its page tables.
+ */
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+#ifdef CONFIG_KAISER
+ /* ensure pgdp points to an actual correctly-allocated PGD */
+ pgd_check_tag(pgdp);
+
+ if (pgd_userspace_access(pgd)) {
+ if (pgdp_maps_userspace(pgdp)) {
+ /*
+ * The user/shadow page tables get the full
+ * PGD, accessible to userspace:
+ */
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * For the copy of the pgd that the kernel
+ * uses, make it unusable to userspace. This
+ * ensures if we get out to userspace with the
+ * wrong CR3 value, userspace will crash
+ * instead of running.
+ */
+ pgd.pgd |= _PAGE_NX;
+ }
+ } else if (!pgd.pgd) {
+ /*
+ * We are clearing the PGD and can not check _PAGE_USER
+ * in the zero'd PGD. We never do this on the
+ * pre-populated kernel PGDs, except for pgd_bad().
+ */
+ if (pgdp_maps_userspace(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ } else {
+ /*
+ * Uh, we are very confused. We have been
+ * asked to clear a PGD that is in the kernel
+ * part of the address space. We preallocated
+ * all the KAISER PGDs, so this should never
+ * happen.
+ */
+ WARN_ON_ONCE(1);
+ }
+ }
+#endif
+ /* return the copy of the PGD we want the kernel to use: */
+ return pgd;
+}
+
+
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
+#if defined(CONFIG_KAISER) && !defined(CONFIG_X86_5LEVEL)
+ /*
+ * set_pgd() does not get called when we are running
+ * CONFIG_X86_5LEVEL=y. So, just hack around it. We
+ * know here that we have a p4d but that it is really at
+ * the top level of the page tables; it is really just a
+ * pgd.
+ */
+ p4dp->pgd = kaiser_set_shadow_pgd(&p4dp->pgd, p4d.pgd);
+#else /* CONFIG_KAISER */
*p4dp = p4d;
+#endif
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -146,7 +271,11 @@
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_KAISER
+ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
+#else /* CONFIG_KAISER */
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f149247..e9330a4 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -47,7 +47,12 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#ifdef CONFIG_X86_GLOBAL_PAGES
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#else
+/* We must ensure that kernel TLBs are unusable while in userspace */
+#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -121,7 +126,7 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -139,6 +144,17 @@
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1, UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+
+/* Make sure this is only usable in KAISER #ifdef'd code: */
+#ifdef CONFIG_KAISER
+#define X86_CR3_KAISER_SWITCH_BIT 11
+#endif
+
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b390ff7..c1dd583 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -348,7 +348,7 @@
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c4aed0d..32f981d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,7 +13,7 @@
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
-
+ void *ptr = &desc;
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
@@ -24,7 +24,7 @@
* invpcid (%rcx), %rax in long mode.
*/
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
- : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+ : : "m" (desc), "a" (type), "c" (ptr) : "memory");
}
#define INVPCID_TYPE_INDIV_ADDR 0
@@ -48,13 +48,13 @@
/* Flush all mappings, including globals, for all PCIDs. */
static inline void invpcid_flush_all(void)
{
- __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+ __invpcid(0x123, 0x456, INVPCID_TYPE_ALL_INCL_GLOBAL);
}
/* Flush all mappings for all PCIDs except globals. */
static inline void invpcid_flush_all_nonglobals(void)
{
- __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+ __invpcid(0x123, 0x456, INVPCID_TYPE_ALL_NON_GLOBAL);
}
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
@@ -74,6 +74,97 @@
return new_tlb_gen;
}
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+/* When enabled, KAISER consumes a single bit for user/kernel switches */
+#ifdef CONFIG_KAISER
+#define X86_CR3_KAISER_SWITCH_BIT 11
+#define KAISER_CONSUMED_ASID_BITS 1
+#else
+#define KAISER_CONSUMED_ASID_BITS 0
+#endif
+
+#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS-KAISER_CONSUMED_ASID_BITS)
+/*
+ * We lose a single extra ASID because 0 is reserved for use
+ * by non-PCID-aware users.
+ */
+#define NR_AVAIL_ASIDS ((1<<CR3_AVAIL_ASID_BITS) - 1)
+
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+static inline u16 kern_asid(u16 asid)
+{
+ VM_WARN_ON_ONCE(asid >= NR_AVAIL_ASIDS);
+
+#ifdef CONFIG_KAISER
+ /*
+ * Make sure that the dynamic ASID space does not confict
+ * with the bit we are using to switch between user and
+ * kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1<<X86_CR3_KAISER_SWITCH_BIT));
+
+ /*
+ * The ASID being passed in here should have respected
+ * the NR_AVAIL_ASIDS and thus never have the switch
+ * bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1<<X86_CR3_KAISER_SWITCH_BIT));
+#endif
+ /*
+ * The dynamically-assigned ASIDs that get passed in are
+ * small (<TLB_NR_DYN_ASIDS). They never have the high
+ * switch bit set, so do not bother to clear it.
+ */
+
+ /*
+ * If PCID is on, ASID-aware code paths put the ASID+1
+ * into the PCID bits. This serves two purposes. It
+ * prevents a nasty situation in which PCID-unaware code
+ * saves CR3, loads some other value (with PCID == 0),
+ * and then restores CR3, thus corrupting the TLB for
+ * ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3
+ * with CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
+}
+
+/*
+ * The user ASID is just the kernel one, plus the "switch bit".
+ */
+static inline u16 user_asid(u16 asid)
+{
+ u16 ret = kern_asid(asid);
+#ifdef CONFIG_KAISER
+ ret |= 1<<X86_CR3_KAISER_SWITCH_BIT;
+#endif
+ return ret;
+}
+
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_asid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > NR_AVAIL_ASIDS);
+ VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_asid(asid) | CR3_NOFLUSH;
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -98,12 +189,6 @@
return !static_cpu_has(X86_FEATURE_PCID);
}
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -138,6 +223,17 @@
bool is_lazy;
/*
+ * If set we changed the page tables in such a way that we
+ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+ * This tells us to go invalidate all the non-loaded ctxs[]
+ * on the next context switch.
+ *
+ * The current ctx was kept up-to-date as it ran and does not
+ * need to be invalidated.
+ */
+ bool all_other_ctxs_invalid;
+
+ /*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
@@ -214,6 +310,19 @@
return this_cpu_read(cpu_tlbstate.cr4);
}
+static inline void tlb_flush_shared_nonglobals(void)
+{
+ /*
+ * With global pages, all of the shared kenel page tables
+ * are set as _PAGE_GLOBAL. We have no shared nonglobals
+ * and nothing to do here.
+ */
+ if (IS_ENABLED(CONFIG_X86_GLOBAL_PAGES))
+ return;
+
+ this_cpu_write(cpu_tlbstate.all_other_ctxs_invalid, true);
+}
+
/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
@@ -235,14 +344,38 @@
static inline void __native_flush_tlb(void)
{
+ if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+ /*
+ * native_write_cr3() only clears the current PCID if
+ * CR4 has X86_CR4_PCIDE set. In other words, this does
+ * not fully flush the TLB if PCIDs are in use. Warn if
+ * it gets called.
+ */
+ WARN_ON_ONCE(this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE);
+ /*
+ * If current->mm == NULL then we borrow a mm
+ * which may change during a task switch and
+ * therefore we must not be preempted while we
+ * write CR3 back:
+ */
+ preempt_disable();
+ native_write_cr3(__native_read_cr3());
+ preempt_enable();
+ /*
+ * Does not need tlb_flush_shared_nonglobals()
+ * since the CR3 write without PCIDs flushes all
+ * non-globals.
+ */
+ return;
+ }
/*
- * If current->mm == NULL then we borrow a mm which may change during a
- * task switch and therefore we must not be preempted while we write CR3
- * back:
+ * We are no longer using globals with KAISER, so a
+ * "nonglobals" flush would work too. But, this is more
+ * conservative.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
- preempt_disable();
- native_write_cr3(__native_read_cr3());
- preempt_enable();
+ invpcid_flush_all();
}
static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -250,7 +383,18 @@
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* clear PGE */
+ /*
+ * According to the SDM, a MOV to CR4 invalidates all TLB
+ * entries (including global entries) and all entries in
+ * all paging-structure caches (for all PCIDs) if it
+ * changes the value of CR4.PGE or changes CR4.PCIDE
+ * from 1->0.
+ *
+ * We can not change CR4.PCIDE 1->0 if we have a non-zero
+ * PCID in CR3, so just toggle X86_CR4_PGE. But,
+ * this will clear all PCIDs, just as if X86_CR4_PCIDE
+ * had also been toggled.
+ */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
@@ -264,6 +408,8 @@
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@@ -283,29 +429,51 @@
static inline void __native_flush_tlb_single(unsigned long addr)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before
+ * CR4.PCIDE=1. Just call invpcid in the case we are called
+ * early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+ }
+ /* Flush the address out of both PCIDs. */
+ /*
+ * An optimization here might be to determine addresses
+ * that are only kernel-mapped and only flush the kernel
+ * ASID. But, userspace flushes are probably much more
+ * important performance-wise.
+ *
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+ if (kern_asid(loaded_mm_asid) != user_asid(loaded_mm_asid))
+ invpcid_flush_one(user_asid(loaded_mm_asid), addr);
+ invpcid_flush_one(kern_asid(loaded_mm_asid), addr);
}
static inline void __flush_tlb_all(void)
{
- if (boot_cpu_has(X86_FEATURE_PGE))
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
- else
+ } else {
__flush_tlb();
-
- /*
- * Note: if we somehow had PCID but not PGE, then this wouldn't work --
- * we'd end up flushing kernel translations for the current ASID but
- * we might fail to flush kernel translations for other cached ASIDs.
- *
- * To avoid this issue, we force PCID off if PGE is off.
- */
+ tlb_flush_shared_nonglobals();
+ }
}
static inline void __flush_tlb_one(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
+ /*
+ * Invalidate other address spaces inaccessible to single-page
+ * invalidation:
+ */
+ tlb_flush_shared_nonglobals();
}
#define TLB_FLUSH_ALL -1UL
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 185f3d1..6616457 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d33..dcd6ea7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -722,7 +722,8 @@
clear_fixmap(FIX_TEXT_POKE0);
if (pages[1])
clear_fixmap(FIX_TEXT_POKE1);
- local_flush_tlb();
+ /* Make sure to flush Global pages: */
+ __flush_tlb_all();
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9176ba..f4985b4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/string.h>
+#include <linux/kaiser.h>
#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/sched/mm.h>
@@ -98,7 +99,7 @@
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -487,6 +488,20 @@
#endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+
+ /* CPU 0's mapping is done in kaiser_init() */
+ if (cpu) {
+ int ret;
+
+ ret = kaiser_add_mapping((unsigned long) get_cpu_gdt_ro(cpu),
+ PAGE_SIZE, __PAGE_KERNEL_RO);
+ /*
+ * We do not have a good way to fail CPU bringup.
+ * Just WARN about it and hope we boot far enough
+ * to get a good log out.
+ */
+ WARN_ON(ret);
+ }
}
/* Load the original GDT from the per-cpu structure */
@@ -1345,7 +1360,7 @@
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
@@ -1425,7 +1440,7 @@
* the top of the kernel stack. Use an extra percpu variable to track the
* top of the kernel stack directly.
*/
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, cpu_current_top_of_stack) =
(unsigned long)&init_thread_union + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9c4e7ba..673137a 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/init_task.h>
+#include <linux/kaiser.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
@@ -60,8 +61,8 @@
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
/* This contains the *bottom* address of the espfix stack */
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, espfix_stack);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, espfix_waddr);
/* Initialization mutex - should this be a spinlock? */
static DEFINE_MUTEX(espfix_init_mutex);
@@ -128,6 +129,22 @@
pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ *
+ * For 5-level paging, we should have already populated
+ * the espfix pgd when kaiser_init() pre-populated all
+ * the pgd entries. The above p4d_alloc() would never do
+ * anything and the p4d_populate() would be done to a p4d
+ * already mapped in the userspace pgd.
+ */
+#ifdef CONFIG_KAISER
+ if (CONFIG_PGTABLE_LEVELS <= 4)
+ *native_get_shadow_pgd(pgd) =
+ __pgd(_KERNPG_TABLE | (p4d_pfn(*p4d) << PAGE_SHIFT));
+#endif
/* Randomize the locations */
init_espfix_random();
@@ -208,4 +225,5 @@
per_cpu(espfix_stack, cpu) = addr;
per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ (addr & ~PAGE_MASK);
+ kaiser_add_mapping((unsigned long)stack_page, PAGE_SIZE, __PAGE_KERNEL);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 513cbb0..35eab60 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -339,6 +339,27 @@
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -348,13 +369,14 @@
.endr
__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -362,16 +384,18 @@
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
+ .fill KAISER_USER_PGD_FILL,8,0
#else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 6107ee1..fa63b0b 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -4,6 +4,7 @@
* This file is licensed under the GPL V2
*/
#include <linux/interrupt.h>
+#include <linux/kaiser.h>
#include <asm/traps.h>
#include <asm/proto.h>
@@ -221,6 +222,8 @@
{
gate_desc desc;
+ kaiser_check_user_mapped(t->addr);
+
for (; size > 0; t++, size--) {
idt_init_desc(&desc, t);
write_idt_entry(idt, t->vector, &desc);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1add9e0..8ae1196 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f0e64db..3f53930 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -10,6 +10,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/string.h>
+#include <linux/kaiser.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/slab.h>
@@ -55,11 +56,21 @@
refresh_ldt_segments();
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree_atomic(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
struct ldt_struct *new_ldt;
unsigned int alloc_size;
+ int ret = 0;
if (num_entries > LDT_ENTRIES)
return NULL;
@@ -87,6 +98,12 @@
return NULL;
}
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
new_ldt->nr_entries = num_entries;
return new_ldt;
}
@@ -113,12 +130,10 @@
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->nr_entries * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->nr_entries);
- if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree_atomic(ldt->entries);
- else
- free_page((unsigned long)ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bd6b85f..a1f65ea 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,7 +46,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 302e7b2..3839f06 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -59,7 +59,7 @@
#include <asm/unistd_32_ia32.h>
#endif
-__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, rsp_scratch);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb..cd7ed7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -805,7 +805,8 @@
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+ !is_long_mode(vcpu))
return 1;
}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index e1f0958..9639303 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -45,6 +45,7 @@
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_KAISER) += kaiser.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af5c1ed..ca03303 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -196,34 +196,59 @@
static void setup_pcid(void)
{
-#ifdef CONFIG_X86_64
- if (boot_cpu_has(X86_FEATURE_PCID)) {
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- /*
- * This can't be cr4_set_bits_and_update_boot() --
- * the trampoline code can't handle CR4.PCIDE and
- * it wouldn't do any good anyway. Despite the name,
- * cr4_set_bits_and_update_boot() doesn't actually
- * cause the bits in question to remain set all the
- * way through the secondary boot asm.
- *
- * Instead, we brute-force it and set CR4.PCIDE
- * manually in start_secondary().
- */
- cr4_set_bits(X86_CR4_PCIDE);
- } else {
- /*
- * flush_tlb_all(), as currently implemented, won't
- * work if PCID is on but PGE is not. Since that
- * combination doesn't exist on real hardware, there's
- * no reason to try to fully support it, but it's
- * polite to avoid corrupting data if we're on
- * an improperly configured VM.
- */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * KAISER uses a PCID for the kernel and another
+ * for userspace. Both PCIDs need to be flushed
+ * when the TLB flush functions are called. But,
+ * flushing *another* PCID is insane without
+ * INVPCID. Just avoid using PCIDs at all if we
+ * have KAISER and do not have INVPCID.
+ */
+ if (!IS_ENABLED(CONFIG_X86_GLOBAL_PAGES) &&
+ !boot_cpu_has(X86_FEATURE_INVPCID)) {
setup_clear_cpu_cap(X86_FEATURE_PCID);
+ return;
}
+ /*
+ * This can't be cr4_set_bits_and_update_boot() --
+ * the trampoline code can't handle CR4.PCIDE and
+ * it wouldn't do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually
+ * cause the bits in question to remain set all the
+ * way through the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE
+ * manually in start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+
+ /*
+ * INVPCID's single-context modes (2/3) only work
+ * if we set X86_CR4_PCIDE, *and* we INVPCID
+ * support. It's unusable on systems that have
+ * X86_CR4_PCIDE clear, or that have no INVPCID
+ * support at all.
+ */
+ if (boot_cpu_has(X86_FEATURE_INVPCID))
+ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
}
-#endif
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 0000000..3366d20
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Based on work published here: https://github.com/IAIK/KAISER
+ * Modified by Dave Hansen <dave.hansen@intel.com to actually work.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/kaiser.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ if (pgd_none(*pgd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ if (pgd_large(*pgd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ pud = pud_offset(p4d, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+#define KAISER_NO_FLAGS 0
+#define KAISER_WALK_ATOMIC 0x1
+#define KAISER_WALK_NO_ALLOC 0x2
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * only walks the shadow page tables and can try to allocate
+ * new page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, unsigned long flags)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ p4d_t *p4d;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ might_sleep();
+ if (flags & KAISER_WALK_ATOMIC) {
+ gfp &= ~GFP_KERNEL;
+ gfp |= __GFP_HIGH | __GFP_ATOMIC;
+ }
+
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ p4d = p4d_offset(pgd, address);
+ BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ if (p4d_none(*p4d)) {
+ unsigned long new_pud_page;
+ if (flags & KAISER_WALK_NO_ALLOC)
+ return NULL;
+ new_pud_page = __get_free_page(gfp);
+ if (!new_pud_page)
+ return NULL;
+
+ spin_lock(&shadow_table_allocation_lock);
+ if (p4d_none(*p4d))
+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+ else
+ free_page(new_pud_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pud = pud_offset(p4d, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page;
+
+ if (flags & KAISER_WALK_NO_ALLOC)
+ return NULL;
+
+ new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud))
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page;
+
+ if (flags & KAISER_WALK_NO_ALLOC)
+ return NULL;
+
+ new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd))
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Given a kernel address, @__start_addr, copy that mapping into
+ * the user (shadow) page tables. This may need to allocate page
+ * table pages.
+ */
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long)__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1)
+ return -EIO;
+
+ pte = kaiser_pagetable_walk(address, KAISER_NO_FLAGS);
+ /*
+ * Errors come from either -ENOMEM for a page
+ * table page, or something screwy that did a
+ * WARN_ON(). Just return -ENOMEM.
+ */
+ if (!pte)
+ return -ENOMEM;
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
+ return 0;
+}
+
+int kaiser_add_user_map_ptrs(const void *__start_addr,
+ const void *__end_addr,
+ unsigned long flags)
+{
+ return kaiser_add_user_map(__start_addr,
+ __end_addr - __start_addr,
+ flags);
+}
+
+static int kaiser_user_map_ptr_early(const void *start_addr, unsigned long size,
+ unsigned long flags)
+{
+ int ret = kaiser_add_user_map(start_addr, size, flags);
+ WARN_ON(ret);
+ return ret;
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k(0UL));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+ unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE;
+#if CONFIG_PGTABLE_LEVELS > 4
+ p4d_t *p4d = p4d_alloc_one(&init_mm, addr);
+ if (!p4d) {
+ WARN_ON(1);
+ break;
+ }
+ pgd[i] = __pgd(_KERNPG_TABLE | __pa(p4d));
+#else /* CONFIG_PGTABLE_LEVELS <= 4 */
+ pud_t *pud = pud_alloc_one(&init_mm, addr);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ pgd[i] = __pgd(_KERNPG_TABLE | __pa(pud));
+#endif /* CONFIG_PGTABLE_LEVELS */
+ }
+}
+
+/*
+ * The page table allocations in here can theoretically fail, but
+ * we can not do much about it in early boot. Do the checking
+ * and warning in a macro to make it more readable.
+ */
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+static int kaiser_init_finished = 0;
+void kaiser_check_idt_table(const struct desc_ptr *idt_desc);
+void __init kaiser_init(void)
+{
+ int cpu;
+
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
+
+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+
+ /* the fixed map address of the idt_table */
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+
+ kaiser_user_map_ptr_early(&debug_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
+
+ /*
+ * We could theoretically do this in setup_fixmap_gdt().
+ * But, we would need to rewrite the above page table
+ * allocation code to use the bootmem allocator. The
+ * buddy allocator is not available at the time that we
+ * call setup_fixmap_gdt() for CPU 0.
+ */
+ kaiser_add_user_map_early(get_cpu_gdt_ro(0), PAGE_SIZE,
+ __PAGE_KERNEL_RO);
+
+ /*
+ * .irqentry.text helps us identify code that runs before
+ * we get a chance to call entering_irq(). This includes
+ * the interrupt entry assembly plus the first C function
+ * that gets called. KAISER does not need the C code
+ * mapped. We just use the .irqentry.text section as-is
+ * to avoid having to carve out a new section for the
+ * assembly only.
+ */
+ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+ __irqentry_text_end,
+ __PAGE_KERNEL_RX);
+
+ kaiser_init_finished = 1;
+ kaiser_check_idt_table(&idt_descr);
+ kaiser_check_idt_table(&debug_idt_descr);
+}
+
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start,
+ unsigned long end);
+int kaiser_add_mapping(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ unsigned long end = start + size;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PGDIR_SIZE) {
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+ /*
+ * unmap_p4d_range() handles > P4D_SIZE unmaps,
+ * so no need to trim 'end'.
+ */
+ unmap_pud_range_nofree(pgd, addr, end);
+ }
+}
+
+/*
+ * If we fail to map something, it is not fun to debug because it
+ * will often crash in the entry code, which jumps into the entry
+ * code. To catch these problems, allow the kaiser mappings to
+ * be checked before we actually need to use them.
+ */
+void kaiser_check_user_mapped(const void *addr)
+{
+ pte_t *pte;
+
+ /*
+ * idt_setup_early_handler() might call in here before
+ * kaiser_init() runs. Ignore those calls.
+ */
+ if (!kaiser_init_finished)
+ return;
+
+ pte = kaiser_pagetable_walk((unsigned long)addr, KAISER_WALK_NO_ALLOC);
+
+ if (pte && pte_present(*pte))
+ return;
+
+ WARN(1, "address is not mapped into user page tables: %pK\n", addr);
+}
+
+void kaiser_check_idt_table(const struct desc_ptr *idt_desc)
+{
+ struct gate_struct *hw_table_ptr;
+ int table_nr_entries;
+ int i;
+
+ /*
+ * Note: "idt_desc" itself is not user-mapped. The
+ * descriptor's content are loaded into a register and it
+ * is not referenced after the 'lidt' instruction used to
+ * load its contents.
+ */
+
+ hw_table_ptr = (struct gate_struct *)idt_desc->address;
+ /* The hardware description in 'idt_desc->size' is in bytes. */
+ table_nr_entries = idt_desc->size / sizeof(*hw_table_ptr);
+
+ for (i = 0; i < table_nr_entries; i++) {
+ struct gate_struct *hw_table_entry = &hw_table_ptr[i];
+ void *idt_handler_addr = (void *)gate_offset(hw_table_entry);
+
+ /* Make sure the table itself is mapped: */
+ kaiser_check_user_mapped(hw_table_entry);
+
+ /* Skip empty entries */
+ if (!idt_handler_addr)
+ continue;
+ /*
+ * Entries for the early IDT handlers are left in
+ * the IDT even after boot. They are not used
+ * and are not user-mapped, so skip the checks
+ * for them.
+ */
+ if (!test_bit(i, used_vectors))
+ continue;
+
+ kaiser_check_user_mapped(idt_handler_addr);
+ }
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dfb7d65..416642e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -53,6 +53,7 @@
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -765,10 +766,13 @@
return 0;
}
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
@@ -777,10 +781,13 @@
return true;
}
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
@@ -789,7 +796,9 @@
return true;
}
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+ unsigned long start,
+ unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -800,22 +809,23 @@
pte++;
}
- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
- if (unmap_pte_range(pmd, start, end))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (unmap_pte_range(cpa, pmd, start, end))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+ unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
@@ -826,7 +836,7 @@
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- __unmap_pmd_range(pud, pmd, start, pre_end);
+ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end;
pmd++;
@@ -839,7 +849,8 @@
if (pmd_large(*pmd))
pmd_clear(pmd);
else
- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+ __unmap_pmd_range(cpa, pud, pmd,
+ start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
@@ -849,17 +860,19 @@
* 4K leftovers?
*/
if (start < end)
- return __unmap_pmd_range(pud, pmd, start, end);
+ return __unmap_pmd_range(cpa, pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, p4d_t *p4d,
+ unsigned long start,
+ unsigned long end)
{
pud_t *pud = pud_offset(p4d, start);
@@ -870,7 +883,7 @@
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- unmap_pmd_range(pud, start, pre_end);
+ unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end;
pud++;
@@ -884,7 +897,7 @@
if (pud_large(*pud))
pud_clear(pud);
else
- unmap_pmd_range(pud, start, start + PUD_SIZE);
+ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
@@ -894,7 +907,7 @@
* 2M leftovers?
*/
if (start < end)
- unmap_pmd_range(pud, start, end);
+ unmap_pmd_range(cpa, pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
@@ -902,6 +915,24 @@
*/
}
+void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = CPA_FREE_PAGETABLES,
+ };
+
+ __unmap_pud_range(&cpa, p4d, start, end);
+}
+
+void unmap_pud_range_nofree(p4d_t *p4d, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = 0,
+ };
+
+ __unmap_pud_range(&cpa, p4d, start, end);
+}
+
static int alloc_pte_page(pmd_t *pmd)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -1149,8 +1180,7 @@
* already found it, but remove any useless entries we just
* added to it.
*/
- unmap_pud_range(p4d, addr,
- addr + (cpa->numpages << PAGE_SHIFT));
+ unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT));
return ret;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b372f34..9bc6fc8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -354,14 +354,69 @@
kmem_cache_free(pgd_cache, pgd);
}
#else
+
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pgd, we aquire two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+const unsigned long pgd_magic = 0x70676400 | PGD_ALLOCATION_ORDER;
+
+void tag_pgd_with_size(pgd_t *pgd, unsigned long size)
+{
+ struct page *pgd_page = virt_to_page(pgd);
+
+ /*
+ * Make sure the size the caller is tagging with matches
+ * the size that we need the PGD to be. This check is a
+ * bit superfluous for the callers within this file but
+ * is quite necessary for the callers from outside.
+ */
+ WARN_ON_ONCE(size != PAGE_SIZE << PGD_ALLOCATION_ORDER);
+
+ pgd_page->private = pgd_magic;
+}
+
+void clear_pgd_tag(pgd_t *pgd)
+{
+ struct page *pgd_page = virt_to_page(pgd);
+ pgd_page->private = 0;
+}
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ pgd_t *pgd = (pgd_t *)__get_free_pages(PGALLOC_GFP,
+ PGD_ALLOCATION_ORDER);
+ tag_pgd_with_size(pgd, PAGE_SIZE << PGD_ALLOCATION_ORDER);
+
+ return pgd;
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ clear_pgd_tag(pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
+void pgd_check_tag(pgd_t *pgd)
+{
+ struct page *pgd_page = virt_to_page(pgd);
+
+ if (pgd_page == virt_to_page(init_top_pgt))
+ return;
+
+ if (pgd_page->private == pgd_magic)
+ return;
+
+ if (WARN_ON_ONCE(1))
+ printk(KERN_WARNING "bad pgd page @ 0x%pK, magic: 0x%lx\n",
+ pgd, pgd_page->private);
}
#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0f3d0ce..00ae052 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts. We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_non_loaded_ctxs(void)
+{
+ u16 asid;
+
+ /*
+ * This is only expected to be set if we have disabled
+ * kernel _PAGE_GLOBAL pages.
+ */
+ if (IS_ENABLED(CONFIG_X86_GLOBAL_PAGES)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ /* Do not need to flush the current asid */
+ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+ continue;
+ /*
+ * Make sure the next time we go to switch to
+ * this asid, we do a flush:
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+ }
+ this_cpu_write(cpu_tlbstate.all_other_ctxs_invalid, false);
+}
+
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@@ -42,6 +74,9 @@
return;
}
+ if (this_cpu_read(cpu_tlbstate.all_other_ctxs_invalid))
+ clear_non_loaded_ctxs();
+
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@@ -65,6 +100,68 @@
*need_flush = true;
}
+/*
+ * Given a kernel asid, flush the corresponding KAISER
+ * user ASID.
+ */
+static void flush_user_asid(pgd_t *pgd, u16 kern_asid)
+{
+ /* There is no user ASID if KAISER is off */
+ if (!IS_ENABLED(CONFIG_KAISER))
+ return;
+ /*
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+ /*
+ * With PCIDs enabled, write_cr3() only flushes TLB
+ * entries for the current (kernel) ASID. This leaves
+ * old TLB entries for the user ASID in place and we must
+ * flush that context separately. We can theoretically
+ * delay doing this until we actually load up the
+ * userspace CR3, but do it here for simplicity.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+ invpcid_flush_single_context(user_asid(kern_asid));
+ } else {
+ /*
+ * On systems with PCIDs, but no INVPCID, the only
+ * way to flush a PCID is a CR3 write. Note that
+ * we use the kernel page tables with the *user*
+ * ASID here.
+ */
+ unsigned long user_asid_flush_cr3;
+ user_asid_flush_cr3 = build_cr3(pgd, user_asid(kern_asid));
+ write_cr3(user_asid_flush_cr3);
+ /*
+ * We do not use PCIDs with KAISER unless we also
+ * have INVPCID. Getting here is unexpected.
+ */
+ WARN_ON_ONCE(1);
+ }
+}
+
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+ unsigned long new_mm_cr3;
+
+ if (need_flush) {
+ flush_user_asid(pgdir, new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid);
+ } else {
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -127,7 +224,7 @@
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -194,12 +291,12 @@
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(build_cr3(next, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, true);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(build_cr3_noflush(next, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, false);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -277,7 +374,7 @@
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm, 0));
+ write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 12e8388..aab0ec4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -69,6 +69,35 @@
}
}
+/*
+ * This is an unapologetic hack. KAISER PGDs have the NX bit set
+ * on the entries mapping userspace. See native_set_pgd() for an
+ * explanation. Unfortunately, EFI runs in what are traditionally
+ * userspace addresses and setting NX on the PGDs will keep EFI
+ * calls from being able to run.
+ *
+ * We do not have a nice pgd_clear_flags() like we have for ptes,
+ * so just hack this in here rather than complicate set_pgd().
+ */
+void pgd_clear_nx(void *__pgd)
+{
+#ifdef CONFIG_KAISER
+ unsigned long *pgd = __pgd;
+ /*
+ * The EFI code always passes init_mm to the pagetable
+ * allocation functions, which should trigger them
+ * via mm_pgtable_flags() set _PAGE_USER, which will
+ * then trigger the KAISER code to not set _PAGE_NX.
+ *
+ * This is probably superfluous, but make it safe and
+ * warn if the mm_pgtable_flags() stuff did not work
+ * out somehow.
+ */
+ WARN_ON_ONCE(*pgd & _PAGE_NX);
+ *pgd &= ~_PAGE_NX;
+#endif
+}
+
pgd_t * __init efi_call_phys_prolog(void)
{
unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
@@ -105,6 +134,7 @@
save_pgd[pgd] = *pgd_efi;
p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+ pgd_clear_nx(pgd_efi);
if (!p4d) {
pr_err("Failed to allocate p4d table!\n");
goto out;
@@ -115,6 +145,7 @@
p4d_efi = p4d + p4d_index(addr_p4d);
pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+ pgd_clear_nx(p4d_efi);
if (!pud) {
pr_err("Failed to allocate pud table!\n");
goto out;
@@ -201,27 +232,45 @@
p4d_t *p4d;
pud_t *pud;
gfp_t gfp_mask;
+ int pgd_order = 0;
+
+ /*
+ * set_pgd() assumes that there is a shadow PGD for all
+ * pgds. Without one, it goes off writing on whatever
+ * page happened to be after the PGD. We can either use
+ * something other than set_pgd(), or just add a shadow
+ * page. Just do the shadow page. We should never
+ * actually use this shadow PGD, though.
+ */
+ if (IS_ENABLED(CONFIG_KAISER))
+ pgd_order = 1;
if (efi_enabled(EFI_OLD_MEMMAP))
return 0;
gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
- efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_order);
+ tag_pgd_with_size(efi_pgd, PAGE_SIZE << pgd_order);
if (!efi_pgd)
return -ENOMEM;
pgd = efi_pgd + pgd_index(EFI_VA_END);
p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
+ /* PGDs "appear" at the P4D level when PGTABLE_LEVELS=4 */
+ pgd_clear_nx(pgd);
if (!p4d) {
- free_page((unsigned long)efi_pgd);
+ clear_pgd_tag(efi_pgd);
+ free_pages((unsigned long)efi_pgd, pgd_order);
return -ENOMEM;
}
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
+ pgd_clear_nx(pud);
if (!pud) {
if (CONFIG_PGTABLE_LEVELS > 4)
free_page((unsigned long) pgd_page_vaddr(*pgd));
- free_page((unsigned long)efi_pgd);
+ clear_pgd_tag(efi_pgd);
+ free_pages((unsigned long)efi_pgd, pgd_order);
return -ENOMEM;
}
@@ -233,6 +282,7 @@
*/
void efi_sync_low_kernel_mappings(void)
{
+ int i;
unsigned num_entries;
pgd_t *pgd_k, *pgd_efi;
p4d_t *p4d_k, *p4d_efi;
@@ -257,7 +307,12 @@
pgd_k = pgd_offset_k(PAGE_OFFSET);
num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
- memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
+ for (i = 0; i < num_entries; i++) {
+ pgd_efi[i] = pgd_k[i];
+ pgd_clear_nx(&pgd_efi[i]);
+ if (IS_ENABLED(CONFIG_KAISER))
+ pgd_efi[i + PTRS_PER_PGD] = pgd_k[i];
+ }
/*
* As with PGDs, we share all P4D entries apart from the one entry
@@ -272,7 +327,12 @@
p4d_k = p4d_offset(pgd_k, 0);
num_entries = p4d_index(EFI_VA_END);
- memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+ for (i = 0; i < num_entries; i++) {
+ p4d_efi[i] = p4d_k[i];
+ pgd_clear_nx(&p4d_efi[i]);
+ if (IS_ENABLED(CONFIG_KAISER))
+ p4d_efi[i + PTRS_PER_PGD] = p4d_k[i];
+ }
/*
* We share all the PUD entries apart from those that map the
@@ -405,6 +465,9 @@
static void __init __map_region(efi_memory_desc_t *md, u64 va)
{
+ int pgd_nr;
+ int start_pgd_nr;
+ int end_pgd_nr;
unsigned long flags = _PAGE_RW;
unsigned long pfn;
pgd_t *pgd = efi_pgd;
@@ -416,6 +479,12 @@
if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
md->phys_addr, va);
+
+ start_pgd_nr = pgd_index(va);
+ end_pgd_nr = pgd_index(va + md->num_pages * PAGE_SIZE);
+ for (pgd_nr = start_pgd_nr; pgd_nr <= end_pgd_nr; pgd_nr++)
+ pgd_clear_nx(&efi_pgd[pgd_nr]);
+
}
void __init efi_map_region(efi_memory_desc_t *md)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8acfc1e..d7c6acb 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -59,6 +59,12 @@
/* Align . to a 8 byte boundary equals to maximum function alignment. */
#define ALIGN_FUNCTION() . = ALIGN(8)
+#ifdef CONFIG_KAISER
+#define ALIGN_KAISER() . = ALIGN(PAGE_SIZE);
+#else
+#define ALIGN_KAISER()
+#endif
+
/*
* LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which
* generates .data.identifier sections, which need to be pulled in with
@@ -493,15 +499,19 @@
VMLINUX_SYMBOL(__kprobes_text_end) = .;
#define ENTRY_TEXT \
+ ALIGN_KAISER(); \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__entry_text_start) = .; \
*(.entry.text) \
+ ALIGN_KAISER(); \
VMLINUX_SYMBOL(__entry_text_end) = .;
#define IRQENTRY_TEXT \
+ ALIGN_KAISER(); \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__irqentry_text_start) = .; \
*(.irqentry.text) \
+ ALIGN_KAISER(); \
VMLINUX_SYMBOL(__irqentry_text_end) = .;
#define SOFTIRQENTRY_TEXT \
@@ -807,7 +817,16 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
- *(.data..percpu..first) \
+ \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
+ *(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
+ \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 0000000..1c3a7d5
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,31 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ return 0;
+}
+
+static inline void kaiser_check_user_mapped(const void *addr)
+{
+}
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299..8ea945f 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -162,6 +182,16 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c
index 0ee9c686..83cfd69 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
+#include <linux/kaiser.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/sched_clock.h>
@@ -504,6 +505,7 @@
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+ kaiser_init();
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743..8a17e62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -247,6 +248,8 @@
static inline void free_thread_stack(struct task_struct *tsk)
{
+ kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
int i;
@@ -536,6 +539,10 @@
* functions again.
*/
tsk->stack = stack;
+ err = kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
+ __PAGE_KERNEL);
+ if (err)
+ goto free_stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
diff --git a/mm/gup.c b/mm/gup.c
index b2b4d42..1a148a6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -36,6 +36,20 @@
return NULL;
}
+enum pgtable_levels {
+ PGD_LEVEL = 5,
+ P4D_LEVEL = 4,
+ PUD_LEVEL = 3,
+ PMD_LEVEL = 2,
+ PTE_LEVEL = 1
+};
+/* noinline so the call site is visible in backtrace */
+noinline static struct page *bad_page_table(enum pgtable_levels level)
+{
+ WARN_ONCE(1, "bad page table at level %d\n", level);
+ return NULL;
+}
+
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
@@ -81,7 +95,7 @@
retry:
if (unlikely(pmd_bad(*pmd)))
- return no_page_table(vma, flags);
+ return bad_page_table(PMD_LEVEL);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
@@ -334,7 +348,7 @@
return page;
}
if (unlikely(pud_bad(*pud)))
- return no_page_table(vma, flags);
+ return bad_page_table(PUD_LEVEL);
return follow_pmd_mask(vma, address, pud, flags, page_mask);
}
@@ -352,7 +366,7 @@
return no_page_table(vma, flags);
BUILD_BUG_ON(p4d_huge(*p4d));
if (unlikely(p4d_bad(*p4d)))
- return no_page_table(vma, flags);
+ return bad_page_table(P4D_LEVEL);
if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
page = follow_huge_pd(vma, address,
@@ -397,8 +411,10 @@
pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ if (pgd_none(*pgd))
return no_page_table(vma, flags);
+ if (unlikely(pgd_bad(*pgd)))
+ return bad_page_table(PGD_LEVEL);
if (pgd_huge(*pgd)) {
page = follow_huge_pgd(mm, address, pgd, flags);
diff --git a/security/Kconfig b/security/Kconfig
index e8e4494..0033d81 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@
implement socket and networking access controls.
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ depends on X86_64 && SMP && !PARAVIRT
+ help
+ This feature reduces the number of hardware side channels by
+ ensuring that the majority of kernel addresses are not mapped
+ into userspace.
+
+ See Documentation/x86/kaiser.txt for more details.
+
config SECURITY_INFINIBAND
bool "Infiniband Security Hooks"
depends on SECURITY && INFINIBAND
@@ -75,7 +85,6 @@
to communicate unlabelled data can send without using
IPSec.
If you are unsure how to answer this question, answer N.
-
config SECURITY_PATH
bool "Security hooks for pathname based access control"
depends on SECURITY