blob: e1e3b3c0159b95a353f14c5c8be86d97ddee915b [file] [log] [blame]
From: Hugh Dickins <hughd@google.com>
Date: Mon, 11 Dec 2017 17:59:50 -0800
Subject: KAISER: Kernel Address Isolation
This patch introduces our implementation of KAISER (Kernel Address
Isolation to have Side-channels Efficiently Removed), a kernel isolation
technique to close hardware side channels on kernel address information.
More information about the original patch can be found at:
https://github.com/IAIK/KAISER
http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Richard Fellner <richard.fellner@student.tugraz.at>
Michael Schwarz <michael.schwarz@iaik.tugraz.at>
<clementine.maurice@iaik.tugraz.at>
<moritz.lipp@iaik.tugraz.at>
That original was then developed further by
Dave Hansen <dave.hansen@intel.com>
Hugh Dickins <hughd@google.com>
then others after this snapshot.
This combined patch for 3.2.96 was derived from hughd's patches below
for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last,
which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been
combined in order to minimize the effort of rebasing: most of the
patches in the 3.18.72 series were small fixes and cleanups and
enhancements to three large patches. About the only new work in this
backport is a simple reimplementation of kaiser_remove_mapping():
since mm/pageattr.c changed a lot between 3.2 and 3.18, and the
mods there for Kaiser never seemed necessary.
KAISER: Kernel Address Isolation
kaiser: merged update
kaiser: do not set _PAGE_NX on pgd_none
kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
kaiser: fix build and FIXME in alloc_ldt_struct()
kaiser: KAISER depends on SMP
kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
kaiser: fix perf crashes
kaiser: ENOMEM if kaiser_pagetable_walk() NULL
kaiser: tidied up asm/kaiser.h somewhat
kaiser: tidied up kaiser_add/remove_mapping slightly
kaiser: kaiser_remove_mapping() move along the pgd
kaiser: align addition to x86/mm/Makefile
kaiser: cleanups while trying for gold link
kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
kaiser: delete KAISER_REAL_SWITCH option
kaiser: vmstat show NR_KAISERTABLE as nr_overhead
kaiser: enhanced by kernel and user PCIDs
kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
kaiser: PCID 0 for kernel and 128 for user
kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
kaiser: paranoid_entry pass cr3 need to paranoid_exit
kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
kaiser: fix unlikely error in alloc_ldt_struct()
kaiser: drop is_atomic arg to kaiser_pagetable_walk()
Signed-off-by: Hugh Dickins <hughd@google.com>
[bwh:
- Fixed the #undef in arch/x86/boot/compressed/misc.h
- Add missing #include in arch/x86/mm/kaiser.c]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/boot/compressed/misc.h | 1 +
arch/x86/ia32/ia32entry.S | 7 +
arch/x86/include/asm/cpufeature.h | 1 +
arch/x86/include/asm/desc.h | 2 +-
arch/x86/include/asm/hw_irq.h | 2 +-
arch/x86/include/asm/kaiser.h | 126 ++++++++++
arch/x86/include/asm/pgtable.h | 18 +-
arch/x86/include/asm/pgtable_64.h | 29 ++-
arch/x86/include/asm/pgtable_types.h | 33 ++-
arch/x86/include/asm/processor-flags.h | 2 +
arch/x86/include/asm/processor.h | 2 +-
arch/x86/include/asm/tlbflush.h | 64 ++++-
arch/x86/kernel/cpu/common.c | 18 +-
arch/x86/kernel/cpu/perf_event_intel_ds.c | 54 ++++-
arch/x86/kernel/entry_64.S | 117 +++++++--
arch/x86/kernel/espfix_64.c | 9 +
arch/x86/kernel/head_64.S | 25 +-
arch/x86/kernel/init_task.c | 2 +-
arch/x86/kernel/irqinit.c | 2 +-
arch/x86/kernel/ldt.c | 25 +-
arch/x86/kernel/process_64.c | 2 +-
arch/x86/mm/Makefile | 1 +
arch/x86/mm/kaiser.c | 382 ++++++++++++++++++++++++++++++
arch/x86/mm/pgtable.c | 31 ++-
arch/x86/mm/tlb.c | 48 +++-
include/asm-generic/vmlinux.lds.h | 7 +
include/linux/kaiser.h | 52 ++++
include/linux/mmzone.h | 3 +-
include/linux/percpu-defs.h | 32 ++-
init/main.c | 2 +
kernel/fork.c | 6 +
mm/vmstat.c | 1 +
security/Kconfig | 10 +
33 files changed, 1049 insertions(+), 67 deletions(-)
create mode 100644 arch/x86/include/asm/kaiser.h
create mode 100644 arch/x86/mm/kaiser.c
create mode 100644 include/linux/kaiser.h
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a6203..2fa2635ee539 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
* we just keep it from happening
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_KAISER
#ifdef CONFIG_X86_32
#define _ASM_X86_DESC_H 1
#endif
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 2b5527726ae1..7eb0d4792800 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,6 +12,8 @@
#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
#include <asm/irqflags.h>
#include <linux/linkage.h>
@@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target)
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(kernel_stack), %rsp
addq $(KERNEL_STACK_OFFSET),%rsp
/*
@@ -183,6 +186,7 @@ ENTRY(ia32_sysenter_target)
popq_cfi %rcx /* User %esp */
CFI_REGISTER rsp,rcx
TRACE_IRQS_ON
+ SWITCH_USER_CR3
ENABLE_INTERRUPTS_SYSEXIT32
#ifdef CONFIG_AUDITSYSCALL
@@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target)
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movl %esp,%r8d
CFI_REGISTER rsp,r8
movq PER_CPU_VAR(kernel_stack),%rsp
@@ -337,6 +342,7 @@ ENTRY(ia32_cstar_target)
xorq %r9,%r9
xorq %r8,%r8
TRACE_IRQS_ON
+ SWITCH_USER_CR3
movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp
USERGS_SYSRET32
@@ -409,6 +415,7 @@ ENTRY(ia32_syscall)
CFI_REL_OFFSET rip,RIP-RIP
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 6f254f2fcd40..736272670870 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,6 +176,7 @@
#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 382ce8a9fd62..7f1ead938ec1 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -40,7 +40,7 @@ struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6ed2be7..3354a390cc71 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
typedef int vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
extern void setup_vector_irq(int cpu);
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..6f4c8ef46881
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,126 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory. It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq x86_cr3_pcid_noflush, \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js 9f
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg regb
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_KAISER
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored. To change the address space, another register is
+ * needed. A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+extern unsigned long x86_cr3_pcid_noflush;
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+/**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * The mapping is done on a global scope, so no bigger
+ * synchronization has to be done. the pages have to be
+ * manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * kaiser_init - Initialize the shadow mapping
+ *
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6be990922d4b..b1c8b8d3b02a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ pgdval_t ignore_flags = _PAGE_USER;
+ /*
+ * We set NX on KAISER pgds that map userspace memory so
+ * that userspace can not meaningfully use the kernel
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+ if (IS_ENABLED(CONFIG_KAISER))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+#endif
}
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09ae..a3bf3de9893b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+ return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ return NULL;
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+ return pgdp;
+}
+#endif /* CONFIG_KAISER */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pgd;
+ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 013286a10c2c..6e1315068a62 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+#else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -62,7 +66,7 @@
#endif
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -74,6 +78,33 @@
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH (0)
+#define X86_CR3_PCID_USER_FLUSH (0)
+#define X86_CR3_PCID_KERN_NOFLUSH (0)
+#define X86_CR3_PCID_USER_NOFLUSH (0)
+#endif
+
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
#define _PAGE_CACHE_WC (_PAGE_PWT)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a9e14a52385f..360e80d0d217 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -43,6 +43,8 @@
*/
#define X86_CR3_PWT 0x00000008 /* Page Write Through */
#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f7c89e231c6c..048249e983ca 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -266,7 +266,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
/*
* Save the original ist values for checking stack pointers during debugging
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e04cbc550424..288195901c8a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
static inline void __native_flush_tlb(void)
{
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all_nonglobals();
+ return;
+ }
+
/*
* If current->mm == NULL then we borrow a mm which may change during a
* task switch and therefore we must not be preempted while we write CR3
* back:
*/
preempt_disable();
+ if (this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
}
static inline void __native_flush_tlb_global(void)
{
+#ifdef CONFIG_KAISER
+ /* Globals are not used at all */
+ __native_flush_tlb();
+#else
unsigned long flags;
unsigned long cr4;
- if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void)
native_write_cr4(cr4);
raw_local_irq_restore(flags);
+#endif
}
static inline void __native_flush_tlb_single(unsigned long addr)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /*
+ * SIMICS #GP's if you run INVPCID with type 2/3
+ * and X86_CR4_PCIDE clear. Shame!
+ *
+ * The ASIDs used below are hard-coded. But, we must not
+ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
+ * invlpg in the case we are called early.
+ */
+
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ if (this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+ }
+ /* Flush the address out of both PCIDs. */
+ /*
+ * An optimization here might be to determine addresses
+ * that are only kernel-mapped and only flush the kernel
+ * ASID. But, userspace flushes are probably much more
+ * important performance-wise.
+ *
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+ if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
}
static inline void __flush_tlb_all(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 895e4b88469c..b567c89fc628 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = {
static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
* SDM says that it can't be enabled in 32-bit mode.
*/
set_in_cr4(X86_CR4_PCIDE);
+ /*
+ * INVPCID has two "groups" of types:
+ * 1/2: Invalidate an individual address
+ * 3/4: Invalidate all contexts
+ *
+ * 1/2 take a PCID, but 3/4 do not. So, 3/4
+ * ignore the PCID argument in the descriptor.
+ * But, we have to be careful not to call 1/2
+ * with an actual non-zero PCID in them before
+ * we do the above set_in_cr4().
+ */
+ if (cpu_has(c, X86_FEATURE_INVPCID))
+ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
@@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_PCID);
}
}
+ kaiser_setup_pcid();
}
/*
@@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 2d4e76ba2b5c..fb933cdca184 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,10 +2,14 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/kaiser.h>
#include <asm/perf_event.h>
#include "perf_event.h"
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu)
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+ unsigned int order = get_order(size);
+ struct page *page;
+ unsigned long addr;
+
+ page = alloc_pages_node(node, flags | __GFP_ZERO, order);
+ if (!page)
+ return NULL;
+ addr = (unsigned long)page_address(page);
+ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+ __free_pages(page, order);
+ addr = 0;
+ }
+ return (void *)addr;
+#else
+ return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+ if (!buffer)
+ return;
+ kaiser_remove_mapping((unsigned long)buffer, size);
+ free_pages((unsigned long)buffer, get_order(size));
+#else
+ kfree(buffer);
+#endif
+}
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu)
if (!ds || !x86_pmu.pebs)
return;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
ds->pebs_buffer_base = 0;
}
@@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu)
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
- if (unlikely(!ds))
- return -ENOMEM;
+ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
@@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f6daf3cdb878..3a4356a2f156 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,7 @@
#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64)
testl $3, CS(%rdi)
je 1f
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
@@ -362,6 +364,12 @@ END(save_rest)
/* save complete stack frame */
.pushsection .kprobes.text, "ax"
+/*
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -387,7 +395,25 @@ ENTRY(save_paranoid)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx,%ebx
-1: ret
+1:
+#ifdef CONFIG_KAISER
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ movq %cr3, %rax
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ orq x86_cr3_pcid_noflush, %rax
+ movq %rax, %cr3
+2:
+#endif
+ ret
CFI_ENDPROC
END(save_paranoid)
.popsection
@@ -464,6 +490,7 @@ ENTRY(system_call)
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -515,6 +542,14 @@ ENTRY(system_call_after_swapgs)
CFI_REGISTER rip,rcx
RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -851,6 +886,14 @@ retint_swapgs: /* return to user-space */
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
SWAPGS
jmp restore_args
@@ -891,6 +934,7 @@ ENTRY(native_iret)
pushq_cfi %rax
pushq_cfi %rdi
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr),%rdi
movq %rax,(0*8)(%rdi) /* RAX */
movq (2*8)(%rsp),%rax /* RIP */
@@ -906,6 +950,7 @@ ENTRY(native_iret)
andl $0xffff0000,%eax
popq_cfi %rdi
orq PER_CPU_VAR(espfix_stack),%rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax,%rsp
popq_cfi %rax
@@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
* is fundamentally NMI-unsafe. (we cannot change the soft and
* hard flags at once, atomically)
*/
-
- /* ebx: no swapgs flag */
+/*
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
+ movq %rbx, %r12 /* paranoid_userspace uses %ebx */
+ testl $3, CS(%rsp)
+ jnz paranoid_userspace
+paranoid_kernel:
+ movq %r12, %rbx /* restore after paranoid_userspace */
TRACE_IRQS_IRETQ 0
+#ifdef CONFIG_KAISER
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
RESTORE_ALL 8
- jmp irq_return
-paranoid_restore:
- TRACE_IRQS_IRETQ 0
- RESTORE_ALL 8
- jmp irq_return
+ jmp irq_return
+
paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
+ jz paranoid_kernel
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
@@ -1438,6 +1493,13 @@ ENTRY(error_entry)
movq_cfi r13, R13+8
movq_cfi r14, R14+8
movq_cfi r15, R15+8
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1527,22 +1589,31 @@ ENTRY(nmi)
call do_nmi
#ifdef CONFIG_TRACE_IRQFLAGS
/* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
+ /* ebx: no-swapgs and kaiser-switch-cr3 flag */
DISABLE_INTERRUPTS(CLBR_NONE)
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
-nmi_swapgs:
+ movq %rbx, %r12 /* nmi_userspace uses %ebx */
+ testl $3, CS(%rsp)
+ jnz nmi_userspace
+nmi_kernel:
+ movq %r12, %rbx /* restore after nmi_userspace */
+#ifdef CONFIG_KAISER
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz nmi_exit_no_switch
+ SWITCH_USER_CR3
+nmi_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
+ jnz nmi_exit_no_swapgs
SWAPGS_UNSAFE_STACK
-nmi_restore:
+nmi_exit_no_swapgs:
RESTORE_ALL 8
- jmp irq_return
+ jmp irq_return
+
nmi_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
+ jz nmi_kernel
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 94d857fb1033..14cd73b0e634 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -129,6 +130,14 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+ if (IS_ENABLED(CONFIG_KAISER))
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0f8ebf78253a..6e697ac3fb54 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -338,6 +338,27 @@ ENTRY(early_idt_handler)
.balign PAGE_SIZE; \
ENTRY(name)
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -353,13 +374,14 @@ ENTRY(name)
* 0xffffffff80000000 to physical address 0x000000. (always using
* 2Mbyte large pages provided by PAE mode)
*/
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level2_kernel_pgt)
/*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 43e9ccf44947..f00e6e734fbd 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task);
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index e328f691eeef..990f743e21b8 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -85,7 +85,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = -1,
};
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1dd32307a494..836a4c2d5ceb 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/kaiser.h>
#include <asm/system.h>
#include <asm/ldt.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size);
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
+ int ret;
if (size > LDT_ENTRIES)
return NULL;
@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL;
}
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
new_ldt->size = size;
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
return new_ldt;
}
@@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
- else
- kfree(ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 557eb3757edb..d2ce2a33d15b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -57,7 +57,7 @@
asmlinkage extern void ret_from_fork(void);
-DEFINE_PER_CPU(unsigned long, old_rsp);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
static DEFINE_PER_CPU(unsigned char, is_idle);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index cf2a84031dfd..c9a00a5e0b87 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_MEMTEST) += memtest.o
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..79b0222ffa74
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,382 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+extern struct mm_struct init_mm;
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h> /* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_KAISER
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+unsigned long x86_cr3_pcid_noflush __read_mostly;
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ BUG_ON(pgd_none(*pgd));
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ pud = pud_offset(pgd, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud)) {
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ __inc_zone_page_state(virt_to_page((void *)
+ new_pmd_page), NR_KAISERTABLE);
+ } else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd)) {
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ __inc_zone_page_state(virt_to_page((void *)
+ new_pte_page), NR_KAISERTABLE);
+ } else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ return pte_offset_kernel(pmd, address);
+}
+
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ int ret = 0;
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long )__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+ ret = -EIO;
+ break;
+ }
+ pte = kaiser_pagetable_walk(address);
+ if (!pte) {
+ ret = -ENOMEM;
+ break;
+ }
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
+ return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+ unsigned long size = end - start;
+
+ return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+ pgd_t new_pgd;
+ pud_t *pud = pud_alloc_one(&init_mm,
+ PAGE_OFFSET + i * PGDIR_SIZE);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+ */
+ if (!pgd_none(pgd[i])) {
+ WARN_ON(1);
+ continue;
+ }
+ set_pgd(pgd + i, new_pgd);
+ }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+ int cpu;
+
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
+
+ /*
+ * Map the entry/exit text section, which is needed at
+ * switches from user to and from kernel.
+ */
+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+ __irqentry_text_end,
+ __PAGE_KERNEL_RX);
+#endif
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+ kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+ sizeof(x86_cr3_pcid_noflush),
+ __PAGE_KERNEL);
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ unsigned long end = start + size;
+ unsigned long addr;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ pte = kaiser_pagetable_walk(addr);
+ if (pte)
+ set_pte(pte, __pte(0));
+ }
+}
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+ */
+ if (pgd.pgd & _PAGE_USER) {
+ if (is_userspace_pgd(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace running on the kernel CR3,
+ * userspace will crash instead of running.
+ */
+ pgd.pgd |= _PAGE_NX;
+ }
+ } else if (!pgd.pgd) {
+ /*
+ * pgd_clear() cannot check _PAGE_USER, and is even used to
+ * clear corrupted pgd entries: so just rely on cases like
+ * kexec and EFI never to be using pgd_clear().
+ */
+ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+ is_userspace_pgd(pgdp))
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+ return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+ unsigned long kern_cr3 = 0;
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+ if (this_cpu_has(X86_FEATURE_PCID)) {
+ kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+ }
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+ x86_cr3_pcid_noflush = kern_cr3;
+ this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+ this_cpu_write(x86_cr3_pcid_user,
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8573b83a63d0..73285602c93f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -5,7 +5,7 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+ PGD_ALLOCATION_ORDER);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
- pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+ pgd = _pgd_alloc();
if (pgd == NULL)
goto out;
@@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
out_free_pmds:
free_pmds(pmds);
out_free_pgd:
- free_page((unsigned long)pgd);
+ _pgd_free(pgd);
out:
return NULL;
}
@@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
- free_page((unsigned long)pgd);
+ _pgd_free(pgd);
}
int ptep_set_access_flags(struct vm_area_struct *vma,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4e4e6bc09e98..bd0ff4eb74a0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,10 +12,43 @@
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
+#include <asm/kaiser.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
= { &init_mm, 0, };
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+#ifdef CONFIG_KAISER
+ if (this_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+ * Flush KERN below, flush USER when returning to userspace in
+ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+ *
+ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+ * do it here, but can only be used if X86_FEATURE_INVPCID is
+ * available - and many machines support pcid without invpcid.
+ *
+ * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+ * but keep that line in there in case something changes.
+ */
+ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ kaiser_flush_tlb_on_return_to_user();
+ }
+#endif /* CONFIG_KAISER */
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_new_mm_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
/*
* TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
@@ -65,7 +98,7 @@ void leave_mm(int cpu)
BUG();
cpumask_clear_cpu(cpu,
mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
- load_cr3(swapper_pg_dir);
+ load_new_mm_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* from next->pgd. TLB fills are special and can happen
* due to instruction fetches or for no reason at all,
* and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
+ * Fortunately, load_new_mm_cr3() is serializing
+ * and gives the ordering guarantee we need.
*/
- load_cr3(next->pgd);
+ load_new_mm_cr3(next->pgd);
/* stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
*
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
+ * As above, load_new_mm_cr3() is serializing and orders
+ * TLB fills with respect to the mm_cpumask write.
*/
- load_cr3(next->pgd);
+ load_new_mm_cr3(next->pgd);
load_mm_ldt(next);
}
}
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b5e2e4c6b017..01c8155dd613 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -692,7 +692,14 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
*(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..4a4d6d911a14
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ /*
+ * Map that page of kernel stack on which we enter from user context.
+ */
+ return kaiser_add_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+ /*
+ * Note: may be called even when kaiser_map_thread_stack() failed.
+ */
+ kaiser_remove_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+ unsigned long size, unsigned long flags)
+{
+ return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+ unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 25842b6e72e1..a0b4422a116a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -95,8 +95,9 @@ enum zone_stat_item {
NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE,
NR_PAGETABLE, /* used for pagetables */
- NR_KERNEL_STACK,
/* Second 128 byte cacheline */
+ NR_KERNEL_STACK,
+ NR_KAISERTABLE,
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_BOUNCE,
NR_VMSCAN_WRITE,
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 27ef6b190ea6..56f5eeb78d1d 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -28,6 +28,12 @@
(void)__vpp_verify; \
} while (0)
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* s390 and alpha modules require percpu variables to be defined as
* weak to force the compiler to generate GOT based external
@@ -90,6 +96,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -119,6 +131,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -137,11 +157,21 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
*/
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
diff --git a/init/main.c b/init/main.c
index e937d9bda0f8..558a9fdd566d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -69,6 +69,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/random.h>
+#include <linux/kaiser.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -463,6 +464,7 @@ static void __init mm_init(void)
percpu_init_late();
pgtable_cache_init();
vmalloc_init();
+ kaiser_init();
}
asmlinkage void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 29b460431c12..511131a15a75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
+ kaiser_unmap_thread_stack(ti);
free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
#endif
@@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->stack = ti;
+ err = kaiser_map_thread_stack(tsk->stack);
+ if (err)
+ goto out;
+
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ff9060919c4b..eaf3db038652 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -699,6 +699,7 @@ const char * const vmstat_text[] = {
"nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_kernel_stack",
+ "nr_overhead",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
diff --git a/security/Kconfig b/security/Kconfig
index 51bd5a0b69ae..19f83193e7ab 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -96,6 +96,16 @@ config SECURITY
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ default y
+ depends on X86_64 && SMP && !PARAVIRT
+ help
+ This enforces a strict kernel and user space isolation, in order
+ to close hardware side channels on kernel address information.
+
+ If you are unsure how to answer this question, answer Y.
+
config SECURITYFS
bool "Enable the securityfs filesystem"
help