| From: Hugh Dickins <hughd@google.com> |
| Date: Sun, 24 Sep 2017 16:59:49 -0700 |
| Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE |
| |
| Added "nokaiser" boot option: an early param like "noinvpcid". |
| Most places now check int kaiser_enabled (#defined 0 when not |
| CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S |
| and entry_64_compat.S are using the ALTERNATIVE technique, which |
| patches in the preferred instructions at runtime. That technique |
| is tied to x86 cpu features, so X86_FEATURE_KAISER fabricated |
| ("" in its comment so "kaiser" not magicked into /proc/cpuinfo). |
| |
| Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, |
| but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when |
| nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - |
| neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL |
| won't get set in some obscure corner, or something add PGE into CR4. |
| By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, |
| all page table setup which uses pte_pfn() masks it out of the ptes. |
| |
| It's slightly shameful that the same declaration versus definition of |
| kaiser_enabled appears in not one, not two, but in three header files |
| (asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, |
| than with #including any of those in any of the others; and did not |
| feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes |
| them all, so we shall hear about it if they get out of synch. |
| |
| Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER |
| from kaiser.c; removed the unused native_get_normal_pgd(); removed |
| the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some |
| comments. But more interestingly, set CR4.PSE in secondary_startup_64: |
| the manual is clear that it does not matter whether it's 0 or 1 when |
| 4-level-pts are enabled, but I was distracted to find cr4 different on |
| BSP and auxiliaries - BSP alone was adding PSE, in init_memory_mapping(). |
| |
| (cherry picked from Change-Id: I8e5bec716944444359cbd19f6729311eff943e9a) |
| |
| Signed-off-by: Hugh Dickins <hughd@google.com> |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| Documentation/kernel-parameters.txt | 2 ++ |
| arch/x86/ia32/ia32entry.S | 2 ++ |
| arch/x86/include/asm/cpufeature.h | 3 +++ |
| arch/x86/include/asm/kaiser.h | 27 ++++++++++++++++++++------- |
| arch/x86/include/asm/pgtable.h | 19 +++++++++++++------ |
| arch/x86/include/asm/pgtable_64.h | 13 ++++--------- |
| arch/x86/include/asm/pgtable_types.h | 4 ---- |
| arch/x86/include/asm/tlbflush.h | 35 +++++++++++++++++++++-------------- |
| arch/x86/kernel/cpu/common.c | 30 +++++++++++++++++++++++++++++- |
| arch/x86/kernel/entry_64.S | 6 +++++- |
| arch/x86/kernel/espfix_64.c | 3 ++- |
| arch/x86/kernel/head_64.S | 4 ++-- |
| arch/x86/mm/init.c | 2 +- |
| arch/x86/mm/init_64.c | 10 ++++++++++ |
| arch/x86/mm/kaiser.c | 26 ++++++++++++++++++++++---- |
| arch/x86/mm/pgtable.c | 8 ++------ |
| arch/x86/mm/tlb.c | 4 +--- |
| 17 files changed, 139 insertions(+), 59 deletions(-) |
| |
| diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt |
| index 4f43fd315ecd..28c23bdc1af4 100644 |
| --- a/Documentation/kernel-parameters.txt |
| +++ b/Documentation/kernel-parameters.txt |
| @@ -1803,6 +1803,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
| |
| nojitter [IA-64] Disables jitter checking for ITC timers. |
| |
| + nokaiser [X86-64] Disable KAISER isolation of kernel from user. |
| + |
| no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver |
| |
| no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page |
| diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S |
| index 7eb0d4792800..130db0702d9f 100644 |
| --- a/arch/x86/ia32/ia32entry.S |
| +++ b/arch/x86/ia32/ia32entry.S |
| @@ -13,6 +13,8 @@ |
| #include <asm/thread_info.h> |
| #include <asm/segment.h> |
| #include <asm/pgtable_types.h> |
| +#include <asm/alternative-asm.h> |
| +#include <asm/cpufeature.h> |
| #include <asm/kaiser.h> |
| #include <asm/irqflags.h> |
| #include <linux/linkage.h> |
| diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h |
| index e65fb1220573..cb967c9db459 100644 |
| --- a/arch/x86/include/asm/cpufeature.h |
| +++ b/arch/x86/include/asm/cpufeature.h |
| @@ -178,6 +178,9 @@ |
| #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ |
| #define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */ |
| |
| +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ |
| +#define X86_FEATURE_KAISER ( 7*32+31) /* "" CONFIG_KAISER w/o nokaiser */ |
| + |
| /* Virtualization flags: Linux defined, word 8 */ |
| #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ |
| #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ |
| diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h |
| index 6f4c8ef46881..2fe06d06241c 100644 |
| --- a/arch/x86/include/asm/kaiser.h |
| +++ b/arch/x86/include/asm/kaiser.h |
| @@ -46,28 +46,33 @@ movq \reg, %cr3 |
| .endm |
| |
| .macro SWITCH_KERNEL_CR3 |
| -pushq %rax |
| +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER |
| _SWITCH_TO_KERNEL_CR3 %rax |
| popq %rax |
| +8: |
| .endm |
| |
| .macro SWITCH_USER_CR3 |
| -pushq %rax |
| +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER |
| _SWITCH_TO_USER_CR3 %rax %al |
| popq %rax |
| +8: |
| .endm |
| |
| .macro SWITCH_KERNEL_CR3_NO_STACK |
| -movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) |
| +ALTERNATIVE "jmp 8f", \ |
| + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ |
| + X86_FEATURE_KAISER |
| _SWITCH_TO_KERNEL_CR3 %rax |
| movq PER_CPU_VAR(unsafe_stack_register_backup), %rax |
| +8: |
| .endm |
| |
| #else /* CONFIG_KAISER */ |
| |
| -.macro SWITCH_KERNEL_CR3 reg |
| +.macro SWITCH_KERNEL_CR3 |
| .endm |
| -.macro SWITCH_USER_CR3 reg regb |
| +.macro SWITCH_USER_CR3 |
| .endm |
| .macro SWITCH_KERNEL_CR3_NO_STACK |
| .endm |
| @@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
| |
| extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
| |
| +extern int kaiser_enabled; |
| +#else |
| +#define kaiser_enabled 0 |
| +#endif /* CONFIG_KAISER */ |
| + |
| +/* |
| + * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, |
| + * so as to build with tests on kaiser_enabled instead of #ifdefs. |
| + */ |
| + |
| /** |
| * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping |
| * @addr: the start address of the range |
| @@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); |
| */ |
| extern void kaiser_init(void); |
| |
| -#endif /* CONFIG_KAISER */ |
| - |
| #endif /* __ASSEMBLY */ |
| |
| #endif /* _ASM_X86_KAISER_H */ |
| diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h |
| index b1c8b8d3b02a..a467c19e64b5 100644 |
| --- a/arch/x86/include/asm/pgtable.h |
| +++ b/arch/x86/include/asm/pgtable.h |
| @@ -17,6 +17,11 @@ |
| #ifndef __ASSEMBLY__ |
| |
| #include <asm/x86_init.h> |
| +#ifdef CONFIG_KAISER |
| +extern int kaiser_enabled; |
| +#else |
| +#define kaiser_enabled 0 |
| +#endif |
| |
| /* |
| * ZERO_PAGE is a global shared page that is always zero: used |
| @@ -577,7 +582,7 @@ static inline int pgd_bad(pgd_t pgd) |
| * page table by accident; it will fault on the first |
| * instruction it tries to run. See native_set_pgd(). |
| */ |
| - if (IS_ENABLED(CONFIG_KAISER)) |
| + if (kaiser_enabled) |
| ignore_flags |= _PAGE_NX; |
| |
| return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; |
| @@ -780,12 +785,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, |
| */ |
| static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) |
| { |
| - memcpy(dst, src, count * sizeof(pgd_t)); |
| + memcpy(dst, src, count * sizeof(pgd_t)); |
| #ifdef CONFIG_KAISER |
| - /* Clone the shadow pgd part as well */ |
| - memcpy(native_get_shadow_pgd(dst), |
| - native_get_shadow_pgd(src), |
| - count * sizeof(pgd_t)); |
| + if (kaiser_enabled) { |
| + /* Clone the shadow pgd part as well */ |
| + memcpy(native_get_shadow_pgd(dst), |
| + native_get_shadow_pgd(src), |
| + count * sizeof(pgd_t)); |
| + } |
| #endif |
| } |
| |
| diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h |
| index a3bf3de9893b..a4a2b700a4e6 100644 |
| --- a/arch/x86/include/asm/pgtable_64.h |
| +++ b/arch/x86/include/asm/pgtable_64.h |
| @@ -110,13 +110,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); |
| |
| static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
| { |
| +#ifdef CONFIG_DEBUG_VM |
| + /* linux/mmdebug.h may not have been included at this point */ |
| + BUG_ON(!kaiser_enabled); |
| +#endif |
| return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); |
| } |
| - |
| -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) |
| -{ |
| - return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); |
| -} |
| #else |
| static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| { |
| @@ -126,10 +125,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
| { |
| return NULL; |
| } |
| -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) |
| -{ |
| - return pgdp; |
| -} |
| #endif /* CONFIG_KAISER */ |
| |
| static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
| diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h |
| index 6e1315068a62..3a3c6d014696 100644 |
| --- a/arch/x86/include/asm/pgtable_types.h |
| +++ b/arch/x86/include/asm/pgtable_types.h |
| @@ -39,11 +39,7 @@ |
| #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) |
| #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) |
| #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) |
| -#ifdef CONFIG_KAISER |
| -#define _PAGE_GLOBAL (_AT(pteval_t, 0)) |
| -#else |
| #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
| -#endif |
| #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) |
| #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) |
| #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) |
| diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h |
| index 288195901c8a..616eca18ed31 100644 |
| --- a/arch/x86/include/asm/tlbflush.h |
| +++ b/arch/x86/include/asm/tlbflush.h |
| @@ -69,9 +69,11 @@ static inline void invpcid_flush_all_nonglobals(void) |
| * to avoid the need for asm/kaiser.h in unexpected places. |
| */ |
| #ifdef CONFIG_KAISER |
| +extern int kaiser_enabled; |
| extern void kaiser_setup_pcid(void); |
| extern void kaiser_flush_tlb_on_return_to_user(void); |
| #else |
| +#define kaiser_enabled 0 |
| static inline void kaiser_setup_pcid(void) |
| { |
| } |
| @@ -96,7 +98,7 @@ static inline void __native_flush_tlb(void) |
| * back: |
| */ |
| preempt_disable(); |
| - if (this_cpu_has(X86_FEATURE_PCID)) |
| + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) |
| kaiser_flush_tlb_on_return_to_user(); |
| native_write_cr3(native_read_cr3()); |
| preempt_enable(); |
| @@ -104,13 +106,15 @@ static inline void __native_flush_tlb(void) |
| |
| static inline void __native_flush_tlb_global(void) |
| { |
| -#ifdef CONFIG_KAISER |
| - /* Globals are not used at all */ |
| - __native_flush_tlb(); |
| -#else |
| unsigned long flags; |
| unsigned long cr4; |
| |
| + if (kaiser_enabled) { |
| + /* Globals are not used at all */ |
| + __native_flush_tlb(); |
| + return; |
| + } |
| + |
| if (this_cpu_has(X86_FEATURE_INVPCID)) { |
| /* |
| * Using INVPCID is considerably faster than a pair of writes |
| @@ -130,13 +134,16 @@ static inline void __native_flush_tlb_global(void) |
| raw_local_irq_save(flags); |
| |
| cr4 = native_read_cr4(); |
| - /* clear PGE */ |
| - native_write_cr4(cr4 & ~X86_CR4_PGE); |
| - /* write old PGE again and flush TLBs */ |
| - native_write_cr4(cr4); |
| + if (cr4 & X86_CR4_PGE) { |
| + /* clear PGE and flush TLB of all entries */ |
| + native_write_cr4(cr4 & ~X86_CR4_PGE); |
| + /* restore PGE as it was before */ |
| + native_write_cr4(cr4); |
| + } else { |
| + native_write_cr3(native_read_cr3()); |
| + } |
| |
| raw_local_irq_restore(flags); |
| -#endif |
| } |
| |
| static inline void __native_flush_tlb_single(unsigned long addr) |
| @@ -151,7 +158,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) |
| */ |
| |
| if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { |
| - if (this_cpu_has(X86_FEATURE_PCID)) |
| + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) |
| kaiser_flush_tlb_on_return_to_user(); |
| asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
| return; |
| @@ -166,9 +173,9 @@ static inline void __native_flush_tlb_single(unsigned long addr) |
| * Make sure to do only a single invpcid when KAISER is |
| * disabled and we have only a single ASID. |
| */ |
| - if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) |
| - invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); |
| - invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); |
| + if (kaiser_enabled) |
| + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); |
| + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); |
| } |
| |
| static inline void __flush_tlb_all(void) |
| diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c |
| index b567c89fc628..dbad6b1ee60e 100644 |
| --- a/arch/x86/kernel/cpu/common.c |
| +++ b/arch/x86/kernel/cpu/common.c |
| @@ -171,6 +171,20 @@ static int __init x86_pcid_setup(char *s) |
| return 1; |
| } |
| __setup("nopcid", x86_pcid_setup); |
| + |
| +static int __init x86_nokaiser_setup(char *s) |
| +{ |
| + /* nokaiser doesn't accept parameters */ |
| + if (s) |
| + return -EINVAL; |
| +#ifdef CONFIG_KAISER |
| + kaiser_enabled = 0; |
| + setup_clear_cpu_cap(X86_FEATURE_KAISER); |
| + pr_info("nokaiser: KAISER feature disabled\n"); |
| +#endif |
| + return 0; |
| +} |
| +early_param("nokaiser", x86_nokaiser_setup); |
| #endif |
| |
| static int __init x86_noinvpcid_setup(char *s) |
| @@ -313,7 +327,8 @@ static __cpuinit void setup_smep(struct cpuinfo_x86 *c) |
| static void setup_pcid(struct cpuinfo_x86 *c) |
| { |
| if (cpu_has(c, X86_FEATURE_PCID)) { |
| - if (cpu_has(c, X86_FEATURE_PGE) && IS_ENABLED(CONFIG_X86_64)) { |
| + if (IS_ENABLED(CONFIG_X86_64) && |
| + (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled)) { |
| /* |
| * Regardless of whether PCID is enumerated, the |
| * SDM says that it can't be enabled in 32-bit mode. |
| @@ -680,6 +695,10 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) |
| c->x86_power = cpuid_edx(0x80000007); |
| |
| init_scattered_cpuid_features(c); |
| +#ifdef CONFIG_KAISER |
| + if (kaiser_enabled) |
| + set_cpu_cap(c, X86_FEATURE_KAISER); |
| +#endif |
| } |
| |
| static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) |
| @@ -1229,6 +1248,15 @@ void __cpuinit cpu_init(void) |
| int cpu; |
| int i; |
| |
| + if (!kaiser_enabled) { |
| + /* |
| + * secondary_startup_64() deferred setting PGE in cr4: |
| + * init_memory_mapping() sets it on the boot cpu, |
| + * but it needs to be set on each secondary cpu. |
| + */ |
| + set_in_cr4(X86_CR4_PGE); |
| + } |
| + |
| cpu = stack_smp_processor_id(); |
| t = &per_cpu(init_tss, cpu); |
| oist = &per_cpu(orig_ist, cpu); |
| diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S |
| index 3a4356a2f156..eb6c31b6069c 100644 |
| --- a/arch/x86/kernel/entry_64.S |
| +++ b/arch/x86/kernel/entry_64.S |
| @@ -56,6 +56,8 @@ |
| #include <asm/ftrace.h> |
| #include <asm/percpu.h> |
| #include <asm/pgtable_types.h> |
| +#include <asm/alternative-asm.h> |
| +#include <asm/cpufeature.h> |
| #include <asm/kaiser.h> |
| |
| /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
| @@ -404,7 +406,7 @@ ENTRY(save_paranoid) |
| * unconditionally, but we need to find out whether the reverse |
| * should be done on return (conveyed to paranoid_exit in %ebx). |
| */ |
| - movq %cr3, %rax |
| + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
| testl $KAISER_SHADOW_PGD_OFFSET, %eax |
| jz 2f |
| orl $2, %ebx |
| @@ -1428,6 +1430,7 @@ ENTRY(paranoid_exit) |
| movq %r12, %rbx /* restore after paranoid_userspace */ |
| TRACE_IRQS_IRETQ 0 |
| #ifdef CONFIG_KAISER |
| + /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */ |
| testl $2, %ebx /* SWITCH_USER_CR3 needed? */ |
| jz paranoid_exit_no_switch |
| SWITCH_USER_CR3 |
| @@ -1597,6 +1600,7 @@ ENTRY(nmi) |
| nmi_kernel: |
| movq %r12, %rbx /* restore after nmi_userspace */ |
| #ifdef CONFIG_KAISER |
| + /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */ |
| testl $2, %ebx /* SWITCH_USER_CR3 needed? */ |
| jz nmi_exit_no_switch |
| SWITCH_USER_CR3 |
| diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c |
| index 14cd73b0e634..a1944faa739c 100644 |
| --- a/arch/x86/kernel/espfix_64.c |
| +++ b/arch/x86/kernel/espfix_64.c |
| @@ -135,9 +135,10 @@ void __init init_espfix_bsp(void) |
| * area to ensure it is mapped into the shadow user page |
| * tables. |
| */ |
| - if (IS_ENABLED(CONFIG_KAISER)) |
| + if (kaiser_enabled) { |
| set_pgd(native_get_shadow_pgd(pgd_p), |
| __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); |
| + } |
| |
| /* Randomize the locations */ |
| init_espfix_random(); |
| diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S |
| index 6e697ac3fb54..28aef29c42de 100644 |
| --- a/arch/x86/kernel/head_64.S |
| +++ b/arch/x86/kernel/head_64.S |
| @@ -166,8 +166,8 @@ ENTRY(secondary_startup_64) |
| /* Sanitize CPU configuration */ |
| call verify_cpu |
| |
| - /* Enable PAE mode and PGE */ |
| - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax |
| + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ |
| + movl $(X86_CR4_PAE | X86_CR4_PSE), %eax |
| movq %rax, %cr4 |
| |
| /* Setup early boot stage 4 level pagetables. */ |
| diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c |
| index dba71b25a546..f897b97ab897 100644 |
| --- a/arch/x86/mm/init.c |
| +++ b/arch/x86/mm/init.c |
| @@ -161,7 +161,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, |
| set_in_cr4(X86_CR4_PSE); |
| |
| /* Enable PGE if available */ |
| - if (cpu_has_pge) { |
| + if (cpu_has_pge && !kaiser_enabled) { |
| set_in_cr4(X86_CR4_PGE); |
| __supported_pte_mask |= _PAGE_GLOBAL; |
| } |
| diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c |
| index 44b93da18401..1f1b8ed9b06f 100644 |
| --- a/arch/x86/mm/init_64.c |
| +++ b/arch/x86/mm/init_64.c |
| @@ -312,6 +312,16 @@ void __init cleanup_highmap(void) |
| continue; |
| if (vaddr < (unsigned long) _text || vaddr > end) |
| set_pmd(pmd, __pmd(0)); |
| + else if (kaiser_enabled) { |
| + /* |
| + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: |
| + * clear that now. This is not important, so long as |
| + * CR4.PGE remains clear, but it removes an anomaly. |
| + * Physical mapping setup below avoids _PAGE_GLOBAL |
| + * by use of massage_pgprot() inside pfn_pte() etc. |
| + */ |
| + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); |
| + } |
| } |
| } |
| |
| diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c |
| index fb16d79fc07a..bef8a415048b 100644 |
| --- a/arch/x86/mm/kaiser.c |
| +++ b/arch/x86/mm/kaiser.c |
| @@ -21,7 +21,9 @@ extern struct mm_struct init_mm; |
| #include <asm/pgalloc.h> |
| #include <asm/desc.h> |
| |
| -#ifdef CONFIG_KAISER |
| +int kaiser_enabled __read_mostly = 1; |
| +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ |
| + |
| DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| |
| /* |
| @@ -165,8 +167,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address) |
| return pte_offset_kernel(pmd, address); |
| } |
| |
| -int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| - unsigned long flags) |
| +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| + unsigned long flags) |
| { |
| int ret = 0; |
| pte_t *pte; |
| @@ -175,6 +177,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
| unsigned long target_address; |
| |
| + /* |
| + * It is convenient for callers to pass in __PAGE_KERNEL etc, |
| + * and there is no actual harm from setting _PAGE_GLOBAL, so |
| + * long as CR4.PGE is not set. But it is nonetheless troubling |
| + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" |
| + * requires that not to be #defined to 0): so mask it off here. |
| + */ |
| + flags &= ~_PAGE_GLOBAL; |
| + |
| if (flags & _PAGE_USER) |
| BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP); |
| |
| @@ -264,6 +275,8 @@ void __init kaiser_init(void) |
| { |
| int cpu; |
| |
| + if (!kaiser_enabled) |
| + return; |
| kaiser_init_all_pgds(); |
| |
| for_each_possible_cpu(cpu) { |
| @@ -303,6 +316,8 @@ void __init kaiser_init(void) |
| /* Add a mapping to the shadow mapping, and synchronize the mappings */ |
| int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
| { |
| + if (!kaiser_enabled) |
| + return 0; |
| return kaiser_add_user_map((const void *)addr, size, flags); |
| } |
| |
| @@ -312,6 +327,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) |
| unsigned long addr; |
| pte_t *pte; |
| |
| + if (!kaiser_enabled) |
| + return; |
| for (addr = start; addr < end; addr += PAGE_SIZE) { |
| pte = kaiser_pagetable_walk(addr); |
| if (pte) |
| @@ -333,6 +350,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp) |
| |
| pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| { |
| + if (!kaiser_enabled) |
| + return pgd; |
| /* |
| * Do we need to also populate the shadow pgd? Check _PAGE_USER to |
| * skip cases like kexec and EFI which make temporary low mappings. |
| @@ -389,4 +408,3 @@ void kaiser_flush_tlb_on_return_to_user(void) |
| X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); |
| } |
| EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); |
| -#endif /* CONFIG_KAISER */ |
| diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c |
| index 73285602c93f..0b246ef0fa48 100644 |
| --- a/arch/x86/mm/pgtable.c |
| +++ b/arch/x86/mm/pgtable.c |
| @@ -253,16 +253,12 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
| } |
| } |
| |
| -#ifdef CONFIG_KAISER |
| /* |
| - * Instead of one pmd, we aquire two pmds. Being order-1, it is |
| + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is |
| * both 8k in size and 8k-aligned. That lets us just flip bit 12 |
| * in a pointer to swap between the two 4k halves. |
| */ |
| -#define PGD_ALLOCATION_ORDER 1 |
| -#else |
| -#define PGD_ALLOCATION_ORDER 0 |
| -#endif |
| +#define PGD_ALLOCATION_ORDER kaiser_enabled |
| |
| static inline pgd_t *_pgd_alloc(void) |
| { |
| diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c |
| index bd0ff4eb74a0..34515ec300b8 100644 |
| --- a/arch/x86/mm/tlb.c |
| +++ b/arch/x86/mm/tlb.c |
| @@ -21,8 +21,7 @@ static void load_new_mm_cr3(pgd_t *pgdir) |
| { |
| unsigned long new_mm_cr3 = __pa(pgdir); |
| |
| -#ifdef CONFIG_KAISER |
| - if (this_cpu_has(X86_FEATURE_PCID)) { |
| + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { |
| /* |
| * We reuse the same PCID for different tasks, so we must |
| * flush all the entries for the PCID out when we change tasks. |
| @@ -39,7 +38,6 @@ static void load_new_mm_cr3(pgd_t *pgdir) |
| new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; |
| kaiser_flush_tlb_on_return_to_user(); |
| } |
| -#endif /* CONFIG_KAISER */ |
| |
| /* |
| * Caution: many callers of this function expect |
| |