| From foo@baz Wed Jan 3 20:37:21 CET 2018 |
| From: Hugh Dickins <hughd@google.com> |
| Date: Tue, 5 Sep 2017 12:05:01 -0700 |
| Subject: kaiser: do not set _PAGE_NX on pgd_none |
| |
| From: Hugh Dickins <hughd@google.com> |
| |
| |
| native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must |
| avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry: |
| usually that just generated a warning on exit, but sometimes |
| more mysterious and damaging failures (our production machines |
| could not complete booting). |
| |
| The original fix to this just avoided adding _PAGE_NX to |
| an empty entry; but eventually more problems surfaced with kexec, |
| and EFI mapping expected to be a problem too. So now instead |
| change native_set_pgd() to update shadow only if _PAGE_USER: |
| |
| A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure) |
| use set_pgd() to set up a temporary internal virtual address space, with |
| physical pages remapped at what Kaiser regards as userspace addresses: |
| Kaiser then assumes a shadow pgd follows, which it will try to corrupt. |
| |
| This appears to be responsible for the recent kexec and kdump failures; |
| though it's unclear how those did not manifest as a problem before. |
| Ah, the shadow pgd will only be assumed to "follow" if the requested |
| pgd is on an even-numbered page: so I suppose it was going wrong 50% |
| of the time all along. |
| |
| What we need is a flag to set_pgd(), to tell it we're dealing with |
| userspace. Er, isn't that what the pgd's _PAGE_USER bit is saying? |
| Add a test for that. But we cannot do the same for pgd_clear() |
| (which may be called to clear corrupted entries - set aside the |
| question of "corrupt in which pgd?" until later), so there just |
| rely on pgd_clear() not being called in the problematic cases - |
| with a WARN_ON_ONCE() which should fire half the time if it is. |
| |
| But this is getting too big for an inline function: move it into |
| arch/x86/mm/kaiser.c (which then demands a boot/compressed mod); |
| and de-void and de-space native_get_shadow/normal_pgd() while here. |
| |
| Also make an unnecessary change to KASLR's init_trampoline(): it was |
| using set_pgd() to assign a pgd-value to a global variable (not in a |
| pg directory page), which was rather scary given Kaiser's previous |
| set_pgd() implementation: not a problem now, but too scary to leave |
| as was, it could easily blow up if we have to change set_pgd() again. |
| |
| Signed-off-by: Hugh Dickins <hughd@google.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/x86/boot/compressed/misc.h | 1 |
| arch/x86/include/asm/pgtable_64.h | 51 +++++++++----------------------------- |
| arch/x86/mm/kaiser.c | 42 +++++++++++++++++++++++++++++++ |
| arch/x86/mm/kaslr.c | 4 +- |
| 4 files changed, 58 insertions(+), 40 deletions(-) |
| |
| --- a/arch/x86/boot/compressed/misc.h |
| +++ b/arch/x86/boot/compressed/misc.h |
| @@ -9,6 +9,7 @@ |
| */ |
| #undef CONFIG_PARAVIRT |
| #undef CONFIG_PARAVIRT_SPINLOCKS |
| +#undef CONFIG_KAISER |
| #undef CONFIG_KASAN |
| |
| #include <linux/linkage.h> |
| --- a/arch/x86/include/asm/pgtable_64.h |
| +++ b/arch/x86/include/asm/pgtable_64.h |
| @@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_ |
| } |
| |
| #ifdef CONFIG_KAISER |
| -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) |
| +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); |
| + |
| +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
| { |
| - return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); |
| + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); |
| } |
| |
| -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) |
| +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) |
| { |
| - return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); |
| + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); |
| } |
| #else |
| -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) |
| +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| +{ |
| + return pgd; |
| +} |
| +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
| { |
| BUILD_BUG_ON(1); |
| return NULL; |
| } |
| -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) |
| +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) |
| { |
| return pgdp; |
| } |
| #endif /* CONFIG_KAISER */ |
| |
| -/* |
| - * Page table pages are page-aligned. The lower half of the top |
| - * level is used for userspace and the top half for the kernel. |
| - * This returns true for user pages that need to get copied into |
| - * both the user and kernel copies of the page tables, and false |
| - * for kernel pages that should only be in the kernel copy. |
| - */ |
| -static inline bool is_userspace_pgd(void *__ptr) |
| -{ |
| - unsigned long ptr = (unsigned long)__ptr; |
| - |
| - return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); |
| -} |
| - |
| static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
| { |
| -#ifdef CONFIG_KAISER |
| - pteval_t extra_kern_pgd_flags = 0; |
| - /* Do we need to also populate the shadow pgd? */ |
| - if (is_userspace_pgd(pgdp)) { |
| - native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| - /* |
| - * Even if the entry is *mapping* userspace, ensure |
| - * that userspace can not use it. This way, if we |
| - * get out to userspace running on the kernel CR3, |
| - * userspace will crash instead of running. |
| - */ |
| - extra_kern_pgd_flags = _PAGE_NX; |
| - } |
| - pgdp->pgd = pgd.pgd; |
| - pgdp->pgd |= extra_kern_pgd_flags; |
| -#else /* CONFIG_KAISER */ |
| - *pgdp = pgd; |
| -#endif |
| + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); |
| } |
| |
| static inline void native_pgd_clear(pgd_t *pgd) |
| --- a/arch/x86/mm/kaiser.c |
| +++ b/arch/x86/mm/kaiser.c |
| @@ -302,4 +302,46 @@ void kaiser_remove_mapping(unsigned long |
| unmap_pud_range_nofree(pgd, addr, end); |
| } |
| } |
| + |
| +/* |
| + * Page table pages are page-aligned. The lower half of the top |
| + * level is used for userspace and the top half for the kernel. |
| + * This returns true for user pages that need to get copied into |
| + * both the user and kernel copies of the page tables, and false |
| + * for kernel pages that should only be in the kernel copy. |
| + */ |
| +static inline bool is_userspace_pgd(pgd_t *pgdp) |
| +{ |
| + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); |
| +} |
| + |
| +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| +{ |
| + /* |
| + * Do we need to also populate the shadow pgd? Check _PAGE_USER to |
| + * skip cases like kexec and EFI which make temporary low mappings. |
| + */ |
| + if (pgd.pgd & _PAGE_USER) { |
| + if (is_userspace_pgd(pgdp)) { |
| + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| + /* |
| + * Even if the entry is *mapping* userspace, ensure |
| + * that userspace can not use it. This way, if we |
| + * get out to userspace running on the kernel CR3, |
| + * userspace will crash instead of running. |
| + */ |
| + pgd.pgd |= _PAGE_NX; |
| + } |
| + } else if (!pgd.pgd) { |
| + /* |
| + * pgd_clear() cannot check _PAGE_USER, and is even used to |
| + * clear corrupted pgd entries: so just rely on cases like |
| + * kexec and EFI never to be using pgd_clear(). |
| + */ |
| + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && |
| + is_userspace_pgd(pgdp)) |
| + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| + } |
| + return pgd; |
| +} |
| #endif /* CONFIG_KAISER */ |
| --- a/arch/x86/mm/kaslr.c |
| +++ b/arch/x86/mm/kaslr.c |
| @@ -189,6 +189,6 @@ void __meminit init_trampoline(void) |
| *pud_tramp = *pud; |
| } |
| |
| - set_pgd(&trampoline_pgd_entry, |
| - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); |
| + /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */ |
| + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); |
| } |