| From: Jeff Xu <jeffxu@chromium.org> |
| Subject: mseal: add mseal syscall |
| Date: Mon, 15 Apr 2024 16:35:21 +0000 |
| |
| The new mseal() is an syscall on 64 bit CPU, and with following signature: |
| |
| int mseal(void addr, size_t len, unsigned long flags) |
| addr/len: memory range. |
| flags: reserved. |
| |
| mseal() blocks following operations for the given memory range. |
| |
| 1> Unmapping, moving to another location, and shrinking the size, |
| via munmap() and mremap(), can leave an empty space, therefore can |
| be replaced with a VMA with a new set of attributes. |
| |
| 2> Moving or expanding a different VMA into the current location, |
| via mremap(). |
| |
| 3> Modifying a VMA via mmap(MAP_FIXED). |
| |
| 4> Size expansion, via mremap(), does not appear to pose any specific |
| risks to sealed VMAs. It is included anyway because the use case is |
| unclear. In any case, users can rely on merging to expand a sealed VMA. |
| |
| 5> mprotect() and pkey_mprotect(). |
| |
| 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous |
| memory, when users don't have write permission to the memory. Those |
| behaviors can alter region contents by discarding pages, effectively a |
| memset(0) for anonymous memory. |
| |
| Following input during RFC are incooperated into this patch: |
| |
| Jann Horn: raising awareness and providing valuable insights on the |
| destructive madvise operations. |
| Linus Torvalds: assisting in defining system call signature and scope. |
| Liam R. Howlett: perf optimization. |
| Theo de Raadt: sharing the experiences and insight gained from |
| implementing mimmutable() in OpenBSD. |
| |
| Finally, the idea that inspired this patch comes from Stephen Röttger's |
| work in Chrome V8 CFI. |
| |
| [jeffxu@chromium.org: add branch prediction hint, per Pedro] |
| Link: https://lkml.kernel.org/r/20240423192825.1273679-2-jeffxu@chromium.org |
| Link: https://lkml.kernel.org/r/20240415163527.626541-3-jeffxu@chromium.org |
| Signed-off-by: Jeff Xu <jeffxu@chromium.org> |
| Reviewed-by: Kees Cook <keescook@chromium.org> |
| Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> |
| Cc: Pedro Falcato <pedro.falcato@gmail.com> |
| Cc: Dave Hansen <dave.hansen@intel.com> |
| Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| Cc: Guenter Roeck <groeck@chromium.org> |
| Cc: Jann Horn <jannh@google.com> |
| Cc: Jeff Xu <jeffxu@google.com> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Jorge Lucangeli Obes <jorgelo@chromium.org> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Muhammad Usama Anjum <usama.anjum@collabora.com> |
| Cc: Pedro Falcato <pedro.falcato@gmail.com> |
| Cc: Stephen Röttger <sroettger@google.com> |
| Cc: Suren Baghdasaryan <surenb@google.com> |
| Cc: Amer Al Shanawany <amer.shanawany@gmail.com> |
| Cc: Javier Carrasco <javier.carrasco.cruz@gmail.com> |
| Cc: Shuah Khan <shuah@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/syscalls.h | 1 |
| mm/Makefile | 4 |
| mm/internal.h | 37 ++++ |
| mm/madvise.c | 12 + |
| mm/mmap.c | 31 +++ |
| mm/mprotect.c | 10 + |
| mm/mremap.c | 31 +++ |
| mm/mseal.c | 307 +++++++++++++++++++++++++++++++++++++ |
| 8 files changed, 432 insertions(+), 1 deletion(-) |
| |
| --- a/include/linux/syscalls.h~mseal-add-mseal-syscall |
| +++ a/include/linux/syscalls.h |
| @@ -821,6 +821,7 @@ asmlinkage long sys_process_mrelease(int |
| asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, |
| unsigned long prot, unsigned long pgoff, |
| unsigned long flags); |
| +asmlinkage long sys_mseal(unsigned long start, size_t len, unsigned long flags); |
| asmlinkage long sys_mbind(unsigned long start, unsigned long len, |
| unsigned long mode, |
| const unsigned long __user *nmask, |
| --- a/mm/internal.h~mseal-add-mseal-syscall |
| +++ a/mm/internal.h |
| @@ -1435,6 +1435,43 @@ void __meminit __init_single_page(struct |
| unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, |
| int priority); |
| |
| +#ifdef CONFIG_64BIT |
| +/* VM is sealed, in vm_flags */ |
| +#define VM_SEALED _BITUL(63) |
| +#endif |
| + |
| +#ifdef CONFIG_64BIT |
| +static inline int can_do_mseal(unsigned long flags) |
| +{ |
| + if (flags) |
| + return -EINVAL; |
| + |
| + return 0; |
| +} |
| + |
| +bool can_modify_mm(struct mm_struct *mm, unsigned long start, |
| + unsigned long end); |
| +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, |
| + unsigned long end, int behavior); |
| +#else |
| +static inline int can_do_mseal(unsigned long flags) |
| +{ |
| + return -EPERM; |
| +} |
| + |
| +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, |
| + unsigned long end) |
| +{ |
| + return true; |
| +} |
| + |
| +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, |
| + unsigned long end, int behavior) |
| +{ |
| + return true; |
| +} |
| +#endif |
| + |
| #ifdef CONFIG_SHRINKER_DEBUG |
| static inline __printf(2, 0) int shrinker_debugfs_name_alloc( |
| struct shrinker *shrinker, const char *fmt, va_list ap) |
| --- a/mm/madvise.c~mseal-add-mseal-syscall |
| +++ a/mm/madvise.c |
| @@ -1401,6 +1401,7 @@ int madvise_set_anon_name(struct mm_stru |
| * -EIO - an I/O error occurred while paging in data. |
| * -EBADF - map exists, but area maps something that isn't a file. |
| * -EAGAIN - a kernel resource was temporarily unavailable. |
| + * -EPERM - memory is sealed. |
| */ |
| int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) |
| { |
| @@ -1444,6 +1445,15 @@ int do_madvise(struct mm_struct *mm, uns |
| start = untagged_addr_remote(mm, start); |
| end = start + len; |
| |
| + /* |
| + * Check if the address range is sealed for do_madvise(). |
| + * can_modify_mm_madv assumes we have acquired the lock on MM. |
| + */ |
| + if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { |
| + error = -EPERM; |
| + goto out; |
| + } |
| + |
| blk_start_plug(&plug); |
| switch (behavior) { |
| case MADV_POPULATE_READ: |
| @@ -1456,6 +1466,8 @@ int do_madvise(struct mm_struct *mm, uns |
| break; |
| } |
| blk_finish_plug(&plug); |
| + |
| +out: |
| if (write) |
| mmap_write_unlock(mm); |
| else |
| --- a/mm/Makefile~mseal-add-mseal-syscall |
| +++ a/mm/Makefile |
| @@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH |
| mmu-$(CONFIG_MMU) += process_vm_access.o |
| endif |
| |
| +ifdef CONFIG_64BIT |
| +mmu-$(CONFIG_MMU) += mseal.o |
| +endif |
| + |
| obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
| maccess.o page-writeback.o folio-compat.o \ |
| readahead.o swap.o truncate.o vmscan.o shrinker.o \ |
| --- a/mm/mmap.c~mseal-add-mseal-syscall |
| +++ a/mm/mmap.c |
| @@ -1255,6 +1255,16 @@ unsigned long do_mmap(struct file *file, |
| if (mm->map_count > sysctl_max_map_count) |
| return -ENOMEM; |
| |
| + /* |
| + * addr is returned from get_unmapped_area, |
| + * There are two cases: |
| + * 1> MAP_FIXED == false |
| + * unallocated memory, no need to check sealing. |
| + * 1> MAP_FIXED == true |
| + * sealing is checked inside mmap_region when |
| + * do_vmi_munmap is called. |
| + */ |
| + |
| if (prot == PROT_EXEC) { |
| pkey = execute_only_pkey(mm); |
| if (pkey < 0) |
| @@ -2727,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *v |
| if (end == start) |
| return -EINVAL; |
| |
| + /* |
| + * Check if memory is sealed before arch_unmap. |
| + * Prevent unmapping a sealed VMA. |
| + * can_modify_mm assumes we have acquired the lock on MM. |
| + */ |
| + if (unlikely(!can_modify_mm(mm, start, end))) |
| + return -EPERM; |
| + |
| /* arch_unmap() might do unmaps itself. */ |
| arch_unmap(mm, start, end); |
| |
| @@ -2789,7 +2807,10 @@ unsigned long mmap_region(struct file *f |
| } |
| |
| /* Unmap any existing mapping in the area */ |
| - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) |
| + error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); |
| + if (error == -EPERM) |
| + return error; |
| + else if (error) |
| return -ENOMEM; |
| |
| /* |
| @@ -3139,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *v |
| { |
| struct mm_struct *mm = vma->vm_mm; |
| |
| + /* |
| + * Check if memory is sealed before arch_unmap. |
| + * Prevent unmapping a sealed VMA. |
| + * can_modify_mm assumes we have acquired the lock on MM. |
| + */ |
| + if (unlikely(!can_modify_mm(mm, start, end))) |
| + return -EPERM; |
| + |
| arch_unmap(mm, start, end); |
| return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); |
| } |
| --- a/mm/mprotect.c~mseal-add-mseal-syscall |
| +++ a/mm/mprotect.c |
| @@ -32,6 +32,7 @@ |
| #include <linux/sched/sysctl.h> |
| #include <linux/userfaultfd_k.h> |
| #include <linux/memory-tiers.h> |
| +#include <uapi/linux/mman.h> |
| #include <asm/cacheflush.h> |
| #include <asm/mmu_context.h> |
| #include <asm/tlbflush.h> |
| @@ -744,6 +745,15 @@ static int do_mprotect_pkey(unsigned lon |
| } |
| } |
| |
| + /* |
| + * checking if memory is sealed. |
| + * can_modify_mm assumes we have acquired the lock on MM. |
| + */ |
| + if (unlikely(!can_modify_mm(current->mm, start, end))) { |
| + error = -EPERM; |
| + goto out; |
| + } |
| + |
| prev = vma_prev(&vmi); |
| if (start > vma->vm_start) |
| prev = vma; |
| --- a/mm/mremap.c~mseal-add-mseal-syscall |
| +++ a/mm/mremap.c |
| @@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned |
| if ((mm->map_count + 2) >= sysctl_max_map_count - 3) |
| return -ENOMEM; |
| |
| + /* |
| + * In mremap_to(). |
| + * Move a VMA to another location, check if src addr is sealed. |
| + * |
| + * Place can_modify_mm here because mremap_to() |
| + * does its own checking for address range, and we only |
| + * check the sealing after passing those checks. |
| + * |
| + * can_modify_mm assumes we have acquired the lock on MM. |
| + */ |
| + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) |
| + return -EPERM; |
| + |
| if (flags & MREMAP_FIXED) { |
| + /* |
| + * In mremap_to(). |
| + * VMA is moved to dst address, and munmap dst first. |
| + * do_munmap will check if dst is sealed. |
| + */ |
| ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); |
| if (ret) |
| goto out; |
| @@ -1061,6 +1079,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, a |
| goto out; |
| } |
| |
| + /* |
| + * Below is shrink/expand case (not mremap_to()) |
| + * Check if src address is sealed, if so, reject. |
| + * In other words, prevent shrinking or expanding a sealed VMA. |
| + * |
| + * Place can_modify_mm here so we can keep the logic related to |
| + * shrink/expand together. |
| + */ |
| + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) { |
| + ret = -EPERM; |
| + goto out; |
| + } |
| + |
| /* |
| * Always allow a shrinking remap: that just unmaps |
| * the unnecessary pages.. |
| --- /dev/null |
| +++ a/mm/mseal.c |
| @@ -0,0 +1,307 @@ |
| +// SPDX-License-Identifier: GPL-2.0 |
| +/* |
| + * Implement mseal() syscall. |
| + * |
| + * Copyright (c) 2023,2024 Google, Inc. |
| + * |
| + * Author: Jeff Xu <jeffxu@chromium.org> |
| + */ |
| + |
| +#include <linux/mempolicy.h> |
| +#include <linux/mman.h> |
| +#include <linux/mm.h> |
| +#include <linux/mm_inline.h> |
| +#include <linux/mmu_context.h> |
| +#include <linux/syscalls.h> |
| +#include <linux/sched.h> |
| +#include "internal.h" |
| + |
| +static inline bool vma_is_sealed(struct vm_area_struct *vma) |
| +{ |
| + return (vma->vm_flags & VM_SEALED); |
| +} |
| + |
| +static inline void set_vma_sealed(struct vm_area_struct *vma) |
| +{ |
| + vm_flags_set(vma, VM_SEALED); |
| +} |
| + |
| +/* |
| + * check if a vma is sealed for modification. |
| + * return true, if modification is allowed. |
| + */ |
| +static bool can_modify_vma(struct vm_area_struct *vma) |
| +{ |
| + if (unlikely(vma_is_sealed(vma))) |
| + return false; |
| + |
| + return true; |
| +} |
| + |
| +static bool is_madv_discard(int behavior) |
| +{ |
| + return behavior & |
| + (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | |
| + MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); |
| +} |
| + |
| +static bool is_ro_anon(struct vm_area_struct *vma) |
| +{ |
| + /* check anonymous mapping. */ |
| + if (vma->vm_file || vma->vm_flags & VM_SHARED) |
| + return false; |
| + |
| + /* |
| + * check for non-writable: |
| + * PROT=RO or PKRU is not writeable. |
| + */ |
| + if (!(vma->vm_flags & VM_WRITE) || |
| + !arch_vma_access_permitted(vma, true, false, false)) |
| + return true; |
| + |
| + return false; |
| +} |
| + |
| +/* |
| + * Check if the vmas of a memory range are allowed to be modified. |
| + * the memory ranger can have a gap (unallocated memory). |
| + * return true, if it is allowed. |
| + */ |
| +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) |
| +{ |
| + struct vm_area_struct *vma; |
| + |
| + VMA_ITERATOR(vmi, mm, start); |
| + |
| + /* going through each vma to check. */ |
| + for_each_vma_range(vmi, vma, end) { |
| + if (unlikely(!can_modify_vma(vma))) |
| + return false; |
| + } |
| + |
| + /* Allow by default. */ |
| + return true; |
| +} |
| + |
| +/* |
| + * Check if the vmas of a memory range are allowed to be modified by madvise. |
| + * the memory ranger can have a gap (unallocated memory). |
| + * return true, if it is allowed. |
| + */ |
| +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, |
| + int behavior) |
| +{ |
| + struct vm_area_struct *vma; |
| + |
| + VMA_ITERATOR(vmi, mm, start); |
| + |
| + if (!is_madv_discard(behavior)) |
| + return true; |
| + |
| + /* going through each vma to check. */ |
| + for_each_vma_range(vmi, vma, end) |
| + if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) |
| + return false; |
| + |
| + /* Allow by default. */ |
| + return true; |
| +} |
| + |
| +static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, |
| + struct vm_area_struct **prev, unsigned long start, |
| + unsigned long end, vm_flags_t newflags) |
| +{ |
| + int ret = 0; |
| + vm_flags_t oldflags = vma->vm_flags; |
| + |
| + if (newflags == oldflags) |
| + goto out; |
| + |
| + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); |
| + if (IS_ERR(vma)) { |
| + ret = PTR_ERR(vma); |
| + goto out; |
| + } |
| + |
| + set_vma_sealed(vma); |
| +out: |
| + *prev = vma; |
| + return ret; |
| +} |
| + |
| +/* |
| + * Check for do_mseal: |
| + * 1> start is part of a valid vma. |
| + * 2> end is part of a valid vma. |
| + * 3> No gap (unallocated address) between start and end. |
| + * 4> map is sealable. |
| + */ |
| +static int check_mm_seal(unsigned long start, unsigned long end) |
| +{ |
| + struct vm_area_struct *vma; |
| + unsigned long nstart = start; |
| + |
| + VMA_ITERATOR(vmi, current->mm, start); |
| + |
| + /* going through each vma to check. */ |
| + for_each_vma_range(vmi, vma, end) { |
| + if (vma->vm_start > nstart) |
| + /* unallocated memory found. */ |
| + return -ENOMEM; |
| + |
| + if (vma->vm_end >= end) |
| + return 0; |
| + |
| + nstart = vma->vm_end; |
| + } |
| + |
| + return -ENOMEM; |
| +} |
| + |
| +/* |
| + * Apply sealing. |
| + */ |
| +static int apply_mm_seal(unsigned long start, unsigned long end) |
| +{ |
| + unsigned long nstart; |
| + struct vm_area_struct *vma, *prev; |
| + |
| + VMA_ITERATOR(vmi, current->mm, start); |
| + |
| + vma = vma_iter_load(&vmi); |
| + /* |
| + * Note: check_mm_seal should already checked ENOMEM case. |
| + * so vma should not be null, same for the other ENOMEM cases. |
| + */ |
| + prev = vma_prev(&vmi); |
| + if (start > vma->vm_start) |
| + prev = vma; |
| + |
| + nstart = start; |
| + for_each_vma_range(vmi, vma, end) { |
| + int error; |
| + unsigned long tmp; |
| + vm_flags_t newflags; |
| + |
| + newflags = vma->vm_flags | VM_SEALED; |
| + tmp = vma->vm_end; |
| + if (tmp > end) |
| + tmp = end; |
| + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); |
| + if (error) |
| + return error; |
| + nstart = vma_iter_end(&vmi); |
| + } |
| + |
| + return 0; |
| +} |
| + |
| +/* |
| + * mseal(2) seals the VM's meta data from |
| + * selected syscalls. |
| + * |
| + * addr/len: VM address range. |
| + * |
| + * The address range by addr/len must meet: |
| + * start (addr) must be in a valid VMA. |
| + * end (addr + len) must be in a valid VMA. |
| + * no gap (unallocated memory) between start and end. |
| + * start (addr) must be page aligned. |
| + * |
| + * len: len will be page aligned implicitly. |
| + * |
| + * Below VMA operations are blocked after sealing. |
| + * 1> Unmapping, moving to another location, and shrinking |
| + * the size, via munmap() and mremap(), can leave an empty |
| + * space, therefore can be replaced with a VMA with a new |
| + * set of attributes. |
| + * 2> Moving or expanding a different vma into the current location, |
| + * via mremap(). |
| + * 3> Modifying a VMA via mmap(MAP_FIXED). |
| + * 4> Size expansion, via mremap(), does not appear to pose any |
| + * specific risks to sealed VMAs. It is included anyway because |
| + * the use case is unclear. In any case, users can rely on |
| + * merging to expand a sealed VMA. |
| + * 5> mprotect and pkey_mprotect. |
| + * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) |
| + * for anonymous memory, when users don't have write permission to the |
| + * memory. Those behaviors can alter region contents by discarding pages, |
| + * effectively a memset(0) for anonymous memory. |
| + * |
| + * flags: reserved. |
| + * |
| + * return values: |
| + * zero: success. |
| + * -EINVAL: |
| + * invalid input flags. |
| + * start address is not page aligned. |
| + * Address arange (start + len) overflow. |
| + * -ENOMEM: |
| + * addr is not a valid address (not allocated). |
| + * end (start + len) is not a valid address. |
| + * a gap (unallocated memory) between start and end. |
| + * -EPERM: |
| + * - In 32 bit architecture, sealing is not supported. |
| + * Note: |
| + * user can call mseal(2) multiple times, adding a seal on an |
| + * already sealed memory is a no-action (no error). |
| + * |
| + * unseal() is not supported. |
| + */ |
| +static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) |
| +{ |
| + size_t len; |
| + int ret = 0; |
| + unsigned long end; |
| + struct mm_struct *mm = current->mm; |
| + |
| + ret = can_do_mseal(flags); |
| + if (ret) |
| + return ret; |
| + |
| + start = untagged_addr(start); |
| + if (!PAGE_ALIGNED(start)) |
| + return -EINVAL; |
| + |
| + len = PAGE_ALIGN(len_in); |
| + /* Check to see whether len was rounded up from small -ve to zero. */ |
| + if (len_in && !len) |
| + return -EINVAL; |
| + |
| + end = start + len; |
| + if (end < start) |
| + return -EINVAL; |
| + |
| + if (end == start) |
| + return 0; |
| + |
| + if (mmap_write_lock_killable(mm)) |
| + return -EINTR; |
| + |
| + /* |
| + * First pass, this helps to avoid |
| + * partial sealing in case of error in input address range, |
| + * e.g. ENOMEM error. |
| + */ |
| + ret = check_mm_seal(start, end); |
| + if (ret) |
| + goto out; |
| + |
| + /* |
| + * Second pass, this should success, unless there are errors |
| + * from vma_modify_flags, e.g. merge/split error, or process |
| + * reaching the max supported VMAs, however, those cases shall |
| + * be rare. |
| + */ |
| + ret = apply_mm_seal(start, end); |
| + |
| +out: |
| + mmap_write_unlock(current->mm); |
| + return ret; |
| +} |
| + |
| +SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, |
| + flags) |
| +{ |
| + return do_mseal(start, len, flags); |
| +} |
| _ |