| From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Subject: mm: abstract initial stack setup to mm subsystem |
| Date: Mon, 28 Apr 2025 16:28:15 +0100 |
| |
| There are peculiarities within the kernel where what is very clearly mm |
| code is performed elsewhere arbitrarily. |
| |
| This violates separation of concerns and makes it harder to refactor code |
| to make changes to how fundamental initialisation and operation of mm |
| logic is performed. |
| |
| One such case is the creation of the VMA containing the initial stack upon |
| execve()'ing a new process. This is currently performed in |
| __bprm_mm_init() in fs/exec.c. |
| |
| Abstract this operation to create_init_stack_vma(). This allows us to |
| limit use of vma allocation and free code to fork and mm only. |
| |
| We previously did the same for the step at which we relocate the initial |
| stack VMA downwards via relocate_vma_down(), now we move the initial VMA |
| establishment too. |
| |
| Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's |
| no longer needed anywhere outside of mm. |
| |
| Link: https://lkml.kernel.org/r/118c950ef7a8dd19ab20a23a68c3603751acd30e.1745853549.git.lorenzo.stoakes@oracle.com |
| Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Acked-by: David Hildenbrand <david@redhat.com> |
| Reviewed-by: Suren Baghdasaryan <surenb@google.com> |
| Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> |
| Reviewed-by: Pedro Falcato <pfalcato@suse.de> |
| Reviewed-by: Kees Cook <kees@kernel.org> |
| Reviewed-by: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Al Viro <viro@zeniv.linux.org.uk> |
| Cc: Christian Brauner <brauner@kernel.org> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Jann Horn <jannh@google.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| fs/exec.c | 66 ++------------------------- |
| mm/mmap.c | 42 ----------------- |
| mm/vma.c | 43 ++++++++++++++++++ |
| mm/vma.h | 4 + |
| mm/vma_exec.c | 69 +++++++++++++++++++++++++++++ |
| tools/testing/vma/vma_internal.h | 32 +++++++++++++ |
| 6 files changed, 153 insertions(+), 103 deletions(-) |
| |
| --- a/fs/exec.c~mm-abstract-initial-stack-setup-to-mm-subsystem |
| +++ a/fs/exec.c |
| @@ -245,60 +245,6 @@ static void flush_arg_page(struct linux_ |
| flush_cache_page(bprm->vma, pos, page_to_pfn(page)); |
| } |
| |
| -static int __bprm_mm_init(struct linux_binprm *bprm) |
| -{ |
| - int err; |
| - struct vm_area_struct *vma = NULL; |
| - struct mm_struct *mm = bprm->mm; |
| - |
| - bprm->vma = vma = vm_area_alloc(mm); |
| - if (!vma) |
| - return -ENOMEM; |
| - vma_set_anonymous(vma); |
| - |
| - if (mmap_write_lock_killable(mm)) { |
| - err = -EINTR; |
| - goto err_free; |
| - } |
| - |
| - /* |
| - * Need to be called with mmap write lock |
| - * held, to avoid race with ksmd. |
| - */ |
| - err = ksm_execve(mm); |
| - if (err) |
| - goto err_ksm; |
| - |
| - /* |
| - * Place the stack at the largest stack address the architecture |
| - * supports. Later, we'll move this to an appropriate place. We don't |
| - * use STACK_TOP because that can depend on attributes which aren't |
| - * configured yet. |
| - */ |
| - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); |
| - vma->vm_end = STACK_TOP_MAX; |
| - vma->vm_start = vma->vm_end - PAGE_SIZE; |
| - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); |
| - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
| - |
| - err = insert_vm_struct(mm, vma); |
| - if (err) |
| - goto err; |
| - |
| - mm->stack_vm = mm->total_vm = 1; |
| - mmap_write_unlock(mm); |
| - bprm->p = vma->vm_end - sizeof(void *); |
| - return 0; |
| -err: |
| - ksm_exit(mm); |
| -err_ksm: |
| - mmap_write_unlock(mm); |
| -err_free: |
| - bprm->vma = NULL; |
| - vm_area_free(vma); |
| - return err; |
| -} |
| - |
| static bool valid_arg_len(struct linux_binprm *bprm, long len) |
| { |
| return len <= MAX_ARG_STRLEN; |
| @@ -351,12 +297,6 @@ static void flush_arg_page(struct linux_ |
| { |
| } |
| |
| -static int __bprm_mm_init(struct linux_binprm *bprm) |
| -{ |
| - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); |
| - return 0; |
| -} |
| - |
| static bool valid_arg_len(struct linux_binprm *bprm, long len) |
| { |
| return len <= bprm->p; |
| @@ -385,9 +325,13 @@ static int bprm_mm_init(struct linux_bin |
| bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; |
| task_unlock(current->group_leader); |
| |
| - err = __bprm_mm_init(bprm); |
| +#ifndef CONFIG_MMU |
| + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); |
| +#else |
| + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); |
| if (err) |
| goto err; |
| +#endif |
| |
| return 0; |
| |
| --- a/mm/mmap.c~mm-abstract-initial-stack-setup-to-mm-subsystem |
| +++ a/mm/mmap.c |
| @@ -1321,48 +1321,6 @@ destroy: |
| vm_unacct_memory(nr_accounted); |
| } |
| |
| -/* Insert vm structure into process list sorted by address |
| - * and into the inode's i_mmap tree. If vm_file is non-NULL |
| - * then i_mmap_rwsem is taken here. |
| - */ |
| -int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
| -{ |
| - unsigned long charged = vma_pages(vma); |
| - |
| - |
| - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) |
| - return -ENOMEM; |
| - |
| - if ((vma->vm_flags & VM_ACCOUNT) && |
| - security_vm_enough_memory_mm(mm, charged)) |
| - return -ENOMEM; |
| - |
| - /* |
| - * The vm_pgoff of a purely anonymous vma should be irrelevant |
| - * until its first write fault, when page's anon_vma and index |
| - * are set. But now set the vm_pgoff it will almost certainly |
| - * end up with (unless mremap moves it elsewhere before that |
| - * first wfault), so /proc/pid/maps tells a consistent story. |
| - * |
| - * By setting it to reflect the virtual start address of the |
| - * vma, merges and splits can happen in a seamless way, just |
| - * using the existing file pgoff checks and manipulations. |
| - * Similarly in do_mmap and in do_brk_flags. |
| - */ |
| - if (vma_is_anonymous(vma)) { |
| - BUG_ON(vma->anon_vma); |
| - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
| - } |
| - |
| - if (vma_link(mm, vma)) { |
| - if (vma->vm_flags & VM_ACCOUNT) |
| - vm_unacct_memory(charged); |
| - return -ENOMEM; |
| - } |
| - |
| - return 0; |
| -} |
| - |
| /* |
| * Return true if the calling process may expand its vm space by the passed |
| * number of pages |
| --- a/mm/vma.c~mm-abstract-initial-stack-setup-to-mm-subsystem |
| +++ a/mm/vma.c |
| @@ -3052,3 +3052,46 @@ int __vm_munmap(unsigned long start, siz |
| userfaultfd_unmap_complete(mm, &uf); |
| return ret; |
| } |
| + |
| + |
| +/* Insert vm structure into process list sorted by address |
| + * and into the inode's i_mmap tree. If vm_file is non-NULL |
| + * then i_mmap_rwsem is taken here. |
| + */ |
| +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
| +{ |
| + unsigned long charged = vma_pages(vma); |
| + |
| + |
| + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) |
| + return -ENOMEM; |
| + |
| + if ((vma->vm_flags & VM_ACCOUNT) && |
| + security_vm_enough_memory_mm(mm, charged)) |
| + return -ENOMEM; |
| + |
| + /* |
| + * The vm_pgoff of a purely anonymous vma should be irrelevant |
| + * until its first write fault, when page's anon_vma and index |
| + * are set. But now set the vm_pgoff it will almost certainly |
| + * end up with (unless mremap moves it elsewhere before that |
| + * first wfault), so /proc/pid/maps tells a consistent story. |
| + * |
| + * By setting it to reflect the virtual start address of the |
| + * vma, merges and splits can happen in a seamless way, just |
| + * using the existing file pgoff checks and manipulations. |
| + * Similarly in do_mmap and in do_brk_flags. |
| + */ |
| + if (vma_is_anonymous(vma)) { |
| + BUG_ON(vma->anon_vma); |
| + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
| + } |
| + |
| + if (vma_link(mm, vma)) { |
| + if (vma->vm_flags & VM_ACCOUNT) |
| + vm_unacct_memory(charged); |
| + return -ENOMEM; |
| + } |
| + |
| + return 0; |
| +} |
| --- a/mm/vma_exec.c~mm-abstract-initial-stack-setup-to-mm-subsystem |
| +++ a/mm/vma_exec.c |
| @@ -90,3 +90,72 @@ int relocate_vma_down(struct vm_area_str |
| /* Shrink the vma to just the new range */ |
| return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); |
| } |
| + |
| +/* |
| + * Establish the stack VMA in an execve'd process, located temporarily at the |
| + * maximum stack address provided by the architecture. |
| + * |
| + * We later relocate this downwards in relocate_vma_down(). |
| + * |
| + * This function is almost certainly NOT what you want for anything other than |
| + * early executable initialisation. |
| + * |
| + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the |
| + * maximum addressable location in the stack (that is capable of storing a |
| + * system word of data). |
| + */ |
| +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, |
| + unsigned long *top_mem_p) |
| +{ |
| + int err; |
| + struct vm_area_struct *vma = vm_area_alloc(mm); |
| + |
| + if (!vma) |
| + return -ENOMEM; |
| + |
| + vma_set_anonymous(vma); |
| + |
| + if (mmap_write_lock_killable(mm)) { |
| + err = -EINTR; |
| + goto err_free; |
| + } |
| + |
| + /* |
| + * Need to be called with mmap write lock |
| + * held, to avoid race with ksmd. |
| + */ |
| + err = ksm_execve(mm); |
| + if (err) |
| + goto err_ksm; |
| + |
| + /* |
| + * Place the stack at the largest stack address the architecture |
| + * supports. Later, we'll move this to an appropriate place. We don't |
| + * use STACK_TOP because that can depend on attributes which aren't |
| + * configured yet. |
| + */ |
| + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); |
| + vma->vm_end = STACK_TOP_MAX; |
| + vma->vm_start = vma->vm_end - PAGE_SIZE; |
| + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); |
| + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
| + |
| + err = insert_vm_struct(mm, vma); |
| + if (err) |
| + goto err; |
| + |
| + mm->stack_vm = mm->total_vm = 1; |
| + mmap_write_unlock(mm); |
| + *vmap = vma; |
| + *top_mem_p = vma->vm_end - sizeof(void *); |
| + return 0; |
| + |
| +err: |
| + ksm_exit(mm); |
| +err_ksm: |
| + mmap_write_unlock(mm); |
| +err_free: |
| + *vmap = NULL; |
| + vm_area_free(vma); |
| + return err; |
| +} |
| --- a/mm/vma.h~mm-abstract-initial-stack-setup-to-mm-subsystem |
| +++ a/mm/vma.h |
| @@ -548,8 +548,12 @@ int expand_downwards(struct vm_area_stru |
| |
| int __vm_munmap(unsigned long start, size_t len, bool unlock); |
| |
| +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); |
| + |
| /* vma_exec.c */ |
| #ifdef CONFIG_MMU |
| +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, |
| + unsigned long *top_mem_p); |
| int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); |
| #endif |
| |
| --- a/tools/testing/vma/vma_internal.h~mm-abstract-initial-stack-setup-to-mm-subsystem |
| +++ a/tools/testing/vma/vma_internal.h |
| @@ -56,6 +56,8 @@ extern unsigned long dac_mmap_min_addr; |
| #define VM_PFNMAP 0x00000400 |
| #define VM_LOCKED 0x00002000 |
| #define VM_IO 0x00004000 |
| +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ |
| +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ |
| #define VM_DONTEXPAND 0x00040000 |
| #define VM_LOCKONFAULT 0x00080000 |
| #define VM_ACCOUNT 0x00100000 |
| @@ -70,6 +72,20 @@ extern unsigned long dac_mmap_min_addr; |
| #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) |
| #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) |
| |
| +#ifdef CONFIG_STACK_GROWSUP |
| +#define VM_STACK VM_GROWSUP |
| +#define VM_STACK_EARLY VM_GROWSDOWN |
| +#else |
| +#define VM_STACK VM_GROWSDOWN |
| +#define VM_STACK_EARLY 0 |
| +#endif |
| + |
| +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) |
| +#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW |
| +#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW |
| +#define STACK_TOP TASK_SIZE_LOW |
| +#define STACK_TOP_MAX TASK_SIZE_MAX |
| + |
| /* This mask represents all the VMA flag bits used by mlock */ |
| #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) |
| |
| @@ -82,6 +98,10 @@ extern unsigned long dac_mmap_min_addr; |
| |
| #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) |
| |
| +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS |
| +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) |
| +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) |
| + |
| #define RLIMIT_STACK 3 /* max stack size */ |
| #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ |
| |
| @@ -1280,4 +1300,16 @@ static inline void free_pgd_range(struct |
| (void)ceiling; |
| } |
| |
| +static inline int ksm_execve(struct mm_struct *mm) |
| +{ |
| + (void)mm; |
| + |
| + return 0; |
| +} |
| + |
| +static inline void ksm_exit(struct mm_struct *mm) |
| +{ |
| + (void)mm; |
| +} |
| + |
| #endif /* __MM_VMA_INTERNAL_H */ |
| _ |