| From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Subject: mm: move dup_mmap() to mm |
| Date: Mon, 28 Apr 2025 16:28:16 +0100 |
| |
| This is a key step in our being able to abstract and isolate VMA |
| allocation and destruction logic. |
| |
| This function is the last one where vm_area_free() and vm_area_dup() are |
| directly referenced outside of mmap, so having this in mm allows us to |
| isolate these. |
| |
| We do the same for the nommu version which is substantially simpler. |
| |
| We place the declaration for dup_mmap() in mm/internal.h and have |
| kernel/fork.c import this in order to prevent improper use of this |
| functionality elsewhere in the kernel. |
| |
| While we're here, we remove the useless #ifdef CONFIG_MMU check around |
| mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if |
| CONFIG_MMU is set. |
| |
| Link: https://lkml.kernel.org/r/e49aad3d00212f5539d9fa5769bfda4ce451db3e.1745853549.git.lorenzo.stoakes@oracle.com |
| Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Suggested-by: Pedro Falcato <pfalcato@suse.de> |
| Reviewed-by: Pedro Falcato <pfalcato@suse.de> |
| Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> |
| Reviewed-by: Suren Baghdasaryan <surenb@google.com> |
| Reviewed-by: David Hildenbrand <david@redhat.com> |
| Reviewed-by: Kees Cook <kees@kernel.org> |
| Reviewed-by: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Al Viro <viro@zeniv.linux.org.uk> |
| Cc: Christian Brauner <brauner@kernel.org> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Jann Horn <jannh@google.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| kernel/fork.c | 189 +----------------------------------------------- |
| mm/internal.h | 2 |
| mm/mmap.c | 181 ++++++++++++++++++++++++++++++++++++++++++++- |
| mm/nommu.c | 8 ++ |
| 4 files changed, 189 insertions(+), 191 deletions(-) |
| |
| --- a/kernel/fork.c~mm-move-dup_mmap-to-mm |
| +++ a/kernel/fork.c |
| @@ -112,6 +112,9 @@ |
| #include <asm/cacheflush.h> |
| #include <asm/tlbflush.h> |
| |
| +/* For dup_mmap(). */ |
| +#include "../mm/internal.h" |
| + |
| #include <trace/events/sched.h> |
| |
| #define CREATE_TRACE_POINTS |
| @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk) |
| } |
| EXPORT_SYMBOL(free_task); |
| |
| -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) |
| +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) |
| { |
| struct file *exe_file; |
| |
| @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_st |
| } |
| |
| #ifdef CONFIG_MMU |
| -static __latent_entropy int dup_mmap(struct mm_struct *mm, |
| - struct mm_struct *oldmm) |
| -{ |
| - struct vm_area_struct *mpnt, *tmp; |
| - int retval; |
| - unsigned long charge = 0; |
| - LIST_HEAD(uf); |
| - VMA_ITERATOR(vmi, mm, 0); |
| - |
| - if (mmap_write_lock_killable(oldmm)) |
| - return -EINTR; |
| - flush_cache_dup_mm(oldmm); |
| - uprobe_dup_mmap(oldmm, mm); |
| - /* |
| - * Not linked in yet - no deadlock potential: |
| - */ |
| - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); |
| - |
| - /* No ordering required: file already has been exposed. */ |
| - dup_mm_exe_file(mm, oldmm); |
| - |
| - mm->total_vm = oldmm->total_vm; |
| - mm->data_vm = oldmm->data_vm; |
| - mm->exec_vm = oldmm->exec_vm; |
| - mm->stack_vm = oldmm->stack_vm; |
| - |
| - /* Use __mt_dup() to efficiently build an identical maple tree. */ |
| - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); |
| - if (unlikely(retval)) |
| - goto out; |
| - |
| - mt_clear_in_rcu(vmi.mas.tree); |
| - for_each_vma(vmi, mpnt) { |
| - struct file *file; |
| - |
| - vma_start_write(mpnt); |
| - if (mpnt->vm_flags & VM_DONTCOPY) { |
| - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, |
| - mpnt->vm_end, GFP_KERNEL); |
| - if (retval) |
| - goto loop_out; |
| - |
| - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); |
| - continue; |
| - } |
| - charge = 0; |
| - /* |
| - * Don't duplicate many vmas if we've been oom-killed (for |
| - * example) |
| - */ |
| - if (fatal_signal_pending(current)) { |
| - retval = -EINTR; |
| - goto loop_out; |
| - } |
| - if (mpnt->vm_flags & VM_ACCOUNT) { |
| - unsigned long len = vma_pages(mpnt); |
| - |
| - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ |
| - goto fail_nomem; |
| - charge = len; |
| - } |
| - tmp = vm_area_dup(mpnt); |
| - if (!tmp) |
| - goto fail_nomem; |
| - |
| - /* track_pfn_copy() will later take care of copying internal state. */ |
| - if (unlikely(tmp->vm_flags & VM_PFNMAP)) |
| - untrack_pfn_clear(tmp); |
| - |
| - retval = vma_dup_policy(mpnt, tmp); |
| - if (retval) |
| - goto fail_nomem_policy; |
| - tmp->vm_mm = mm; |
| - retval = dup_userfaultfd(tmp, &uf); |
| - if (retval) |
| - goto fail_nomem_anon_vma_fork; |
| - if (tmp->vm_flags & VM_WIPEONFORK) { |
| - /* |
| - * VM_WIPEONFORK gets a clean slate in the child. |
| - * Don't prepare anon_vma until fault since we don't |
| - * copy page for current vma. |
| - */ |
| - tmp->anon_vma = NULL; |
| - } else if (anon_vma_fork(tmp, mpnt)) |
| - goto fail_nomem_anon_vma_fork; |
| - vm_flags_clear(tmp, VM_LOCKED_MASK); |
| - /* |
| - * Copy/update hugetlb private vma information. |
| - */ |
| - if (is_vm_hugetlb_page(tmp)) |
| - hugetlb_dup_vma_private(tmp); |
| - |
| - /* |
| - * Link the vma into the MT. After using __mt_dup(), memory |
| - * allocation is not necessary here, so it cannot fail. |
| - */ |
| - vma_iter_bulk_store(&vmi, tmp); |
| - |
| - mm->map_count++; |
| - |
| - if (tmp->vm_ops && tmp->vm_ops->open) |
| - tmp->vm_ops->open(tmp); |
| - |
| - file = tmp->vm_file; |
| - if (file) { |
| - struct address_space *mapping = file->f_mapping; |
| - |
| - get_file(file); |
| - i_mmap_lock_write(mapping); |
| - if (vma_is_shared_maywrite(tmp)) |
| - mapping_allow_writable(mapping); |
| - flush_dcache_mmap_lock(mapping); |
| - /* insert tmp into the share list, just after mpnt */ |
| - vma_interval_tree_insert_after(tmp, mpnt, |
| - &mapping->i_mmap); |
| - flush_dcache_mmap_unlock(mapping); |
| - i_mmap_unlock_write(mapping); |
| - } |
| - |
| - if (!(tmp->vm_flags & VM_WIPEONFORK)) |
| - retval = copy_page_range(tmp, mpnt); |
| - |
| - if (retval) { |
| - mpnt = vma_next(&vmi); |
| - goto loop_out; |
| - } |
| - } |
| - /* a new mm has just been created */ |
| - retval = arch_dup_mmap(oldmm, mm); |
| -loop_out: |
| - vma_iter_free(&vmi); |
| - if (!retval) { |
| - mt_set_in_rcu(vmi.mas.tree); |
| - ksm_fork(mm, oldmm); |
| - khugepaged_fork(mm, oldmm); |
| - } else { |
| - |
| - /* |
| - * The entire maple tree has already been duplicated. If the |
| - * mmap duplication fails, mark the failure point with |
| - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, |
| - * stop releasing VMAs that have not been duplicated after this |
| - * point. |
| - */ |
| - if (mpnt) { |
| - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); |
| - mas_store(&vmi.mas, XA_ZERO_ENTRY); |
| - /* Avoid OOM iterating a broken tree */ |
| - set_bit(MMF_OOM_SKIP, &mm->flags); |
| - } |
| - /* |
| - * The mm_struct is going to exit, but the locks will be dropped |
| - * first. Set the mm_struct as unstable is advisable as it is |
| - * not fully initialised. |
| - */ |
| - set_bit(MMF_UNSTABLE, &mm->flags); |
| - } |
| -out: |
| - mmap_write_unlock(mm); |
| - flush_tlb_mm(oldmm); |
| - mmap_write_unlock(oldmm); |
| - if (!retval) |
| - dup_userfaultfd_complete(&uf); |
| - else |
| - dup_userfaultfd_fail(&uf); |
| - return retval; |
| - |
| -fail_nomem_anon_vma_fork: |
| - mpol_put(vma_policy(tmp)); |
| -fail_nomem_policy: |
| - vm_area_free(tmp); |
| -fail_nomem: |
| - retval = -ENOMEM; |
| - vm_unacct_memory(charge); |
| - goto loop_out; |
| -} |
| - |
| static inline int mm_alloc_pgd(struct mm_struct *mm) |
| { |
| mm->pgd = pgd_alloc(mm); |
| @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm |
| pgd_free(mm, mm->pgd); |
| } |
| #else |
| -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
| -{ |
| - mmap_write_lock(oldmm); |
| - dup_mm_exe_file(mm, oldmm); |
| - mmap_write_unlock(oldmm); |
| - return 0; |
| -} |
| #define mm_alloc_pgd(mm) (0) |
| #define mm_free_pgd(mm) |
| #endif /* CONFIG_MMU */ |
| --- a/mm/internal.h~mm-move-dup_mmap-to-mm |
| +++ a/mm/internal.h |
| @@ -1624,5 +1624,7 @@ static inline bool reclaim_pt_is_enabled |
| } |
| #endif /* CONFIG_PT_RECLAIM */ |
| |
| +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); |
| +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); |
| |
| #endif /* __MM_INTERNAL_H */ |
| --- a/mm/mmap.c~mm-move-dup_mmap-to-mm |
| +++ a/mm/mmap.c |
| @@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifi |
| } |
| subsys_initcall(init_reserve_notifier); |
| |
| -#ifdef CONFIG_MMU |
| /* |
| * Obtain a read lock on mm->mmap_lock, if the specified address is below the |
| * start of the VMA, the intent is to perform a write, and it is a |
| @@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct |
| mmap_write_downgrade(mm); |
| return true; |
| } |
| -#else |
| -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, |
| - unsigned long addr, bool write) |
| + |
| +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
| { |
| - return false; |
| + struct vm_area_struct *mpnt, *tmp; |
| + int retval; |
| + unsigned long charge = 0; |
| + LIST_HEAD(uf); |
| + VMA_ITERATOR(vmi, mm, 0); |
| + |
| + if (mmap_write_lock_killable(oldmm)) |
| + return -EINTR; |
| + flush_cache_dup_mm(oldmm); |
| + uprobe_dup_mmap(oldmm, mm); |
| + /* |
| + * Not linked in yet - no deadlock potential: |
| + */ |
| + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); |
| + |
| + /* No ordering required: file already has been exposed. */ |
| + dup_mm_exe_file(mm, oldmm); |
| + |
| + mm->total_vm = oldmm->total_vm; |
| + mm->data_vm = oldmm->data_vm; |
| + mm->exec_vm = oldmm->exec_vm; |
| + mm->stack_vm = oldmm->stack_vm; |
| + |
| + /* Use __mt_dup() to efficiently build an identical maple tree. */ |
| + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); |
| + if (unlikely(retval)) |
| + goto out; |
| + |
| + mt_clear_in_rcu(vmi.mas.tree); |
| + for_each_vma(vmi, mpnt) { |
| + struct file *file; |
| + |
| + vma_start_write(mpnt); |
| + if (mpnt->vm_flags & VM_DONTCOPY) { |
| + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, |
| + mpnt->vm_end, GFP_KERNEL); |
| + if (retval) |
| + goto loop_out; |
| + |
| + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); |
| + continue; |
| + } |
| + charge = 0; |
| + /* |
| + * Don't duplicate many vmas if we've been oom-killed (for |
| + * example) |
| + */ |
| + if (fatal_signal_pending(current)) { |
| + retval = -EINTR; |
| + goto loop_out; |
| + } |
| + if (mpnt->vm_flags & VM_ACCOUNT) { |
| + unsigned long len = vma_pages(mpnt); |
| + |
| + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ |
| + goto fail_nomem; |
| + charge = len; |
| + } |
| + |
| + tmp = vm_area_dup(mpnt); |
| + if (!tmp) |
| + goto fail_nomem; |
| + |
| + /* track_pfn_copy() will later take care of copying internal state. */ |
| + if (unlikely(tmp->vm_flags & VM_PFNMAP)) |
| + untrack_pfn_clear(tmp); |
| + |
| + retval = vma_dup_policy(mpnt, tmp); |
| + if (retval) |
| + goto fail_nomem_policy; |
| + tmp->vm_mm = mm; |
| + retval = dup_userfaultfd(tmp, &uf); |
| + if (retval) |
| + goto fail_nomem_anon_vma_fork; |
| + if (tmp->vm_flags & VM_WIPEONFORK) { |
| + /* |
| + * VM_WIPEONFORK gets a clean slate in the child. |
| + * Don't prepare anon_vma until fault since we don't |
| + * copy page for current vma. |
| + */ |
| + tmp->anon_vma = NULL; |
| + } else if (anon_vma_fork(tmp, mpnt)) |
| + goto fail_nomem_anon_vma_fork; |
| + vm_flags_clear(tmp, VM_LOCKED_MASK); |
| + /* |
| + * Copy/update hugetlb private vma information. |
| + */ |
| + if (is_vm_hugetlb_page(tmp)) |
| + hugetlb_dup_vma_private(tmp); |
| + |
| + /* |
| + * Link the vma into the MT. After using __mt_dup(), memory |
| + * allocation is not necessary here, so it cannot fail. |
| + */ |
| + vma_iter_bulk_store(&vmi, tmp); |
| + |
| + mm->map_count++; |
| + |
| + if (tmp->vm_ops && tmp->vm_ops->open) |
| + tmp->vm_ops->open(tmp); |
| + |
| + file = tmp->vm_file; |
| + if (file) { |
| + struct address_space *mapping = file->f_mapping; |
| + |
| + get_file(file); |
| + i_mmap_lock_write(mapping); |
| + if (vma_is_shared_maywrite(tmp)) |
| + mapping_allow_writable(mapping); |
| + flush_dcache_mmap_lock(mapping); |
| + /* insert tmp into the share list, just after mpnt */ |
| + vma_interval_tree_insert_after(tmp, mpnt, |
| + &mapping->i_mmap); |
| + flush_dcache_mmap_unlock(mapping); |
| + i_mmap_unlock_write(mapping); |
| + } |
| + |
| + if (!(tmp->vm_flags & VM_WIPEONFORK)) |
| + retval = copy_page_range(tmp, mpnt); |
| + |
| + if (retval) { |
| + mpnt = vma_next(&vmi); |
| + goto loop_out; |
| + } |
| + } |
| + /* a new mm has just been created */ |
| + retval = arch_dup_mmap(oldmm, mm); |
| +loop_out: |
| + vma_iter_free(&vmi); |
| + if (!retval) { |
| + mt_set_in_rcu(vmi.mas.tree); |
| + ksm_fork(mm, oldmm); |
| + khugepaged_fork(mm, oldmm); |
| + } else { |
| + |
| + /* |
| + * The entire maple tree has already been duplicated. If the |
| + * mmap duplication fails, mark the failure point with |
| + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, |
| + * stop releasing VMAs that have not been duplicated after this |
| + * point. |
| + */ |
| + if (mpnt) { |
| + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); |
| + mas_store(&vmi.mas, XA_ZERO_ENTRY); |
| + /* Avoid OOM iterating a broken tree */ |
| + set_bit(MMF_OOM_SKIP, &mm->flags); |
| + } |
| + /* |
| + * The mm_struct is going to exit, but the locks will be dropped |
| + * first. Set the mm_struct as unstable is advisable as it is |
| + * not fully initialised. |
| + */ |
| + set_bit(MMF_UNSTABLE, &mm->flags); |
| + } |
| +out: |
| + mmap_write_unlock(mm); |
| + flush_tlb_mm(oldmm); |
| + mmap_write_unlock(oldmm); |
| + if (!retval) |
| + dup_userfaultfd_complete(&uf); |
| + else |
| + dup_userfaultfd_fail(&uf); |
| + return retval; |
| + |
| +fail_nomem_anon_vma_fork: |
| + mpol_put(vma_policy(tmp)); |
| +fail_nomem_policy: |
| + vm_area_free(tmp); |
| +fail_nomem: |
| + retval = -ENOMEM; |
| + vm_unacct_memory(charge); |
| + goto loop_out; |
| } |
| -#endif |
| --- a/mm/nommu.c~mm-move-dup_mmap-to-mm |
| +++ a/mm/nommu.c |
| @@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve( |
| return 0; |
| } |
| subsys_initcall(init_admin_reserve); |
| + |
| +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
| +{ |
| + mmap_write_lock(oldmm); |
| + dup_mm_exe_file(mm, oldmm); |
| + mmap_write_unlock(oldmm); |
| + return 0; |
| +} |
| _ |