| From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> |
| Subject: mm/mmap: avoid zeroing vma tree in mmap_region() |
| Date: Fri, 30 Aug 2024 00:00:54 -0400 |
| |
| Instead of zeroing the vma tree and then overwriting the area, let the |
| area be overwritten and then clean up the gathered vmas using |
| vms_complete_munmap_vmas(). |
| |
| To ensure locking is downgraded correctly, the mm is set regardless of |
| MAP_FIXED or not (NULL vma). |
| |
| If a driver is mapping over an existing vma, then clear the ptes before |
| the call_mmap() invocation. This is done using the vms_clean_up_area() |
| helper. If there is a close vm_ops, that must also be called to ensure |
| any cleanup is done before mapping over the area. This also means that |
| calling open has been added to the abort of an unmap operation, for now. |
| |
| Since vm_ops->open() and vm_ops->close() are not always undo each other |
| (state cleanup may exist in ->close() that is lost forever), the code |
| cannot be left in this way, but that change has been isolated to another |
| commit to make this point very obvious for traceability. |
| |
| Temporarily keep track of the number of pages that will be removed and |
| reduce the charged amount. |
| |
| This also drops the validate_mm() call in the vma_expand() function. It |
| is necessary to drop the validate as it would fail since the mm map_count |
| would be incorrect during a vma expansion, prior to the cleanup from |
| vms_complete_munmap_vmas(). |
| |
| Clean up the error handing of the vms_gather_munmap_vmas() by calling the |
| verification within the function. |
| |
| Link: https://lkml.kernel.org/r/20240830040101.822209-15-Liam.Howlett@oracle.com |
| Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> |
| Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Cc: Bert Karwatzki <spasswolf@web.de> |
| Cc: Jeff Xu <jeffxu@chromium.org> |
| Cc: Jiri Olsa <olsajiri@gmail.com> |
| Cc: Kees Cook <kees@kernel.org> |
| Cc: Lorenzo Stoakes <lstoakes@gmail.com> |
| Cc: Mark Brown <broonie@kernel.org> |
| Cc: Matthew Wilcox <willy@infradead.org> |
| Cc: "Paul E. McKenney" <paulmck@kernel.org> |
| Cc: Paul Moore <paul@paul-moore.com> |
| Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> |
| Cc: Suren Baghdasaryan <surenb@google.com> |
| Cc: Vlastimil Babka <vbabka@suse.cz> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/mmap.c | 57 ++++++++++++++++++++++++---------------------------- |
| mm/vma.c | 54 ++++++++++++++++++++++++++++++++++++++----------- |
| mm/vma.h | 22 ++++++++++++++------ |
| 3 files changed, 85 insertions(+), 48 deletions(-) |
| |
| --- a/mm/mmap.c~mm-mmap-avoid-zeroing-vma-tree-in-mmap_region |
| +++ a/mm/mmap.c |
| @@ -1373,23 +1373,19 @@ unsigned long mmap_region(struct file *f |
| unsigned long merge_start = addr, merge_end = end; |
| bool writable_file_mapping = false; |
| pgoff_t vm_pgoff; |
| - int error; |
| + int error = -ENOMEM; |
| VMA_ITERATOR(vmi, mm, addr); |
| + unsigned long nr_pages, nr_accounted; |
| |
| - /* Check against address space limit. */ |
| - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { |
| - unsigned long nr_pages; |
| + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted); |
| |
| - /* |
| - * MAP_FIXED may remove pages of mappings that intersects with |
| - * requested mapping. Account for the pages it would unmap. |
| - */ |
| - nr_pages = count_vma_pages_range(mm, addr, end); |
| - |
| - if (!may_expand_vm(mm, vm_flags, |
| - (len >> PAGE_SHIFT) - nr_pages)) |
| - return -ENOMEM; |
| - } |
| + /* |
| + * Check against address space limit. |
| + * MAP_FIXED may remove pages of mappings that intersects with requested |
| + * mapping. Account for the pages it would unmap. |
| + */ |
| + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) |
| + return -ENOMEM; |
| |
| /* Find the first overlapping VMA */ |
| vma = vma_find(&vmi, end); |
| @@ -1403,13 +1399,6 @@ unsigned long mmap_region(struct file *f |
| if (error) |
| goto gather_failed; |
| |
| - /* Remove any existing mappings from the vma tree */ |
| - error = vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL); |
| - if (error) |
| - goto clear_tree_failed; |
| - |
| - /* Unmap any existing mapping in the area */ |
| - vms_complete_munmap_vmas(&vms, &mas_detach); |
| next = vms.next; |
| prev = vms.prev; |
| vma = NULL; |
| @@ -1425,8 +1414,10 @@ unsigned long mmap_region(struct file *f |
| */ |
| if (accountable_mapping(file, vm_flags)) { |
| charged = len >> PAGE_SHIFT; |
| + charged -= nr_accounted; |
| if (security_vm_enough_memory_mm(mm, charged)) |
| - return -ENOMEM; |
| + goto abort_munmap; |
| + vms.nr_accounted = 0; |
| vm_flags |= VM_ACCOUNT; |
| } |
| |
| @@ -1475,10 +1466,8 @@ cannot_expand: |
| * not unmapped, but the maps are removed from the list. |
| */ |
| vma = vm_area_alloc(mm); |
| - if (!vma) { |
| - error = -ENOMEM; |
| + if (!vma) |
| goto unacct_error; |
| - } |
| |
| vma_iter_config(&vmi, addr, end); |
| vma_set_range(vma, addr, end, pgoff); |
| @@ -1487,6 +1476,11 @@ cannot_expand: |
| |
| if (file) { |
| vma->vm_file = get_file(file); |
| + /* |
| + * call_mmap() may map PTE, so ensure there are no existing PTEs |
| + * call the vm_ops close function if one exists. |
| + */ |
| + vms_clean_up_area(&vms, &mas_detach, true); |
| error = call_mmap(file, vma); |
| if (error) |
| goto unmap_and_free_vma; |
| @@ -1577,6 +1571,9 @@ unmap_writable: |
| expanded: |
| perf_event_mmap(vma); |
| |
| + /* Unmap any existing mapping in the area */ |
| + vms_complete_munmap_vmas(&vms, &mas_detach); |
| + |
| vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); |
| if (vm_flags & VM_LOCKED) { |
| if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || |
| @@ -1605,7 +1602,7 @@ expanded: |
| return addr; |
| |
| close_and_free_vma: |
| - if (file && vma->vm_ops && vma->vm_ops->close) |
| + if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close) |
| vma->vm_ops->close(vma); |
| |
| if (file || vma->vm_file) { |
| @@ -1625,9 +1622,9 @@ unacct_error: |
| if (charged) |
| vm_unacct_memory(charged); |
| |
| -clear_tree_failed: |
| - if (vms.vma_count) |
| - abort_munmap_vmas(&mas_detach); |
| +abort_munmap: |
| + if (vms.nr_pages) |
| + abort_munmap_vmas(&mas_detach, vms.closed_vm_ops); |
| gather_failed: |
| validate_mm(mm); |
| return error; |
| @@ -1960,7 +1957,7 @@ void exit_mmap(struct mm_struct *mm) |
| do { |
| if (vma->vm_flags & VM_ACCOUNT) |
| nr_accounted += vma_pages(vma); |
| - remove_vma(vma, true); |
| + remove_vma(vma, /* unreachable = */ true, /* closed = */ false); |
| count++; |
| cond_resched(); |
| vma = vma_next(&vmi); |
| --- a/mm/vma.c~mm-mmap-avoid-zeroing-vma-tree-in-mmap_region |
| +++ a/mm/vma.c |
| @@ -136,10 +136,10 @@ can_vma_merge_after(struct vm_area_struc |
| /* |
| * Close a vm structure and free it. |
| */ |
| -void remove_vma(struct vm_area_struct *vma, bool unreachable) |
| +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed) |
| { |
| might_sleep(); |
| - if (vma->vm_ops && vma->vm_ops->close) |
| + if (!closed && vma->vm_ops && vma->vm_ops->close) |
| vma->vm_ops->close(vma); |
| if (vma->vm_file) |
| fput(vma->vm_file); |
| @@ -521,7 +521,6 @@ int vma_expand(struct vma_iterator *vmi, |
| vma_iter_store(vmi, vma); |
| |
| vma_complete(&vp, vmi, vma->vm_mm); |
| - validate_mm(vma->vm_mm); |
| return 0; |
| |
| nomem: |
| @@ -645,11 +644,14 @@ again: |
| uprobe_mmap(vp->insert); |
| } |
| |
| -static void vms_complete_pte_clear(struct vma_munmap_struct *vms, |
| - struct ma_state *mas_detach, bool mm_wr_locked) |
| +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, |
| + struct ma_state *mas_detach, bool mm_wr_locked) |
| { |
| struct mmu_gather tlb; |
| |
| + if (!vms->clear_ptes) /* Nothing to do */ |
| + return; |
| + |
| /* |
| * We can free page tables without write-locking mmap_lock because VMAs |
| * were isolated before we downgraded mmap_lock. |
| @@ -658,11 +660,31 @@ static void vms_complete_pte_clear(struc |
| lru_add_drain(); |
| tlb_gather_mmu(&tlb, vms->mm); |
| update_hiwater_rss(vms->mm); |
| - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked); |
| + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, |
| + vms->vma_count, mm_wr_locked); |
| + |
| mas_set(mas_detach, 1); |
| /* start and end may be different if there is no prev or next vma. */ |
| - free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); |
| + free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, |
| + vms->unmap_end, mm_wr_locked); |
| tlb_finish_mmu(&tlb); |
| + vms->clear_ptes = false; |
| +} |
| + |
| +void vms_clean_up_area(struct vma_munmap_struct *vms, |
| + struct ma_state *mas_detach, bool mm_wr_locked) |
| +{ |
| + struct vm_area_struct *vma; |
| + |
| + if (!vms->nr_pages) |
| + return; |
| + |
| + vms_clear_ptes(vms, mas_detach, mm_wr_locked); |
| + mas_set(mas_detach, 0); |
| + mas_for_each(mas_detach, vma, ULONG_MAX) |
| + if (vma->vm_ops && vma->vm_ops->close) |
| + vma->vm_ops->close(vma); |
| + vms->closed_vm_ops = true; |
| } |
| |
| /* |
| @@ -686,7 +708,10 @@ void vms_complete_munmap_vmas(struct vma |
| if (vms->unlock) |
| mmap_write_downgrade(mm); |
| |
| - vms_complete_pte_clear(vms, mas_detach, !vms->unlock); |
| + if (!vms->nr_pages) |
| + return; |
| + |
| + vms_clear_ptes(vms, mas_detach, !vms->unlock); |
| /* Update high watermark before we lower total_vm */ |
| update_hiwater_vm(mm); |
| /* Stat accounting */ |
| @@ -702,7 +727,7 @@ void vms_complete_munmap_vmas(struct vma |
| /* Remove and clean up vmas */ |
| mas_set(mas_detach, 0); |
| mas_for_each(mas_detach, vma, ULONG_MAX) |
| - remove_vma(vma, false); |
| + remove_vma(vma, /* = */ false, vms->closed_vm_ops); |
| |
| vm_unacct_memory(vms->nr_accounted); |
| validate_mm(mm); |
| @@ -846,13 +871,14 @@ int vms_gather_munmap_vmas(struct vma_mu |
| while (vma_iter_addr(vms->vmi) > vms->start) |
| vma_iter_prev_range(vms->vmi); |
| |
| + vms->clear_ptes = true; |
| return 0; |
| |
| userfaultfd_error: |
| munmap_gather_failed: |
| end_split_failed: |
| modify_vma_failed: |
| - abort_munmap_vmas(mas_detach); |
| + abort_munmap_vmas(mas_detach, /* closed = */ false); |
| start_split_failed: |
| map_count_exceeded: |
| return error; |
| @@ -897,7 +923,7 @@ int do_vmi_align_munmap(struct vma_itera |
| return 0; |
| |
| clear_tree_failed: |
| - abort_munmap_vmas(&mas_detach); |
| + abort_munmap_vmas(&mas_detach, /* closed = */ false); |
| gather_failed: |
| validate_mm(mm); |
| return error; |
| @@ -1615,17 +1641,21 @@ bool vma_wants_writenotify(struct vm_are |
| } |
| |
| unsigned long count_vma_pages_range(struct mm_struct *mm, |
| - unsigned long addr, unsigned long end) |
| + unsigned long addr, unsigned long end, |
| + unsigned long *nr_accounted) |
| { |
| VMA_ITERATOR(vmi, mm, addr); |
| struct vm_area_struct *vma; |
| unsigned long nr_pages = 0; |
| |
| + *nr_accounted = 0; |
| for_each_vma_range(vmi, vma, end) { |
| unsigned long vm_start = max(addr, vma->vm_start); |
| unsigned long vm_end = min(end, vma->vm_end); |
| |
| nr_pages += PHYS_PFN(vm_end - vm_start); |
| + if (vma->vm_flags & VM_ACCOUNT) |
| + *nr_accounted += PHYS_PFN(vm_end - vm_start); |
| } |
| |
| return nr_pages; |
| --- a/mm/vma.h~mm-mmap-avoid-zeroing-vma-tree-in-mmap_region |
| +++ a/mm/vma.h |
| @@ -48,6 +48,8 @@ struct vma_munmap_struct { |
| unsigned long stack_vm; |
| unsigned long data_vm; |
| bool unlock; /* Unlock after the munmap */ |
| + bool clear_ptes; /* If there are outstanding PTE to be cleared */ |
| + bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */ |
| }; |
| |
| #ifdef CONFIG_DEBUG_VM_MAPLE_TREE |
| @@ -96,14 +98,13 @@ static inline void init_vma_munmap(struc |
| unsigned long start, unsigned long end, struct list_head *uf, |
| bool unlock) |
| { |
| + vms->mm = current->mm; |
| vms->vmi = vmi; |
| vms->vma = vma; |
| if (vma) { |
| - vms->mm = vma->vm_mm; |
| vms->start = start; |
| vms->end = end; |
| } else { |
| - vms->mm = NULL; |
| vms->start = vms->end = 0; |
| } |
| vms->unlock = unlock; |
| @@ -113,6 +114,8 @@ static inline void init_vma_munmap(struc |
| vms->exec_vm = vms->stack_vm = vms->data_vm = 0; |
| vms->unmap_start = FIRST_USER_ADDRESS; |
| vms->unmap_end = USER_PGTABLES_CEILING; |
| + vms->clear_ptes = false; |
| + vms->closed_vm_ops = false; |
| } |
| #endif |
| |
| @@ -122,18 +125,24 @@ int vms_gather_munmap_vmas(struct vma_mu |
| void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, |
| struct ma_state *mas_detach); |
| |
| +void vms_clean_up_area(struct vma_munmap_struct *vms, |
| + struct ma_state *mas_detach, bool mm_wr_locked); |
| + |
| /* |
| * abort_munmap_vmas - Undo any munmap work and free resources |
| * |
| * Reattach any detached vmas and free up the maple tree used to track the vmas. |
| */ |
| -static inline void abort_munmap_vmas(struct ma_state *mas_detach) |
| +static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed) |
| { |
| struct vm_area_struct *vma; |
| |
| mas_set(mas_detach, 0); |
| - mas_for_each(mas_detach, vma, ULONG_MAX) |
| + mas_for_each(mas_detach, vma, ULONG_MAX) { |
| vma_mark_detached(vma, false); |
| + if (closed && vma->vm_ops && vma->vm_ops->open) |
| + vma->vm_ops->open(vma); |
| + } |
| |
| __mt_destroy(mas_detach->tree); |
| } |
| @@ -147,7 +156,7 @@ int do_vmi_munmap(struct vma_iterator *v |
| unsigned long start, size_t len, struct list_head *uf, |
| bool unlock); |
| |
| -void remove_vma(struct vm_area_struct *vma, bool unreachable); |
| +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed); |
| |
| void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, |
| struct vm_area_struct *prev, struct vm_area_struct *next); |
| @@ -261,7 +270,8 @@ bool vma_wants_writenotify(struct vm_are |
| int mm_take_all_locks(struct mm_struct *mm); |
| void mm_drop_all_locks(struct mm_struct *mm); |
| unsigned long count_vma_pages_range(struct mm_struct *mm, |
| - unsigned long addr, unsigned long end); |
| + unsigned long addr, unsigned long end, |
| + unsigned long *nr_accounted); |
| |
| static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) |
| { |
| _ |