| From: Hugh Dickins <hughd@google.com> |
| Subject: mempolicy: mmap_lock is not needed while migrating folios |
| Date: Tue, 3 Oct 2023 02:27:47 -0700 (PDT) |
| |
| mbind(2) holds down_write of current task's mmap_lock throughout |
| (exclusive because it needs to set the new mempolicy on the vmas); |
| migrate_pages(2) holds down_read of pid's mmap_lock throughout. |
| |
| They both hold mmap_lock across the internal migrate_pages(), under which |
| all new page allocations (huge or small) are made. I'm nervous about it; |
| and migrate_pages() certainly does not need mmap_lock itself. It's done |
| this way for mbind(2), because its page allocator is vma_alloc_folio() or |
| alloc_hugetlb_folio_vma(), both of which depend on vma and address. |
| |
| Now that we have alloc_pages_mpol(), depending on (refcounted) memory |
| policy and interleave index, mbind(2) can be modified to use that or |
| alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the |
| internal migrate_pages() at all: add alloc_migration_target_by_mpol() to |
| replace mbind's new_page(). |
| |
| (After that change, alloc_hugetlb_folio_vma() is used by nothing but a |
| userfaultfd function: move it out of hugetlb.h and into the #ifdef.) |
| |
| migrate_pages(2) has chosen its target node before migrating, so can |
| continue to use the standard alloc_migration_target(); but let it take and |
| drop mmap_lock just around migrate_to_node()'s queue_pages_range(): |
| neither the node-to-node calculations nor the page migrations need it. |
| |
| It seems unlikely, but it is conceivable that some userspace depends on |
| the kernel's mmap_lock exclusion here, instead of doing its own locking: |
| more likely in a testsuite than in real life. It is also possible, of |
| course, that some pages on the list will be munmapped by another thread |
| before they are migrated, or a newer memory policy applied to the range by |
| that time: but such races could happen before, as soon as mmap_lock was |
| dropped, so it does not appear to be a concern. |
| |
| Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com |
| Signed-off-by: Hugh Dickins <hughd@google.com> |
| Cc: Andi Kleen <ak@linux.intel.com> |
| Cc: Christoph Lameter <cl@linux.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| Cc: "Huang, Ying" <ying.huang@intel.com> |
| Cc: Kefeng Wang <wangkefeng.wang@huawei.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Mel Gorman <mgorman@techsingularity.net> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Mike Kravetz <mike.kravetz@oracle.com> |
| Cc: Nhat Pham <nphamcs@gmail.com> |
| Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> |
| Cc: Suren Baghdasaryan <surenb@google.com> |
| Cc: Tejun heo <tj@kernel.org> |
| Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> |
| Cc: Yang Shi <shy828301@gmail.com> |
| Cc: Yosry Ahmed <yosryahmed@google.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/hugetlb.h | 9 ---- |
| mm/hugetlb.c | 38 +++++++++-------- |
| mm/mempolicy.c | 83 +++++++++++++++++++------------------- |
| 3 files changed, 63 insertions(+), 67 deletions(-) |
| |
| --- a/include/linux/hugetlb.h~mempolicy-mmap_lock-is-not-needed-while-migrating-folios |
| +++ a/include/linux/hugetlb.h |
| @@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct |
| unsigned long addr, int avoid_reserve); |
| struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, |
| nodemask_t *nmask, gfp_t gfp_mask); |
| -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, |
| - unsigned long address); |
| int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, |
| pgoff_t idx); |
| void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, |
| @@ -1071,13 +1069,6 @@ alloc_hugetlb_folio_nodemask(struct hsta |
| { |
| return NULL; |
| } |
| - |
| -static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, |
| - struct vm_area_struct *vma, |
| - unsigned long address) |
| -{ |
| - return NULL; |
| -} |
| |
| static inline int __alloc_bootmem_huge_page(struct hstate *h) |
| { |
| --- a/mm/hugetlb.c~mempolicy-mmap_lock-is-not-needed-while-migrating-folios |
| +++ a/mm/hugetlb.c |
| @@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodema |
| return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); |
| } |
| |
| -/* mempolicy aware migration callback */ |
| -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, |
| - unsigned long address) |
| -{ |
| - struct mempolicy *mpol; |
| - nodemask_t *nodemask; |
| - struct folio *folio; |
| - gfp_t gfp_mask; |
| - int node; |
| - |
| - gfp_mask = htlb_alloc_mask(h); |
| - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); |
| - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); |
| - mpol_cond_put(mpol); |
| - |
| - return folio; |
| -} |
| - |
| /* |
| * Increase the hugetlb pool such that it can accommodate a reservation |
| * of size 'delta'. |
| @@ -6560,6 +6542,26 @@ out_mutex: |
| |
| #ifdef CONFIG_USERFAULTFD |
| /* |
| + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). |
| + */ |
| +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, |
| + struct vm_area_struct *vma, unsigned long address) |
| +{ |
| + struct mempolicy *mpol; |
| + nodemask_t *nodemask; |
| + struct folio *folio; |
| + gfp_t gfp_mask; |
| + int node; |
| + |
| + gfp_mask = htlb_alloc_mask(h); |
| + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); |
| + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); |
| + mpol_cond_put(mpol); |
| + |
| + return folio; |
| +} |
| + |
| +/* |
| * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte |
| * with modifications for hugetlb pages. |
| */ |
| --- a/mm/mempolicy.c~mempolicy-mmap_lock-is-not-needed-while-migrating-folios |
| +++ a/mm/mempolicy.c |
| @@ -415,6 +415,8 @@ static const struct mempolicy_operations |
| |
| static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, |
| unsigned long flags); |
| +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, |
| + pgoff_t ilx, int *nid); |
| |
| static bool strictly_unmovable(unsigned long flags) |
| { |
| @@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_st |
| node_set(source, nmask); |
| |
| VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); |
| + |
| + mmap_read_lock(mm); |
| vma = find_vma(mm, 0); |
| |
| /* |
| @@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_st |
| */ |
| nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, |
| flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
| + mmap_read_unlock(mm); |
| |
| if (!list_empty(&pagelist)) { |
| err = migrate_pages(&pagelist, alloc_migration_target, NULL, |
| @@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *m |
| |
| lru_cache_disable(); |
| |
| - mmap_read_lock(mm); |
| - |
| /* |
| * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
| * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
| @@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *m |
| if (err < 0) |
| break; |
| } |
| - mmap_read_unlock(mm); |
| |
| lru_cache_enable(); |
| if (err < 0) |
| @@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *m |
| } |
| |
| /* |
| - * Allocate a new page for page migration based on vma policy. |
| - * Start by assuming the page is mapped by the same vma as contains @start. |
| - * Search forward from there, if not. N.B., this assumes that the |
| - * list of pages handed to migrate_pages()--which is how we get here-- |
| - * is in virtual address order. |
| + * Allocate a new folio for page migration, according to NUMA mempolicy. |
| */ |
| -static struct folio *new_folio(struct folio *src, unsigned long start) |
| +static struct folio *alloc_migration_target_by_mpol(struct folio *src, |
| + unsigned long private) |
| { |
| - struct vm_area_struct *vma; |
| - unsigned long address; |
| - VMA_ITERATOR(vmi, current->mm, start); |
| - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; |
| - |
| - for_each_vma(vmi, vma) { |
| - address = page_address_in_vma(&src->page, vma); |
| - if (address != -EFAULT) |
| - break; |
| - } |
| + struct mempolicy *pol = (struct mempolicy *)private; |
| + pgoff_t ilx = 0; /* improve on this later */ |
| + struct page *page; |
| + unsigned int order; |
| + int nid = numa_node_id(); |
| + gfp_t gfp; |
| |
| - /* |
| - * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL |
| - * when the page can no longer be located in a vma: that is not ideal |
| - * (migrate_pages() will give up early, presuming ENOMEM), but good |
| - * enough to avoid a crash by syzkaller or concurrent holepunch. |
| - */ |
| - if (!vma) |
| - return NULL; |
| + order = folio_order(src); |
| + ilx += src->index >> order; |
| |
| if (folio_test_hugetlb(src)) { |
| - return alloc_hugetlb_folio_vma(folio_hstate(src), |
| - vma, address); |
| + nodemask_t *nodemask; |
| + struct hstate *h; |
| + |
| + h = folio_hstate(src); |
| + gfp = htlb_alloc_mask(h); |
| + nodemask = policy_nodemask(gfp, pol, ilx, &nid); |
| + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); |
| } |
| |
| if (folio_test_large(src)) |
| gfp = GFP_TRANSHUGE; |
| + else |
| + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; |
| |
| - return vma_alloc_folio(gfp, folio_order(src), vma, address, |
| - folio_test_large(src)); |
| + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); |
| + return page_rmappable_folio(page); |
| } |
| #else |
| |
| @@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *m |
| return -ENOSYS; |
| } |
| |
| -static struct folio *new_folio(struct folio *src, unsigned long start) |
| +static struct folio *alloc_migration_target_by_mpol(struct folio *src, |
| + unsigned long private) |
| { |
| return NULL; |
| } |
| @@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start |
| |
| if (nr_failed < 0) { |
| err = nr_failed; |
| + nr_failed = 0; |
| } else { |
| vma_iter_init(&vmi, mm, start); |
| prev = vma_prev(&vmi); |
| @@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start |
| } |
| } |
| |
| - if (!err) { |
| - if (!list_empty(&pagelist)) { |
| - nr_failed |= migrate_pages(&pagelist, new_folio, NULL, |
| - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); |
| + mmap_write_unlock(mm); |
| + |
| + if (!err && !list_empty(&pagelist)) { |
| + /* Convert MPOL_DEFAULT's NULL to task or default policy */ |
| + if (!new) { |
| + new = get_task_policy(current); |
| + mpol_get(new); |
| } |
| - if (nr_failed && (flags & MPOL_MF_STRICT)) |
| - err = -EIO; |
| + nr_failed |= migrate_pages(&pagelist, |
| + alloc_migration_target_by_mpol, NULL, |
| + (unsigned long)new, MIGRATE_SYNC, |
| + MR_MEMPOLICY_MBIND, NULL); |
| } |
| |
| + if (nr_failed && (flags & MPOL_MF_STRICT)) |
| + err = -EIO; |
| if (!list_empty(&pagelist)) |
| putback_movable_pages(&pagelist); |
| - |
| - mmap_write_unlock(mm); |
| mpol_out: |
| mpol_put(new); |
| if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
| _ |