| From: Mike Kravetz <mike.kravetz@oracle.com> |
| Subject: hugetlb: skip to end of PT page mapping when pte not present |
| Date: Tue, 21 Jun 2022 16:56:17 -0700 |
| |
| Patch series "hugetlb: speed up linear address scanning", v2. |
| |
| At unmap, fork and remap time hugetlb address ranges are linearly scanned. |
| We can optimize these scans if the ranges are sparsely populated. |
| |
| Also, enable page table "Lazy copy" for hugetlb at fork. |
| |
| NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to |
| add an arch specific version hugetlb_mask_last_page() to take advantage of |
| sparse address scanning improvements. Baolin Wang added the routine for |
| arm64. Other architectures which could be optimized are: ia64, mips, |
| parisc, powerpc, s390, sh and sparc. |
| |
| |
| This patch (of 4): |
| |
| HugeTLB address ranges are linearly scanned during fork, unmap and remap |
| operations. If a non-present entry is encountered, the code currently |
| continues to the next huge page aligned address. However, a non-present |
| entry implies that the page table page for that entry is not present. |
| Therefore, the linear scan can skip to the end of range mapped by the page |
| table page. This can speed operations on large sparsely populated hugetlb |
| mappings. |
| |
| Create a new routine hugetlb_mask_last_page() that will return an address |
| mask. When the mask is ORed with an address, the result will be the |
| address of the last huge page mapped by the associated page table page. |
| Use this mask to update addresses in routines which linearly scan hugetlb |
| address ranges when a non-present pte is encountered. |
| |
| hugetlb_mask_last_page is related to the implementation of huge_pte_offset |
| as hugetlb_mask_last_page is called when huge_pte_offset returns NULL. |
| This patch only provides a complete hugetlb_mask_last_page implementation |
| when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined. Architectures which |
| provide their own versions of huge_pte_offset can also provide their own |
| version of hugetlb_mask_last_page. |
| |
| Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com |
| Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.com |
| Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> |
| Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com> |
| Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> |
| Acked-by: Muchun Song <songmuchun@bytedance.com> |
| Reported-by: kernel test robot <lkp@intel.com> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Peter Xu <peterx@redhat.com> |
| Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> |
| Cc: James Houghton <jthoughton@google.com> |
| Cc: Mina Almasry <almasrymina@google.com> |
| Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> |
| Cc: Anshuman Khandual <anshuman.khandual@arm.com> |
| Cc: Paul Walmsley <paul.walmsley@sifive.com> |
| Cc: Christian Borntraeger <borntraeger@linux.ibm.com> |
| Cc: Catalin Marinas <catalin.marinas@arm.com> |
| Cc: Will Deacon <will@kernel.org> |
| Cc: Rolf Eike Beer <eike-kernel@sf-tec.de> |
| Cc: David Hildenbrand <david@redhat.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/hugetlb.h | 1 |
| mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++++++---- |
| 2 files changed, 52 insertions(+), 5 deletions(-) |
| |
| --- a/include/linux/hugetlb.h~hugetlb-skip-to-end-of-pt-page-mapping-when-pte-not-present |
| +++ a/include/linux/hugetlb.h |
| @@ -194,6 +194,7 @@ pte_t *huge_pte_alloc(struct mm_struct * |
| unsigned long addr, unsigned long sz); |
| pte_t *huge_pte_offset(struct mm_struct *mm, |
| unsigned long addr, unsigned long sz); |
| +unsigned long hugetlb_mask_last_page(struct hstate *h); |
| int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, |
| unsigned long *addr, pte_t *ptep); |
| void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, |
| --- a/mm/hugetlb.c~hugetlb-skip-to-end-of-pt-page-mapping-when-pte-not-present |
| +++ a/mm/hugetlb.c |
| @@ -4727,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_st |
| unsigned long npages = pages_per_huge_page(h); |
| struct address_space *mapping = src_vma->vm_file->f_mapping; |
| struct mmu_notifier_range range; |
| + unsigned long last_addr_mask; |
| int ret = 0; |
| |
| if (cow) { |
| @@ -4746,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_st |
| i_mmap_lock_read(mapping); |
| } |
| |
| + last_addr_mask = hugetlb_mask_last_page(h); |
| for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { |
| spinlock_t *src_ptl, *dst_ptl; |
| src_pte = huge_pte_offset(src, addr, sz); |
| - if (!src_pte) |
| + if (!src_pte) { |
| + addr |= last_addr_mask; |
| continue; |
| + } |
| dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); |
| if (!dst_pte) { |
| ret = -ENOMEM; |
| @@ -4767,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_st |
| * after taking the lock below. |
| */ |
| dst_entry = huge_ptep_get(dst_pte); |
| - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) |
| + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { |
| + addr |= last_addr_mask; |
| continue; |
| + } |
| |
| dst_ptl = huge_pte_lock(h, dst, dst_pte); |
| src_ptl = huge_pte_lockptr(h, src, src_pte); |
| @@ -4928,6 +4934,7 @@ int move_hugetlb_page_tables(struct vm_a |
| unsigned long sz = huge_page_size(h); |
| struct mm_struct *mm = vma->vm_mm; |
| unsigned long old_end = old_addr + len; |
| + unsigned long last_addr_mask; |
| unsigned long old_addr_copy; |
| pte_t *src_pte, *dst_pte; |
| struct mmu_notifier_range range; |
| @@ -4943,12 +4950,16 @@ int move_hugetlb_page_tables(struct vm_a |
| flush_cache_range(vma, range.start, range.end); |
| |
| mmu_notifier_invalidate_range_start(&range); |
| + last_addr_mask = hugetlb_mask_last_page(h); |
| /* Prevent race with file truncation */ |
| i_mmap_lock_write(mapping); |
| for (; old_addr < old_end; old_addr += sz, new_addr += sz) { |
| src_pte = huge_pte_offset(mm, old_addr, sz); |
| - if (!src_pte) |
| + if (!src_pte) { |
| + old_addr |= last_addr_mask; |
| + new_addr |= last_addr_mask; |
| continue; |
| + } |
| if (huge_pte_none(huge_ptep_get(src_pte))) |
| continue; |
| |
| @@ -4993,6 +5004,7 @@ static void __unmap_hugepage_range(struc |
| struct hstate *h = hstate_vma(vma); |
| unsigned long sz = huge_page_size(h); |
| struct mmu_notifier_range range; |
| + unsigned long last_addr_mask; |
| bool force_flush = false; |
| |
| WARN_ON(!is_vm_hugetlb_page(vma)); |
| @@ -5013,11 +5025,14 @@ static void __unmap_hugepage_range(struc |
| end); |
| adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); |
| mmu_notifier_invalidate_range_start(&range); |
| + last_addr_mask = hugetlb_mask_last_page(h); |
| address = start; |
| for (; address < end; address += sz) { |
| ptep = huge_pte_offset(mm, address, sz); |
| - if (!ptep) |
| + if (!ptep) { |
| + address |= last_addr_mask; |
| continue; |
| + } |
| |
| ptl = huge_pte_lock(h, mm, ptep); |
| if (huge_pmd_unshare(mm, vma, &address, ptep)) { |
| @@ -6285,6 +6300,7 @@ unsigned long hugetlb_change_protection( |
| unsigned long pages = 0, psize = huge_page_size(h); |
| bool shared_pmd = false; |
| struct mmu_notifier_range range; |
| + unsigned long last_addr_mask; |
| bool uffd_wp = cp_flags & MM_CP_UFFD_WP; |
| bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; |
| |
| @@ -6301,12 +6317,15 @@ unsigned long hugetlb_change_protection( |
| flush_cache_range(vma, range.start, range.end); |
| |
| mmu_notifier_invalidate_range_start(&range); |
| + last_addr_mask = hugetlb_mask_last_page(h); |
| i_mmap_lock_write(vma->vm_file->f_mapping); |
| for (; address < end; address += psize) { |
| spinlock_t *ptl; |
| ptep = huge_pte_offset(mm, address, psize); |
| - if (!ptep) |
| + if (!ptep) { |
| + address |= last_addr_mask; |
| continue; |
| + } |
| ptl = huge_pte_lock(h, mm, ptep); |
| if (huge_pmd_unshare(mm, vma, &address, ptep)) { |
| /* |
| @@ -6856,6 +6875,33 @@ pte_t *huge_pte_offset(struct mm_struct |
| return (pte_t *)pmd; |
| } |
| |
| +/* |
| + * Return a mask that can be used to update an address to the last huge |
| + * page in a page table page mapping size. Used to skip non-present |
| + * page table entries when linearly scanning address ranges. Architectures |
| + * with unique huge page to page table relationships can define their own |
| + * version of this routine. |
| + */ |
| +unsigned long hugetlb_mask_last_page(struct hstate *h) |
| +{ |
| + unsigned long hp_size = huge_page_size(h); |
| + |
| + if (hp_size == PUD_SIZE) |
| + return P4D_SIZE - PUD_SIZE; |
| + else if (hp_size == PMD_SIZE) |
| + return PUD_SIZE - PMD_SIZE; |
| + else |
| + return 0UL; |
| +} |
| + |
| +#else |
| + |
| +/* See description above. Architectures can provide their own version. */ |
| +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) |
| +{ |
| + return 0UL; |
| +} |
| + |
| #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
| |
| /* |
| _ |