| From: Dev Jain <dev.jain@arm.com> |
| Subject: mm: abstract THP allocation |
| Date: Tue, 8 Oct 2024 11:47:45 +0530 |
| |
| Patch series "Do not shatter hugezeropage on wp-fault", v7. |
| |
| It was observed at [1] and [2] that the current kernel behaviour of |
| shattering a hugezeropage is inconsistent and suboptimal. For a VMA with |
| a THP allowable order, when we write-fault on it, the kernel installs a |
| PMD-mapped THP. On the other hand, if we first get a read fault, we get a |
| PMD pointing to the hugezeropage; subsequent write will trigger a |
| write-protection fault, shattering the hugezeropage into one writable |
| page, and all the other PTEs write-protected. The conclusion being, as |
| compared to the case of a single write-fault, applications have to suffer |
| 512 extra page faults if they were to use the VMA as such, plus we get the |
| overhead of khugepaged trying to replace that area with a THP anyway. |
| |
| Instead, replace the hugezeropage with a THP on wp-fault. |
| |
| [1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/ |
| [2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/ |
| |
| |
| This patch (of 2): |
| |
| In preparation for the second patch, abstract away the THP allocation |
| logic present in the create_huge_pmd() path, which corresponds to the |
| faulting case when no page is present. |
| |
| There should be no functional change as a result of applying this patch, |
| except that, as David notes at [1], a PMD-aligned address should be passed |
| to update_mmu_cache_pmd(). |
| |
| [1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/ |
| |
| Link: https://lkml.kernel.org/r/20241008061746.285961-1-dev.jain@arm.com |
| Link: https://lkml.kernel.org/r/20241008061746.285961-2-dev.jain@arm.com |
| Signed-off-by: Dev Jain <dev.jain@arm.com> |
| Acked-by: David Hildenbrand <david@redhat.com> |
| Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com> |
| Cc: Alistair Popple <apopple@nvidia.com> |
| Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org> |
| Cc: Anshuman Khandual <anshuman.khandual@arm.com> |
| Cc: Barry Song <baohua@kernel.org> |
| Cc: Catalin Marinas <catalin.marinas@arm.com> |
| Cc: Christoph Lameter <cl@gentwo.org> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> |
| Cc: Lance Yang <ioworker0@gmail.com> |
| Cc: Mark Rutland <mark.rutland@arm.com> |
| Cc: Matthew Wilcox <willy@infradead.org> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Peter Xu <peterx@redhat.com> |
| Cc: Ryan Roberts <ryan.roberts@arm.com> |
| Cc: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Will Deacon <will@kernel.org> |
| Cc: Yang Shi <yang@os.amperecomputing.com> |
| Cc: Zi Yan <ziy@nvidia.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/huge_memory.c | 98 ++++++++++++++++++++++++++------------------- |
| 1 file changed, 57 insertions(+), 41 deletions(-) |
| |
| --- a/mm/huge_memory.c~mm-abstract-thp-allocation |
| +++ a/mm/huge_memory.c |
| @@ -1139,47 +1139,81 @@ unsigned long thp_get_unmapped_area(stru |
| } |
| EXPORT_SYMBOL_GPL(thp_get_unmapped_area); |
| |
| -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, |
| - struct page *page, gfp_t gfp) |
| +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, |
| + unsigned long addr) |
| { |
| - struct vm_area_struct *vma = vmf->vma; |
| - struct folio *folio = page_folio(page); |
| - pgtable_t pgtable; |
| - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| - vm_fault_t ret = 0; |
| + gfp_t gfp = vma_thp_gfp_mask(vma); |
| + const int order = HPAGE_PMD_ORDER; |
| + struct folio *folio; |
| |
| - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); |
| + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); |
| |
| + if (unlikely(!folio)) { |
| + count_vm_event(THP_FAULT_FALLBACK); |
| + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); |
| + return NULL; |
| + } |
| + |
| + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); |
| if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { |
| folio_put(folio); |
| count_vm_event(THP_FAULT_FALLBACK); |
| count_vm_event(THP_FAULT_FALLBACK_CHARGE); |
| - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); |
| - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); |
| - return VM_FAULT_FALLBACK; |
| + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); |
| + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); |
| + return NULL; |
| } |
| folio_throttle_swaprate(folio, gfp); |
| |
| - pgtable = pte_alloc_one(vma->vm_mm); |
| - if (unlikely(!pgtable)) { |
| - ret = VM_FAULT_OOM; |
| - goto release; |
| - } |
| - |
| - folio_zero_user(folio, vmf->address); |
| + folio_zero_user(folio, addr); |
| /* |
| * The memory barrier inside __folio_mark_uptodate makes sure that |
| * folio_zero_user writes become visible before the set_pmd_at() |
| * write. |
| */ |
| __folio_mark_uptodate(folio); |
| + return folio; |
| +} |
| + |
| +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, |
| + struct vm_area_struct *vma, unsigned long haddr) |
| +{ |
| + pmd_t entry; |
| + |
| + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); |
| + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
| + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); |
| + folio_add_lru_vma(folio, vma); |
| + set_pmd_at(vma->vm_mm, haddr, pmd, entry); |
| + update_mmu_cache_pmd(vma, haddr, pmd); |
| + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| + count_vm_event(THP_FAULT_ALLOC); |
| + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); |
| + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); |
| +} |
| + |
| +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) |
| +{ |
| + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| + struct vm_area_struct *vma = vmf->vma; |
| + struct folio *folio; |
| + pgtable_t pgtable; |
| + vm_fault_t ret = 0; |
| + |
| + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); |
| + if (unlikely(!folio)) |
| + return VM_FAULT_FALLBACK; |
| + |
| + pgtable = pte_alloc_one(vma->vm_mm); |
| + if (unlikely(!pgtable)) { |
| + ret = VM_FAULT_OOM; |
| + goto release; |
| + } |
| |
| vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| if (unlikely(!pmd_none(*vmf->pmd))) { |
| goto unlock_release; |
| } else { |
| - pmd_t entry; |
| - |
| ret = check_stable_address_space(vma->vm_mm); |
| if (ret) |
| goto unlock_release; |
| @@ -1193,21 +1227,11 @@ static vm_fault_t __do_huge_pmd_anonymou |
| VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
| return ret; |
| } |
| - |
| - entry = mk_huge_pmd(page, vma->vm_page_prot); |
| - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
| - folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); |
| - folio_add_lru_vma(folio, vma); |
| pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); |
| - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
| - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
| - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); |
| mm_inc_nr_ptes(vma->vm_mm); |
| deferred_split_folio(folio, false); |
| spin_unlock(vmf->ptl); |
| - count_vm_event(THP_FAULT_ALLOC); |
| - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); |
| - count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); |
| } |
| |
| return 0; |
| @@ -1274,8 +1298,6 @@ static void set_huge_zero_folio(pgtable_ |
| vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) |
| { |
| struct vm_area_struct *vma = vmf->vma; |
| - gfp_t gfp; |
| - struct folio *folio; |
| unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| vm_fault_t ret; |
| |
| @@ -1326,14 +1348,8 @@ vm_fault_t do_huge_pmd_anonymous_page(st |
| } |
| return ret; |
| } |
| - gfp = vma_thp_gfp_mask(vma); |
| - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); |
| - if (unlikely(!folio)) { |
| - count_vm_event(THP_FAULT_FALLBACK); |
| - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); |
| - return VM_FAULT_FALLBACK; |
| - } |
| - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); |
| + |
| + return __do_huge_pmd_anonymous_page(vmf); |
| } |
| |
| static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
| _ |