| From: Usama Arif <usamaarif642@gmail.com> |
| Subject: mm: split underutilized THPs |
| Date: Tue, 13 Aug 2024 13:02:48 +0100 |
| |
| This is an attempt to mitigate the issue of running out of memory when THP |
| is always enabled. During runtime whenever a THP is being faulted in |
| (__do_huge_pmd_anonymous_page) or collapsed by khugepaged |
| (collapse_huge_page), the THP is added to _deferred_list. Whenever memory |
| reclaim happens in linux, the kernel runs the deferred_split shrinker |
| which goes through the _deferred_list. |
| |
| If the folio was partially mapped, the shrinker attempts to split it. If |
| the folio is not partially mapped, the shrinker checks if the THP was |
| underutilized, i.e. how many of the base 4K pages of the entire THP were |
| zero-filled. If this number goes above a certain threshold (decided by |
| /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none), the |
| shrinker will attempt to split that THP. Then at remap time, the pages |
| that were zero-filled are mapped to the shared zeropage, hence saving |
| memory. |
| |
| Link: https://lkml.kernel.org/r/20240813120328.1275952-6-usamaarif642@gmail.com |
| Signed-off-by: Usama Arif <usamaarif642@gmail.com> |
| Suggested-by: Rik van Riel <riel@surriel.com> |
| Co-authored-by: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Alexander Zhu <alexlzhu@fb.com> |
| Cc: Barry Song <baohua@kernel.org> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Matthew Wilcox <willy@infradead.org> |
| Cc: Mike Rapoport <rppt@kernel.org> |
| Cc: Roman Gushchin <roman.gushchin@linux.dev> |
| Cc: Ryan Roberts <ryan.roberts@arm.com> |
| Cc: Shakeel Butt <shakeel.butt@linux.dev> |
| Cc: Shuang Zhai <zhais@google.com> |
| Cc: Yu Zhao <yuzhao@google.com> |
| Cc: Andi Kleen <ak@linux.intel.com> |
| Cc: Huan Yang <link@vivo.com> |
| Cc: Kairui Song <ryncsn@gmail.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| Documentation/admin-guide/mm/transhuge.rst | 6 + |
| include/linux/khugepaged.h | 1 |
| include/linux/vm_event_item.h | 1 |
| mm/huge_memory.c | 76 +++++++++++++++++-- |
| mm/khugepaged.c | 3 |
| mm/vmstat.c | 1 |
| 6 files changed, 80 insertions(+), 8 deletions(-) |
| |
| --- a/Documentation/admin-guide/mm/transhuge.rst~mm-split-underutilized-thps |
| +++ a/Documentation/admin-guide/mm/transhuge.rst |
| @@ -447,6 +447,12 @@ thp_deferred_split_page |
| splitting it would free up some memory. Pages on split queue are |
| going to be split under memory pressure. |
| |
| +thp_underutilized_split_page |
| + is incremented when a huge page on the split queue was split |
| + because it was underutilized. A THP is underutilized if the |
| + number of zero pages in the THP is above a certain threshold |
| + (/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none). |
| + |
| thp_split_pmd |
| is incremented every time a PMD split into table of PTEs. |
| This can happen, for instance, when application calls mprotect() or |
| --- a/include/linux/khugepaged.h~mm-split-underutilized-thps |
| +++ a/include/linux/khugepaged.h |
| @@ -4,6 +4,7 @@ |
| |
| #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */ |
| |
| +extern unsigned int khugepaged_max_ptes_none __read_mostly; |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| extern struct attribute_group khugepaged_attr_group; |
| |
| --- a/include/linux/vm_event_item.h~mm-split-underutilized-thps |
| +++ a/include/linux/vm_event_item.h |
| @@ -105,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS |
| THP_SPLIT_PAGE, |
| THP_SPLIT_PAGE_FAILED, |
| THP_DEFERRED_SPLIT_PAGE, |
| + THP_UNDERUTILIZED_SPLIT_PAGE, |
| THP_SPLIT_PMD, |
| THP_SCAN_EXCEED_NONE_PTE, |
| THP_SCAN_EXCEED_SWAP_PTE, |
| --- a/mm/huge_memory.c~mm-split-underutilized-thps |
| +++ a/mm/huge_memory.c |
| @@ -1087,6 +1087,7 @@ static vm_fault_t __do_huge_pmd_anonymou |
| update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
| add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| mm_inc_nr_ptes(vma->vm_mm); |
| + deferred_split_folio(folio, false); |
| spin_unlock(vmf->ptl); |
| count_vm_event(THP_FAULT_ALLOC); |
| count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); |
| @@ -3517,6 +3518,39 @@ static unsigned long deferred_split_coun |
| return READ_ONCE(ds_queue->split_queue_len); |
| } |
| |
| +static bool thp_underutilized(struct folio *folio) |
| +{ |
| + int num_zero_pages = 0, num_filled_pages = 0; |
| + void *kaddr; |
| + int i; |
| + |
| + if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) |
| + return false; |
| + |
| + for (i = 0; i < folio_nr_pages(folio); i++) { |
| + kaddr = kmap_local_folio(folio, i * PAGE_SIZE); |
| + if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { |
| + num_zero_pages++; |
| + if (num_zero_pages > khugepaged_max_ptes_none) { |
| + kunmap_local(kaddr); |
| + return true; |
| + } |
| + } else { |
| + /* |
| + * Another path for early exit once the number |
| + * of non-zero filled pages exceeds threshold. |
| + */ |
| + num_filled_pages++; |
| + if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { |
| + kunmap_local(kaddr); |
| + return false; |
| + } |
| + } |
| + kunmap_local(kaddr); |
| + } |
| + return false; |
| +} |
| + |
| static unsigned long deferred_split_scan(struct shrinker *shrink, |
| struct shrink_control *sc) |
| { |
| @@ -3550,17 +3584,45 @@ static unsigned long deferred_split_scan |
| spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); |
| |
| list_for_each_entry_safe(folio, next, &list, _deferred_list) { |
| + bool did_split = false; |
| + bool underutilized = false; |
| + |
| + if (folio_test_partially_mapped(folio)) |
| + goto split; |
| + underutilized = thp_underutilized(folio); |
| + if (underutilized) |
| + goto split; |
| + continue; |
| +split: |
| if (!folio_trylock(folio)) |
| - goto next; |
| - /* split_huge_page() removes page from list on success */ |
| - if (!split_folio(folio)) |
| - split++; |
| + continue; |
| + did_split = !split_folio(folio); |
| folio_unlock(folio); |
| -next: |
| - folio_put(folio); |
| + if (did_split) { |
| + /* Splitting removed folio from the list, drop reference here */ |
| + folio_put(folio); |
| + if (underutilized) |
| + count_vm_event(THP_UNDERUTILIZED_SPLIT_PAGE); |
| + split++; |
| + } |
| } |
| + |
| spin_lock_irqsave(&ds_queue->split_queue_lock, flags); |
| - list_splice_tail(&list, &ds_queue->split_queue); |
| + /* |
| + * Only add back to the queue if folio is partially mapped. |
| + * If thp_underutilized returns false, or if split_folio fails in |
| + * the case it was underutilized, then consider it used and don't |
| + * add it back to split_queue. |
| + */ |
| + list_for_each_entry_safe(folio, next, &list, _deferred_list) { |
| + if (folio_test_partially_mapped(folio)) |
| + list_move(&folio->_deferred_list, &ds_queue->split_queue); |
| + else { |
| + list_del_init(&folio->_deferred_list); |
| + ds_queue->split_queue_len--; |
| + } |
| + folio_put(folio); |
| + } |
| spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); |
| |
| /* |
| --- a/mm/khugepaged.c~mm-split-underutilized-thps |
| +++ a/mm/khugepaged.c |
| @@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepage |
| * |
| * Note that these are only respected if collapse was initiated by khugepaged. |
| */ |
| -static unsigned int khugepaged_max_ptes_none __read_mostly; |
| +unsigned int khugepaged_max_ptes_none __read_mostly; |
| static unsigned int khugepaged_max_ptes_swap __read_mostly; |
| static unsigned int khugepaged_max_ptes_shared __read_mostly; |
| |
| @@ -1235,6 +1235,7 @@ static int collapse_huge_page(struct mm_ |
| pgtable_trans_huge_deposit(mm, pmd, pgtable); |
| set_pmd_at(mm, address, pmd, _pmd); |
| update_mmu_cache_pmd(vma, address, pmd); |
| + deferred_split_folio(folio, false); |
| spin_unlock(pmd_ptl); |
| |
| folio = NULL; |
| --- a/mm/vmstat.c~mm-split-underutilized-thps |
| +++ a/mm/vmstat.c |
| @@ -1385,6 +1385,7 @@ const char * const vmstat_text[] = { |
| "thp_split_page", |
| "thp_split_page_failed", |
| "thp_deferred_split_page", |
| + "thp_underutilized_split_page", |
| "thp_split_pmd", |
| "thp_scan_exceed_none_pte", |
| "thp_scan_exceed_swap_pte", |
| _ |