| From: Yin Fengwei <fengwei.yin@intel.com> |
| Subject: mm: handle large folio when large folio in VM_LOCKED VMA range |
| Date: Mon, 18 Sep 2023 15:33:17 +0800 |
| |
| If large folio is in the range of VM_LOCKED VMA, it should be mlocked to |
| avoid being picked by page reclaim. Which may split the large folio and |
| then mlock each pages again. |
| |
| Mlock this kind of large folio to prevent them being picked by page |
| reclaim. |
| |
| For the large folio which cross the boundary of VM_LOCKED VMA or not fully |
| mapped to VM_LOCKED VMA, we'd better not to mlock it. So if the system is |
| under memory pressure, this kind of large folio will be split and the |
| pages ouf of VM_LOCKED VMA can be reclaimed. |
| |
| Ideally, for large folio, we should mlock it when the large folio is fully |
| mapped to VMA and munlock it if any page are unmampped from VMA. But it's |
| not easy to detect whether the large folio is fully mapped to VMA in some |
| cases (like add/remove rmap). So we update mlock_vma_folio() and |
| munlock_vma_folio() to mlock/munlock the folio according to vma->vm_flags. |
| Let caller to decide whether they should call these two functions. |
| |
| For add rmap, only mlock normal 4K folio and postpone large folio handling |
| to page reclaim phase. It is possible to reuse page table iterator to |
| detect whether folio is fully mapped or not during page reclaim phase. |
| For remove rmap, invoke munlock_vma_folio() to munlock folio unconditionly |
| because rmap makes folio not fully mapped to VMA. |
| |
| Link: https://lkml.kernel.org/r/20230918073318.1181104-3-fengwei.yin@intel.com |
| Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Ryan Roberts <ryan.roberts@arm.com> |
| Cc: Yang Shi <shy828301@gmail.com> |
| Cc: Yosry Ahmed <yosryahmed@google.com> |
| Cc: Yu Zhao <yuzhao@google.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/internal.h | 23 +++++++++------- |
| mm/rmap.c | 66 ++++++++++++++++++++++++++++++++++++++++-------- |
| 2 files changed, 68 insertions(+), 21 deletions(-) |
| |
| --- a/mm/internal.h~mm-handle-large-folio-when-large-folio-in-vm_locked-vma-range |
| +++ a/mm/internal.h |
| @@ -644,14 +644,10 @@ folio_within_vma(struct folio *folio, st |
| * mlock is usually called at the end of page_add_*_rmap(), munlock at |
| * the end of page_remove_rmap(); but new anon folios are managed by |
| * folio_add_lru_vma() calling mlock_new_folio(). |
| - * |
| - * @compound is used to include pmd mappings of THPs, but filter out |
| - * pte mappings of THPs, which cannot be consistently counted: a pte |
| - * mapping of the THP head cannot be distinguished by the page alone. |
| */ |
| void mlock_folio(struct folio *folio); |
| static inline void mlock_vma_folio(struct folio *folio, |
| - struct vm_area_struct *vma, bool compound) |
| + struct vm_area_struct *vma) |
| { |
| /* |
| * The VM_SPECIAL check here serves two purposes. |
| @@ -661,17 +657,24 @@ static inline void mlock_vma_folio(struc |
| * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may |
| * still be set while VM_SPECIAL bits are added: so ignore it then. |
| */ |
| - if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && |
| - (compound || !folio_test_large(folio))) |
| + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) |
| mlock_folio(folio); |
| } |
| |
| void munlock_folio(struct folio *folio); |
| static inline void munlock_vma_folio(struct folio *folio, |
| - struct vm_area_struct *vma, bool compound) |
| + struct vm_area_struct *vma) |
| { |
| - if (unlikely(vma->vm_flags & VM_LOCKED) && |
| - (compound || !folio_test_large(folio))) |
| + /* |
| + * munlock if the function is called. Ideally, we should only |
| + * do munlock if any page of folio is unmapped from VMA and |
| + * cause folio not fully mapped to VMA. |
| + * |
| + * But it's not easy to confirm that's the situation. So we |
| + * always munlock the folio and page reclaim will correct it |
| + * if it's wrong. |
| + */ |
| + if (unlikely(vma->vm_flags & VM_LOCKED)) |
| munlock_folio(folio); |
| } |
| |
| --- a/mm/rmap.c~mm-handle-large-folio-when-large-folio-in-vm_locked-vma-range |
| +++ a/mm/rmap.c |
| @@ -798,6 +798,7 @@ struct folio_referenced_arg { |
| unsigned long vm_flags; |
| struct mem_cgroup *memcg; |
| }; |
| + |
| /* |
| * arg: folio_referenced_arg will be passed |
| */ |
| @@ -807,17 +808,33 @@ static bool folio_referenced_one(struct |
| struct folio_referenced_arg *pra = arg; |
| DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); |
| int referenced = 0; |
| + unsigned long start = address, ptes = 0; |
| |
| while (page_vma_mapped_walk(&pvmw)) { |
| address = pvmw.address; |
| |
| - if ((vma->vm_flags & VM_LOCKED) && |
| - (!folio_test_large(folio) || !pvmw.pte)) { |
| - /* Restore the mlock which got missed */ |
| - mlock_vma_folio(folio, vma, !pvmw.pte); |
| - page_vma_mapped_walk_done(&pvmw); |
| - pra->vm_flags |= VM_LOCKED; |
| - return false; /* To break the loop */ |
| + if (vma->vm_flags & VM_LOCKED) { |
| + if (!folio_test_large(folio) || !pvmw.pte) { |
| + /* Restore the mlock which got missed */ |
| + mlock_vma_folio(folio, vma); |
| + page_vma_mapped_walk_done(&pvmw); |
| + pra->vm_flags |= VM_LOCKED; |
| + return false; /* To break the loop */ |
| + } |
| + /* |
| + * For large folio fully mapped to VMA, will |
| + * be handled after the pvmw loop. |
| + * |
| + * For large folio cross VMA boundaries, it's |
| + * expected to be picked by page reclaim. But |
| + * should skip reference of pages which are in |
| + * the range of VM_LOCKED vma. As page reclaim |
| + * should just count the reference of pages out |
| + * the range of VM_LOCKED vma. |
| + */ |
| + ptes++; |
| + pra->mapcount--; |
| + continue; |
| } |
| |
| if (pvmw.pte) { |
| @@ -842,6 +859,23 @@ static bool folio_referenced_one(struct |
| pra->mapcount--; |
| } |
| |
| + if ((vma->vm_flags & VM_LOCKED) && |
| + folio_test_large(folio) && |
| + folio_within_vma(folio, vma)) { |
| + unsigned long s_align, e_align; |
| + |
| + s_align = ALIGN_DOWN(start, PMD_SIZE); |
| + e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); |
| + |
| + /* folio doesn't cross page table boundary and fully mapped */ |
| + if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { |
| + /* Restore the mlock which got missed */ |
| + mlock_vma_folio(folio, vma); |
| + pra->vm_flags |= VM_LOCKED; |
| + return false; /* To break the loop */ |
| + } |
| + } |
| + |
| if (referenced) |
| folio_clear_idle(folio); |
| if (folio_test_clear_young(folio)) |
| @@ -1254,7 +1288,14 @@ void page_add_anon_rmap(struct page *pag |
| (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && |
| PageAnonExclusive(page), folio); |
| |
| - mlock_vma_folio(folio, vma, compound); |
| + /* |
| + * For large folio, only mlock it if it's fully mapped to VMA. It's |
| + * not easy to check whether the large folio is fully mapped to VMA |
| + * here. Only mlock normal 4K folio and leave page reclaim to handle |
| + * large folio. |
| + */ |
| + if (!folio_test_large(folio)) |
| + mlock_vma_folio(folio, vma); |
| } |
| |
| /** |
| @@ -1354,7 +1395,9 @@ void folio_add_file_rmap_range(struct fo |
| if (nr) |
| __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); |
| |
| - mlock_vma_folio(folio, vma, compound); |
| + /* See comments in page_add_anon_rmap() */ |
| + if (!folio_test_large(folio)) |
| + mlock_vma_folio(folio, vma); |
| } |
| |
| /** |
| @@ -1465,7 +1508,7 @@ void page_remove_rmap(struct page *page, |
| * it's only reliable while mapped. |
| */ |
| |
| - munlock_vma_folio(folio, vma, compound); |
| + munlock_vma_folio(folio, vma); |
| } |
| |
| /* |
| @@ -1530,7 +1573,8 @@ static bool try_to_unmap_one(struct foli |
| if (!(flags & TTU_IGNORE_MLOCK) && |
| (vma->vm_flags & VM_LOCKED)) { |
| /* Restore the mlock which got missed */ |
| - mlock_vma_folio(folio, vma, false); |
| + if (!folio_test_large(folio)) |
| + mlock_vma_folio(folio, vma); |
| page_vma_mapped_walk_done(&pvmw); |
| ret = false; |
| break; |
| _ |