| From: David Hildenbrand <david@redhat.com> |
| Subject: mm: support GUP-triggered unsharing of anonymous pages |
| |
| Whenever GUP currently ends up taking a R/O pin on an anonymous page that |
| might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault |
| on the page table entry will end up replacing the mapped anonymous page |
| due to COW, resulting in the GUP pin no longer being consistent with the |
| page actually mapped into the page table. |
| |
| The possible ways to deal with this situation are: |
| (1) Ignore and pin -- what we do right now. |
| (2) Fail to pin -- which would be rather surprising to callers and |
| could break user space. |
| (3) Trigger unsharing and pin the now exclusive page -- reliable R/O |
| pins. |
| |
| We want to implement 3) because it provides the clearest semantics and |
| allows for checking in unpin_user_pages() and friends for possible BUGs: |
| when trying to unpin a page that's no longer exclusive, clearly something |
| went very wrong and might result in memory corruptions that might be hard |
| to debug. So we better have a nice way to spot such issues. |
| |
| To implement 3), we need a way for GUP to trigger unsharing: |
| FAULT_FLAG_UNSHARE. FAULT_FLAG_UNSHARE is only applicable to R/O mapped |
| anonymous pages and resembles COW logic during a write fault. However, in |
| contrast to a write fault, GUP-triggered unsharing will, for example, |
| still maintain the write protection. |
| |
| Let's implement FAULT_FLAG_UNSHARE by hooking into the existing write |
| fault handlers for all applicable anonymous page types: ordinary pages, |
| THP and hugetlb. |
| |
| * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that has been |
| marked exclusive in the meantime by someone else, there is nothing to do. |
| * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that's not |
| marked exclusive, it will try detecting if the process is the exclusive |
| owner. If exclusive, it can be set exclusive similar to reuse logic |
| during write faults via page_move_anon_rmap() and there is nothing |
| else to do; otherwise, we either have to copy and map a fresh, |
| anonymous exclusive page R/O (ordinary pages, hugetlb), or split the |
| THP. |
| |
| This commit is heavily based on patches by Andrea. |
| |
| Link: https://lkml.kernel.org/r/20220428083441.37290-16-david@redhat.com |
| Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> |
| Signed-off-by: David Hildenbrand <david@redhat.com> |
| Acked-by: Vlastimil Babka <vbabka@suse.cz> |
| Co-developed-by: Andrea Arcangeli <aarcange@redhat.com> |
| Cc: Christoph Hellwig <hch@lst.de> |
| Cc: David Rientjes <rientjes@google.com> |
| Cc: Don Dutile <ddutile@redhat.com> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Jann Horn <jannh@google.com> |
| Cc: Jason Gunthorpe <jgg@nvidia.com> |
| Cc: John Hubbard <jhubbard@nvidia.com> |
| Cc: Khalid Aziz <khalid.aziz@oracle.com> |
| Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> |
| Cc: Liang Zhang <zhangliang5@huawei.com> |
| Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> |
| Cc: Michal Hocko <mhocko@kernel.org> |
| Cc: Mike Kravetz <mike.kravetz@oracle.com> |
| Cc: Mike Rapoport <rppt@linux.ibm.com> |
| Cc: Nadav Amit <namit@vmware.com> |
| Cc: Oded Gabbay <oded.gabbay@gmail.com> |
| Cc: Oleg Nesterov <oleg@redhat.com> |
| Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com> |
| Cc: Peter Xu <peterx@redhat.com> |
| Cc: Rik van Riel <riel@surriel.com> |
| Cc: Roman Gushchin <guro@fb.com> |
| Cc: Shakeel Butt <shakeelb@google.com> |
| Cc: Yang Shi <shy828301@gmail.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/mm_types.h | 8 ++ |
| mm/huge_memory.c | 10 +++ |
| mm/hugetlb.c | 56 ++++++++++++------- |
| mm/memory.c | 107 +++++++++++++++++++++++++------------ |
| 4 files changed, 126 insertions(+), 55 deletions(-) |
| |
| --- a/include/linux/mm_types.h~mm-support-gup-triggered-unsharing-of-anonymous-pages |
| +++ a/include/linux/mm_types.h |
| @@ -819,6 +819,9 @@ typedef struct { |
| * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. |
| * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. |
| * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. |
| + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark |
| + * exclusive) a possibly shared anonymous page that is |
| + * mapped R/O. |
| * |
| * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify |
| * whether we would allow page faults to retry by specifying these two |
| @@ -838,6 +841,10 @@ typedef struct { |
| * continuous faults with flags (b). We should always try to detect pending |
| * signals before a retry to make sure the continuous page faults can still be |
| * interrupted if necessary. |
| + * |
| + * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. |
| + * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when |
| + * no existing R/O-mapped anonymous page is encountered. |
| */ |
| enum fault_flag { |
| FAULT_FLAG_WRITE = 1 << 0, |
| @@ -850,6 +857,7 @@ enum fault_flag { |
| FAULT_FLAG_REMOTE = 1 << 7, |
| FAULT_FLAG_INSTRUCTION = 1 << 8, |
| FAULT_FLAG_INTERRUPTIBLE = 1 << 9, |
| + FAULT_FLAG_UNSHARE = 1 << 10, |
| }; |
| |
| #endif /* _LINUX_MM_TYPES_H */ |
| --- a/mm/huge_memory.c~mm-support-gup-triggered-unsharing-of-anonymous-pages |
| +++ a/mm/huge_memory.c |
| @@ -1271,6 +1271,7 @@ unlock: |
| |
| vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) |
| { |
| + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; |
| struct vm_area_struct *vma = vmf->vma; |
| struct page *page; |
| unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| @@ -1279,6 +1280,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm |
| vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); |
| VM_BUG_ON_VMA(!vma->anon_vma, vma); |
| |
| + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); |
| + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); |
| + |
| if (is_huge_zero_pmd(orig_pmd)) |
| goto fallback; |
| |
| @@ -1317,7 +1321,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm |
| } |
| |
| /* |
| - * See do_wp_page(): we can only map the page writable if there are |
| + * See do_wp_page(): we can only reuse the page exclusively if there are |
| * no additional references. Note that we always drain the LRU |
| * pagevecs immediately after adding a THP. |
| */ |
| @@ -1331,6 +1335,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm |
| page_move_anon_rmap(page, vma); |
| unlock_page(page); |
| reuse: |
| + if (unlikely(unshare)) { |
| + spin_unlock(vmf->ptl); |
| + return 0; |
| + } |
| entry = pmd_mkyoung(orig_pmd); |
| entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
| if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) |
| --- a/mm/hugetlb.c~mm-support-gup-triggered-unsharing-of-anonymous-pages |
| +++ a/mm/hugetlb.c |
| @@ -5162,15 +5162,16 @@ static void unmap_ref_private(struct mm_ |
| } |
| |
| /* |
| - * Hugetlb_cow() should be called with page lock of the original hugepage held. |
| + * hugetlb_wp() should be called with page lock of the original hugepage held. |
| * Called with hugetlb_fault_mutex_table held and pte_page locked so we |
| * cannot race with other handlers or page migration. |
| * Keep the pte_same checks anyway to make transition from the mutex easier. |
| */ |
| -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
| - unsigned long address, pte_t *ptep, |
| +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, |
| + unsigned long address, pte_t *ptep, unsigned int flags, |
| struct page *pagecache_page, spinlock_t *ptl) |
| { |
| + const bool unshare = flags & FAULT_FLAG_UNSHARE; |
| pte_t pte; |
| struct hstate *h = hstate_vma(vma); |
| struct page *old_page, *new_page; |
| @@ -5179,15 +5180,22 @@ static vm_fault_t hugetlb_cow(struct mm_ |
| unsigned long haddr = address & huge_page_mask(h); |
| struct mmu_notifier_range range; |
| |
| + VM_BUG_ON(unshare && (flags & FOLL_WRITE)); |
| + VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); |
| + |
| pte = huge_ptep_get(ptep); |
| old_page = pte_page(pte); |
| |
| retry_avoidcopy: |
| - /* If no-one else is actually using this page, avoid the copy |
| - * and just make the page writable */ |
| + /* |
| + * If no-one else is actually using this page, we're the exclusive |
| + * owner and can reuse this page. |
| + */ |
| if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { |
| - page_move_anon_rmap(old_page, vma); |
| - set_huge_ptep_writable(vma, haddr, ptep); |
| + if (!PageAnonExclusive(old_page)) |
| + page_move_anon_rmap(old_page, vma); |
| + if (likely(!unshare)) |
| + set_huge_ptep_writable(vma, haddr, ptep); |
| return 0; |
| } |
| VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), |
| @@ -5290,13 +5298,13 @@ retry_avoidcopy: |
| if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { |
| ClearHPageRestoreReserve(new_page); |
| |
| - /* Break COW */ |
| + /* Break COW or unshare */ |
| huge_ptep_clear_flush(vma, haddr, ptep); |
| mmu_notifier_invalidate_range(mm, range.start, range.end); |
| page_remove_rmap(old_page, vma, true); |
| hugepage_add_new_anon_rmap(new_page, vma, haddr); |
| set_huge_pte_at(mm, haddr, ptep, |
| - make_huge_pte(vma, new_page, 1)); |
| + make_huge_pte(vma, new_page, !unshare)); |
| SetHPageMigratable(new_page); |
| /* Make the old page be freed below */ |
| new_page = old_page; |
| @@ -5304,7 +5312,10 @@ retry_avoidcopy: |
| spin_unlock(ptl); |
| mmu_notifier_invalidate_range_end(&range); |
| out_release_all: |
| - /* No restore in case of successful pagetable update (Break COW) */ |
| + /* |
| + * No restore in case of successful pagetable update (Break COW or |
| + * unshare) |
| + */ |
| if (new_page != old_page) |
| restore_reserve_on_error(h, vma, haddr, new_page); |
| put_page(new_page); |
| @@ -5429,7 +5440,8 @@ static vm_fault_t hugetlb_no_page(struct |
| /* |
| * Currently, we are forced to kill the process in the event the |
| * original mapper has unmapped pages from the child due to a failed |
| - * COW. Warn that such a situation has occurred as it may not be obvious |
| + * COW/unsharing. Warn that such a situation has occurred as it may not |
| + * be obvious. |
| */ |
| if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
| pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", |
| @@ -5555,7 +5567,7 @@ retry: |
| hugetlb_count_add(pages_per_huge_page(h), mm); |
| if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { |
| /* Optimization, do the COW without a second fault */ |
| - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); |
| + ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); |
| } |
| |
| spin_unlock(ptl); |
| @@ -5685,14 +5697,15 @@ vm_fault_t hugetlb_fault(struct mm_struc |
| goto out_mutex; |
| |
| /* |
| - * If we are going to COW the mapping later, we examine the pending |
| - * reservations for this page now. This will ensure that any |
| + * If we are going to COW/unshare the mapping later, we examine the |
| + * pending reservations for this page now. This will ensure that any |
| * allocations necessary to record that reservation occur outside the |
| * spinlock. For private mappings, we also lookup the pagecache |
| * page now as it is used to determine if a reservation has been |
| * consumed. |
| */ |
| - if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { |
| + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && |
| + !huge_pte_write(entry)) { |
| if (vma_needs_reservation(h, vma, haddr) < 0) { |
| ret = VM_FAULT_OOM; |
| goto out_mutex; |
| @@ -5707,12 +5720,12 @@ vm_fault_t hugetlb_fault(struct mm_struc |
| |
| ptl = huge_pte_lock(h, mm, ptep); |
| |
| - /* Check for a racing update before calling hugetlb_cow */ |
| + /* Check for a racing update before calling hugetlb_wp() */ |
| if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) |
| goto out_ptl; |
| |
| /* |
| - * hugetlb_cow() requires page locks of pte_page(entry) and |
| + * hugetlb_wp() requires page locks of pte_page(entry) and |
| * pagecache_page, so here we need take the former one |
| * when page != pagecache_page or !pagecache_page. |
| */ |
| @@ -5725,13 +5738,14 @@ vm_fault_t hugetlb_fault(struct mm_struc |
| |
| get_page(page); |
| |
| - if (flags & FAULT_FLAG_WRITE) { |
| + if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { |
| if (!huge_pte_write(entry)) { |
| - ret = hugetlb_cow(mm, vma, address, ptep, |
| - pagecache_page, ptl); |
| + ret = hugetlb_wp(mm, vma, address, ptep, flags, |
| + pagecache_page, ptl); |
| goto out_put_page; |
| + } else if (likely(flags & FAULT_FLAG_WRITE)) { |
| + entry = huge_pte_mkdirty(entry); |
| } |
| - entry = huge_pte_mkdirty(entry); |
| } |
| entry = pte_mkyoung(entry); |
| if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, |
| --- a/mm/memory.c~mm-support-gup-triggered-unsharing-of-anonymous-pages |
| +++ a/mm/memory.c |
| @@ -2745,8 +2745,8 @@ static inline int pte_unmap_same(struct |
| return same; |
| } |
| |
| -static inline bool cow_user_page(struct page *dst, struct page *src, |
| - struct vm_fault *vmf) |
| +static inline bool __wp_page_copy_user(struct page *dst, struct page *src, |
| + struct vm_fault *vmf) |
| { |
| bool ret; |
| void *kaddr; |
| @@ -2954,6 +2954,7 @@ static inline void wp_page_reuse(struct |
| struct page *page = vmf->page; |
| pte_t entry; |
| |
| + VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); |
| VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page)); |
| |
| /* |
| @@ -2974,7 +2975,8 @@ static inline void wp_page_reuse(struct |
| } |
| |
| /* |
| - * Handle the case of a page which we actually need to copy to a new page. |
| + * Handle the case of a page which we actually need to copy to a new page, |
| + * either due to COW or unsharing. |
| * |
| * Called with mmap_lock locked and the old page referenced, but |
| * without the ptl held. |
| @@ -2991,6 +2993,7 @@ static inline void wp_page_reuse(struct |
| */ |
| static vm_fault_t wp_page_copy(struct vm_fault *vmf) |
| { |
| + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; |
| struct vm_area_struct *vma = vmf->vma; |
| struct mm_struct *mm = vma->vm_mm; |
| struct page *old_page = vmf->page; |
| @@ -3013,7 +3016,7 @@ static vm_fault_t wp_page_copy(struct vm |
| if (!new_page) |
| goto oom; |
| |
| - if (!cow_user_page(new_page, old_page, vmf)) { |
| + if (!__wp_page_copy_user(new_page, old_page, vmf)) { |
| /* |
| * COW failed, if the fault was solved by other, |
| * it's fine. If not, userspace would re-fault on |
| @@ -3055,7 +3058,14 @@ static vm_fault_t wp_page_copy(struct vm |
| flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
| entry = mk_pte(new_page, vma->vm_page_prot); |
| entry = pte_sw_mkyoung(entry); |
| - entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| + if (unlikely(unshare)) { |
| + if (pte_soft_dirty(vmf->orig_pte)) |
| + entry = pte_mksoft_dirty(entry); |
| + if (pte_uffd_wp(vmf->orig_pte)) |
| + entry = pte_mkuffd_wp(entry); |
| + } else { |
| + entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| + } |
| |
| /* |
| * Clear the pte entry and flush it first, before updating the |
| @@ -3072,6 +3082,7 @@ static vm_fault_t wp_page_copy(struct vm |
| * mmu page tables (such as kvm shadow page tables), we want the |
| * new page to be mapped directly into the secondary page table. |
| */ |
| + BUG_ON(unshare && pte_write(entry)); |
| set_pte_at_notify(mm, vmf->address, vmf->pte, entry); |
| update_mmu_cache(vma, vmf->address, vmf->pte); |
| if (old_page) { |
| @@ -3121,7 +3132,7 @@ static vm_fault_t wp_page_copy(struct vm |
| free_swap_cache(old_page); |
| put_page(old_page); |
| } |
| - return page_copied ? VM_FAULT_WRITE : 0; |
| + return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; |
| oom_free_new: |
| put_page(new_page); |
| oom: |
| @@ -3221,18 +3232,22 @@ static vm_fault_t wp_page_shared(struct |
| } |
| |
| /* |
| - * This routine handles present pages, when users try to write |
| - * to a shared page. It is done by copying the page to a new address |
| - * and decrementing the shared-page counter for the old page. |
| + * This routine handles present pages, when |
| + * * users try to write to a shared page (FAULT_FLAG_WRITE) |
| + * * GUP wants to take a R/O pin on a possibly shared anonymous page |
| + * (FAULT_FLAG_UNSHARE) |
| + * |
| + * It is done by copying the page to a new address and decrementing the |
| + * shared-page counter for the old page. |
| * |
| * Note that this routine assumes that the protection checks have been |
| * done by the caller (the low-level page fault routine in most cases). |
| - * Thus we can safely just mark it writable once we've done any necessary |
| - * COW. |
| + * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've |
| + * done any necessary COW. |
| * |
| - * We also mark the page dirty at this point even though the page will |
| - * change only once the write actually happens. This avoids a few races, |
| - * and potentially makes it more efficient. |
| + * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even |
| + * though the page will change only once the write actually happens. This |
| + * avoids a few races, and potentially makes it more efficient. |
| * |
| * We enter with non-exclusive mmap_lock (to exclude vma changes, |
| * but allow concurrent faults), with pte both mapped and locked. |
| @@ -3241,23 +3256,35 @@ static vm_fault_t wp_page_shared(struct |
| static vm_fault_t do_wp_page(struct vm_fault *vmf) |
| __releases(vmf->ptl) |
| { |
| + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; |
| struct vm_area_struct *vma = vmf->vma; |
| |
| - if (userfaultfd_pte_wp(vma, *vmf->pte)) { |
| - pte_unmap_unlock(vmf->pte, vmf->ptl); |
| - return handle_userfault(vmf, VM_UFFD_WP); |
| - } |
| + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); |
| + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); |
| |
| - /* |
| - * Userfaultfd write-protect can defer flushes. Ensure the TLB |
| - * is flushed in this case before copying. |
| - */ |
| - if (unlikely(userfaultfd_wp(vmf->vma) && |
| - mm_tlb_flush_pending(vmf->vma->vm_mm))) |
| - flush_tlb_page(vmf->vma, vmf->address); |
| + if (likely(!unshare)) { |
| + if (userfaultfd_pte_wp(vma, *vmf->pte)) { |
| + pte_unmap_unlock(vmf->pte, vmf->ptl); |
| + return handle_userfault(vmf, VM_UFFD_WP); |
| + } |
| + |
| + /* |
| + * Userfaultfd write-protect can defer flushes. Ensure the TLB |
| + * is flushed in this case before copying. |
| + */ |
| + if (unlikely(userfaultfd_wp(vmf->vma) && |
| + mm_tlb_flush_pending(vmf->vma->vm_mm))) |
| + flush_tlb_page(vmf->vma, vmf->address); |
| + } |
| |
| vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); |
| if (!vmf->page) { |
| + if (unlikely(unshare)) { |
| + /* No anonymous page -> nothing to do. */ |
| + pte_unmap_unlock(vmf->pte, vmf->ptl); |
| + return 0; |
| + } |
| + |
| /* |
| * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
| * VM_PFNMAP VMA. |
| @@ -3320,8 +3347,16 @@ static vm_fault_t do_wp_page(struct vm_f |
| page_move_anon_rmap(page, vma); |
| unlock_page(page); |
| reuse: |
| + if (unlikely(unshare)) { |
| + pte_unmap_unlock(vmf->pte, vmf->ptl); |
| + return 0; |
| + } |
| wp_page_reuse(vmf); |
| return VM_FAULT_WRITE; |
| + } else if (unshare) { |
| + /* No anonymous page -> nothing to do. */ |
| + pte_unmap_unlock(vmf->pte, vmf->ptl); |
| + return 0; |
| } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| (VM_WRITE|VM_SHARED))) { |
| return wp_page_shared(vmf); |
| @@ -4523,8 +4558,11 @@ static inline vm_fault_t create_huge_pmd |
| /* `inline' is required to avoid gcc 4.1.2 build error */ |
| static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) |
| { |
| + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; |
| + |
| if (vma_is_anonymous(vmf->vma)) { |
| - if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) |
| + if (likely(!unshare) && |
| + userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) |
| return handle_userfault(vmf, VM_UFFD_WP); |
| return do_huge_pmd_wp_page(vmf); |
| } |
| @@ -4659,10 +4697,11 @@ static vm_fault_t handle_pte_fault(struc |
| update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); |
| goto unlock; |
| } |
| - if (vmf->flags & FAULT_FLAG_WRITE) { |
| + if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { |
| if (!pte_write(entry)) |
| return do_wp_page(vmf); |
| - entry = pte_mkdirty(entry); |
| + else if (likely(vmf->flags & FAULT_FLAG_WRITE)) |
| + entry = pte_mkdirty(entry); |
| } |
| entry = pte_mkyoung(entry); |
| if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, |
| @@ -4703,7 +4742,6 @@ static vm_fault_t __handle_mm_fault(stru |
| .pgoff = linear_page_index(vma, address), |
| .gfp_mask = __get_fault_gfp_mask(vma), |
| }; |
| - unsigned int dirty = flags & FAULT_FLAG_WRITE; |
| struct mm_struct *mm = vma->vm_mm; |
| pgd_t *pgd; |
| p4d_t *p4d; |
| @@ -4728,9 +4766,11 @@ retry_pud: |
| barrier(); |
| if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { |
| |
| - /* NUMA case for anonymous PUDs would go here */ |
| - |
| - if (dirty && !pud_write(orig_pud)) { |
| + /* |
| + * TODO once we support anonymous PUDs: NUMA case and |
| + * FAULT_FLAG_UNSHARE handling. |
| + */ |
| + if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { |
| ret = wp_huge_pud(&vmf, orig_pud); |
| if (!(ret & VM_FAULT_FALLBACK)) |
| return ret; |
| @@ -4768,7 +4808,8 @@ retry_pud: |
| if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) |
| return do_huge_pmd_numa_page(&vmf); |
| |
| - if (dirty && !pmd_write(vmf.orig_pmd)) { |
| + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && |
| + !pmd_write(vmf.orig_pmd)) { |
| ret = wp_huge_pmd(&vmf); |
| if (!(ret & VM_FAULT_FALLBACK)) |
| return ret; |
| _ |