| From c3e5ea6ee574ae5e845a40ac8198de1fb63bb3ab Mon Sep 17 00:00:00 2001 |
| From: "Kirill A. Shutemov" <kirill@shutemov.name> |
| Date: Thu, 5 Mar 2020 22:28:32 -0800 |
| Subject: mm: avoid data corruption on CoW fault into PFN-mapped VMA |
| |
| From: Kirill A. Shutemov <kirill@shutemov.name> |
| |
| commit c3e5ea6ee574ae5e845a40ac8198de1fb63bb3ab upstream. |
| |
| Jeff Moyer has reported that one of xfstests triggers a warning when run |
| on DAX-enabled filesystem: |
| |
| WARNING: CPU: 76 PID: 51024 at mm/memory.c:2317 wp_page_copy+0xc40/0xd50 |
| ... |
| wp_page_copy+0x98c/0xd50 (unreliable) |
| do_wp_page+0xd8/0xad0 |
| __handle_mm_fault+0x748/0x1b90 |
| handle_mm_fault+0x120/0x1f0 |
| __do_page_fault+0x240/0xd70 |
| do_page_fault+0x38/0xd0 |
| handle_page_fault+0x10/0x30 |
| |
| The warning happens on failed __copy_from_user_inatomic() which tries to |
| copy data into a CoW page. |
| |
| This happens because of race between MADV_DONTNEED and CoW page fault: |
| |
| CPU0 CPU1 |
| handle_mm_fault() |
| do_wp_page() |
| wp_page_copy() |
| do_wp_page() |
| madvise(MADV_DONTNEED) |
| zap_page_range() |
| zap_pte_range() |
| ptep_get_and_clear_full() |
| <TLB flush> |
| __copy_from_user_inatomic() |
| sees empty PTE and fails |
| WARN_ON_ONCE(1) |
| clear_page() |
| |
| The solution is to re-try __copy_from_user_inatomic() under PTL after |
| checking that PTE is matches the orig_pte. |
| |
| The second copy attempt can still fail, like due to non-readable PTE, but |
| there's nothing reasonable we can do about, except clearing the CoW page. |
| |
| Reported-by: Jeff Moyer <jmoyer@redhat.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> |
| Tested-by: Jeff Moyer <jmoyer@redhat.com> |
| Cc: <stable@vger.kernel.org> |
| Cc: Justin He <Justin.He@arm.com> |
| Cc: Dan Williams <dan.j.williams@intel.com> |
| Link: http://lkml.kernel.org/r/20200218154151.13349-1-kirill.shutemov@linux.intel.com |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| mm/memory.c | 35 +++++++++++++++++++++++++++-------- |
| 1 file changed, 27 insertions(+), 8 deletions(-) |
| |
| --- a/mm/memory.c |
| +++ b/mm/memory.c |
| @@ -2221,7 +2221,7 @@ static inline bool cow_user_page(struct |
| bool ret; |
| void *kaddr; |
| void __user *uaddr; |
| - bool force_mkyoung; |
| + bool locked = false; |
| struct vm_area_struct *vma = vmf->vma; |
| struct mm_struct *mm = vma->vm_mm; |
| unsigned long addr = vmf->address; |
| @@ -2246,11 +2246,11 @@ static inline bool cow_user_page(struct |
| * On architectures with software "accessed" bits, we would |
| * take a double page fault, so mark it accessed here. |
| */ |
| - force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte); |
| - if (force_mkyoung) { |
| + if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { |
| pte_t entry; |
| |
| vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); |
| + locked = true; |
| if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { |
| /* |
| * Other thread has already handled the fault |
| @@ -2274,18 +2274,37 @@ static inline bool cow_user_page(struct |
| * zeroes. |
| */ |
| if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { |
| + if (locked) |
| + goto warn; |
| + |
| + /* Re-validate under PTL if the page is still mapped */ |
| + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); |
| + locked = true; |
| + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { |
| + /* The PTE changed under us. Retry page fault. */ |
| + ret = false; |
| + goto pte_unlock; |
| + } |
| + |
| /* |
| - * Give a warn in case there can be some obscure |
| - * use-case |
| + * The same page can be mapped back since last copy attampt. |
| + * Try to copy again under PTL. |
| */ |
| - WARN_ON_ONCE(1); |
| - clear_page(kaddr); |
| + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { |
| + /* |
| + * Give a warn in case there can be some obscure |
| + * use-case |
| + */ |
| +warn: |
| + WARN_ON_ONCE(1); |
| + clear_page(kaddr); |
| + } |
| } |
| |
| ret = true; |
| |
| pte_unlock: |
| - if (force_mkyoung) |
| + if (locked) |
| pte_unmap_unlock(vmf->pte, vmf->ptl); |
| kunmap_atomic(kaddr); |
| flush_dcache_page(dst); |