| From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 |
| From: Andrea Arcangeli <aarcange@redhat.com> |
| Date: Thu, 18 Apr 2019 17:50:52 -0700 |
| Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping |
| |
| From: Andrea Arcangeli <aarcange@redhat.com> |
| |
| commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream. |
| |
| The core dumping code has always run without holding the mmap_sem for |
| writing, despite that is the only way to ensure that the entire vma |
| layout will not change from under it. Only using some signal |
| serialization on the processes belonging to the mm is not nearly enough. |
| This was pointed out earlier. For example in Hugh's post from Jul 2017: |
| |
| https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils |
| |
| "Not strictly relevant here, but a related note: I was very surprised |
| to discover, only quite recently, how handle_mm_fault() may be called |
| without down_read(mmap_sem) - when core dumping. That seems a |
| misguided optimization to me, which would also be nice to correct" |
| |
| In particular because the growsdown and growsup can move the |
| vm_start/vm_end the various loops the core dump does around the vma will |
| not be consistent if page faults can happen concurrently. |
| |
| Pretty much all users calling mmget_not_zero()/get_task_mm() and then |
| taking the mmap_sem had the potential to introduce unexpected side |
| effects in the core dumping code. |
| |
| Adding mmap_sem for writing around the ->core_dump invocation is a |
| viable long term fix, but it requires removing all copy user and page |
| faults and to replace them with get_dump_page() for all binary formats |
| which is not suitable as a short term fix. |
| |
| For the time being this solution manually covers the places that can |
| confuse the core dump either by altering the vma layout or the vma flags |
| while it runs. Once ->core_dump runs under mmap_sem for writing the |
| function mmget_still_valid() can be dropped. |
| |
| Allowing mmap_sem protected sections to run in parallel with the |
| coredump provides some minor parallelism advantage to the swapoff code |
| (which seems to be safe enough by never mangling any vma field and can |
| keep doing swapins in parallel to the core dumping) and to some other |
| corner case. |
| |
| In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" |
| however the side effect of this same race condition in /proc/pid/mem |
| should be reproducible since before 2.6.12-rc2 so I couldn't add any |
| other "Fixes:" because there's no hash beyond the git genesis commit. |
| |
| Because find_extend_vma() is the only location outside of the process |
| context that could modify the "mm" structures under mmap_sem for |
| reading, by adding the mmget_still_valid() check to it, all other cases |
| that take the mmap_sem for reading don't need the new check after |
| mmget_not_zero()/get_task_mm(). The expand_stack() in page fault |
| context also doesn't need the new check, because all tasks under core |
| dumping are frozen. |
| |
| Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com |
| Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") |
| Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> |
| Reported-by: Jann Horn <jannh@google.com> |
| Suggested-by: Oleg Nesterov <oleg@redhat.com> |
| Acked-by: Peter Xu <peterx@redhat.com> |
| Reviewed-by: Mike Rapoport <rppt@linux.ibm.com> |
| Reviewed-by: Oleg Nesterov <oleg@redhat.com> |
| Reviewed-by: Jann Horn <jannh@google.com> |
| Acked-by: Jason Gunthorpe <jgg@mellanox.com> |
| Acked-by: Michal Hocko <mhocko@suse.com> |
| Cc: <stable@vger.kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| [akaher@vmware.com: stable 4.9 backport |
| - handle binder_update_page_range - mhocko@suse.com] |
| Signed-off-by: Ajay Kaher <akaher@vmware.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| drivers/android/binder.c | 6 ++++++ |
| fs/proc/task_mmu.c | 18 ++++++++++++++++++ |
| fs/userfaultfd.c | 9 +++++++++ |
| include/linux/mm.h | 20 ++++++++++++++++++++ |
| mm/mmap.c | 6 +++++- |
| 5 files changed, 58 insertions(+), 1 deletion(-) |
| |
| --- a/drivers/android/binder.c |
| +++ b/drivers/android/binder.c |
| @@ -581,6 +581,12 @@ static int binder_update_page_range(stru |
| |
| if (mm) { |
| down_write(&mm->mmap_sem); |
| + if (!mmget_still_valid(mm)) { |
| + if (allocate == 0) |
| + goto free_range; |
| + goto err_no_vma; |
| + } |
| + |
| vma = proc->vma; |
| if (vma && mm != proc->vma_vm_mm) { |
| pr_err("%d: vma mm and task mm mismatch\n", |
| --- a/fs/proc/task_mmu.c |
| +++ b/fs/proc/task_mmu.c |
| @@ -1057,6 +1057,24 @@ static ssize_t clear_refs_write(struct f |
| count = -EINTR; |
| goto out_mm; |
| } |
| + /* |
| + * Avoid to modify vma->vm_flags |
| + * without locked ops while the |
| + * coredump reads the vm_flags. |
| + */ |
| + if (!mmget_still_valid(mm)) { |
| + /* |
| + * Silently return "count" |
| + * like if get_task_mm() |
| + * failed. FIXME: should this |
| + * function have returned |
| + * -ESRCH if get_task_mm() |
| + * failed like if |
| + * get_proc_task() fails? |
| + */ |
| + up_write(&mm->mmap_sem); |
| + goto out_mm; |
| + } |
| for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| vma->vm_flags &= ~VM_SOFTDIRTY; |
| vma_set_page_prot(vma); |
| --- a/fs/userfaultfd.c |
| +++ b/fs/userfaultfd.c |
| @@ -479,6 +479,8 @@ static int userfaultfd_release(struct in |
| * taking the mmap_sem for writing. |
| */ |
| down_write(&mm->mmap_sem); |
| + if (!mmget_still_valid(mm)) |
| + goto skip_mm; |
| prev = NULL; |
| for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| cond_resched(); |
| @@ -501,6 +503,7 @@ static int userfaultfd_release(struct in |
| vma->vm_flags = new_flags; |
| vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; |
| } |
| +skip_mm: |
| up_write(&mm->mmap_sem); |
| mmput(mm); |
| wakeup: |
| @@ -802,6 +805,9 @@ static int userfaultfd_register(struct u |
| goto out; |
| |
| down_write(&mm->mmap_sem); |
| + if (!mmget_still_valid(mm)) |
| + goto out_unlock; |
| + |
| vma = find_vma_prev(mm, start, &prev); |
| if (!vma) |
| goto out_unlock; |
| @@ -947,6 +953,9 @@ static int userfaultfd_unregister(struct |
| goto out; |
| |
| down_write(&mm->mmap_sem); |
| + if (!mmget_still_valid(mm)) |
| + goto out_unlock; |
| + |
| vma = find_vma_prev(mm, start, &prev); |
| if (!vma) |
| goto out_unlock; |
| --- a/include/linux/mm.h |
| +++ b/include/linux/mm.h |
| @@ -1192,6 +1192,26 @@ void zap_page_range(struct vm_area_struc |
| unsigned long size, struct zap_details *); |
| void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
| unsigned long start, unsigned long end); |
| +/* |
| + * This has to be called after a get_task_mm()/mmget_not_zero() |
| + * followed by taking the mmap_sem for writing before modifying the |
| + * vmas or anything the coredump pretends not to change from under it. |
| + * |
| + * NOTE: find_extend_vma() called from GUP context is the only place |
| + * that can modify the "mm" (notably the vm_start/end) under mmap_sem |
| + * for reading and outside the context of the process, so it is also |
| + * the only case that holds the mmap_sem for reading that must call |
| + * this function. Generally if the mmap_sem is hold for reading |
| + * there's no need of this check after get_task_mm()/mmget_not_zero(). |
| + * |
| + * This function can be obsoleted and the check can be removed, after |
| + * the coredump code will hold the mmap_sem for writing before |
| + * invoking the ->core_dump methods. |
| + */ |
| +static inline bool mmget_still_valid(struct mm_struct *mm) |
| +{ |
| + return likely(!mm->core_state); |
| +} |
| |
| /** |
| * mm_walk - callbacks for walk_page_range |
| --- a/mm/mmap.c |
| +++ b/mm/mmap.c |
| @@ -2448,7 +2448,8 @@ find_extend_vma(struct mm_struct *mm, un |
| vma = find_vma_prev(mm, addr, &prev); |
| if (vma && (vma->vm_start <= addr)) |
| return vma; |
| - if (!prev || expand_stack(prev, addr)) |
| + /* don't alter vm_end if the coredump is running */ |
| + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) |
| return NULL; |
| if (prev->vm_flags & VM_LOCKED) |
| populate_vma_page_range(prev, addr, prev->vm_end, NULL); |
| @@ -2474,6 +2475,9 @@ find_extend_vma(struct mm_struct *mm, un |
| return vma; |
| if (!(vma->vm_flags & VM_GROWSDOWN)) |
| return NULL; |
| + /* don't alter vm_start if the coredump is running */ |
| + if (!mmget_still_valid(mm)) |
| + return NULL; |
| start = vma->vm_start; |
| if (expand_stack(vma, addr)) |
| return NULL; |