| From: Alistair Popple <apopple@nvidia.com> |
| Subject: mm/memory.c: fix race when faulting a device private page |
| Date: Wed, 28 Sep 2022 22:01:15 +1000 |
| |
| Patch series "Fix several device private page reference counting issues", |
| v2 |
| |
| This series aims to fix a number of page reference counting issues in |
| drivers dealing with device private ZONE_DEVICE pages. These result in |
| use-after-free type bugs, either from accessing a struct page which no |
| longer exists because it has been removed or accessing fields within the |
| struct page which are no longer valid because the page has been freed. |
| |
| During normal usage it is unlikely these will cause any problems. However |
| without these fixes it is possible to crash the kernel from userspace. |
| These crashes can be triggered either by unloading the kernel module or |
| unbinding the device from the driver prior to a userspace task exiting. |
| In modules such as Nouveau it is also possible to trigger some of these |
| issues by explicitly closing the device file-descriptor prior to the task |
| exiting and then accessing device private memory. |
| |
| This involves some minor changes to both PowerPC and AMD GPU code. |
| Unfortunately I lack hardware to test either of those so any help there |
| would be appreciated. The changes mimic what is done in for both Nouveau |
| and hmm-tests though so I doubt they will cause problems. |
| |
| |
| This patch (of 8): |
| |
| When the CPU tries to access a device private page the migrate_to_ram() |
| callback associated with the pgmap for the page is called. However no |
| reference is taken on the faulting page. Therefore a concurrent migration |
| of the device private page can free the page and possibly the underlying |
| pgmap. This results in a race which can crash the kernel due to the |
| migrate_to_ram() function pointer becoming invalid. It also means drivers |
| can't reliably read the zone_device_data field because the page may have |
| been freed with memunmap_pages(). |
| |
| Close the race by getting a reference on the page while holding the ptl to |
| ensure it has not been freed. Unfortunately the elevated reference count |
| will cause the migration required to handle the fault to fail. To avoid |
| this failure pass the faulting page into the migrate_vma functions so that |
| if an elevated reference count is found it can be checked to see if it's |
| expected or not. |
| |
| [mpe@ellerman.id.au: fix build] |
| Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au |
| Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com |
| Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com |
| Signed-off-by: Alistair Popple <apopple@nvidia.com> |
| Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> |
| Cc: Jason Gunthorpe <jgg@nvidia.com> |
| Cc: John Hubbard <jhubbard@nvidia.com> |
| Cc: Ralph Campbell <rcampbell@nvidia.com> |
| Cc: Michael Ellerman <mpe@ellerman.id.au> |
| Cc: Lyude Paul <lyude@redhat.com> |
| Cc: Alex Deucher <alexander.deucher@amd.com> |
| Cc: Alex Sierra <alex.sierra@amd.com> |
| Cc: Ben Skeggs <bskeggs@redhat.com> |
| Cc: Christian Kรถnig <christian.koenig@amd.com> |
| Cc: Dan Williams <dan.j.williams@intel.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: "Huang, Ying" <ying.huang@intel.com> |
| Cc: Matthew Wilcox <willy@infradead.org> |
| Cc: Yang Shi <shy828301@gmail.com> |
| Cc: Zi Yan <ziy@nvidia.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| arch/powerpc/kvm/book3s_hv_uvmem.c | 19 ++++++----- |
| drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 ++++++---- |
| drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 - |
| drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 ++++-- |
| include/linux/migrate.h | 8 ++++ |
| lib/test_hmm.c | 7 ++-- |
| mm/memory.c | 16 +++++++++ |
| mm/migrate.c | 34 ++++++++++++--------- |
| mm/migrate_device.c | 18 ++++++++--- |
| 9 files changed, 89 insertions(+), 43 deletions(-) |
| |
| --- a/arch/powerpc/kvm/book3s_hv_uvmem.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/arch/powerpc/kvm/book3s_hv_uvmem.c |
| @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(st |
| static int __kvmppc_svm_page_out(struct vm_area_struct *vma, |
| unsigned long start, |
| unsigned long end, unsigned long page_shift, |
| - struct kvm *kvm, unsigned long gpa) |
| + struct kvm *kvm, unsigned long gpa, struct page *fault_page) |
| { |
| unsigned long src_pfn, dst_pfn = 0; |
| - struct migrate_vma mig; |
| + struct migrate_vma mig = { 0 }; |
| struct page *dpage, *spage; |
| struct kvmppc_uvmem_page_pvt *pvt; |
| unsigned long pfn; |
| @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct |
| mig.dst = &dst_pfn; |
| mig.pgmap_owner = &kvmppc_uvmem_pgmap; |
| mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; |
| + mig.fault_page = fault_page; |
| |
| /* The requested page is already paged-out, nothing to do */ |
| if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) |
| @@ -580,12 +581,14 @@ out_finalize: |
| static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, |
| unsigned long start, unsigned long end, |
| unsigned long page_shift, |
| - struct kvm *kvm, unsigned long gpa) |
| + struct kvm *kvm, unsigned long gpa, |
| + struct page *fault_page) |
| { |
| int ret; |
| |
| mutex_lock(&kvm->arch.uvmem_lock); |
| - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); |
| + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, |
| + fault_page); |
| mutex_unlock(&kvm->arch.uvmem_lock); |
| |
| return ret; |
| @@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struc |
| pvt->remove_gfn = true; |
| |
| if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, |
| - PAGE_SHIFT, kvm, pvt->gpa)) |
| + PAGE_SHIFT, kvm, pvt->gpa, NULL)) |
| pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", |
| pvt->gpa, addr); |
| } else { |
| @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_ |
| bool pagein) |
| { |
| unsigned long src_pfn, dst_pfn = 0; |
| - struct migrate_vma mig; |
| + struct migrate_vma mig = { 0 }; |
| struct page *spage; |
| unsigned long pfn; |
| struct page *dpage; |
| @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_t |
| |
| if (kvmppc_svm_page_out(vmf->vma, vmf->address, |
| vmf->address + PAGE_SIZE, PAGE_SHIFT, |
| - pvt->kvm, pvt->gpa)) |
| + pvt->kvm, pvt->gpa, vmf->page)) |
| return VM_FAULT_SIGBUS; |
| else |
| return 0; |
| @@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, u |
| if (!vma || vma->vm_start > start || vma->vm_end < end) |
| goto out; |
| |
| - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) |
| + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) |
| ret = H_SUCCESS; |
| out: |
| mmap_read_unlock(kvm->mm); |
| --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |
| @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_de |
| uint64_t npages = (end - start) >> PAGE_SHIFT; |
| struct kfd_process_device *pdd; |
| struct dma_fence *mfence = NULL; |
| - struct migrate_vma migrate; |
| + struct migrate_vma migrate = { 0 }; |
| unsigned long cpages = 0; |
| dma_addr_t *scratch; |
| void *buf; |
| @@ -668,7 +668,7 @@ out_oom: |
| static long |
| svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, |
| struct vm_area_struct *vma, uint64_t start, uint64_t end, |
| - uint32_t trigger) |
| + uint32_t trigger, struct page *fault_page) |
| { |
| struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); |
| uint64_t npages = (end - start) >> PAGE_SHIFT; |
| @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_dev |
| unsigned long cpages = 0; |
| struct kfd_process_device *pdd; |
| struct dma_fence *mfence = NULL; |
| - struct migrate_vma migrate; |
| + struct migrate_vma migrate = { 0 }; |
| dma_addr_t *scratch; |
| void *buf; |
| int r = -ENOMEM; |
| @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_dev |
| |
| migrate.src = buf; |
| migrate.dst = migrate.src + npages; |
| + migrate.fault_page = fault_page; |
| scratch = (dma_addr_t *)(migrate.dst + npages); |
| |
| kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, |
| @@ -766,7 +767,7 @@ out: |
| * 0 - OK, otherwise error code |
| */ |
| int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, |
| - uint32_t trigger) |
| + uint32_t trigger, struct page *fault_page) |
| { |
| struct amdgpu_device *adev; |
| struct vm_area_struct *vma; |
| @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_r |
| } |
| |
| next = min(vma->vm_end, end); |
| - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); |
| + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, |
| + fault_page); |
| if (r < 0) { |
| pr_debug("failed %ld to migrate prange %p\n", r, prange); |
| break; |
| @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_rang |
| pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); |
| |
| do { |
| - r = svm_migrate_vram_to_ram(prange, mm, trigger); |
| + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); |
| if (r) |
| return r; |
| } while (prange->actual_loc && --retries); |
| @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(str |
| goto out_unlock_prange; |
| } |
| |
| - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); |
| + r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, |
| + vmf->page); |
| if (r) |
| pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, |
| prange, prange->start, prange->last); |
| --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |
| @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { |
| int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, |
| struct mm_struct *mm, uint32_t trigger); |
| int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, |
| - uint32_t trigger); |
| + uint32_t trigger, struct page *fault_page); |
| unsigned long |
| svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); |
| |
| --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c |
| @@ -2913,13 +2913,15 @@ retry_write_locked: |
| */ |
| if (prange->actual_loc) |
| r = svm_migrate_vram_to_ram(prange, mm, |
| - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); |
| + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, |
| + NULL); |
| else |
| r = 0; |
| } |
| } else { |
| r = svm_migrate_vram_to_ram(prange, mm, |
| - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); |
| + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, |
| + NULL); |
| } |
| if (r) { |
| pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", |
| @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_st |
| return 0; |
| |
| if (!best_loc) { |
| - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); |
| + r = svm_migrate_vram_to_ram(prange, mm, |
| + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); |
| *migrated = !r; |
| return r; |
| } |
| @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worke |
| mutex_lock(&prange->migrate_mutex); |
| do { |
| r = svm_migrate_vram_to_ram(prange, mm, |
| - KFD_MIGRATE_TRIGGER_TTM_EVICTION); |
| + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); |
| } while (!r && prange->actual_loc && --retries); |
| |
| if (!r && prange->actual_loc) |
| --- a/include/linux/migrate.h~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/include/linux/migrate.h |
| @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[ |
| #ifdef CONFIG_MIGRATION |
| |
| extern void putback_movable_pages(struct list_head *l); |
| +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, |
| + struct folio *src, enum migrate_mode mode, int extra_count); |
| int migrate_folio(struct address_space *mapping, struct folio *dst, |
| struct folio *src, enum migrate_mode mode); |
| extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, |
| @@ -197,6 +199,12 @@ struct migrate_vma { |
| */ |
| void *pgmap_owner; |
| unsigned long flags; |
| + |
| + /* |
| + * Set to vmf->page if this is being called to migrate a page as part of |
| + * a migrate_to_ram() callback. |
| + */ |
| + struct page *fault_page; |
| }; |
| |
| int migrate_vma_setup(struct migrate_vma *args); |
| --- a/lib/test_hmm.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/lib/test_hmm.c |
| @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(str |
| struct vm_area_struct *vma; |
| unsigned long src_pfns[64] = { 0 }; |
| unsigned long dst_pfns[64] = { 0 }; |
| - struct migrate_vma args; |
| + struct migrate_vma args = { 0 }; |
| unsigned long next; |
| int ret; |
| |
| @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(str |
| unsigned long src_pfns[64] = { 0 }; |
| unsigned long dst_pfns[64] = { 0 }; |
| struct dmirror_bounce bounce; |
| - struct migrate_vma args; |
| + struct migrate_vma args = { 0 }; |
| unsigned long next; |
| int ret; |
| |
| @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct p |
| |
| static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) |
| { |
| - struct migrate_vma args; |
| + struct migrate_vma args = { 0 }; |
| unsigned long src_pfns = 0; |
| unsigned long dst_pfns = 0; |
| struct page *rpage; |
| @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(s |
| args.dst = &dst_pfns; |
| args.pgmap_owner = dmirror->mdevice; |
| args.flags = dmirror_select_device(dmirror); |
| + args.fault_page = vmf->page; |
| |
| if (migrate_vma_setup(&args)) |
| return VM_FAULT_SIGBUS; |
| --- a/mm/memory.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/mm/memory.c |
| @@ -3750,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault |
| ret = remove_device_exclusive_entry(vmf); |
| } else if (is_device_private_entry(entry)) { |
| vmf->page = pfn_swap_entry_to_page(entry); |
| - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); |
| + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
| + vmf->address, &vmf->ptl); |
| + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { |
| + spin_unlock(vmf->ptl); |
| + goto out; |
| + } |
| + |
| + /* |
| + * Get a page reference while we know the page can't be |
| + * freed. |
| + */ |
| + get_page(vmf->page); |
| + pte_unmap_unlock(vmf->pte, vmf->ptl); |
| + vmf->page->pgmap->ops->migrate_to_ram(vmf); |
| + put_page(vmf->page); |
| } else if (is_hwpoison_entry(entry)) { |
| ret = VM_FAULT_HWPOISON; |
| } else if (is_swapin_error_entry(entry)) { |
| --- a/mm/migrate.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/mm/migrate.c |
| @@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy); |
| * Migration functions |
| ***********************************************************/ |
| |
| +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, |
| + struct folio *src, enum migrate_mode mode, int extra_count) |
| +{ |
| + int rc; |
| + |
| + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ |
| + |
| + rc = folio_migrate_mapping(mapping, dst, src, extra_count); |
| + |
| + if (rc != MIGRATEPAGE_SUCCESS) |
| + return rc; |
| + |
| + if (mode != MIGRATE_SYNC_NO_COPY) |
| + folio_migrate_copy(dst, src); |
| + else |
| + folio_migrate_flags(dst, src); |
| + return MIGRATEPAGE_SUCCESS; |
| +} |
| + |
| /** |
| * migrate_folio() - Simple folio migration. |
| * @mapping: The address_space containing the folio. |
| @@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy); |
| int migrate_folio(struct address_space *mapping, struct folio *dst, |
| struct folio *src, enum migrate_mode mode) |
| { |
| - int rc; |
| - |
| - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ |
| - |
| - rc = folio_migrate_mapping(mapping, dst, src, 0); |
| - |
| - if (rc != MIGRATEPAGE_SUCCESS) |
| - return rc; |
| - |
| - if (mode != MIGRATE_SYNC_NO_COPY) |
| - folio_migrate_copy(dst, src); |
| - else |
| - folio_migrate_flags(dst, src); |
| - return MIGRATEPAGE_SUCCESS; |
| + return migrate_folio_extra(mapping, dst, src, mode, 0); |
| } |
| EXPORT_SYMBOL(migrate_folio); |
| |
| --- a/mm/migrate_device.c~mm-memoryc-fix-race-when-faulting-a-device-private-page |
| +++ a/mm/migrate_device.c |
| @@ -325,14 +325,14 @@ static void migrate_vma_collect(struct m |
| * folio_migrate_mapping(), except that here we allow migration of a |
| * ZONE_DEVICE page. |
| */ |
| -static bool migrate_vma_check_page(struct page *page) |
| +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) |
| { |
| /* |
| * One extra ref because caller holds an extra reference, either from |
| * isolate_lru_page() for a regular page, or migrate_vma_collect() for |
| * a device page. |
| */ |
| - int extra = 1; |
| + int extra = 1 + (page == fault_page); |
| |
| /* |
| * FIXME support THP (transparent huge page), it is bit more complex to |
| @@ -405,7 +405,8 @@ static void migrate_vma_unmap(struct mig |
| if (folio_mapped(folio)) |
| try_to_migrate(folio, 0); |
| |
| - if (page_mapped(page) || !migrate_vma_check_page(page)) { |
| + if (page_mapped(page) || |
| + !migrate_vma_check_page(page, migrate->fault_page)) { |
| if (!is_zone_device_page(page)) { |
| get_page(page); |
| putback_lru_page(page); |
| @@ -517,6 +518,8 @@ int migrate_vma_setup(struct migrate_vma |
| return -EINVAL; |
| if (!args->src || !args->dst) |
| return -EINVAL; |
| + if (args->fault_page && !is_device_private_page(args->fault_page)) |
| + return -EINVAL; |
| |
| memset(args->src, 0, sizeof(*args->src) * nr_pages); |
| args->cpages = 0; |
| @@ -747,8 +750,13 @@ void migrate_vma_pages(struct migrate_vm |
| continue; |
| } |
| |
| - r = migrate_folio(mapping, page_folio(newpage), |
| - page_folio(page), MIGRATE_SYNC_NO_COPY); |
| + if (migrate->fault_page == page) |
| + r = migrate_folio_extra(mapping, page_folio(newpage), |
| + page_folio(page), |
| + MIGRATE_SYNC_NO_COPY, 1); |
| + else |
| + r = migrate_folio(mapping, page_folio(newpage), |
| + page_folio(page), MIGRATE_SYNC_NO_COPY); |
| if (r != MIGRATEPAGE_SUCCESS) |
| migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; |
| } |
| _ |