| From af8dd6357eccf64de71ca150887e57f496cf2f74 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Wed, 12 Jan 2022 11:50:50 -0500 |
| Subject: drm/amdkfd: svm range restore work deadlock when process exit |
| |
| From: Philip Yang <Philip.Yang@amd.com> |
| |
| [ Upstream commit 6225bb3a88d22594aacea2485dc28ca12d596721 ] |
| |
| kfd_process_notifier_release flush svm_range_restore_work |
| which calls svm_range_list_lock_and_flush_work to flush deferred_list |
| work, but if deferred_list work mmput release the last user, it will |
| call exit_mmap -> notifier_release, it is deadlock with below backtrace. |
| |
| Move flush svm_range_restore_work to kfd_process_wq_release to avoid |
| deadlock. Then svm_range_restore_work take task->mm ref to avoid mm is |
| gone while validating and mapping ranges to GPU. |
| |
| Workqueue: events svm_range_deferred_list_work [amdgpu] |
| Call Trace: |
| wait_for_completion+0x94/0x100 |
| __flush_work+0x12a/0x1e0 |
| __cancel_work_timer+0x10e/0x190 |
| cancel_delayed_work_sync+0x13/0x20 |
| kfd_process_notifier_release+0x98/0x2a0 [amdgpu] |
| __mmu_notifier_release+0x74/0x1f0 |
| exit_mmap+0x170/0x200 |
| mmput+0x5d/0x130 |
| svm_range_deferred_list_work+0x104/0x230 [amdgpu] |
| process_one_work+0x220/0x3c0 |
| |
| Signed-off-by: Philip Yang <Philip.Yang@amd.com> |
| Reported-by: Ruili Ji <ruili.ji@amd.com> |
| Tested-by: Ruili Ji <ruili.ji@amd.com> |
| Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> |
| Signed-off-by: Alex Deucher <alexander.deucher@amd.com> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 - |
| drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 15 +++++++++------ |
| 2 files changed, 9 insertions(+), 7 deletions(-) |
| |
| diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c |
| index b993011cfa64..990228711108 100644 |
| --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c |
| +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c |
| @@ -1150,7 +1150,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, |
| |
| cancel_delayed_work_sync(&p->eviction_work); |
| cancel_delayed_work_sync(&p->restore_work); |
| - cancel_delayed_work_sync(&p->svms.restore_work); |
| |
| mutex_lock(&p->mutex); |
| |
| diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c |
| index ea1c5aaf659a..a1b0c6bda803 100644 |
| --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c |
| +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c |
| @@ -1589,13 +1589,14 @@ static void svm_range_restore_work(struct work_struct *work) |
| |
| pr_debug("restore svm ranges\n"); |
| |
| - /* kfd_process_notifier_release destroys this worker thread. So during |
| - * the lifetime of this thread, kfd_process and mm will be valid. |
| - */ |
| p = container_of(svms, struct kfd_process, svms); |
| - mm = p->mm; |
| - if (!mm) |
| + |
| + /* Keep mm reference when svm_range_validate_and_map ranges */ |
| + mm = get_task_mm(p->lead_thread); |
| + if (!mm) { |
| + pr_debug("svms 0x%p process mm gone\n", svms); |
| return; |
| + } |
| |
| svm_range_list_lock_and_flush_work(svms, mm); |
| mutex_lock(&svms->lock); |
| @@ -1649,6 +1650,7 @@ static void svm_range_restore_work(struct work_struct *work) |
| out_reschedule: |
| mutex_unlock(&svms->lock); |
| mmap_write_unlock(mm); |
| + mmput(mm); |
| |
| /* If validation failed, reschedule another attempt */ |
| if (evicted_ranges) { |
| @@ -2779,6 +2781,8 @@ void svm_range_list_fini(struct kfd_process *p) |
| |
| pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); |
| |
| + cancel_delayed_work_sync(&p->svms.restore_work); |
| + |
| /* Ensure list work is finished before process is destroyed */ |
| flush_work(&p->svms.deferred_list_work); |
| |
| @@ -2789,7 +2793,6 @@ void svm_range_list_fini(struct kfd_process *p) |
| atomic_inc(&p->svms.drain_pagefaults); |
| svm_range_drain_retry_fault(&p->svms); |
| |
| - |
| list_for_each_entry_safe(prange, next, &p->svms.list, list) { |
| svm_range_unlink(prange); |
| svm_range_remove_notifier(prange); |
| -- |
| 2.35.1 |
| |