| From 284cf3e4c61802631b233807b4b11229545d5bb7 Mon Sep 17 00:00:00 2001 |
| From: Oded Gabbay <oded.gabbay@gmail.com> |
| Date: Sat, 6 Apr 2019 13:23:54 +0300 |
| Subject: habanalabs: all FD must be closed before removing device |
| |
| [ Upstream commit caa3c8e52582fc4d2ed82afd5e7ea164c18ef4fe ] |
| |
| This patch fixes a bug in the implementation of the function that removes |
| the device. |
| |
| The bug can happen when the device is removed but not the driver itself |
| (e.g. remove by the OS due to PCI freeze in Power architecture). |
| |
| In that case, there maybe open users that are calling IOCTLs while the |
| device is removed. This is a possible race condition that the driver must |
| handle. Otherwise, a kernel panic may occur. |
| |
| This race is prevented in the hard-reset flow, because the driver makes |
| sure the users are closed before continuing with the hard-reset. This |
| race can not occur when the driver itself is removed because the OS makes |
| sure all the file descriptors are closed. |
| |
| The fix is to make sure the open users close their file descriptors and if |
| they don't (after a certain amount of time), the driver sends them a |
| SIGKILL, because the remove of the device can't be stopped. |
| |
| The patch re-uses the same code that is called from the hard-reset flow. |
| |
| Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| drivers/misc/habanalabs/device.c | 32 +++++++++++++++++++++++++++----- |
| 1 file changed, 27 insertions(+), 5 deletions(-) |
| |
| diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c |
| index 77d51be66c7e8..652c8edb2164c 100644 |
| --- a/drivers/misc/habanalabs/device.c |
| +++ b/drivers/misc/habanalabs/device.c |
| @@ -498,11 +498,8 @@ int hl_device_resume(struct hl_device *hdev) |
| return rc; |
| } |
| |
| -static void hl_device_hard_reset_pending(struct work_struct *work) |
| +static void device_kill_open_processes(struct hl_device *hdev) |
| { |
| - struct hl_device_reset_work *device_reset_work = |
| - container_of(work, struct hl_device_reset_work, reset_work); |
| - struct hl_device *hdev = device_reset_work->hdev; |
| u16 pending_total, pending_cnt; |
| struct task_struct *task = NULL; |
| |
| @@ -537,6 +534,12 @@ static void hl_device_hard_reset_pending(struct work_struct *work) |
| } |
| } |
| |
| + /* We killed the open users, but because the driver cleans up after the |
| + * user contexts are closed (e.g. mmu mappings), we need to wait again |
| + * to make sure the cleaning phase is finished before continuing with |
| + * the reset |
| + */ |
| + |
| pending_cnt = pending_total; |
| |
| while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) { |
| @@ -552,6 +555,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work) |
| |
| mutex_unlock(&hdev->fd_open_cnt_lock); |
| |
| +} |
| + |
| +static void device_hard_reset_pending(struct work_struct *work) |
| +{ |
| + struct hl_device_reset_work *device_reset_work = |
| + container_of(work, struct hl_device_reset_work, reset_work); |
| + struct hl_device *hdev = device_reset_work->hdev; |
| + |
| + device_kill_open_processes(hdev); |
| + |
| hl_device_reset(hdev, true, true); |
| |
| kfree(device_reset_work); |
| @@ -635,7 +648,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, |
| * from a dedicated work |
| */ |
| INIT_WORK(&device_reset_work->reset_work, |
| - hl_device_hard_reset_pending); |
| + device_hard_reset_pending); |
| device_reset_work->hdev = hdev; |
| schedule_work(&device_reset_work->reset_work); |
| |
| @@ -1035,6 +1048,15 @@ void hl_device_fini(struct hl_device *hdev) |
| /* Mark device as disabled */ |
| hdev->disabled = true; |
| |
| + /* |
| + * Flush anyone that is inside the critical section of enqueue |
| + * jobs to the H/W |
| + */ |
| + hdev->asic_funcs->hw_queues_lock(hdev); |
| + hdev->asic_funcs->hw_queues_unlock(hdev); |
| + |
| + device_kill_open_processes(hdev); |
| + |
| hl_hwmon_fini(hdev); |
| |
| device_late_fini(hdev); |
| -- |
| 2.20.1 |
| |