| From foo@baz Wed May 31 09:13:34 JST 2017 |
| From: Mohamad Haj Yahia <mohamad@mellanox.com> |
| Date: Thu, 23 Feb 2017 11:19:36 +0200 |
| Subject: net/mlx5: Avoid using pending command interface slots |
| |
| From: Mohamad Haj Yahia <mohamad@mellanox.com> |
| |
| |
| [ Upstream commit 73dd3a4839c1d27c36d4dcc92e1ff44225ecbeb7 ] |
| |
| Currently when firmware command gets stuck or it takes long time to |
| complete, the driver command will get timeout and the command slot is |
| freed and can be used for new commands, and if the firmware receive new |
| command on the old busy slot its behavior is unexpected and this could |
| be harmful. |
| To fix this when the driver command gets timeout we return failure, |
| but we don't free the command slot and we wait for the firmware to |
| explicitly respond to that command. |
| Once all the entries are busy we will stop processing new firmware |
| commands. |
| |
| Fixes: 9cba4ebcf374 ('net/mlx5: Fix potential deadlock in command mode change') |
| Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com> |
| Cc: kernel-team@fb.com |
| Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 41 ++++++++++++++++++++--- |
| drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 - |
| drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 - |
| include/linux/mlx5/driver.h | 7 +++ |
| 4 files changed, 44 insertions(+), 8 deletions(-) |
| |
| --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c |
| +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c |
| @@ -767,7 +767,7 @@ static void cb_timeout_handler(struct wo |
| mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", |
| mlx5_command_str(msg_to_opcode(ent->in)), |
| msg_to_opcode(ent->in)); |
| - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); |
| + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); |
| } |
| |
| static void cmd_work_handler(struct work_struct *work) |
| @@ -797,6 +797,7 @@ static void cmd_work_handler(struct work |
| } |
| |
| cmd->ent_arr[ent->idx] = ent; |
| + set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); |
| lay = get_inst(cmd, ent->idx); |
| ent->lay = lay; |
| memset(lay, 0, sizeof(*lay)); |
| @@ -818,6 +819,20 @@ static void cmd_work_handler(struct work |
| if (ent->callback) |
| schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); |
| |
| + /* Skip sending command to fw if internal error */ |
| + if (pci_channel_offline(dev->pdev) || |
| + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { |
| + u8 status = 0; |
| + u32 drv_synd; |
| + |
| + ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status); |
| + MLX5_SET(mbox_out, ent->out, status, status); |
| + MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); |
| + |
| + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); |
| + return; |
| + } |
| + |
| /* ring doorbell after the descriptor is valid */ |
| mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); |
| wmb(); |
| @@ -828,7 +843,7 @@ static void cmd_work_handler(struct work |
| poll_timeout(ent); |
| /* make sure we read the descriptor after ownership is SW */ |
| rmb(); |
| - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); |
| + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT)); |
| } |
| } |
| |
| @@ -872,7 +887,7 @@ static int wait_func(struct mlx5_core_de |
| wait_for_completion(&ent->done); |
| } else if (!wait_for_completion_timeout(&ent->done, timeout)) { |
| ent->ret = -ETIMEDOUT; |
| - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); |
| + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); |
| } |
| |
| err = ent->ret; |
| @@ -1369,7 +1384,7 @@ static void free_msg(struct mlx5_core_de |
| } |
| } |
| |
| -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) |
| +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced) |
| { |
| struct mlx5_cmd *cmd = &dev->cmd; |
| struct mlx5_cmd_work_ent *ent; |
| @@ -1389,6 +1404,19 @@ void mlx5_cmd_comp_handler(struct mlx5_c |
| struct semaphore *sem; |
| |
| ent = cmd->ent_arr[i]; |
| + |
| + /* if we already completed the command, ignore it */ |
| + if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, |
| + &ent->state)) { |
| + /* only real completion can free the cmd slot */ |
| + if (!forced) { |
| + mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", |
| + ent->idx); |
| + free_ent(cmd, ent->idx); |
| + } |
| + continue; |
| + } |
| + |
| if (ent->callback) |
| cancel_delayed_work(&ent->cb_timeout_work); |
| if (ent->page_queue) |
| @@ -1411,7 +1439,10 @@ void mlx5_cmd_comp_handler(struct mlx5_c |
| mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", |
| ent->ret, deliv_status_to_str(ent->status), ent->status); |
| } |
| - free_ent(cmd, ent->idx); |
| + |
| + /* only real completion will free the entry slot */ |
| + if (!forced) |
| + free_ent(cmd, ent->idx); |
| |
| if (ent->callback) { |
| ds = ent->ts2 - ent->ts1; |
| --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c |
| +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c |
| @@ -234,7 +234,7 @@ static int mlx5_eq_int(struct mlx5_core_ |
| break; |
| |
| case MLX5_EVENT_TYPE_CMD: |
| - mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector)); |
| + mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false); |
| break; |
| |
| case MLX5_EVENT_TYPE_PORT_CHANGE: |
| --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c |
| +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c |
| @@ -90,7 +90,7 @@ static void trigger_cmd_completions(stru |
| spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); |
| |
| mlx5_core_dbg(dev, "vector 0x%llx\n", vector); |
| - mlx5_cmd_comp_handler(dev, vector); |
| + mlx5_cmd_comp_handler(dev, vector, true); |
| return; |
| |
| no_trig: |
| --- a/include/linux/mlx5/driver.h |
| +++ b/include/linux/mlx5/driver.h |
| @@ -640,7 +640,12 @@ enum { |
| |
| typedef void (*mlx5_cmd_cbk_t)(int status, void *context); |
| |
| +enum { |
| + MLX5_CMD_ENT_STATE_PENDING_COMP, |
| +}; |
| + |
| struct mlx5_cmd_work_ent { |
| + unsigned long state; |
| struct mlx5_cmd_msg *in; |
| struct mlx5_cmd_msg *out; |
| void *uout; |
| @@ -838,7 +843,7 @@ void mlx5_eq_pagefault(struct mlx5_core_ |
| #endif |
| void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); |
| struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); |
| -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec); |
| +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); |
| void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type); |
| int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, |
| int nent, u64 mask, const char *name, struct mlx5_uar *uar); |