| From foo@baz Wed Feb 28 16:16:23 CET 2018 |
| From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com> |
| Date: Fri, 22 Dec 2017 13:01:39 -0200 |
| Subject: bnx2x: Improve reliability in case of nested PCI errors |
| |
| From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com> |
| |
| |
| [ Upstream commit f7084059a9cb9e56a186e1677b1dcffd76c2cd24 ] |
| |
| While in recovery process of PCI error (called EEH on PowerPC arch), |
| another PCI transaction could be corrupted causing a situation of |
| nested PCI errors. Also, this scenario could be reproduced with |
| error injection mechanisms (for debug purposes). |
| |
| We observe that in case of nested PCI errors, bnx2x might attempt to |
| initialize its shmem and cause a kernel crash due to bad addresses |
| read from MCP. Multiple different stack traces were observed depending |
| on the point the second PCI error happens. |
| |
| This patch avoids the crashes by: |
| |
| * failing PCI recovery in case of nested errors (since multiple |
| PCI errors in a row are not expected to lead to a functional |
| adapter anyway), and by, |
| |
| * preventing access to adapter FW when MCP is failed (we mark it as |
| failed when shmem cannot get initialized properly). |
| |
| Reported-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com> |
| Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com> |
| Acked-by: Shahed Shaikh <Shahed.Shaikh@cavium.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4 ++-- |
| drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 14 +++++++++++++- |
| 2 files changed, 15 insertions(+), 3 deletions(-) |
| |
| --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |
| +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |
| @@ -2994,7 +2994,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, i |
| |
| del_timer_sync(&bp->timer); |
| |
| - if (IS_PF(bp)) { |
| + if (IS_PF(bp) && !BP_NOMCP(bp)) { |
| /* Set ALWAYS_ALIVE bit in shmem */ |
| bp->fw_drv_pulse_wr_seq |= DRV_PULSE_ALWAYS_ALIVE; |
| bnx2x_drv_pulse(bp); |
| @@ -3076,7 +3076,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, i |
| bp->cnic_loaded = false; |
| |
| /* Clear driver version indication in shmem */ |
| - if (IS_PF(bp)) |
| + if (IS_PF(bp) && !BP_NOMCP(bp)) |
| bnx2x_update_mng_version(bp); |
| |
| /* Check if there are pending parity attentions. If there are - set |
| --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |
| +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |
| @@ -9472,6 +9472,15 @@ static int bnx2x_init_shmem(struct bnx2x |
| |
| do { |
| bp->common.shmem_base = REG_RD(bp, MISC_REG_SHARED_MEM_ADDR); |
| + |
| + /* If we read all 0xFFs, means we are in PCI error state and |
| + * should bail out to avoid crashes on adapter's FW reads. |
| + */ |
| + if (bp->common.shmem_base == 0xFFFFFFFF) { |
| + bp->flags |= NO_MCP_FLAG; |
| + return -ENODEV; |
| + } |
| + |
| if (bp->common.shmem_base) { |
| val = SHMEM_RD(bp, validity_map[BP_PORT(bp)]); |
| if (val & SHR_MEM_VALIDITY_MB) |
| @@ -13743,7 +13752,10 @@ static pci_ers_result_t bnx2x_io_slot_re |
| BNX2X_ERR("IO slot reset --> driver unload\n"); |
| |
| /* MCP should have been reset; Need to wait for validity */ |
| - bnx2x_init_shmem(bp); |
| + if (bnx2x_init_shmem(bp)) { |
| + rtnl_unlock(); |
| + return PCI_ERS_RESULT_DISCONNECT; |
| + } |
| |
| if (IS_PF(bp) && SHMEM2_HAS(bp, drv_capabilities_flag)) { |
| u32 v; |