| From feadf7c0a1a7c08c74bebb4a13b755f8c40e3bbc Mon Sep 17 00:00:00 2001 |
| From: Gavin Shan <shangw@linux.vnet.ibm.com> |
| Date: Mon, 17 Sep 2012 04:34:27 +0000 |
| Subject: powerpc/eeh: Lock module while handling EEH event |
| |
| From: Gavin Shan <shangw@linux.vnet.ibm.com> |
| |
| commit feadf7c0a1a7c08c74bebb4a13b755f8c40e3bbc upstream. |
| |
| The EEH core is talking with the PCI device driver to determine the |
| action (purely reset, or PCI device removal). During the period, the |
| driver might be unloaded and in turn causes kernel crash as follows: |
| |
| EEH: Detected PCI bus error on PHB#4-PE#10000 |
| EEH: This PCI device has failed 3 times in the last hour |
| lpfc 0004:01:00.0: 0:2710 PCI channel disable preparing for reset |
| Unable to handle kernel paging request for data at address 0x00000490 |
| Faulting instruction address: 0xd00000000e682c90 |
| cpu 0x1: Vector: 300 (Data Access) at [c000000fc75ffa20] |
| pc: d00000000e682c90: .lpfc_io_error_detected+0x30/0x240 [lpfc] |
| lr: d00000000e682c8c: .lpfc_io_error_detected+0x2c/0x240 [lpfc] |
| sp: c000000fc75ffca0 |
| msr: 8000000000009032 |
| dar: 490 |
| dsisr: 40000000 |
| current = 0xc000000fc79b88b0 |
| paca = 0xc00000000edb0380 softe: 0 irq_happened: 0x00 |
| pid = 3386, comm = eehd |
| enter ? for help |
| [c000000fc75ffca0] c000000fc75ffd30 (unreliable) |
| [c000000fc75ffd30] c00000000004fd3c .eeh_report_error+0x7c/0xf0 |
| [c000000fc75ffdc0] c00000000004ee00 .eeh_pe_dev_traverse+0xa0/0x180 |
| [c000000fc75ffe70] c00000000004ffd8 .eeh_handle_event+0x68/0x300 |
| [c000000fc75fff00] c0000000000503a0 .eeh_event_handler+0x130/0x1a0 |
| [c000000fc75fff90] c000000000020138 .kernel_thread+0x54/0x70 |
| 1:mon> |
| |
| The patch increases the reference of the corresponding driver modules |
| while EEH core does the negotiation with PCI device driver so that the |
| corresponding driver modules can't be unloaded during the period and |
| we're safe to refer the callbacks. |
| |
| Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru> |
| Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> |
| Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> |
| [ herton: backported for 3.5, adjusted driver assignments, return 0 |
| instead of NULL, assume dev is not NULL ] |
| Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> |
| Cc: Ben Hutchings <ben@decadent.org.uk> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| arch/powerpc/platforms/pseries/eeh_driver.c | 95 +++++++++++++++++++++------- |
| 1 file changed, 74 insertions(+), 21 deletions(-) |
| |
| --- a/arch/powerpc/platforms/pseries/eeh_driver.c |
| +++ b/arch/powerpc/platforms/pseries/eeh_driver.c |
| @@ -25,6 +25,7 @@ |
| #include <linux/delay.h> |
| #include <linux/interrupt.h> |
| #include <linux/irq.h> |
| +#include <linux/module.h> |
| #include <linux/pci.h> |
| #include <asm/eeh.h> |
| #include <asm/eeh_event.h> |
| @@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name( |
| return ""; |
| } |
| |
| +/** |
| + * eeh_pcid_get - Get the PCI device driver |
| + * @pdev: PCI device |
| + * |
| + * The function is used to retrieve the PCI device driver for |
| + * the indicated PCI device. Besides, we will increase the reference |
| + * of the PCI device driver to prevent that being unloaded on |
| + * the fly. Otherwise, kernel crash would be seen. |
| + */ |
| +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) |
| +{ |
| + if (!pdev || !pdev->driver) |
| + return NULL; |
| + |
| + if (!try_module_get(pdev->driver->driver.owner)) |
| + return NULL; |
| + |
| + return pdev->driver; |
| +} |
| + |
| +/** |
| + * eeh_pcid_put - Dereference on the PCI device driver |
| + * @pdev: PCI device |
| + * |
| + * The function is called to do dereference on the PCI device |
| + * driver of the indicated PCI device. |
| + */ |
| +static inline void eeh_pcid_put(struct pci_dev *pdev) |
| +{ |
| + if (!pdev || !pdev->driver) |
| + return; |
| + |
| + module_put(pdev->driver->driver.owner); |
| +} |
| + |
| #if 0 |
| static void print_device_node_tree(struct pci_dn *pdn, int dent) |
| { |
| @@ -126,18 +162,20 @@ static void eeh_enable_irq(struct pci_de |
| static int eeh_report_error(struct pci_dev *dev, void *userdata) |
| { |
| enum pci_ers_result rc, *res = userdata; |
| - struct pci_driver *driver = dev->driver; |
| + struct pci_driver *driver; |
| |
| dev->error_state = pci_channel_io_frozen; |
| |
| - if (!driver) |
| - return 0; |
| + driver = eeh_pcid_get(dev); |
| + if (!driver) return 0; |
| |
| eeh_disable_irq(dev); |
| |
| if (!driver->err_handler || |
| - !driver->err_handler->error_detected) |
| + !driver->err_handler->error_detected) { |
| + eeh_pcid_put(dev); |
| return 0; |
| + } |
| |
| rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); |
| |
| @@ -145,6 +183,7 @@ static int eeh_report_error(struct pci_d |
| if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; |
| if (*res == PCI_ERS_RESULT_NONE) *res = rc; |
| |
| + eeh_pcid_put(dev); |
| return 0; |
| } |
| |
| @@ -160,12 +199,16 @@ static int eeh_report_error(struct pci_d |
| static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) |
| { |
| enum pci_ers_result rc, *res = userdata; |
| - struct pci_driver *driver = dev->driver; |
| + struct pci_driver *driver; |
| |
| - if (!driver || |
| - !driver->err_handler || |
| - !driver->err_handler->mmio_enabled) |
| + driver = eeh_pcid_get(dev); |
| + if (!driver) return 0; |
| + |
| + if (!driver->err_handler || |
| + !driver->err_handler->mmio_enabled) { |
| + eeh_pcid_put(dev); |
| return 0; |
| + } |
| |
| rc = driver->err_handler->mmio_enabled(dev); |
| |
| @@ -173,6 +216,7 @@ static int eeh_report_mmio_enabled(struc |
| if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; |
| if (*res == PCI_ERS_RESULT_NONE) *res = rc; |
| |
| + eeh_pcid_put(dev); |
| return 0; |
| } |
| |
| @@ -189,18 +233,20 @@ static int eeh_report_mmio_enabled(struc |
| static int eeh_report_reset(struct pci_dev *dev, void *userdata) |
| { |
| enum pci_ers_result rc, *res = userdata; |
| - struct pci_driver *driver = dev->driver; |
| - |
| - if (!driver) |
| - return 0; |
| + struct pci_driver *driver; |
| |
| dev->error_state = pci_channel_io_normal; |
| |
| + driver = eeh_pcid_get(dev); |
| + if (!driver) return 0; |
| + |
| eeh_enable_irq(dev); |
| |
| if (!driver->err_handler || |
| - !driver->err_handler->slot_reset) |
| + !driver->err_handler->slot_reset) { |
| + eeh_pcid_put(dev); |
| return 0; |
| + } |
| |
| rc = driver->err_handler->slot_reset(dev); |
| if ((*res == PCI_ERS_RESULT_NONE) || |
| @@ -208,6 +254,7 @@ static int eeh_report_reset(struct pci_d |
| if (*res == PCI_ERS_RESULT_DISCONNECT && |
| rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; |
| |
| + eeh_pcid_put(dev); |
| return 0; |
| } |
| |
| @@ -222,21 +269,24 @@ static int eeh_report_reset(struct pci_d |
| */ |
| static int eeh_report_resume(struct pci_dev *dev, void *userdata) |
| { |
| - struct pci_driver *driver = dev->driver; |
| + struct pci_driver *driver; |
| |
| dev->error_state = pci_channel_io_normal; |
| |
| - if (!driver) |
| - return 0; |
| + driver = eeh_pcid_get(dev); |
| + if (!driver) return 0; |
| |
| eeh_enable_irq(dev); |
| |
| if (!driver->err_handler || |
| - !driver->err_handler->resume) |
| + !driver->err_handler->resume) { |
| + eeh_pcid_put(dev); |
| return 0; |
| + } |
| |
| driver->err_handler->resume(dev); |
| |
| + eeh_pcid_put(dev); |
| return 0; |
| } |
| |
| @@ -250,21 +300,24 @@ static int eeh_report_resume(struct pci_ |
| */ |
| static int eeh_report_failure(struct pci_dev *dev, void *userdata) |
| { |
| - struct pci_driver *driver = dev->driver; |
| + struct pci_driver *driver; |
| |
| dev->error_state = pci_channel_io_perm_failure; |
| |
| - if (!driver) |
| - return 0; |
| + driver = eeh_pcid_get(dev); |
| + if (!driver) return 0; |
| |
| eeh_disable_irq(dev); |
| |
| if (!driver->err_handler || |
| - !driver->err_handler->error_detected) |
| + !driver->err_handler->error_detected) { |
| + eeh_pcid_put(dev); |
| return 0; |
| + } |
| |
| driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); |
| |
| + eeh_pcid_put(dev); |
| return 0; |
| } |
| |