| From 09c51bb692bbab20a244b69a028cc60b63e1f94e Mon Sep 17 00:00:00 2001 |
| From: Steffen Maier <maier@linux.ibm.com> |
| Date: Tue, 1 Oct 2019 12:49:49 +0200 |
| Subject: [PATCH] scsi: zfcp: fix reaction on bit error threshold notification |
| |
| commit 2190168aaea42c31bff7b9a967e7b045f07df095 upstream. |
| |
| On excessive bit errors for the FCP channel ingress fibre path, the channel |
| notifies us. Previously, we only emitted a kernel message and a trace |
| record. Since performance can become suboptimal with I/O timeouts due to |
| bit errors, we now stop using an FCP device by default on channel |
| notification so multipath on top can timely failover to other paths. A new |
| module parameter zfcp.ber_stop can be used to get zfcp old behavior. |
| |
| User explanation of new kernel message: |
| |
| * Description: |
| * The FCP channel reported that its bit error threshold has been exceeded. |
| * These errors might result from a problem with the physical components |
| * of the local fibre link into the FCP channel. |
| * The problem might be damage or malfunction of the cable or |
| * cable connection between the FCP channel and |
| * the adjacent fabric switch port or the point-to-point peer. |
| * Find details about the errors in the HBA trace for the FCP device. |
| * The zfcp device driver closed down the FCP device |
| * to limit the performance impact from possible I/O command timeouts. |
| * User action: |
| * Check for problems on the local fibre link, ensure that fibre optics are |
| * clean and functional, and all cables are properly plugged. |
| * After the repair action, you can manually recover the FCP device by |
| * writing "0" into its "failed" sysfs attribute. |
| * If recovery through sysfs is not possible, set the CHPID of the device |
| * offline and back online on the service element. |
| |
| Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") |
| Cc: <stable@vger.kernel.org> #2.6.30+ |
| Link: https://lore.kernel.org/r/20191001104949.42810-1-maier@linux.ibm.com |
| Reviewed-by: Jens Remus <jremus@linux.ibm.com> |
| Reviewed-by: Benjamin Block <bblock@linux.ibm.com> |
| Signed-off-by: Steffen Maier <maier@linux.ibm.com> |
| Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c |
| index 296bbc3c4606..cf63916814cc 100644 |
| --- a/drivers/s390/scsi/zfcp_fsf.c |
| +++ b/drivers/s390/scsi/zfcp_fsf.c |
| @@ -27,6 +27,11 @@ |
| |
| struct kmem_cache *zfcp_fsf_qtcb_cache; |
| |
| +static bool ber_stop = true; |
| +module_param(ber_stop, bool, 0600); |
| +MODULE_PARM_DESC(ber_stop, |
| + "Shuts down FCP devices for FCP channels that report a bit-error count in excess of its threshold (default on)"); |
| + |
| static void zfcp_fsf_request_timeout_handler(struct timer_list *t) |
| { |
| struct zfcp_fsf_req *fsf_req = from_timer(fsf_req, t, timer); |
| @@ -236,10 +241,15 @@ static void zfcp_fsf_status_read_handler(struct zfcp_fsf_req *req) |
| case FSF_STATUS_READ_SENSE_DATA_AVAIL: |
| break; |
| case FSF_STATUS_READ_BIT_ERROR_THRESHOLD: |
| - dev_warn(&adapter->ccw_device->dev, |
| - "The error threshold for checksum statistics " |
| - "has been exceeded\n"); |
| zfcp_dbf_hba_bit_err("fssrh_3", req); |
| + if (ber_stop) { |
| + dev_warn(&adapter->ccw_device->dev, |
| + "All paths over this FCP device are disused because of excessive bit errors\n"); |
| + zfcp_erp_adapter_shutdown(adapter, 0, "fssrh_b"); |
| + } else { |
| + dev_warn(&adapter->ccw_device->dev, |
| + "The error threshold for checksum statistics has been exceeded\n"); |
| + } |
| break; |
| case FSF_STATUS_READ_LINK_DOWN: |
| zfcp_fsf_status_read_link_down(req); |
| -- |
| 2.7.4 |
| |