| From 4f73a1afbc22b47f1ad1fc1138af484b28a160e2 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Fri, 11 Jun 2021 20:37:07 +0800 |
| Subject: ACPI: APEI: fix synchronous external aborts in user-mode |
| |
| From: Xiaofei Tan <tanxiaofei@huawei.com> |
| |
| [ Upstream commit ccb5ecdc2ddeaff744ee075b54cdff8a689e8fa7 ] |
| |
| Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea() |
| synchronise with APEI's irq work"), do_sea() would unconditionally |
| signal the affected task from the arch code. Since that change, |
| the GHES driver sends the signals. |
| |
| This exposes a problem as errors the GHES driver doesn't understand |
| or doesn't handle effectively are silently ignored. It will cause |
| the errors get taken again, and circulate endlessly. User-space task |
| get stuck in this loop. |
| |
| Existing firmware on Kunpeng9xx systems reports cache errors with the |
| 'ARM Processor Error' CPER records. |
| |
| Do memory failure handling for ARM Processor Error Section just like |
| for Memory Error Section. |
| |
| Fixes: 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work") |
| Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com> |
| Reviewed-by: James Morse <james.morse@arm.com> |
| [ rjw: Subject edit ] |
| Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| drivers/acpi/apei/ghes.c | 81 +++++++++++++++++++++++++++++++--------- |
| 1 file changed, 64 insertions(+), 17 deletions(-) |
| |
| diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c |
| index fce7ade2aba9..0c8330ed1ffd 100644 |
| --- a/drivers/acpi/apei/ghes.c |
| +++ b/drivers/acpi/apei/ghes.c |
| @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head) |
| gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); |
| } |
| |
| -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, |
| - int sev) |
| +static bool ghes_do_memory_failure(u64 physical_addr, int flags) |
| { |
| unsigned long pfn; |
| - int flags = -1; |
| - int sec_sev = ghes_severity(gdata->error_severity); |
| - struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); |
| |
| if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) |
| return false; |
| |
| - if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) |
| - return false; |
| - |
| - pfn = mem_err->physical_addr >> PAGE_SHIFT; |
| + pfn = PHYS_PFN(physical_addr); |
| if (!pfn_valid(pfn)) { |
| pr_warn_ratelimited(FW_WARN GHES_PFX |
| "Invalid address in generic error data: %#llx\n", |
| - mem_err->physical_addr); |
| + physical_addr); |
| return false; |
| } |
| |
| + memory_failure_queue(pfn, flags); |
| + return true; |
| +} |
| + |
| +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, |
| + int sev) |
| +{ |
| + int flags = -1; |
| + int sec_sev = ghes_severity(gdata->error_severity); |
| + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); |
| + |
| + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) |
| + return false; |
| + |
| /* iff following two events can be handled properly by now */ |
| if (sec_sev == GHES_SEV_CORRECTED && |
| (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED)) |
| @@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, |
| if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE) |
| flags = 0; |
| |
| - if (flags != -1) { |
| - memory_failure_queue(pfn, flags); |
| - return true; |
| - } |
| + if (flags != -1) |
| + return ghes_do_memory_failure(mem_err->physical_addr, flags); |
| |
| return false; |
| } |
| |
| +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev) |
| +{ |
| + struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); |
| + bool queued = false; |
| + int sec_sev, i; |
| + char *p; |
| + |
| + log_arm_hw_error(err); |
| + |
| + sec_sev = ghes_severity(gdata->error_severity); |
| + if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) |
| + return false; |
| + |
| + p = (char *)(err + 1); |
| + for (i = 0; i < err->err_info_num; i++) { |
| + struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p; |
| + bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR); |
| + bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); |
| + const char *error_type = "unknown error"; |
| + |
| + /* |
| + * The field (err_info->error_info & BIT(26)) is fixed to set to |
| + * 1 in some old firmware of HiSilicon Kunpeng920. We assume that |
| + * firmware won't mix corrected errors in an uncorrected section, |
| + * and don't filter out 'corrected' error here. |
| + */ |
| + if (is_cache && has_pa) { |
| + queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0); |
| + p += err_info->length; |
| + continue; |
| + } |
| + |
| + if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs)) |
| + error_type = cper_proc_error_type_strs[err_info->type]; |
| + |
| + pr_warn_ratelimited(FW_WARN GHES_PFX |
| + "Unhandled processor error type: %s\n", |
| + error_type); |
| + p += err_info->length; |
| + } |
| + |
| + return queued; |
| +} |
| + |
| /* |
| * PCIe AER errors need to be sent to the AER driver for reporting and |
| * recovery. The GHES severities map to the following AER severities and |
| @@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes, |
| ghes_handle_aer(gdata); |
| } |
| else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { |
| - struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); |
| - |
| - log_arm_hw_error(err); |
| + queued = ghes_handle_arm_hw_error(gdata, sev); |
| } else { |
| void *err = acpi_hest_get_payload(gdata); |
| |
| -- |
| 2.30.2 |
| |