linux-loongson/include/acpi/ghes.h
Shuai Xue c1f1fda141 ACPI: APEI: handle synchronous exceptions in task work
The memory uncorrected error could be signaled by asynchronous interrupt
(specifically, SPI in arm64 platform), e.g. when an error is detected by
a background scrubber, or signaled by synchronous exception
(specifically, data abort exception in arm64 platform), e.g. when a CPU
tries to access a poisoned cache line. Currently, both synchronous and
asynchronous errors use memory_failure_queue() to schedule
memory_failure() to exectute in a kworker context.

As a result, when a user-space process is accessing a poisoned data, a
data abort is taken and the memory_failure() is executed in the kworker
context, which:

  - will send wrong si_code by SIGBUS signal in early_kill mode, and
  - can not kill the user-space in some cases resulting a synchronous
    error infinite loop

Issue 1: send wrong si_code in early_kill mode

Since commit a70297d221 ("ACPI: APEI: set memory failure flags as
MF_ACTION_REQUIRED on synchronous events")', the flag MF_ACTION_REQUIRED
could be used to determine whether a synchronous exception occurs on
ARM64 platform.  When a synchronous exception is detected, the kernel is
expected to terminate the current process which has accessed a poisoned
page. This is done by sending a SIGBUS signal with error code
BUS_MCEERR_AR, indicating an action-required machine check error on
read.

However, when kill_proc() is called to terminate the processes who has
the poisoned page mapped, it sends the incorrect SIGBUS error code
BUS_MCEERR_AO because the context in which it operates is not the one
where the error was triggered.

To reproduce this problem:

  #sysctl -w vm.memory_failure_early_kill=1
  vm.memory_failure_early_kill = 1

  # STEP2: inject an UCE error and consume it to trigger a synchronous error
  #einj_mem_uc single
  0: single   vaddr = 0xffffb0d75400 paddr = 4092d55b400
  injecting ...
  triggering ...
  signal 7 code 5 addr 0xffffb0d75000
  page not present
  Test passed

The si_code (code 5) from einj_mem_uc indicates that it is BUS_MCEERR_AO
error and it is not factually correct.

After this change:

  # STEP1: enable early kill mode
  #sysctl -w vm.memory_failure_early_kill=1
  vm.memory_failure_early_kill = 1
  # STEP2: inject an UCE error and consume it to trigger a synchronous error
  #einj_mem_uc single
  0: single   vaddr = 0xffffb0d75400 paddr = 4092d55b400
  injecting ...
  triggering ...
  signal 7 code 4 addr 0xffffb0d75000
  page not present
  Test passed

The si_code (code 4) from einj_mem_uc indicates that it is a BUS_MCEERR_AR
error as expected.

Issue 2: a synchronous error infinite loop

If a user-space process, e.g. devmem, accesses a poisoned page for which
the HWPoison flag is set, kill_accessing_process() is called to send
SIGBUS to current processs with error info. Since the memory_failure()
is executed in the kworker context, it will just do nothing but return
EFAULT. So, devmem will access the posioned page and trigger an
exception again, resulting in a synchronous error infinite loop. Such
exception loop may cause platform firmware to exceed some threshold and
reboot when Linux could have recovered from this error.

To reproduce this problem:

  # STEP 1: inject an UCE error, and kernel will set HWPosion flag for related page
  #einj_mem_uc single
  0: single   vaddr = 0xffffb0d75400 paddr = 4092d55b400
  injecting ...
  triggering ...
  signal 7 code 4 addr 0xffffb0d75000
  page not present
  Test passed

  # STEP 2: access the same page and it will trigger a synchronous error infinite loop
  devmem 0x4092d55b400

To fix above two issues, queue memory_failure() as a task_work so that
it runs in the context of the process that is actually consuming the
poisoned data.

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Tested-by: Ma Wupeng <mawupeng1@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xiaofei Tan <tanxiaofei@huawei.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://patch.msgid.link/20250714114212.31660-3-xueshuai@linux.alibaba.com
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2025-07-16 21:08:04 +02:00

134 lines
3.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef GHES_H
#define GHES_H
#include <acpi/apei.h>
#include <acpi/hed.h>
/*
* One struct ghes is created for each generic hardware error source.
* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
* handler.
*
* estatus: memory buffer for error status block, allocated during
* HEST parsing.
*/
#define GHES_EXITING 0x0002
struct ghes {
union {
struct acpi_hest_generic *generic;
struct acpi_hest_generic_v2 *generic_v2;
};
struct acpi_hest_generic_status *estatus;
unsigned long flags;
union {
struct list_head list;
struct timer_list timer;
unsigned int irq;
};
struct device *dev;
struct list_head elist;
};
struct ghes_estatus_node {
struct llist_node llnode;
struct acpi_hest_generic *generic;
struct ghes *ghes;
};
struct ghes_estatus_cache {
u32 estatus_len;
atomic_t count;
struct acpi_hest_generic *generic;
unsigned long long time_in;
struct rcu_head rcu;
};
enum {
GHES_SEV_NO = 0x0,
GHES_SEV_CORRECTED = 0x1,
GHES_SEV_RECOVERABLE = 0x2,
GHES_SEV_PANIC = 0x3,
};
#ifdef CONFIG_ACPI_APEI_GHES
/**
* ghes_register_vendor_record_notifier - register a notifier for vendor
* records that the kernel would otherwise ignore.
* @nb: pointer to the notifier_block structure of the event handler.
*
* return 0 : SUCCESS, non-zero : FAIL
*/
int ghes_register_vendor_record_notifier(struct notifier_block *nb);
/**
* ghes_unregister_vendor_record_notifier - unregister the previously
* registered vendor record notifier.
* @nb: pointer to the notifier_block structure of the vendor record handler.
*/
void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
struct list_head *ghes_get_devices(void);
void ghes_estatus_pool_region_free(unsigned long addr, u32 size);
#else
static inline struct list_head *ghes_get_devices(void) { return NULL; }
static inline void ghes_estatus_pool_region_free(unsigned long addr, u32 size) { return; }
#endif
int ghes_estatus_pool_init(unsigned int num_ghes);
static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
{
return gdata->revision >> 8;
}
static inline void *acpi_hest_get_payload(struct acpi_hest_generic_data *gdata)
{
if (acpi_hest_get_version(gdata) >= 3)
return (void *)(((struct acpi_hest_generic_data_v300 *)(gdata)) + 1);
return gdata + 1;
}
static inline int acpi_hest_get_error_length(struct acpi_hest_generic_data *gdata)
{
return ((struct acpi_hest_generic_data *)(gdata))->error_data_length;
}
static inline int acpi_hest_get_size(struct acpi_hest_generic_data *gdata)
{
if (acpi_hest_get_version(gdata) >= 3)
return sizeof(struct acpi_hest_generic_data_v300);
return sizeof(struct acpi_hest_generic_data);
}
static inline int acpi_hest_get_record_size(struct acpi_hest_generic_data *gdata)
{
return (acpi_hest_get_size(gdata) + acpi_hest_get_error_length(gdata));
}
static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
{
return (void *)(gdata) + acpi_hest_get_record_size(gdata);
}
#define apei_estatus_for_each_section(estatus, section) \
for (section = (struct acpi_hest_generic_data *)(estatus + 1); \
(void *)section - (void *)(estatus + 1) < estatus->data_length; \
section = acpi_hest_get_next(section))
#ifdef CONFIG_ACPI_APEI_SEA
int ghes_notify_sea(void);
#else
static inline int ghes_notify_sea(void) { return -ENOENT; }
#endif
struct notifier_block;
extern void ghes_register_report_chain(struct notifier_block *nb);
extern void ghes_unregister_report_chain(struct notifier_block *nb);
#endif /* GHES_H */