diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 9122e4318773..ef900b9f0457 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -96,6 +96,7 @@ xe-y += xe_bb.o \ xe_sa.o \ xe_sched_job.o \ xe_step.o \ + xe_survivability_mode.o \ xe_sync.o \ xe_tile.o \ xe_tile_sysfs.o \ diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 58e79e19deaa..89f532b67bc4 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -22,6 +22,7 @@ #include "xe_pt_types.h" #include "xe_sriov_types.h" #include "xe_step_types.h" +#include "xe_survivability_mode_types.h" #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) #define TEST_VM_OPS_ERROR @@ -331,6 +332,9 @@ struct xe_device { u8 skip_pcode:1; } info; + /** @survivability: survivability information for device */ + struct xe_survivability survivability; + /** @irq: device interrupt state */ struct { /** @irq.lock: lock for processing irq's on this device */ diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h index f153ce96f69a..2bae9afdbd35 100644 --- a/drivers/gpu/drm/xe/xe_pcode_api.h +++ b/drivers/gpu/drm/xe/xe_pcode_api.h @@ -49,6 +49,20 @@ /* Domain IDs (param2) */ #define PCODE_MBOX_DOMAIN_HBM 0x2 +#define PCODE_SCRATCH(x) XE_REG(0x138320 + ((x) * 4)) +/* PCODE_SCRATCH0 */ +#define AUXINFO_REG_OFFSET REG_GENMASK(17, 15) +#define OVERFLOW_REG_OFFSET REG_GENMASK(14, 12) +#define HISTORY_TRACKING REG_BIT(11) +#define OVERFLOW_SUPPORT REG_BIT(10) +#define AUXINFO_SUPPORT REG_BIT(9) +#define BOOT_STATUS REG_GENMASK(3, 1) +#define CRITICAL_FAILURE 4 +#define NON_CRITICAL_FAILURE 7 + +/* Auxiliary info bits */ +#define AUXINFO_HISTORY_OFFSET REG_GENMASK(31, 29) + struct pcode_err_decode { int errno; const char *str; diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c new file mode 100644 index 000000000000..9911e9f6b99b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include "xe_survivability_mode.h" +#include "xe_survivability_mode_types.h" + +#include +#include +#include + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_mmio.h" +#include "xe_pcode_api.h" + +#define MAX_SCRATCH_MMIO 8 + +/** + * DOC: Xe Boot Survivability + * + * Boot Survivability is a software based workflow for recovering a system in a failed boot state + * Here system recoverability is concerned with recovering the firmware responsible for boot. + * + * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware + * to be flashed through mei and collect telemetry. The driver's probe flow is modified + * such that it enters survivability mode when pcode initialization is incomplete and boot status + * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating + * survivability mode and provides additional information required for debug + * + * KMD exposes below admin-only readable sysfs in survivability mode + * + * device/survivability_mode: The presence of this file indicates that the card is in survivability + * mode. Also, provides additional information on why the driver entered + * survivability mode. + * + * Capability Information - Provides boot status + * Postcode Information - Provides information about the failure + * Overflow Information - Provides history of previous failures + * Auxiliary Information - Certain failures may have information in + * addition to postcode information + */ + +static u32 aux_history_offset(u32 reg_value) +{ + return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value); +} + +static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info, + int id, char *name) +{ + strscpy(info[id].name, name, sizeof(info[id].name)); + info[id].reg = PCODE_SCRATCH(id).raw; + info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id)); +} + +static void populate_survivability_info(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + struct xe_survivability_info *info = survivability->info; + struct xe_mmio *mmio; + u32 id = 0, reg_value; + char name[NAME_MAX]; + int index; + + mmio = xe_root_tile_mmio(xe); + set_survivability_info(mmio, info, id, "Capability Info"); + reg_value = info[id].value; + + if (reg_value & HISTORY_TRACKING) { + id++; + set_survivability_info(mmio, info, id, "Postcode Info"); + + if (reg_value & OVERFLOW_SUPPORT) { + id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value); + set_survivability_info(mmio, info, id, "Overflow Info"); + } + } + + if (reg_value & AUXINFO_SUPPORT) { + id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value); + + for (index = 0; id && reg_value; index++, reg_value = info[id].value, + id = aux_history_offset(reg_value)) { + snprintf(name, NAME_MAX, "Auxiliary Info %d", index); + set_survivability_info(mmio, info, id, name); + } + } +} + +static void log_survivability_info(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_survivability *survivability = &xe->survivability; + struct xe_survivability_info *info = survivability->info; + int id; + + dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n", + survivability->boot_status); + for (id = 0; id < MAX_SCRATCH_MMIO; id++) { + if (info[id].reg) + dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name, + info[id].reg, info[id].value); + } +} + +static ssize_t survivability_mode_show(struct device *dev, + struct device_attribute *attr, char *buff) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_survivability *survivability = &xe->survivability; + struct xe_survivability_info *info = survivability->info; + int index = 0, count = 0; + + for (index = 0; index < MAX_SCRATCH_MMIO; index++) { + if (info[index].reg) + count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, + info[index].reg, info[index].value); + } + + return count; +} + +static DEVICE_ATTR_ADMIN_RO(survivability_mode); + +static void enable_survivability_mode(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_survivability *survivability = &xe->survivability; + int ret = 0; + + /* set survivability mode */ + survivability->mode = true; + dev_info(dev, "In Survivability Mode\n"); + + /* create survivability mode sysfs */ + ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); + if (ret) { + dev_warn(dev, "Failed to create survivability sysfs files\n"); + return; + } +} + +/** + * xe_survivability_mode_required - checks if survivability mode is required + * @xe: xe device instance + * + * This function reads the boot status from Pcode + * + * Return: true if boot status indicates failure, false otherwise + */ +bool xe_survivability_mode_required(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + struct xe_mmio *mmio = xe_root_tile_mmio(xe); + u32 data; + + data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); + survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); + + return (survivability->boot_status == NON_CRITICAL_FAILURE || + survivability->boot_status == CRITICAL_FAILURE); +} + +/** + * xe_survivability_mode_remove - remove survivability mode + * @xe: xe device instance + * + * clean up sysfs entries of survivability mode + */ +void xe_survivability_mode_remove(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct device *dev = &pdev->dev; + + sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); + kfree(survivability->info); + pci_set_drvdata(pdev, NULL); +} + +/** + * xe_survivability_mode_init - Initialize the survivability mode + * @xe: xe device instance + * + * Initializes survivability information and enables survivability mode + */ +void xe_survivability_mode_init(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + struct xe_survivability_info *info; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + + survivability->size = MAX_SCRATCH_MMIO; + + info = kcalloc(survivability->size, sizeof(*info), GFP_KERNEL); + if (!info) + return; + + survivability->info = info; + + populate_survivability_info(xe); + + /* Only log debug information and exit if it is a critical failure */ + if (survivability->boot_status == CRITICAL_FAILURE) { + log_survivability_info(pdev); + kfree(survivability->info); + return; + } + + enable_survivability_mode(pdev); +} diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h new file mode 100644 index 000000000000..410e3ee5f5d1 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_SURVIVABILITY_MODE_H_ +#define _XE_SURVIVABILITY_MODE_H_ + +#include + +struct xe_device; + +void xe_survivability_mode_init(struct xe_device *xe); +void xe_survivability_mode_remove(struct xe_device *xe); +bool xe_survivability_mode_required(struct xe_device *xe); + +#endif /* _XE_SURVIVABILITY_MODE_H_ */ diff --git a/drivers/gpu/drm/xe/xe_survivability_mode_types.h b/drivers/gpu/drm/xe/xe_survivability_mode_types.h new file mode 100644 index 000000000000..19d433e253df --- /dev/null +++ b/drivers/gpu/drm/xe/xe_survivability_mode_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_SURVIVABILITY_MODE_TYPES_H_ +#define _XE_SURVIVABILITY_MODE_TYPES_H_ + +#include +#include + +struct xe_survivability_info { + char name[NAME_MAX]; + u32 reg; + u32 value; +}; + +/** + * struct xe_survivability: Contains survivability mode information + */ +struct xe_survivability { + /** @info: struct that holds survivability info from scratch registers */ + struct xe_survivability_info *info; + + /** @size: number of scratch registers */ + u32 size; + + /** @boot_status: indicates critical/non critical boot failure */ + u8 boot_status; + + /** @mode: boolean to indicate survivability mode */ + bool mode; +}; + +#endif /* _XE_SURVIVABILITY_MODE_TYPES_H_ */