drm/amdgpu: Decode deferred error type in gfx aca bank parser

In the case of injecting uncorrected error with background workload,
the deferred error among uncorrected errors need to be specified
by checking the deferred and poison bits of status register.

v2: refine checking for deferred error
v2: log possiable DEs among CEs
v2: generate CPER records for DEs among UEs

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Xiang Liu
2025-03-19 17:02:49 +08:00
committed by Alex Deucher
parent 2ec0a7c337
commit 338f7412c7
3 changed files with 36 additions and 10 deletions

View File

@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
{
struct aca_bank_node *node;
struct aca_bank *bank;
int r;
if (!adev->cper.enabled)
return;
@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
/* UEs must be encoded into separate CPER entries */
if (type == ACA_SMU_TYPE_UE) {
struct aca_banks de_banks;
aca_banks_init(&de_banks);
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
if (amdgpu_cper_generate_ue_record(adev, bank))
dev_warn(adev->dev, "fail to generate ue cper records\n");
if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
r = aca_banks_add_bank(&de_banks, bank);
if (r)
dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
} else {
if (amdgpu_cper_generate_ue_record(adev, bank))
dev_warn(adev->dev, "fail to generate ue cper records\n");
}
}
if (!list_empty(&de_banks.list)) {
if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
dev_warn(adev->dev, "fail to generate de cper records\n");
}
aca_banks_release(&de_banks);
} else {
/*
* SMU_TYPE_CE banks are combined into 1 CPER entries,
@@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
if (ret)
return ret;
/* DEs may contain in CEs or UEs */
if (type != ACA_ERROR_TYPE_DEFERRED)
aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
return aca_log_aca_error(handle, type, err_data);
}

View File

@@ -76,11 +76,17 @@ struct ras_query_context;
#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */
#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
ACA_ERROR_TYPE_DEFERRED : \
ACA_ERROR_TYPE_CE)
#define ACA_BANK_ERR_IS_DEFFERED(bank) \
(ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
ACA_ERROR_TYPE_CE)
#define ACA_BANK_ERR_UE_DE_DECODE(bank) \
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
ACA_ERROR_TYPE_UE)
enum aca_reg_idx {
ACA_REG_IDX_CTL = 0,

View File

@@ -867,9 +867,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
switch (type) {
case ACA_SMU_TYPE_UE:
bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info,
ACA_ERROR_TYPE_UE, 1ULL);
bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank);
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL);
break;
case ACA_SMU_TYPE_CE:
bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);