mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-09 21:04:42 -04:00
drm/amdgpu: query umc error info from ecc_table v2
if smu support ECCTABLE, driver can message smu to get ecc_table
then query umc error info from ECCTABLE
v2:
optimize source code makes logical more reasonable
Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
edd7942085
commit
fdcb279d5b
@@ -892,6 +892,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
|
||||
}
|
||||
}
|
||||
|
||||
static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
|
||||
{
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* choosing right query method according to
|
||||
* whether smu support query error information
|
||||
*/
|
||||
ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc));
|
||||
if (ret == -EOPNOTSUPP) {
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_count)
|
||||
adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
|
||||
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
*/
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_address)
|
||||
adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
|
||||
} else if (!ret) {
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_count)
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data);
|
||||
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_address)
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data);
|
||||
}
|
||||
}
|
||||
|
||||
/* query/inject/cure begin */
|
||||
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
|
||||
struct ras_query_if *info)
|
||||
@@ -905,15 +937,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
|
||||
|
||||
switch (info->head.block) {
|
||||
case AMDGPU_RAS_BLOCK__UMC:
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_count)
|
||||
adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
*/
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_address)
|
||||
adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
|
||||
amdgpu_ras_get_ecc_info(adev, &err_data);
|
||||
break;
|
||||
case AMDGPU_RAS_BLOCK__SDMA:
|
||||
if (adev->sdma.funcs->query_ras_error_count) {
|
||||
|
||||
@@ -94,30 +94,58 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
int ret = 0;
|
||||
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_count)
|
||||
adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
|
||||
ret = smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc));
|
||||
if (ret == -EOPNOTSUPP) {
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_count)
|
||||
adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
|
||||
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_address &&
|
||||
adev->umc.max_ras_err_cnt_per_query) {
|
||||
err_data->err_addr =
|
||||
kcalloc(adev->umc.max_ras_err_cnt_per_query,
|
||||
sizeof(struct eeprom_table_record), GFP_KERNEL);
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->query_ras_error_address &&
|
||||
adev->umc.max_ras_err_cnt_per_query) {
|
||||
err_data->err_addr =
|
||||
kcalloc(adev->umc.max_ras_err_cnt_per_query,
|
||||
sizeof(struct eeprom_table_record), GFP_KERNEL);
|
||||
|
||||
/* still call query_ras_error_address to clear error status
|
||||
* even NOMEM error is encountered
|
||||
*/
|
||||
if(!err_data->err_addr)
|
||||
dev_warn(adev->dev, "Failed to alloc memory for "
|
||||
"umc error address record!\n");
|
||||
/* still call query_ras_error_address to clear error status
|
||||
* even NOMEM error is encountered
|
||||
*/
|
||||
if(!err_data->err_addr)
|
||||
dev_warn(adev->dev, "Failed to alloc memory for "
|
||||
"umc error address record!\n");
|
||||
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
*/
|
||||
adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
*/
|
||||
adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
|
||||
}
|
||||
} else if (!ret) {
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_count)
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, ras_error_status);
|
||||
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_address &&
|
||||
adev->umc.max_ras_err_cnt_per_query) {
|
||||
err_data->err_addr =
|
||||
kcalloc(adev->umc.max_ras_err_cnt_per_query,
|
||||
sizeof(struct eeprom_table_record), GFP_KERNEL);
|
||||
|
||||
/* still call query_ras_error_address to clear error status
|
||||
* even NOMEM error is encountered
|
||||
*/
|
||||
if(!err_data->err_addr)
|
||||
dev_warn(adev->dev, "Failed to alloc memory for "
|
||||
"umc error address record!\n");
|
||||
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
*/
|
||||
adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, ras_error_status);
|
||||
}
|
||||
}
|
||||
|
||||
/* only uncorrectable error needs gpu reset */
|
||||
|
||||
Reference in New Issue
Block a user