mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-03 22:57:21 -04:00
drm/amdgpu: Add ras helper to query boot errors v2
Add ras helper function to query boot time gpu errors. v2: use aqua_vanjaram smn addressing pattern Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Yang Wang <kevinyang.wang@amd.com> Reviewed-by: Le Ma <le.ma@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
f5e4cc8461
commit
cce4febb27
@@ -1333,6 +1333,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
|
||||
#define WREG32_FIELD_OFFSET(reg, offset, field, val) \
|
||||
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
|
||||
|
||||
#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
|
||||
/*
|
||||
* BIOS helpers.
|
||||
*/
|
||||
|
||||
@@ -3767,3 +3767,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define mmMP0_SMN_C2PMSG_92 0x1609C
|
||||
#define mmMP0_SMN_C2PMSG_126 0x160BE
|
||||
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
|
||||
u32 instance, u32 boot_error)
|
||||
{
|
||||
u32 socket_id, aid_id, hbm_id;
|
||||
u32 reg_data;
|
||||
u64 reg_addr;
|
||||
|
||||
socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
|
||||
aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
|
||||
hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
|
||||
|
||||
/* The pattern for smn addressing in other SOC could be different from
|
||||
* the one for aqua_vanjaram. We should revisit the code if the pattern
|
||||
* is changed. In such case, replace the aqua_vanjaram implementation
|
||||
* with more common helper */
|
||||
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
|
||||
aqua_vanjaram_encode_ext_smn_addressing(instance);
|
||||
|
||||
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
|
||||
dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n",
|
||||
socket_id, aid_id, reg_data);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n",
|
||||
socket_id, aid_id, hbm_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n",
|
||||
socket_id, aid_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n",
|
||||
socket_id, aid_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n",
|
||||
socket_id, aid_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n",
|
||||
socket_id, aid_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n",
|
||||
socket_id, aid_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n",
|
||||
socket_id, aid_id, hbm_id);
|
||||
|
||||
if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
|
||||
dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n",
|
||||
socket_id, aid_id, hbm_id);
|
||||
}
|
||||
|
||||
static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
|
||||
u32 instance, u32 *boot_error)
|
||||
{
|
||||
u32 reg_addr;
|
||||
u32 reg_data;
|
||||
int retry_loop;
|
||||
|
||||
/* The pattern for smn addressing in other SOC could be different from
|
||||
* the one for aqua_vanjaram. We should revisit the code if the pattern
|
||||
* is changed. In such case, replace the aqua_vanjaram implementation
|
||||
* with more common helper */
|
||||
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
|
||||
aqua_vanjaram_encode_ext_smn_addressing(instance);
|
||||
|
||||
for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
|
||||
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
|
||||
if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
|
||||
*boot_error = reg_data;
|
||||
return 0;
|
||||
}
|
||||
msleep(1);
|
||||
}
|
||||
|
||||
*boot_error = reg_data;
|
||||
return -ETIME;
|
||||
}
|
||||
|
||||
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
|
||||
{
|
||||
u32 boot_error = 0;
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < num_instances; i++) {
|
||||
if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
|
||||
amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,19 @@
|
||||
|
||||
struct amdgpu_iv_entry;
|
||||
|
||||
#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 0, 0)
|
||||
#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x) AMDGPU_GET_REG_FIELD(x, 1, 1)
|
||||
#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 2, 2)
|
||||
#define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 3, 3)
|
||||
#define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 4, 4)
|
||||
#define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 5, 5)
|
||||
#define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x) AMDGPU_GET_REG_FIELD(x, 6, 6)
|
||||
#define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7)
|
||||
#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8)
|
||||
#define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11)
|
||||
#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 13, 13)
|
||||
#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31)
|
||||
|
||||
#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
|
||||
/* position of instance value in sub_block_index of
|
||||
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
|
||||
@@ -819,5 +832,5 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
|
||||
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
|
||||
struct amdgpu_smuio_mcm_config_info *mcm_info,
|
||||
struct ras_err_addr *err_addr, u64 count);
|
||||
|
||||
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user