mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-30 21:40:37 -04:00
drm/amdgpu: added xgmi ras error reset sequence
added mechanism to clear xgmi ras status inbetween error queries Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: John Clements <john.clements@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
3aa0115d23
commit
66399248fe
@@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
|
||||
adev->gmc.xgmi.num_physical_nodes == 0)
|
||||
return 0;
|
||||
|
||||
amdgpu_xgmi_reset_ras_error_count(adev);
|
||||
|
||||
if (!adev->gmc.xgmi.ras_if) {
|
||||
adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
|
||||
if (!adev->gmc.xgmi.ras_if)
|
||||
@@ -668,6 +670,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
|
||||
return addr + dram_base_addr;
|
||||
}
|
||||
|
||||
static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
|
||||
{
|
||||
WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
|
||||
WREG32_PCIE(pcs_status_reg, 0);
|
||||
}
|
||||
|
||||
void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t i;
|
||||
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_ARCTURUS:
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
|
||||
pcs_clear_status(adev,
|
||||
xgmi_pcs_err_status_reg_arct[i]);
|
||||
break;
|
||||
case CHIP_VEGA20:
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
|
||||
pcs_clear_status(adev,
|
||||
xgmi_pcs_err_status_reg_vg20[i]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
|
||||
uint32_t value,
|
||||
uint32_t *ue_count,
|
||||
@@ -758,6 +786,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
break;
|
||||
}
|
||||
|
||||
amdgpu_xgmi_reset_ras_error_count(adev);
|
||||
|
||||
err_data->ue_count += ue_cnt;
|
||||
err_data->ce_count += ce_cnt;
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
|
||||
uint64_t addr);
|
||||
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status);
|
||||
void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);
|
||||
|
||||
static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
|
||||
struct amdgpu_device *bo_adev)
|
||||
|
||||
Reference in New Issue
Block a user