drm/amdgpu: format old RAS eeprom data into V3 version

Clear old data and save it in V3 format.

v2: only format eeprom data for new ASICs.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou
2025-03-04 15:05:53 +08:00
committed by Alex Deucher
parent 9deacd6c55
commit 05d50ea3ea
3 changed files with 22 additions and 12 deletions

View File

@@ -3473,6 +3473,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
}
/* The format action is only applied to new ASICs */
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
control->tbl_hdr.version < RAS_TABLE_VER_V3)
if (!amdgpu_ras_eeprom_reset_table(control))
if (amdgpu_ras_save_bad_pages(adev, NULL))
dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
return ret;

View File

@@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control
switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
case IP_VERSION(8, 10, 0):
case IP_VERSION(12, 0, 0):
hdr->version = RAS_TABLE_VER_V2_1;
return;
case IP_VERSION(12, 0, 0):
hdr->version = RAS_TABLE_VER_V3;
return;
default:
hdr->version = RAS_TABLE_VER_V1;
return;
@@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
hdr->header = RAS_TABLE_HDR_VAL;
amdgpu_ras_set_eeprom_table_version(control);
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE;
@@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
}
csum = __calc_hdr_byte_sum(control);
if (hdr->version == RAS_TABLE_VER_V2_1)
if (hdr->version >= RAS_TABLE_VER_V2_1)
csum += __calc_ras_info_byte_sum(control);
csum = -csum;
hdr->checksum = csum;
@@ -757,7 +759,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
"Saved bad pages %d reaches threshold value %d\n",
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
control->tbl_rai.health_percent = 0;
}
@@ -770,7 +772,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
amdgpu_dpm_send_rma_reason(adev);
}
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE +
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -810,7 +812,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
* now calculate gpu health percent
*/
if (amdgpu_bad_page_threshold != 0 &&
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
control->ras_num_bad_pages) * 100) /
@@ -823,7 +825,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
csum += *pp;
csum += __calc_hdr_byte_sum(control);
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
csum += __calc_ras_info_byte_sum(control);
/* avoid sign extension when assigning to "checksum" */
csum = -csum;
@@ -1040,7 +1042,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co
/* get available eeprom table version first before eeprom table init */
amdgpu_ras_set_eeprom_table_version(control);
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
return RAS_MAX_RECORD_COUNT_V2_1;
else
return RAS_MAX_RECORD_COUNT;
@@ -1285,7 +1287,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
int buf_size, res;
u8 csum, *buf, *pp;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
buf_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE +
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -1388,7 +1390,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
__decode_table_header_from_buf(hdr, buf);
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
control->ras_record_offset = RAS_RECORD_START_V2_1;
control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
@@ -1428,7 +1430,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
control->ras_num_bad_pages);
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
if (res)
return res;
@@ -1448,7 +1450,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
if (res)
return res;

View File

@@ -28,6 +28,7 @@
#define RAS_TABLE_VER_V1 0x00010000
#define RAS_TABLE_VER_V2_1 0x00021000
#define RAS_TABLE_VER_V3 0x00030000
struct amdgpu_device;