mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-02-15 10:53:52 -05:00
drm/amdgpu: format old RAS eeprom data into V3 version
Clear old data and save it in V3 format. v2: only format eeprom data for new ASICs. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -3473,6 +3473,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
|
||||
adev, control->bad_channel_bitmap);
|
||||
con->update_channel_flag = false;
|
||||
}
|
||||
|
||||
/* The format action is only applied to new ASICs */
|
||||
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
|
||||
control->tbl_hdr.version < RAS_TABLE_VER_V3)
|
||||
if (!amdgpu_ras_eeprom_reset_table(control))
|
||||
if (amdgpu_ras_save_bad_pages(adev, NULL))
|
||||
dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control
|
||||
|
||||
switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
|
||||
case IP_VERSION(8, 10, 0):
|
||||
case IP_VERSION(12, 0, 0):
|
||||
hdr->version = RAS_TABLE_VER_V2_1;
|
||||
return;
|
||||
case IP_VERSION(12, 0, 0):
|
||||
hdr->version = RAS_TABLE_VER_V3;
|
||||
return;
|
||||
default:
|
||||
hdr->version = RAS_TABLE_VER_V1;
|
||||
return;
|
||||
@@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
hdr->header = RAS_TABLE_HDR_VAL;
|
||||
amdgpu_ras_set_eeprom_table_version(control);
|
||||
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
if (hdr->version >= RAS_TABLE_VER_V2_1) {
|
||||
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
|
||||
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
|
||||
RAS_TABLE_V2_1_INFO_SIZE;
|
||||
@@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
}
|
||||
|
||||
csum = __calc_hdr_byte_sum(control);
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1)
|
||||
if (hdr->version >= RAS_TABLE_VER_V2_1)
|
||||
csum += __calc_ras_info_byte_sum(control);
|
||||
csum = -csum;
|
||||
hdr->checksum = csum;
|
||||
@@ -757,7 +759,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
"Saved bad pages %d reaches threshold value %d\n",
|
||||
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
|
||||
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
|
||||
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
|
||||
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
|
||||
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
|
||||
control->tbl_rai.health_percent = 0;
|
||||
}
|
||||
@@ -770,7 +772,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
amdgpu_dpm_send_rma_reason(adev);
|
||||
}
|
||||
|
||||
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
|
||||
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
|
||||
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
|
||||
RAS_TABLE_V2_1_INFO_SIZE +
|
||||
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
|
||||
@@ -810,7 +812,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
* now calculate gpu health percent
|
||||
*/
|
||||
if (amdgpu_bad_page_threshold != 0 &&
|
||||
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
|
||||
control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
|
||||
control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
|
||||
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
|
||||
control->ras_num_bad_pages) * 100) /
|
||||
@@ -823,7 +825,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
csum += *pp;
|
||||
|
||||
csum += __calc_hdr_byte_sum(control);
|
||||
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
|
||||
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
|
||||
csum += __calc_ras_info_byte_sum(control);
|
||||
/* avoid sign extension when assigning to "checksum" */
|
||||
csum = -csum;
|
||||
@@ -1040,7 +1042,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co
|
||||
/* get available eeprom table version first before eeprom table init */
|
||||
amdgpu_ras_set_eeprom_table_version(control);
|
||||
|
||||
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
|
||||
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
|
||||
return RAS_MAX_RECORD_COUNT_V2_1;
|
||||
else
|
||||
return RAS_MAX_RECORD_COUNT;
|
||||
@@ -1285,7 +1287,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
|
||||
int buf_size, res;
|
||||
u8 csum, *buf, *pp;
|
||||
|
||||
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
|
||||
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
|
||||
buf_size = RAS_TABLE_HEADER_SIZE +
|
||||
RAS_TABLE_V2_1_INFO_SIZE +
|
||||
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
|
||||
@@ -1388,7 +1390,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
|
||||
|
||||
__decode_table_header_from_buf(hdr, buf);
|
||||
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
if (hdr->version >= RAS_TABLE_VER_V2_1) {
|
||||
control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
|
||||
control->ras_record_offset = RAS_RECORD_START_V2_1;
|
||||
control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
|
||||
@@ -1428,7 +1430,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
|
||||
control->ras_num_bad_pages);
|
||||
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
if (hdr->version >= RAS_TABLE_VER_V2_1) {
|
||||
res = __read_table_ras_info(control);
|
||||
if (res)
|
||||
return res;
|
||||
@@ -1448,7 +1450,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
ras->bad_page_cnt_threshold);
|
||||
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
|
||||
amdgpu_bad_page_threshold != 0) {
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
if (hdr->version >= RAS_TABLE_VER_V2_1) {
|
||||
res = __read_table_ras_info(control);
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
|
||||
#define RAS_TABLE_VER_V1 0x00010000
|
||||
#define RAS_TABLE_VER_V2_1 0x00021000
|
||||
#define RAS_TABLE_VER_V3 0x00030000
|
||||
|
||||
struct amdgpu_device;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user