mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 12:31:52 -04:00
drm/amd/ras: Add umc common ras functions
Add umc common ras functions. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
8bd7fe95a4
commit
7a3f9c0992
706
drivers/gpu/drm/amd/ras/rascore/ras_umc.c
Normal file
706
drivers/gpu/drm/amd/ras/rascore/ras_umc.c
Normal file
@@ -0,0 +1,706 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#include "ras.h"
|
||||
#include "ras_umc.h"
|
||||
#include "ras_umc_v12_0.h"
|
||||
|
||||
#define MAX_ECC_NUM_PER_RETIREMENT 16
|
||||
|
||||
/* bad page timestamp format
|
||||
* yy[31:27] mm[26:23] day[22:17] hh[16:12] mm[11:6] ss[5:0]
|
||||
*/
|
||||
#define EEPROM_TIMESTAMP_MINUTE 6
|
||||
#define EEPROM_TIMESTAMP_HOUR 12
|
||||
#define EEPROM_TIMESTAMP_DAY 17
|
||||
#define EEPROM_TIMESTAMP_MONTH 23
|
||||
#define EEPROM_TIMESTAMP_YEAR 27
|
||||
|
||||
static uint64_t ras_umc_get_eeprom_timestamp(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_time tm = {0};
|
||||
uint64_t utc_timestamp = 0;
|
||||
uint64_t eeprom_timestamp = 0;
|
||||
|
||||
utc_timestamp = ras_core_get_utc_second_timestamp(ras_core);
|
||||
if (!utc_timestamp)
|
||||
return utc_timestamp;
|
||||
|
||||
ras_core_convert_timestamp_to_time(ras_core, utc_timestamp, &tm);
|
||||
|
||||
/* the year range is 2000 ~ 2031, set the year if not in the range */
|
||||
if (tm.tm_year < 2000)
|
||||
tm.tm_year = 2000;
|
||||
if (tm.tm_year > 2031)
|
||||
tm.tm_year = 2031;
|
||||
|
||||
tm.tm_year -= 2000;
|
||||
|
||||
eeprom_timestamp = tm.tm_sec + (tm.tm_min << EEPROM_TIMESTAMP_MINUTE)
|
||||
+ (tm.tm_hour << EEPROM_TIMESTAMP_HOUR)
|
||||
+ (tm.tm_mday << EEPROM_TIMESTAMP_DAY)
|
||||
+ (tm.tm_mon << EEPROM_TIMESTAMP_MONTH)
|
||||
+ (tm.tm_year << EEPROM_TIMESTAMP_YEAR);
|
||||
eeprom_timestamp &= 0xffffffff;
|
||||
|
||||
return eeprom_timestamp;
|
||||
}
|
||||
|
||||
static const struct ras_umc_ip_func *ras_umc_get_ip_func(
|
||||
struct ras_core_context *ras_core, uint32_t ip_version)
|
||||
{
|
||||
switch (ip_version) {
|
||||
case IP_VERSION(12, 0, 0):
|
||||
return &ras_umc_func_v12_0;
|
||||
default:
|
||||
RAS_DEV_ERR(ras_core->dev,
|
||||
"UMC ip version(0x%x) is not supported!\n", ip_version);
|
||||
break;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
|
||||
struct umc_mca_addr *in, struct umc_phy_addr *out,
|
||||
uint32_t nps)
|
||||
{
|
||||
struct ras_ta_query_address_input addr_in;
|
||||
struct ras_ta_query_address_output addr_out;
|
||||
int ret;
|
||||
|
||||
if (!in)
|
||||
return -EINVAL;
|
||||
|
||||
memset(&addr_in, 0, sizeof(addr_in));
|
||||
memset(&addr_out, 0, sizeof(addr_out));
|
||||
|
||||
addr_in.ma.err_addr = in->err_addr;
|
||||
addr_in.ma.ch_inst = in->ch_inst;
|
||||
addr_in.ma.umc_inst = in->umc_inst;
|
||||
addr_in.ma.node_inst = in->node_inst;
|
||||
addr_in.ma.socket_id = in->socket_id;
|
||||
|
||||
addr_in.addr_type = RAS_TA_MCA_TO_PA;
|
||||
|
||||
ret = ras_psp_query_address(ras_core, &addr_in, &addr_out);
|
||||
if (ret) {
|
||||
RAS_DEV_WARN(ras_core->dev,
|
||||
"Failed to query RAS physical address for 0x%llx, ret:%d",
|
||||
in->err_addr, ret);
|
||||
return -EREMOTEIO;
|
||||
}
|
||||
|
||||
if (out) {
|
||||
out->pa = addr_out.pa.pa;
|
||||
out->bank = addr_out.pa.bank;
|
||||
out->channel_idx = addr_out.pa.channel_idx;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ras_umc_log_ecc(struct ras_core_context *ras_core,
|
||||
unsigned long idx, void *data)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&ras_umc->tree_lock);
|
||||
ret = radix_tree_insert(&ras_umc->root, idx, data);
|
||||
if (!ret)
|
||||
radix_tree_tag_set(&ras_umc->root, idx, UMC_ECC_NEW_DETECTED_TAG);
|
||||
mutex_unlock(&ras_umc->tree_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
uint64_t buf[8] = {0};
|
||||
void **slot;
|
||||
void *data;
|
||||
void *iter = buf;
|
||||
|
||||
mutex_lock(&ras_umc->tree_lock);
|
||||
radix_tree_for_each_slot(slot, &ras_umc->root, iter, 0) {
|
||||
data = ras_radix_tree_delete_iter(&ras_umc->root, iter);
|
||||
kfree(data);
|
||||
}
|
||||
mutex_unlock(&ras_umc->tree_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *record)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
uint64_t page_pfn[16];
|
||||
int count = 0, i;
|
||||
|
||||
memset(page_pfn, 0, sizeof(page_pfn));
|
||||
if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages) {
|
||||
count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
|
||||
record, record->cur_nps, page_pfn, ARRAY_SIZE(page_pfn));
|
||||
if (count <= 0) {
|
||||
RAS_DEV_ERR(ras_core->dev,
|
||||
"Fail to convert error address! count:%d\n", count);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Reserve memory */
|
||||
for (i = 0; i < count; i++)
|
||||
ras_core_event_notify(ras_core,
|
||||
RAS_EVENT_ID__RESERVE_BAD_PAGE, &page_pfn[i]);
|
||||
}
|
||||
|
||||
/* When gpu reset is ongoing, ecc logging operations will be pended.
|
||||
*/
|
||||
int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct ras_bank_ecc_node *ecc_node;
|
||||
|
||||
ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL);
|
||||
if (!ecc_node)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
|
||||
|
||||
mutex_lock(&ras_umc->pending_ecc_lock);
|
||||
list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
|
||||
mutex_unlock(&ras_umc->pending_ecc_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* After gpu reset is complete, re-log the pending error banks.
|
||||
*/
|
||||
int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct ras_bank_ecc_node *ecc_node, *tmp;
|
||||
|
||||
mutex_lock(&ras_umc->pending_ecc_lock);
|
||||
list_for_each_entry_safe(ecc_node,
|
||||
tmp, &ras_umc->pending_ecc_list, node){
|
||||
if (ecc_node && !ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
|
||||
list_del(&ecc_node->node);
|
||||
kfree(ecc_node);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&ras_umc->pending_ecc_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_log_bad_bank(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct eeprom_umc_record umc_rec;
|
||||
struct eeprom_umc_record *err_rec;
|
||||
int ret;
|
||||
|
||||
memset(&umc_rec, 0, sizeof(umc_rec));
|
||||
|
||||
mutex_lock(&ras_umc->bank_log_lock);
|
||||
ret = ras_umc->ip_func->bank_to_eeprom_record(ras_core, bank, &umc_rec);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
err_rec = kzalloc(sizeof(*err_rec), GFP_KERNEL);
|
||||
if (!err_rec) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcpy(err_rec, &umc_rec, sizeof(umc_rec));
|
||||
ret = ras_umc_log_ecc(ras_core, err_rec->cur_nps_retired_row_pfn, err_rec);
|
||||
if (ret) {
|
||||
if (ret == -EEXIST) {
|
||||
RAS_DEV_INFO(ras_core->dev, "The bad pages have been logged before.\n");
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
kfree(err_rec);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ras_umc_reserve_eeprom_record(ras_core, err_rec);
|
||||
|
||||
ret = ras_core_event_notify(ras_core,
|
||||
RAS_EVENT_ID__BAD_PAGE_DETECTED, NULL);
|
||||
|
||||
out:
|
||||
mutex_unlock(&ras_umc->bank_log_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ras_umc_get_new_records(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *records, u32 num)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct eeprom_umc_record *entries[MAX_ECC_NUM_PER_RETIREMENT];
|
||||
u32 entry_num = num < MAX_ECC_NUM_PER_RETIREMENT ? num : MAX_ECC_NUM_PER_RETIREMENT;
|
||||
int count = 0;
|
||||
int new_detected, i;
|
||||
|
||||
mutex_lock(&ras_umc->tree_lock);
|
||||
new_detected = radix_tree_gang_lookup_tag(&ras_umc->root, (void **)entries,
|
||||
0, entry_num, UMC_ECC_NEW_DETECTED_TAG);
|
||||
for (i = 0; i < new_detected; i++) {
|
||||
if (!entries[i])
|
||||
continue;
|
||||
|
||||
memcpy(&records[i], entries[i], sizeof(struct eeprom_umc_record));
|
||||
count++;
|
||||
radix_tree_tag_clear(&ras_umc->root,
|
||||
entries[i]->cur_nps_retired_row_pfn, UMC_ECC_NEW_DETECTED_TAG);
|
||||
}
|
||||
mutex_unlock(&ras_umc->tree_lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static bool ras_umc_check_retired_record(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *record, bool from_eeprom)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
|
||||
uint32_t nps = 0;
|
||||
int i, ret;
|
||||
|
||||
if (from_eeprom) {
|
||||
nps = ras_umc->umc_err_data.umc_nps_mode;
|
||||
if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_record) {
|
||||
ret = ras_umc->ip_func->eeprom_record_to_nps_record(ras_core, record, nps);
|
||||
if (ret)
|
||||
RAS_DEV_WARN(ras_core->dev,
|
||||
"Failed to adjust eeprom record, ret:%d", ret);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
for (i = 0; i < data->count; i++) {
|
||||
if ((data->bps[i].retired_row_pfn == record->retired_row_pfn) &&
|
||||
(data->bps[i].cur_nps_retired_row_pfn == record->cur_nps_retired_row_pfn))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* alloc/realloc bps array */
|
||||
static int ras_umc_realloc_err_data_space(struct ras_core_context *ras_core,
|
||||
struct eeprom_store_record *data, int pages)
|
||||
{
|
||||
unsigned int old_space = data->count + data->space_left;
|
||||
unsigned int new_space = old_space + pages;
|
||||
unsigned int align_space = ALIGN(new_space, 512);
|
||||
void *bps = kzalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
|
||||
|
||||
if (!bps)
|
||||
return -ENOMEM;
|
||||
|
||||
if (data->bps) {
|
||||
memcpy(bps, data->bps,
|
||||
data->count * sizeof(*data->bps));
|
||||
kfree(data->bps);
|
||||
}
|
||||
|
||||
data->bps = bps;
|
||||
data->space_left += align_space - old_space;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ras_umc_update_eeprom_rom_data(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *bps)
|
||||
{
|
||||
struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.rom_data;
|
||||
|
||||
if (!data->space_left &&
|
||||
ras_umc_realloc_err_data_space(ras_core, data, 256)) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *bps)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
|
||||
uint64_t page_pfn[16];
|
||||
int count = 0, j;
|
||||
|
||||
if (!data->space_left &&
|
||||
ras_umc_realloc_err_data_space(ras_core, data, 256)) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memset(page_pfn, 0, sizeof(page_pfn));
|
||||
if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages)
|
||||
count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
|
||||
bps, bps->cur_nps, page_pfn, ARRAY_SIZE(page_pfn));
|
||||
|
||||
if (count > 0) {
|
||||
for (j = 0; j < count; j++) {
|
||||
bps->cur_nps_retired_row_pfn = page_pfn[j];
|
||||
memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
}
|
||||
} else {
|
||||
memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* it deal with vram only. */
|
||||
static int ras_umc_add_bad_pages(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *bps,
|
||||
int pages, bool from_eeprom)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct ras_umc_err_data *data = &ras_umc->umc_err_data;
|
||||
int i, ret = 0;
|
||||
|
||||
if (!bps || pages <= 0)
|
||||
return 0;
|
||||
|
||||
mutex_lock(&ras_umc->umc_lock);
|
||||
for (i = 0; i < pages; i++) {
|
||||
if (ras_umc_check_retired_record(ras_core, &bps[i], from_eeprom))
|
||||
continue;
|
||||
|
||||
ret = ras_umc_update_eeprom_rom_data(ras_core, &bps[i]);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (data->last_retired_pfn == bps[i].cur_nps_retired_row_pfn)
|
||||
continue;
|
||||
|
||||
data->last_retired_pfn = bps[i].cur_nps_retired_row_pfn;
|
||||
|
||||
if (from_eeprom)
|
||||
ras_umc_reserve_eeprom_record(ras_core, &bps[i]);
|
||||
|
||||
ret = ras_umc_update_eeprom_ram_data(ras_core, &bps[i]);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&ras_umc->umc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* read error record array in eeprom and reserve enough space for
|
||||
* storing new bad pages
|
||||
*/
|
||||
int ras_umc_load_bad_pages(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct eeprom_umc_record *bps;
|
||||
uint32_t ras_num_recs;
|
||||
int ret;
|
||||
|
||||
ras_num_recs = ras_eeprom_get_record_count(ras_core);
|
||||
/* no bad page record, skip eeprom access */
|
||||
if (!ras_num_recs ||
|
||||
ras_core->ras_eeprom.record_threshold_config == DISABLE_RETIRE_PAGE)
|
||||
return 0;
|
||||
|
||||
bps = kcalloc(ras_num_recs, sizeof(*bps), GFP_KERNEL);
|
||||
if (!bps)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = ras_eeprom_read(ras_core, bps, ras_num_recs);
|
||||
if (ret) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Failed to load EEPROM table records!");
|
||||
} else {
|
||||
ras_core->ras_umc.umc_err_data.last_retired_pfn = UMC_INV_MEM_PFN;
|
||||
ret = ras_umc_add_bad_pages(ras_core, bps, ras_num_recs, true);
|
||||
}
|
||||
|
||||
kfree(bps);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* write error record array to eeprom, the function should be
|
||||
* protected by recovery_lock
|
||||
* new_cnt: new added UE count, excluding reserved bad pages, can be NULL
|
||||
*/
|
||||
static int ras_umc_save_bad_pages(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
|
||||
uint32_t eeprom_record_num;
|
||||
int save_count;
|
||||
int ret = 0;
|
||||
|
||||
if (!data->bps)
|
||||
return 0;
|
||||
|
||||
eeprom_record_num = ras_eeprom_get_record_count(ras_core);
|
||||
mutex_lock(&ras_umc->umc_lock);
|
||||
save_count = data->count - eeprom_record_num;
|
||||
/* only new entries are saved */
|
||||
if (save_count > 0) {
|
||||
if (ras_eeprom_append(ras_core,
|
||||
&data->bps[eeprom_record_num],
|
||||
save_count)) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Failed to save EEPROM table data!");
|
||||
ret = -EIO;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n", save_count);
|
||||
}
|
||||
|
||||
exit:
|
||||
mutex_unlock(&ras_umc->umc_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data)
|
||||
{
|
||||
struct eeprom_umc_record records[MAX_ECC_NUM_PER_RETIREMENT];
|
||||
int count, ret;
|
||||
|
||||
memset(records, 0, sizeof(records));
|
||||
count = ras_umc_get_new_records(ras_core, records, ARRAY_SIZE(records));
|
||||
if (count <= 0)
|
||||
return -ENODATA;
|
||||
|
||||
ret = ras_umc_add_bad_pages(ras_core, records, count, false);
|
||||
if (ret) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Failed to add ras bad page!\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = ras_umc_save_bad_pages(ras_core);
|
||||
if (ret) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Failed to save ras bad page\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_sw_init(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
|
||||
memset(ras_umc, 0, sizeof(*ras_umc));
|
||||
|
||||
INIT_LIST_HEAD(&ras_umc->pending_ecc_list);
|
||||
|
||||
INIT_RADIX_TREE(&ras_umc->root, GFP_KERNEL);
|
||||
|
||||
mutex_init(&ras_umc->tree_lock);
|
||||
mutex_init(&ras_umc->pending_ecc_lock);
|
||||
mutex_init(&ras_umc->umc_lock);
|
||||
mutex_init(&ras_umc->bank_log_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_sw_fini(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct ras_umc_err_data *umc_err_data = &ras_umc->umc_err_data;
|
||||
struct ras_bank_ecc_node *ecc_node, *tmp;
|
||||
|
||||
mutex_destroy(&ras_umc->umc_lock);
|
||||
mutex_destroy(&ras_umc->bank_log_lock);
|
||||
|
||||
if (umc_err_data->rom_data.bps) {
|
||||
umc_err_data->rom_data.count = 0;
|
||||
kfree(umc_err_data->rom_data.bps);
|
||||
umc_err_data->rom_data.bps = NULL;
|
||||
umc_err_data->rom_data.space_left = 0;
|
||||
}
|
||||
|
||||
if (umc_err_data->ram_data.bps) {
|
||||
umc_err_data->ram_data.count = 0;
|
||||
kfree(umc_err_data->ram_data.bps);
|
||||
umc_err_data->ram_data.bps = NULL;
|
||||
umc_err_data->ram_data.space_left = 0;
|
||||
}
|
||||
|
||||
ras_umc_clear_logged_ecc(ras_core);
|
||||
|
||||
mutex_lock(&ras_umc->pending_ecc_lock);
|
||||
list_for_each_entry_safe(ecc_node,
|
||||
tmp, &ras_umc->pending_ecc_list, node){
|
||||
list_del(&ecc_node->node);
|
||||
kfree(ecc_node);
|
||||
}
|
||||
mutex_unlock(&ras_umc->pending_ecc_lock);
|
||||
|
||||
mutex_destroy(&ras_umc->tree_lock);
|
||||
mutex_destroy(&ras_umc->pending_ecc_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_hw_init(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
uint32_t nps;
|
||||
|
||||
nps = ras_core_get_curr_nps_mode(ras_core);
|
||||
|
||||
if (!nps || (nps >= UMC_MEMORY_PARTITION_MODE_UNKNOWN)) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Invalid memory NPS mode: %u!\n", nps);
|
||||
return -ENODATA;
|
||||
}
|
||||
|
||||
ras_umc->umc_err_data.umc_nps_mode = nps;
|
||||
|
||||
ras_umc->umc_vram_type = ras_core->config->umc_cfg.umc_vram_type;
|
||||
if (!ras_umc->umc_vram_type) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Invalid UMC VRAM Type: %u!\n",
|
||||
ras_umc->umc_vram_type);
|
||||
return -ENODATA;
|
||||
}
|
||||
|
||||
ras_umc->umc_ip_version = ras_core->config->umc_ip_version;
|
||||
ras_umc->ip_func = ras_umc_get_ip_func(ras_core, ras_umc->umc_ip_version);
|
||||
if (!ras_umc->ip_func)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_hw_fini(struct ras_core_context *ras_core)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_clean_badpage_data(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc_err_data *data = &ras_core->ras_umc.umc_err_data;
|
||||
|
||||
mutex_lock(&ras_core->ras_umc.umc_lock);
|
||||
|
||||
kfree(data->rom_data.bps);
|
||||
kfree(data->ram_data.bps);
|
||||
|
||||
memset(data, 0, sizeof(*data));
|
||||
mutex_unlock(&ras_core->ras_umc.umc_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
|
||||
uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr,
|
||||
enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record)
|
||||
{
|
||||
struct eeprom_umc_record *err_rec = record;
|
||||
|
||||
/* Set bad page pfn and nps mode */
|
||||
EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(err_rec,
|
||||
RAS_ADDR_TO_PFN(cur_nps_addr->pa), cur_nps);
|
||||
|
||||
err_rec->address = err_addr;
|
||||
err_rec->ts = ras_umc_get_eeprom_timestamp(ras_core);
|
||||
err_rec->err_type = RAS_EEPROM_ERR_NON_RECOVERABLE;
|
||||
err_rec->cu = 0;
|
||||
err_rec->mem_channel = cur_nps_addr->channel_idx;
|
||||
err_rec->mcumc_id = umc_inst;
|
||||
err_rec->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(cur_nps_addr->pa);
|
||||
err_rec->cur_nps_bank = cur_nps_addr->bank;
|
||||
err_rec->cur_nps = cur_nps;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_umc_err_data *err_data = &ras_core->ras_umc.umc_err_data;
|
||||
|
||||
return err_data->rom_data.count;
|
||||
}
|
||||
|
||||
int ras_umc_get_badpage_count(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data;
|
||||
|
||||
return data->count;
|
||||
}
|
||||
|
||||
int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record)
|
||||
{
|
||||
struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data;
|
||||
|
||||
if (index >= data->count)
|
||||
return -EINVAL;
|
||||
|
||||
memcpy(record, &data->bps[index], sizeof(struct eeprom_umc_record));
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
|
||||
uint64_t page_pfn = RAS_ADDR_TO_PFN(addr);
|
||||
int i, ret = false;
|
||||
|
||||
mutex_lock(&ras_umc->umc_lock);
|
||||
for (i = 0; i < data->count; i++) {
|
||||
if (data->bps[i].cur_nps_retired_row_pfn == page_pfn) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&ras_umc->umc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
|
||||
uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa)
|
||||
{
|
||||
struct ras_umc *ras_umc = &ras_core->ras_umc;
|
||||
int ret = 0;
|
||||
|
||||
if (bank_to_pa)
|
||||
ret = ras_umc->ip_func->bank_to_soc_pa(ras_core, *bank_addr, soc_pa);
|
||||
else
|
||||
ret = ras_umc->ip_func->soc_pa_to_bank(ras_core, *soc_pa, bank_addr);
|
||||
|
||||
return ret;
|
||||
}
|
||||
166
drivers/gpu/drm/amd/ras/rascore/ras_umc.h
Normal file
166
drivers/gpu/drm/amd/ras/rascore/ras_umc.h
Normal file
@@ -0,0 +1,166 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/*
|
||||
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __RAS_UMC_H__
|
||||
#define __RAS_UMC_H__
|
||||
#include "ras.h"
|
||||
#include "ras_eeprom.h"
|
||||
#include "ras_cmd.h"
|
||||
|
||||
#define UMC_VRAM_TYPE_UNKNOWN 0
|
||||
#define UMC_VRAM_TYPE_GDDR1 1
|
||||
#define UMC_VRAM_TYPE_DDR2 2
|
||||
#define UMC_VRAM_TYPE_GDDR3 3
|
||||
#define UMC_VRAM_TYPE_GDDR4 4
|
||||
#define UMC_VRAM_TYPE_GDDR5 5
|
||||
#define UMC_VRAM_TYPE_HBM 6
|
||||
#define UMC_VRAM_TYPE_DDR3 7
|
||||
#define UMC_VRAM_TYPE_DDR4 8
|
||||
#define UMC_VRAM_TYPE_GDDR6 9
|
||||
#define UMC_VRAM_TYPE_DDR5 10
|
||||
#define UMC_VRAM_TYPE_LPDDR4 11
|
||||
#define UMC_VRAM_TYPE_LPDDR5 12
|
||||
#define UMC_VRAM_TYPE_HBM3E 13
|
||||
|
||||
#define UMC_ECC_NEW_DETECTED_TAG 0x1
|
||||
#define UMC_INV_MEM_PFN (0xFFFFFFFFFFFFFFFF)
|
||||
|
||||
/* three column bits and one row bit in MCA address flip
|
||||
* in bad page retirement
|
||||
*/
|
||||
#define UMC_PA_FLIP_BITS_NUM 4
|
||||
|
||||
enum umc_memory_partition_mode {
|
||||
UMC_MEMORY_PARTITION_MODE_NONE = 0,
|
||||
UMC_MEMORY_PARTITION_MODE_NPS1 = 1,
|
||||
UMC_MEMORY_PARTITION_MODE_NPS2 = 2,
|
||||
UMC_MEMORY_PARTITION_MODE_NPS3 = 3,
|
||||
UMC_MEMORY_PARTITION_MODE_NPS4 = 4,
|
||||
UMC_MEMORY_PARTITION_MODE_NPS6 = 6,
|
||||
UMC_MEMORY_PARTITION_MODE_NPS8 = 8,
|
||||
UMC_MEMORY_PARTITION_MODE_UNKNOWN
|
||||
};
|
||||
|
||||
struct ras_core_context;
|
||||
struct ras_bank_ecc;
|
||||
|
||||
struct umc_flip_bits {
|
||||
uint32_t flip_bits_in_pa[UMC_PA_FLIP_BITS_NUM];
|
||||
uint32_t flip_row_bit;
|
||||
uint32_t r13_in_pa;
|
||||
uint32_t bit_num;
|
||||
};
|
||||
|
||||
struct umc_mca_addr {
|
||||
uint64_t err_addr;
|
||||
uint32_t ch_inst;
|
||||
uint32_t umc_inst;
|
||||
uint32_t node_inst;
|
||||
uint32_t socket_id;
|
||||
};
|
||||
|
||||
struct umc_phy_addr {
|
||||
uint64_t pa;
|
||||
uint32_t bank;
|
||||
uint32_t channel_idx;
|
||||
};
|
||||
|
||||
struct umc_bank_addr {
|
||||
uint32_t stack_id; /* SID */
|
||||
uint32_t bank_group;
|
||||
uint32_t bank;
|
||||
uint32_t row;
|
||||
uint32_t column;
|
||||
uint32_t channel;
|
||||
uint32_t subchannel; /* Also called Pseudochannel (PC) */
|
||||
};
|
||||
|
||||
struct ras_umc_ip_func {
|
||||
int (*bank_to_eeprom_record)(struct ras_core_context *ras_core,
|
||||
struct ras_bank_ecc *bank, struct eeprom_umc_record *record);
|
||||
int (*eeprom_record_to_nps_record)(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *record, uint32_t nps);
|
||||
int (*eeprom_record_to_nps_pages)(struct ras_core_context *ras_core,
|
||||
struct eeprom_umc_record *record, uint32_t nps,
|
||||
uint64_t *pfns, uint32_t num);
|
||||
int (*bank_to_soc_pa)(struct ras_core_context *ras_core,
|
||||
struct umc_bank_addr bank_addr, uint64_t *soc_pa);
|
||||
int (*soc_pa_to_bank)(struct ras_core_context *ras_core,
|
||||
uint64_t soc_pa, struct umc_bank_addr *bank_addr);
|
||||
};
|
||||
|
||||
struct eeprom_store_record {
|
||||
/* point to data records array */
|
||||
struct eeprom_umc_record *bps;
|
||||
/* the count of entries */
|
||||
int count;
|
||||
/* the space can place new entries */
|
||||
int space_left;
|
||||
};
|
||||
|
||||
struct ras_umc_err_data {
|
||||
struct eeprom_store_record rom_data;
|
||||
struct eeprom_store_record ram_data;
|
||||
enum umc_memory_partition_mode umc_nps_mode;
|
||||
uint64_t last_retired_pfn;
|
||||
};
|
||||
|
||||
struct ras_umc {
|
||||
u32 umc_ip_version;
|
||||
u32 umc_vram_type;
|
||||
const struct ras_umc_ip_func *ip_func;
|
||||
struct radix_tree_root root;
|
||||
struct mutex tree_lock;
|
||||
struct mutex umc_lock;
|
||||
struct mutex bank_log_lock;
|
||||
struct mutex pending_ecc_lock;
|
||||
struct ras_umc_err_data umc_err_data;
|
||||
struct list_head pending_ecc_list;
|
||||
};
|
||||
|
||||
int ras_umc_sw_init(struct ras_core_context *ras);
|
||||
int ras_umc_sw_fini(struct ras_core_context *ras);
|
||||
int ras_umc_hw_init(struct ras_core_context *ras);
|
||||
int ras_umc_hw_fini(struct ras_core_context *ras);
|
||||
int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
|
||||
struct umc_mca_addr *in, struct umc_phy_addr *out,
|
||||
uint32_t nps);
|
||||
int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data);
|
||||
int ras_umc_log_bad_bank(struct ras_core_context *ras, struct ras_bank_ecc *bank);
|
||||
int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank);
|
||||
int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core);
|
||||
int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core);
|
||||
int ras_umc_load_bad_pages(struct ras_core_context *ras_core);
|
||||
int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core);
|
||||
int ras_umc_clean_badpage_data(struct ras_core_context *ras_core);
|
||||
int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
|
||||
uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr,
|
||||
enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record);
|
||||
|
||||
int ras_umc_get_badpage_count(struct ras_core_context *ras_core);
|
||||
int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record);
|
||||
bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr);
|
||||
int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
|
||||
uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa);
|
||||
#endif
|
||||
Reference in New Issue
Block a user