drm/amd/ras: Add umc common ras functions

Add umc common ras functions.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai
2025-03-17 17:26:49 +08:00
committed by Alex Deucher
parent 8bd7fe95a4
commit 7a3f9c0992
2 changed files with 872 additions and 0 deletions

View File

@@ -0,0 +1,706 @@
// SPDX-License-Identifier: MIT
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "ras.h"
#include "ras_umc.h"
#include "ras_umc_v12_0.h"
#define MAX_ECC_NUM_PER_RETIREMENT 16
/* bad page timestamp format
* yy[31:27] mm[26:23] day[22:17] hh[16:12] mm[11:6] ss[5:0]
*/
#define EEPROM_TIMESTAMP_MINUTE 6
#define EEPROM_TIMESTAMP_HOUR 12
#define EEPROM_TIMESTAMP_DAY 17
#define EEPROM_TIMESTAMP_MONTH 23
#define EEPROM_TIMESTAMP_YEAR 27
static uint64_t ras_umc_get_eeprom_timestamp(struct ras_core_context *ras_core)
{
struct ras_time tm = {0};
uint64_t utc_timestamp = 0;
uint64_t eeprom_timestamp = 0;
utc_timestamp = ras_core_get_utc_second_timestamp(ras_core);
if (!utc_timestamp)
return utc_timestamp;
ras_core_convert_timestamp_to_time(ras_core, utc_timestamp, &tm);
/* the year range is 2000 ~ 2031, set the year if not in the range */
if (tm.tm_year < 2000)
tm.tm_year = 2000;
if (tm.tm_year > 2031)
tm.tm_year = 2031;
tm.tm_year -= 2000;
eeprom_timestamp = tm.tm_sec + (tm.tm_min << EEPROM_TIMESTAMP_MINUTE)
+ (tm.tm_hour << EEPROM_TIMESTAMP_HOUR)
+ (tm.tm_mday << EEPROM_TIMESTAMP_DAY)
+ (tm.tm_mon << EEPROM_TIMESTAMP_MONTH)
+ (tm.tm_year << EEPROM_TIMESTAMP_YEAR);
eeprom_timestamp &= 0xffffffff;
return eeprom_timestamp;
}
static const struct ras_umc_ip_func *ras_umc_get_ip_func(
struct ras_core_context *ras_core, uint32_t ip_version)
{
switch (ip_version) {
case IP_VERSION(12, 0, 0):
return &ras_umc_func_v12_0;
default:
RAS_DEV_ERR(ras_core->dev,
"UMC ip version(0x%x) is not supported!\n", ip_version);
break;
}
return NULL;
}
int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
struct umc_mca_addr *in, struct umc_phy_addr *out,
uint32_t nps)
{
struct ras_ta_query_address_input addr_in;
struct ras_ta_query_address_output addr_out;
int ret;
if (!in)
return -EINVAL;
memset(&addr_in, 0, sizeof(addr_in));
memset(&addr_out, 0, sizeof(addr_out));
addr_in.ma.err_addr = in->err_addr;
addr_in.ma.ch_inst = in->ch_inst;
addr_in.ma.umc_inst = in->umc_inst;
addr_in.ma.node_inst = in->node_inst;
addr_in.ma.socket_id = in->socket_id;
addr_in.addr_type = RAS_TA_MCA_TO_PA;
ret = ras_psp_query_address(ras_core, &addr_in, &addr_out);
if (ret) {
RAS_DEV_WARN(ras_core->dev,
"Failed to query RAS physical address for 0x%llx, ret:%d",
in->err_addr, ret);
return -EREMOTEIO;
}
if (out) {
out->pa = addr_out.pa.pa;
out->bank = addr_out.pa.bank;
out->channel_idx = addr_out.pa.channel_idx;
}
return 0;
}
static int ras_umc_log_ecc(struct ras_core_context *ras_core,
unsigned long idx, void *data)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
int ret;
mutex_lock(&ras_umc->tree_lock);
ret = radix_tree_insert(&ras_umc->root, idx, data);
if (!ret)
radix_tree_tag_set(&ras_umc->root, idx, UMC_ECC_NEW_DETECTED_TAG);
mutex_unlock(&ras_umc->tree_lock);
return ret;
}
int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
uint64_t buf[8] = {0};
void **slot;
void *data;
void *iter = buf;
mutex_lock(&ras_umc->tree_lock);
radix_tree_for_each_slot(slot, &ras_umc->root, iter, 0) {
data = ras_radix_tree_delete_iter(&ras_umc->root, iter);
kfree(data);
}
mutex_unlock(&ras_umc->tree_lock);
return 0;
}
static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core,
struct eeprom_umc_record *record)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
uint64_t page_pfn[16];
int count = 0, i;
memset(page_pfn, 0, sizeof(page_pfn));
if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages) {
count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
record, record->cur_nps, page_pfn, ARRAY_SIZE(page_pfn));
if (count <= 0) {
RAS_DEV_ERR(ras_core->dev,
"Fail to convert error address! count:%d\n", count);
return;
}
}
/* Reserve memory */
for (i = 0; i < count; i++)
ras_core_event_notify(ras_core,
RAS_EVENT_ID__RESERVE_BAD_PAGE, &page_pfn[i]);
}
/* When gpu reset is ongoing, ecc logging operations will be pended.
*/
int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct ras_bank_ecc_node *ecc_node;
ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL);
if (!ecc_node)
return -ENOMEM;
memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
mutex_lock(&ras_umc->pending_ecc_lock);
list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
mutex_unlock(&ras_umc->pending_ecc_lock);
return 0;
}
/* After gpu reset is complete, re-log the pending error banks.
*/
int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct ras_bank_ecc_node *ecc_node, *tmp;
mutex_lock(&ras_umc->pending_ecc_lock);
list_for_each_entry_safe(ecc_node,
tmp, &ras_umc->pending_ecc_list, node){
if (ecc_node && !ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
list_del(&ecc_node->node);
kfree(ecc_node);
}
}
mutex_unlock(&ras_umc->pending_ecc_lock);
return 0;
}
int ras_umc_log_bad_bank(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct eeprom_umc_record umc_rec;
struct eeprom_umc_record *err_rec;
int ret;
memset(&umc_rec, 0, sizeof(umc_rec));
mutex_lock(&ras_umc->bank_log_lock);
ret = ras_umc->ip_func->bank_to_eeprom_record(ras_core, bank, &umc_rec);
if (ret)
goto out;
err_rec = kzalloc(sizeof(*err_rec), GFP_KERNEL);
if (!err_rec) {
ret = -ENOMEM;
goto out;
}
memcpy(err_rec, &umc_rec, sizeof(umc_rec));
ret = ras_umc_log_ecc(ras_core, err_rec->cur_nps_retired_row_pfn, err_rec);
if (ret) {
if (ret == -EEXIST) {
RAS_DEV_INFO(ras_core->dev, "The bad pages have been logged before.\n");
ret = 0;
}
kfree(err_rec);
goto out;
}
ras_umc_reserve_eeprom_record(ras_core, err_rec);
ret = ras_core_event_notify(ras_core,
RAS_EVENT_ID__BAD_PAGE_DETECTED, NULL);
out:
mutex_unlock(&ras_umc->bank_log_lock);
return ret;
}
static int ras_umc_get_new_records(struct ras_core_context *ras_core,
struct eeprom_umc_record *records, u32 num)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct eeprom_umc_record *entries[MAX_ECC_NUM_PER_RETIREMENT];
u32 entry_num = num < MAX_ECC_NUM_PER_RETIREMENT ? num : MAX_ECC_NUM_PER_RETIREMENT;
int count = 0;
int new_detected, i;
mutex_lock(&ras_umc->tree_lock);
new_detected = radix_tree_gang_lookup_tag(&ras_umc->root, (void **)entries,
0, entry_num, UMC_ECC_NEW_DETECTED_TAG);
for (i = 0; i < new_detected; i++) {
if (!entries[i])
continue;
memcpy(&records[i], entries[i], sizeof(struct eeprom_umc_record));
count++;
radix_tree_tag_clear(&ras_umc->root,
entries[i]->cur_nps_retired_row_pfn, UMC_ECC_NEW_DETECTED_TAG);
}
mutex_unlock(&ras_umc->tree_lock);
return count;
}
static bool ras_umc_check_retired_record(struct ras_core_context *ras_core,
struct eeprom_umc_record *record, bool from_eeprom)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
uint32_t nps = 0;
int i, ret;
if (from_eeprom) {
nps = ras_umc->umc_err_data.umc_nps_mode;
if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_record) {
ret = ras_umc->ip_func->eeprom_record_to_nps_record(ras_core, record, nps);
if (ret)
RAS_DEV_WARN(ras_core->dev,
"Failed to adjust eeprom record, ret:%d", ret);
}
return false;
}
for (i = 0; i < data->count; i++) {
if ((data->bps[i].retired_row_pfn == record->retired_row_pfn) &&
(data->bps[i].cur_nps_retired_row_pfn == record->cur_nps_retired_row_pfn))
return true;
}
return false;
}
/* alloc/realloc bps array */
static int ras_umc_realloc_err_data_space(struct ras_core_context *ras_core,
struct eeprom_store_record *data, int pages)
{
unsigned int old_space = data->count + data->space_left;
unsigned int new_space = old_space + pages;
unsigned int align_space = ALIGN(new_space, 512);
void *bps = kzalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
if (!bps)
return -ENOMEM;
if (data->bps) {
memcpy(bps, data->bps,
data->count * sizeof(*data->bps));
kfree(data->bps);
}
data->bps = bps;
data->space_left += align_space - old_space;
return 0;
}
static int ras_umc_update_eeprom_rom_data(struct ras_core_context *ras_core,
struct eeprom_umc_record *bps)
{
struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.rom_data;
if (!data->space_left &&
ras_umc_realloc_err_data_space(ras_core, data, 256)) {
return -ENOMEM;
}
memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
data->count++;
data->space_left--;
return 0;
}
static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core,
struct eeprom_umc_record *bps)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
uint64_t page_pfn[16];
int count = 0, j;
if (!data->space_left &&
ras_umc_realloc_err_data_space(ras_core, data, 256)) {
return -ENOMEM;
}
memset(page_pfn, 0, sizeof(page_pfn));
if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages)
count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
bps, bps->cur_nps, page_pfn, ARRAY_SIZE(page_pfn));
if (count > 0) {
for (j = 0; j < count; j++) {
bps->cur_nps_retired_row_pfn = page_pfn[j];
memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
data->count++;
data->space_left--;
}
} else {
memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
data->count++;
data->space_left--;
}
return 0;
}
/* it deal with vram only. */
static int ras_umc_add_bad_pages(struct ras_core_context *ras_core,
struct eeprom_umc_record *bps,
int pages, bool from_eeprom)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct ras_umc_err_data *data = &ras_umc->umc_err_data;
int i, ret = 0;
if (!bps || pages <= 0)
return 0;
mutex_lock(&ras_umc->umc_lock);
for (i = 0; i < pages; i++) {
if (ras_umc_check_retired_record(ras_core, &bps[i], from_eeprom))
continue;
ret = ras_umc_update_eeprom_rom_data(ras_core, &bps[i]);
if (ret)
goto out;
if (data->last_retired_pfn == bps[i].cur_nps_retired_row_pfn)
continue;
data->last_retired_pfn = bps[i].cur_nps_retired_row_pfn;
if (from_eeprom)
ras_umc_reserve_eeprom_record(ras_core, &bps[i]);
ret = ras_umc_update_eeprom_ram_data(ras_core, &bps[i]);
if (ret)
goto out;
}
out:
mutex_unlock(&ras_umc->umc_lock);
return ret;
}
/*
* read error record array in eeprom and reserve enough space for
* storing new bad pages
*/
int ras_umc_load_bad_pages(struct ras_core_context *ras_core)
{
struct eeprom_umc_record *bps;
uint32_t ras_num_recs;
int ret;
ras_num_recs = ras_eeprom_get_record_count(ras_core);
/* no bad page record, skip eeprom access */
if (!ras_num_recs ||
ras_core->ras_eeprom.record_threshold_config == DISABLE_RETIRE_PAGE)
return 0;
bps = kcalloc(ras_num_recs, sizeof(*bps), GFP_KERNEL);
if (!bps)
return -ENOMEM;
ret = ras_eeprom_read(ras_core, bps, ras_num_recs);
if (ret) {
RAS_DEV_ERR(ras_core->dev, "Failed to load EEPROM table records!");
} else {
ras_core->ras_umc.umc_err_data.last_retired_pfn = UMC_INV_MEM_PFN;
ret = ras_umc_add_bad_pages(ras_core, bps, ras_num_recs, true);
}
kfree(bps);
return ret;
}
/*
* write error record array to eeprom, the function should be
* protected by recovery_lock
* new_cnt: new added UE count, excluding reserved bad pages, can be NULL
*/
static int ras_umc_save_bad_pages(struct ras_core_context *ras_core)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
uint32_t eeprom_record_num;
int save_count;
int ret = 0;
if (!data->bps)
return 0;
eeprom_record_num = ras_eeprom_get_record_count(ras_core);
mutex_lock(&ras_umc->umc_lock);
save_count = data->count - eeprom_record_num;
/* only new entries are saved */
if (save_count > 0) {
if (ras_eeprom_append(ras_core,
&data->bps[eeprom_record_num],
save_count)) {
RAS_DEV_ERR(ras_core->dev, "Failed to save EEPROM table data!");
ret = -EIO;
goto exit;
}
RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n", save_count);
}
exit:
mutex_unlock(&ras_umc->umc_lock);
return ret;
}
int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data)
{
struct eeprom_umc_record records[MAX_ECC_NUM_PER_RETIREMENT];
int count, ret;
memset(records, 0, sizeof(records));
count = ras_umc_get_new_records(ras_core, records, ARRAY_SIZE(records));
if (count <= 0)
return -ENODATA;
ret = ras_umc_add_bad_pages(ras_core, records, count, false);
if (ret) {
RAS_DEV_ERR(ras_core->dev, "Failed to add ras bad page!\n");
return -EINVAL;
}
ret = ras_umc_save_bad_pages(ras_core);
if (ret) {
RAS_DEV_ERR(ras_core->dev, "Failed to save ras bad page\n");
return -EINVAL;
}
return 0;
}
int ras_umc_sw_init(struct ras_core_context *ras_core)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
memset(ras_umc, 0, sizeof(*ras_umc));
INIT_LIST_HEAD(&ras_umc->pending_ecc_list);
INIT_RADIX_TREE(&ras_umc->root, GFP_KERNEL);
mutex_init(&ras_umc->tree_lock);
mutex_init(&ras_umc->pending_ecc_lock);
mutex_init(&ras_umc->umc_lock);
mutex_init(&ras_umc->bank_log_lock);
return 0;
}
int ras_umc_sw_fini(struct ras_core_context *ras_core)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct ras_umc_err_data *umc_err_data = &ras_umc->umc_err_data;
struct ras_bank_ecc_node *ecc_node, *tmp;
mutex_destroy(&ras_umc->umc_lock);
mutex_destroy(&ras_umc->bank_log_lock);
if (umc_err_data->rom_data.bps) {
umc_err_data->rom_data.count = 0;
kfree(umc_err_data->rom_data.bps);
umc_err_data->rom_data.bps = NULL;
umc_err_data->rom_data.space_left = 0;
}
if (umc_err_data->ram_data.bps) {
umc_err_data->ram_data.count = 0;
kfree(umc_err_data->ram_data.bps);
umc_err_data->ram_data.bps = NULL;
umc_err_data->ram_data.space_left = 0;
}
ras_umc_clear_logged_ecc(ras_core);
mutex_lock(&ras_umc->pending_ecc_lock);
list_for_each_entry_safe(ecc_node,
tmp, &ras_umc->pending_ecc_list, node){
list_del(&ecc_node->node);
kfree(ecc_node);
}
mutex_unlock(&ras_umc->pending_ecc_lock);
mutex_destroy(&ras_umc->tree_lock);
mutex_destroy(&ras_umc->pending_ecc_lock);
return 0;
}
int ras_umc_hw_init(struct ras_core_context *ras_core)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
uint32_t nps;
nps = ras_core_get_curr_nps_mode(ras_core);
if (!nps || (nps >= UMC_MEMORY_PARTITION_MODE_UNKNOWN)) {
RAS_DEV_ERR(ras_core->dev, "Invalid memory NPS mode: %u!\n", nps);
return -ENODATA;
}
ras_umc->umc_err_data.umc_nps_mode = nps;
ras_umc->umc_vram_type = ras_core->config->umc_cfg.umc_vram_type;
if (!ras_umc->umc_vram_type) {
RAS_DEV_ERR(ras_core->dev, "Invalid UMC VRAM Type: %u!\n",
ras_umc->umc_vram_type);
return -ENODATA;
}
ras_umc->umc_ip_version = ras_core->config->umc_ip_version;
ras_umc->ip_func = ras_umc_get_ip_func(ras_core, ras_umc->umc_ip_version);
if (!ras_umc->ip_func)
return -EINVAL;
return 0;
}
int ras_umc_hw_fini(struct ras_core_context *ras_core)
{
return 0;
}
int ras_umc_clean_badpage_data(struct ras_core_context *ras_core)
{
struct ras_umc_err_data *data = &ras_core->ras_umc.umc_err_data;
mutex_lock(&ras_core->ras_umc.umc_lock);
kfree(data->rom_data.bps);
kfree(data->ram_data.bps);
memset(data, 0, sizeof(*data));
mutex_unlock(&ras_core->ras_umc.umc_lock);
return 0;
}
int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr,
enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record)
{
struct eeprom_umc_record *err_rec = record;
/* Set bad page pfn and nps mode */
EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(err_rec,
RAS_ADDR_TO_PFN(cur_nps_addr->pa), cur_nps);
err_rec->address = err_addr;
err_rec->ts = ras_umc_get_eeprom_timestamp(ras_core);
err_rec->err_type = RAS_EEPROM_ERR_NON_RECOVERABLE;
err_rec->cu = 0;
err_rec->mem_channel = cur_nps_addr->channel_idx;
err_rec->mcumc_id = umc_inst;
err_rec->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(cur_nps_addr->pa);
err_rec->cur_nps_bank = cur_nps_addr->bank;
err_rec->cur_nps = cur_nps;
return 0;
}
int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core)
{
struct ras_umc_err_data *err_data = &ras_core->ras_umc.umc_err_data;
return err_data->rom_data.count;
}
int ras_umc_get_badpage_count(struct ras_core_context *ras_core)
{
struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data;
return data->count;
}
int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record)
{
struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data;
if (index >= data->count)
return -EINVAL;
memcpy(record, &data->bps[index], sizeof(struct eeprom_umc_record));
return 0;
}
bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
uint64_t page_pfn = RAS_ADDR_TO_PFN(addr);
int i, ret = false;
mutex_lock(&ras_umc->umc_lock);
for (i = 0; i < data->count; i++) {
if (data->bps[i].cur_nps_retired_row_pfn == page_pfn) {
ret = true;
break;
}
}
mutex_unlock(&ras_umc->umc_lock);
return ret;
}
int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
int ret = 0;
if (bank_to_pa)
ret = ras_umc->ip_func->bank_to_soc_pa(ras_core, *bank_addr, soc_pa);
else
ret = ras_umc->ip_func->soc_pa_to_bank(ras_core, *soc_pa, bank_addr);
return ret;
}

View File

@@ -0,0 +1,166 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __RAS_UMC_H__
#define __RAS_UMC_H__
#include "ras.h"
#include "ras_eeprom.h"
#include "ras_cmd.h"
#define UMC_VRAM_TYPE_UNKNOWN 0
#define UMC_VRAM_TYPE_GDDR1 1
#define UMC_VRAM_TYPE_DDR2 2
#define UMC_VRAM_TYPE_GDDR3 3
#define UMC_VRAM_TYPE_GDDR4 4
#define UMC_VRAM_TYPE_GDDR5 5
#define UMC_VRAM_TYPE_HBM 6
#define UMC_VRAM_TYPE_DDR3 7
#define UMC_VRAM_TYPE_DDR4 8
#define UMC_VRAM_TYPE_GDDR6 9
#define UMC_VRAM_TYPE_DDR5 10
#define UMC_VRAM_TYPE_LPDDR4 11
#define UMC_VRAM_TYPE_LPDDR5 12
#define UMC_VRAM_TYPE_HBM3E 13
#define UMC_ECC_NEW_DETECTED_TAG 0x1
#define UMC_INV_MEM_PFN (0xFFFFFFFFFFFFFFFF)
/* three column bits and one row bit in MCA address flip
* in bad page retirement
*/
#define UMC_PA_FLIP_BITS_NUM 4
enum umc_memory_partition_mode {
UMC_MEMORY_PARTITION_MODE_NONE = 0,
UMC_MEMORY_PARTITION_MODE_NPS1 = 1,
UMC_MEMORY_PARTITION_MODE_NPS2 = 2,
UMC_MEMORY_PARTITION_MODE_NPS3 = 3,
UMC_MEMORY_PARTITION_MODE_NPS4 = 4,
UMC_MEMORY_PARTITION_MODE_NPS6 = 6,
UMC_MEMORY_PARTITION_MODE_NPS8 = 8,
UMC_MEMORY_PARTITION_MODE_UNKNOWN
};
struct ras_core_context;
struct ras_bank_ecc;
struct umc_flip_bits {
uint32_t flip_bits_in_pa[UMC_PA_FLIP_BITS_NUM];
uint32_t flip_row_bit;
uint32_t r13_in_pa;
uint32_t bit_num;
};
struct umc_mca_addr {
uint64_t err_addr;
uint32_t ch_inst;
uint32_t umc_inst;
uint32_t node_inst;
uint32_t socket_id;
};
struct umc_phy_addr {
uint64_t pa;
uint32_t bank;
uint32_t channel_idx;
};
struct umc_bank_addr {
uint32_t stack_id; /* SID */
uint32_t bank_group;
uint32_t bank;
uint32_t row;
uint32_t column;
uint32_t channel;
uint32_t subchannel; /* Also called Pseudochannel (PC) */
};
struct ras_umc_ip_func {
int (*bank_to_eeprom_record)(struct ras_core_context *ras_core,
struct ras_bank_ecc *bank, struct eeprom_umc_record *record);
int (*eeprom_record_to_nps_record)(struct ras_core_context *ras_core,
struct eeprom_umc_record *record, uint32_t nps);
int (*eeprom_record_to_nps_pages)(struct ras_core_context *ras_core,
struct eeprom_umc_record *record, uint32_t nps,
uint64_t *pfns, uint32_t num);
int (*bank_to_soc_pa)(struct ras_core_context *ras_core,
struct umc_bank_addr bank_addr, uint64_t *soc_pa);
int (*soc_pa_to_bank)(struct ras_core_context *ras_core,
uint64_t soc_pa, struct umc_bank_addr *bank_addr);
};
struct eeprom_store_record {
/* point to data records array */
struct eeprom_umc_record *bps;
/* the count of entries */
int count;
/* the space can place new entries */
int space_left;
};
struct ras_umc_err_data {
struct eeprom_store_record rom_data;
struct eeprom_store_record ram_data;
enum umc_memory_partition_mode umc_nps_mode;
uint64_t last_retired_pfn;
};
struct ras_umc {
u32 umc_ip_version;
u32 umc_vram_type;
const struct ras_umc_ip_func *ip_func;
struct radix_tree_root root;
struct mutex tree_lock;
struct mutex umc_lock;
struct mutex bank_log_lock;
struct mutex pending_ecc_lock;
struct ras_umc_err_data umc_err_data;
struct list_head pending_ecc_list;
};
int ras_umc_sw_init(struct ras_core_context *ras);
int ras_umc_sw_fini(struct ras_core_context *ras);
int ras_umc_hw_init(struct ras_core_context *ras);
int ras_umc_hw_fini(struct ras_core_context *ras);
int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
struct umc_mca_addr *in, struct umc_phy_addr *out,
uint32_t nps);
int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data);
int ras_umc_log_bad_bank(struct ras_core_context *ras, struct ras_bank_ecc *bank);
int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank);
int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core);
int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core);
int ras_umc_load_bad_pages(struct ras_core_context *ras_core);
int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core);
int ras_umc_clean_badpage_data(struct ras_core_context *ras_core);
int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr,
enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record);
int ras_umc_get_badpage_count(struct ras_core_context *ras_core);
int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record);
bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr);
int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa);
#endif