mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 12:31:52 -04:00
drm/amd/ras: use dedicated memory as vf ras command buffer
Use dedicated memory as vf ras command buffer. V2: Add lock to ensure serialization of sending vf ras commands. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Jinzhou Su <jinzhou.su@amd.com> Tested-by: Jinzhou Su <jinzhou.su@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
bfe60e539c
commit
cb48a6b2b6
@@ -470,14 +470,22 @@ struct amd_sriov_ras_chk_criti {
|
||||
uint32_t hit;
|
||||
};
|
||||
|
||||
union amd_sriov_ras_host_push {
|
||||
struct amd_sriov_ras_telemetry_error_count error_count;
|
||||
struct amd_sriov_ras_cper_dump cper_dump;
|
||||
struct amd_sriov_ras_chk_criti chk_criti;
|
||||
};
|
||||
|
||||
#define AMD_SRIOV_UNIRAS_CMD_MAX_SIZE (PAGE_SIZE * 13)
|
||||
struct amd_sriov_uniras_shared_mem {
|
||||
uint8_t blocks_ecc_buf[PAGE_SIZE];
|
||||
uint8_t cmd_buf[AMD_SRIOV_UNIRAS_CMD_MAX_SIZE];
|
||||
};
|
||||
|
||||
struct amdsriov_ras_telemetry {
|
||||
struct amd_sriov_ras_telemetry_header header;
|
||||
|
||||
union {
|
||||
struct amd_sriov_ras_telemetry_error_count error_count;
|
||||
struct amd_sriov_ras_cper_dump cper_dump;
|
||||
struct amd_sriov_ras_chk_criti chk_criti;
|
||||
} body;
|
||||
union amd_sriov_ras_host_push body;
|
||||
struct amd_sriov_uniras_shared_mem uniras_shared_mem;
|
||||
};
|
||||
|
||||
/* version data stored in MAILBOX_MSGBUF_RCV_DW1 for future expansion */
|
||||
@@ -510,6 +518,10 @@ _Static_assert(AMD_SRIOV_MSG_RESERVE_UCODE % 4 == 0,
|
||||
_Static_assert(AMD_SRIOV_MSG_RESERVE_UCODE > AMD_SRIOV_UCODE_ID__MAX,
|
||||
"AMD_SRIOV_MSG_RESERVE_UCODE must be bigger than AMD_SRIOV_UCODE_ID__MAX");
|
||||
|
||||
_Static_assert(
|
||||
sizeof(struct amdsriov_ras_telemetry) <= AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10,
|
||||
"amdsriov_ras_telemetry must be " stringification(AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1) " KB");
|
||||
|
||||
#undef _stringification
|
||||
#undef stringification
|
||||
#endif
|
||||
|
||||
@@ -30,28 +30,83 @@
|
||||
#include "amdgpu_virt_ras_cmd.h"
|
||||
#include "amdgpu_ras_mgr.h"
|
||||
|
||||
static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context *ras_core,
|
||||
uint32_t cmd, uint32_t mem_size, struct amdgpu_virt_shared_mem *shared_mem)
|
||||
{
|
||||
struct amdgpu_device *adev = ras_core->dev;
|
||||
struct amdsriov_ras_telemetry *ras_telemetry_cpu;
|
||||
struct amdsriov_ras_telemetry *ras_telemetry_gpu;
|
||||
uint64_t fw_vram_usage_start_offset = 0;
|
||||
uint64_t ras_telemetry_offset = 0;
|
||||
|
||||
if (!adev->virt.fw_reserve.ras_telemetry)
|
||||
return -EINVAL;
|
||||
|
||||
if (adev->mman.fw_vram_usage_va &&
|
||||
adev->mman.fw_vram_usage_va <= adev->virt.fw_reserve.ras_telemetry) {
|
||||
fw_vram_usage_start_offset = adev->mman.fw_vram_usage_start_offset;
|
||||
ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
|
||||
(uintptr_t)adev->mman.fw_vram_usage_va;
|
||||
} else if (adev->mman.drv_vram_usage_va &&
|
||||
adev->mman.drv_vram_usage_va <= adev->virt.fw_reserve.ras_telemetry) {
|
||||
fw_vram_usage_start_offset = adev->mman.drv_vram_usage_start_offset;
|
||||
ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
|
||||
(uintptr_t)adev->mman.drv_vram_usage_va;
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ras_telemetry_cpu =
|
||||
(struct amdsriov_ras_telemetry *)adev->virt.fw_reserve.ras_telemetry;
|
||||
ras_telemetry_gpu =
|
||||
(struct amdsriov_ras_telemetry *)(uintptr_t)(fw_vram_usage_start_offset +
|
||||
ras_telemetry_offset);
|
||||
|
||||
if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) {
|
||||
if (mem_size > PAGE_SIZE)
|
||||
return -ENOMEM;
|
||||
|
||||
shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.blocks_ecc_buf;
|
||||
shared_mem->gpa =
|
||||
(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.blocks_ecc_buf -
|
||||
adev->gmc.vram_start;
|
||||
shared_mem->size = mem_size;
|
||||
} else {
|
||||
if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE)
|
||||
return -ENOMEM;
|
||||
|
||||
shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.cmd_buf;
|
||||
shared_mem->gpa =
|
||||
(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.cmd_buf -
|
||||
adev->gmc.vram_start;
|
||||
shared_mem->size = mem_size;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ctx *cmd, void *output_data, uint32_t output_size)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
|
||||
struct amdgpu_virt_ras_cmd *virt_ras = ras_mgr->virt_ras_cmd;
|
||||
uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, AMDGPU_GPU_PAGE_SIZE);
|
||||
struct ras_cmd_ctx *rcmd;
|
||||
struct amdgpu_bo *rcmd_bo = NULL;
|
||||
uint64_t mc_addr = 0;
|
||||
void *cpu_addr = NULL;
|
||||
struct amdgpu_virt_shared_mem shared_mem = {0};
|
||||
int ret = 0;
|
||||
|
||||
ret = amdgpu_bo_create_kernel(adev, mem_len, PAGE_SIZE,
|
||||
AMDGPU_GEM_DOMAIN_VRAM, &rcmd_bo, &mc_addr, (void **)&cpu_addr);
|
||||
if (ret)
|
||||
return ret;
|
||||
mutex_lock(&virt_ras->remote_access_lock);
|
||||
|
||||
rcmd = (struct ras_cmd_ctx *)cpu_addr;
|
||||
ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd->cmd_id, mem_len, &shared_mem);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr;
|
||||
memset(rcmd, 0, mem_len);
|
||||
memcpy(rcmd, cmd, sizeof(*cmd));
|
||||
|
||||
ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev,
|
||||
mc_addr - adev->gmc.vram_start, mem_len);
|
||||
shared_mem.gpa, mem_len);
|
||||
if (!ret) {
|
||||
if (rcmd->cmd_res) {
|
||||
ret = rcmd->cmd_res;
|
||||
@@ -65,8 +120,7 @@ static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
|
||||
}
|
||||
|
||||
out:
|
||||
amdgpu_bo_free_kernel(&rcmd_bo, &mc_addr, &cpu_addr);
|
||||
|
||||
mutex_unlock(&virt_ras->remote_access_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -77,6 +131,9 @@ static int amdgpu_virt_ras_send_remote_cmd(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ctx rcmd = {0};
|
||||
int ret;
|
||||
|
||||
if (input_size > RAS_CMD_MAX_IN_SIZE)
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
rcmd.cmd_id = cmd_id;
|
||||
rcmd.input_size = input_size;
|
||||
memcpy(rcmd.input_buff_raw, input_data, input_size);
|
||||
@@ -146,7 +203,7 @@ static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache;
|
||||
struct batch_ras_trace_info *batch;
|
||||
int ret = 0;
|
||||
uint8_t i;
|
||||
uint32_t i;
|
||||
|
||||
if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) ||
|
||||
(batch_id >= (rsp->start_batch_id + rsp->real_batch_num))) {
|
||||
@@ -249,14 +306,14 @@ static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
|
||||
{
|
||||
struct ras_cmd_ctx *rcmd;
|
||||
|
||||
if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr)
|
||||
if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr)
|
||||
return -EINVAL;
|
||||
|
||||
rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr;
|
||||
rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr;
|
||||
|
||||
rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
|
||||
rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
|
||||
rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd);
|
||||
rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -305,15 +362,15 @@ static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
|
||||
|
||||
if (!virt_ras->blocks_ecc.auto_update_actived) {
|
||||
ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
|
||||
blks_ecc->mc_addr - adev->gmc.vram_start,
|
||||
blks_ecc->size, true);
|
||||
blks_ecc->shared_mem.gpa,
|
||||
blks_ecc->shared_mem.size, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
blks_ecc->auto_update_actived = true;
|
||||
}
|
||||
|
||||
blks_ecc_cmd_ctx = blks_ecc->cpu_addr;
|
||||
blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr;
|
||||
blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
|
||||
|
||||
output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
|
||||
@@ -364,18 +421,24 @@ int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
|
||||
int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
struct amdgpu_virt_ras_cmd *virt_ras_cmd;
|
||||
|
||||
ras_mgr->virt_ras_cmd = kzalloc_obj(struct amdgpu_virt_ras_cmd);
|
||||
if (!ras_mgr->virt_ras_cmd)
|
||||
return -ENOMEM;
|
||||
|
||||
virt_ras_cmd = ras_mgr->virt_ras_cmd;
|
||||
mutex_init(&virt_ras_cmd->remote_access_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
struct amdgpu_virt_ras_cmd *virt_ras_cmd = ras_mgr->virt_ras_cmd;
|
||||
|
||||
mutex_destroy(&virt_ras_cmd->remote_access_lock);
|
||||
kfree(ras_mgr->virt_ras_cmd);
|
||||
ras_mgr->virt_ras_cmd = NULL;
|
||||
|
||||
@@ -392,11 +455,9 @@ int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
|
||||
amdgpu_virt_get_ras_capability(adev);
|
||||
|
||||
memset(blks_ecc, 0, sizeof(*blks_ecc));
|
||||
blks_ecc->size = PAGE_SIZE;
|
||||
if (amdgpu_bo_create_kernel(adev, blks_ecc->size,
|
||||
PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
|
||||
&blks_ecc->bo, &blks_ecc->mc_addr,
|
||||
(void **)&blks_ecc->cpu_addr))
|
||||
if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core,
|
||||
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
|
||||
PAGE_SIZE, &blks_ecc->shared_mem))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
@@ -409,18 +470,15 @@ int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
|
||||
(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
|
||||
struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
|
||||
|
||||
if (blks_ecc->bo) {
|
||||
if (blks_ecc->shared_mem.cpu_addr) {
|
||||
__set_cmd_auto_update(adev,
|
||||
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
|
||||
blks_ecc->mc_addr - adev->gmc.vram_start,
|
||||
blks_ecc->size, false);
|
||||
blks_ecc->shared_mem.gpa,
|
||||
blks_ecc->shared_mem.size, false);
|
||||
|
||||
memset(blks_ecc->cpu_addr, 0, blks_ecc->size);
|
||||
amdgpu_bo_free_kernel(&blks_ecc->bo,
|
||||
&blks_ecc->mc_addr, &blks_ecc->cpu_addr);
|
||||
|
||||
memset(blks_ecc, 0, sizeof(*blks_ecc));
|
||||
memset(blks_ecc->shared_mem.cpu_addr, 0, blks_ecc->shared_mem.size);
|
||||
}
|
||||
memset(blks_ecc, 0, sizeof(*blks_ecc));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -30,11 +30,14 @@ struct remote_batch_trace_mgr {
|
||||
struct ras_cmd_batch_trace_record_rsp batch_trace;
|
||||
};
|
||||
|
||||
struct vram_blocks_ecc {
|
||||
struct amdgpu_bo *bo;
|
||||
uint64_t mc_addr;
|
||||
struct amdgpu_virt_shared_mem {
|
||||
uint64_t gpa;
|
||||
void *cpu_addr;
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
struct vram_blocks_ecc {
|
||||
struct amdgpu_virt_shared_mem shared_mem;
|
||||
bool auto_update_actived;
|
||||
};
|
||||
|
||||
@@ -42,6 +45,7 @@ struct amdgpu_virt_ras_cmd {
|
||||
bool remote_uniras_supported;
|
||||
struct remote_batch_trace_mgr batch_mgr;
|
||||
struct vram_blocks_ecc blocks_ecc;
|
||||
struct mutex remote_access_lock;
|
||||
};
|
||||
|
||||
int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev);
|
||||
|
||||
Reference in New Issue
Block a user