|
|
|
|
@@ -279,8 +279,68 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
|
|
|
|
|
return rc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
|
|
|
|
|
u32 cpu_security_boot_status_reg)
|
|
|
|
|
{
|
|
|
|
|
u32 err_val, security_val;
|
|
|
|
|
|
|
|
|
|
/* Some of the firmware status codes are deprecated in newer f/w
|
|
|
|
|
* versions. In those versions, the errors are reported
|
|
|
|
|
* in different registers. Therefore, we need to check those
|
|
|
|
|
* registers and print the exact errors. Moreover, there
|
|
|
|
|
* may be multiple errors, so we need to report on each error
|
|
|
|
|
* separately. Some of the error codes might indicate a state
|
|
|
|
|
* that is not an error per-se, but it is an error in production
|
|
|
|
|
* environment
|
|
|
|
|
*/
|
|
|
|
|
err_val = RREG32(boot_err0_reg);
|
|
|
|
|
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - DRAM initialization failed\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - Thermal Sensor initialization failed\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
|
|
|
|
|
dev_warn(hdev->dev,
|
|
|
|
|
"Device boot warning - Skipped DRAM initialization\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED)
|
|
|
|
|
dev_warn(hdev->dev,
|
|
|
|
|
"Device boot error - Skipped waiting for BMC\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - Serdes data from BMC not available\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - NIC F/W initialization failed\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
|
|
|
|
|
dev_warn(hdev->dev,
|
|
|
|
|
"Device boot warning - security not ready\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - security failure\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - PLL failure\n");
|
|
|
|
|
|
|
|
|
|
security_val = RREG32(cpu_security_boot_status_reg);
|
|
|
|
|
if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
|
|
|
|
|
dev_dbg(hdev->dev, "Device security status %#x\n",
|
|
|
|
|
security_val);
|
|
|
|
|
|
|
|
|
|
if (err_val & ~CPU_BOOT_ERR0_ENABLED)
|
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int hl_fw_cpucp_info_get(struct hl_device *hdev,
|
|
|
|
|
u32 cpu_security_boot_status_reg)
|
|
|
|
|
u32 cpu_security_boot_status_reg,
|
|
|
|
|
u32 boot_err0_reg)
|
|
|
|
|
{
|
|
|
|
|
struct asic_fixed_properties *prop = &hdev->asic_prop;
|
|
|
|
|
struct cpucp_packet pkt = {};
|
|
|
|
|
@@ -314,6 +374,12 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev,
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
|
|
|
|
|
if (rc) {
|
|
|
|
|
dev_err(hdev->dev, "Errors in device boot\n");
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
memcpy(&prop->cpucp_info, cpucp_info_cpu_addr,
|
|
|
|
|
sizeof(prop->cpucp_info));
|
|
|
|
|
|
|
|
|
|
@@ -483,58 +549,6 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u16 pll_index,
|
|
|
|
|
return rc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
|
|
|
|
|
u32 cpu_security_boot_status_reg)
|
|
|
|
|
{
|
|
|
|
|
u32 err_val, security_val;
|
|
|
|
|
|
|
|
|
|
/* Some of the firmware status codes are deprecated in newer f/w
|
|
|
|
|
* versions. In those versions, the errors are reported
|
|
|
|
|
* in different registers. Therefore, we need to check those
|
|
|
|
|
* registers and print the exact errors. Moreover, there
|
|
|
|
|
* may be multiple errors, so we need to report on each error
|
|
|
|
|
* separately. Some of the error codes might indicate a state
|
|
|
|
|
* that is not an error per-se, but it is an error in production
|
|
|
|
|
* environment
|
|
|
|
|
*/
|
|
|
|
|
err_val = RREG32(boot_err0_reg);
|
|
|
|
|
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - DRAM initialization failed\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - Thermal Sensor initialization failed\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
|
|
|
|
|
dev_warn(hdev->dev,
|
|
|
|
|
"Device boot warning - Skipped DRAM initialization\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED)
|
|
|
|
|
dev_warn(hdev->dev,
|
|
|
|
|
"Device boot error - Skipped waiting for BMC\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - Serdes data from BMC not available\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
|
|
|
|
|
dev_err(hdev->dev,
|
|
|
|
|
"Device boot error - NIC F/W initialization failed\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
|
|
|
|
|
dev_warn(hdev->dev,
|
|
|
|
|
"Device boot warning - security not ready\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - security failure\n");
|
|
|
|
|
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
|
|
|
|
|
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
|
|
|
|
|
|
|
|
|
|
security_val = RREG32(cpu_security_boot_status_reg);
|
|
|
|
|
if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
|
|
|
|
|
dev_dbg(hdev->dev, "Device security status %#x\n",
|
|
|
|
|
security_val);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
|
|
|
|
|
{
|
|
|
|
|
/* Some of the status codes below are deprecated in newer f/w
|
|
|
|
|
@@ -833,6 +847,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
|
|
|
|
|
if (rc)
|
|
|
|
|
return rc;
|
|
|
|
|
|
|
|
|
|
/* Clear reset status since we need to read again from app */
|
|
|
|
|
prop->hard_reset_done_by_fw = false;
|
|
|
|
|
|
|
|
|
|
@@ -855,6 +873,8 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
|
|
|
|
|
|
|
|
|
|
dev_info(hdev->dev, "Successfully loaded firmware to device\n");
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
|
|
|
|
|
|
|
|
|
|
|