Commit a474161e authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher

drm/amdgpu: Update programming for boot error reporting

AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid.
The polling sequence is also simplifed according to
the latest firmware change.
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2bac0844
...@@ -4400,64 +4400,74 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, ...@@ -4400,64 +4400,74 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
#define mmMP0_SMN_C2PMSG_92 0x1609C #define mmMP0_SMN_C2PMSG_92 0x1609C
#define mmMP0_SMN_C2PMSG_126 0x160BE #define mmMP0_SMN_C2PMSG_126 0x160BE
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
u32 instance, u32 boot_error) u32 instance)
{ {
u32 socket_id, aid_id, hbm_id; u32 socket_id, aid_id, hbm_id;
u32 reg_data; u32 fw_status;
u32 boot_error;
u64 reg_addr; u64 reg_addr;
socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
/* The pattern for smn addressing in other SOC could be different from /* The pattern for smn addressing in other SOC could be different from
* the one for aqua_vanjaram. We should revisit the code if the pattern * the one for aqua_vanjaram. We should revisit the code if the pattern
* is changed. In such case, replace the aqua_vanjaram implementation * is changed. In such case, replace the aqua_vanjaram implementation
* with more common helper */ * with more common helper */
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
aqua_vanjaram_encode_ext_smn_addressing(instance); aqua_vanjaram_encode_ext_smn_addressing(instance);
fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
aqua_vanjaram_encode_ext_smn_addressing(instance);
boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
socket_id, aid_id, reg_data); hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", dev_info(adev->dev,
socket_id, aid_id, hbm_id); "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n",
socket_id, aid_id, hbm_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", dev_info(adev->dev,
socket_id, aid_id); "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n",
socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", dev_info(adev->dev,
socket_id, aid_id); "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n",
socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", dev_info(adev->dev,
socket_id, aid_id); "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n",
socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", dev_info(adev->dev,
socket_id, aid_id); "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n",
socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", dev_info(adev->dev,
socket_id, aid_id); "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n",
socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", dev_info(adev->dev,
socket_id, aid_id, hbm_id); "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n",
socket_id, aid_id, hbm_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", dev_info(adev->dev,
socket_id, aid_id, hbm_id); "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n",
socket_id, aid_id, hbm_id, fw_status);
} }
static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
u32 instance, u32 *boot_error) u32 instance)
{ {
u32 reg_addr; u64 reg_addr;
u32 reg_data; u32 reg_data;
int retry_loop; int retry_loop;
...@@ -4466,41 +4476,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, ...@@ -4466,41 +4476,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) { if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS)
*boot_error = AMDGPU_RAS_BOOT_SUCEESS; return false;
return 0; else
} msleep(1);
msleep(1);
}
/* The pattern for smn addressing in other SOC could be different from
* the one for aqua_vanjaram. We should revisit the code if the pattern
* is changed. In such case, replace the aqua_vanjaram implementation
* with more common helper */
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
aqua_vanjaram_encode_ext_smn_addressing(instance);
for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
*boot_error = reg_data;
return 0;
}
msleep(1);
} }
*boot_error = reg_data; return true;
return -ETIME;
} }
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
{ {
u32 boot_error = 0;
u32 i; u32 i;
for (i = 0; i < num_instances; i++) { for (i = 0; i < num_instances; i++) {
if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) if (amdgpu_ras_boot_error_detected(adev, i))
amdgpu_ras_boot_time_error_reporting(adev, i, boot_error); amdgpu_ras_boot_time_error_reporting(adev, i);
} }
} }
......
...@@ -47,12 +47,10 @@ struct amdgpu_iv_entry; ...@@ -47,12 +47,10 @@ struct amdgpu_iv_entry;
#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8)
#define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11)
#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) #define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13)
#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31)
#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100
#define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA
#define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF
#define AMDGPU_RAS_BOOT_SUCEESS 0x80000000
#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
/* position of instance value in sub_block_index of /* position of instance value in sub_block_index of
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment