Commit 9f91e983 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: MCA supports recording umc address information

MCA supports recording umc address information.

V2:
  Move err_addr variable from struct ras_err_node to
struct ras_err_info.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 731b2f6e
......@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
{
struct amdgpu_smuio_mcm_config_info mcm_info;
struct ras_err_addr err_addr = {0};
struct mca_bank_set mca_set;
struct mca_bank_node *node;
struct mca_bank_entry *entry;
......@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
mcm_info.socket_id = entry->info.socket_id;
mcm_info.die_id = entry->info.aid;
if (blk == AMDGPU_RAS_BLOCK__UMC) {
err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS];
err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID];
err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
}
if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count);
amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
else
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count);
amdgpu_ras_error_statistic_ce_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
}
out_mca_release:
......
......@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
amdgpu_ras_error_statistic_ce_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->ue_count);
}
} else {
/* for legacy asic path which doesn't has error source info */
......@@ -3691,7 +3693,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
}
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info)
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr)
{
struct ras_err_node *err_node;
......@@ -3705,6 +3708,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
if (err_addr)
memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
err_data->err_list_count++;
list_add_tail(&err_node->node, &err_data->err_node_list);
list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
......@@ -3713,7 +3719,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
}
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count)
{
struct ras_err_info *err_info;
......@@ -3723,7 +3730,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
if (!count)
return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info)
return -EINVAL;
......@@ -3734,7 +3741,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
}
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count)
{
struct ras_err_info *err_info;
......@@ -3744,7 +3752,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
if (!count)
return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info)
return -EINVAL;
......
......@@ -452,10 +452,17 @@ struct ras_fs_data {
char debugfs_name[32];
};
struct ras_err_addr {
uint64_t err_status;
uint64_t err_ipid;
uint64_t err_addr;
};
struct ras_err_info {
struct amdgpu_smuio_mcm_config_info mcm_info;
u64 ce_count;
u64 ue_count;
struct ras_err_addr err_addr;
};
struct ras_err_node {
......@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
#endif
......@@ -1313,10 +1313,10 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a
switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
case AMDGPU_MCA_ERROR_TYPE_UE:
amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 1ULL);
break;
case AMDGPU_MCA_ERROR_TYPE_CE:
amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 1ULL);
break;
default:
break;
......
......@@ -3828,8 +3828,8 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
/* the caller should make sure initialize value of
* err_data->ue_count and err_data->ce_count
*/
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
}
static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
......
......@@ -652,8 +652,8 @@ static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev,
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
&ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
}
static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
......
......@@ -2156,7 +2156,7 @@ static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
&ue_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
}
static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
......
......@@ -166,8 +166,8 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count);
umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
return 0;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment