Commit fd90456c authored by Guchun Chen's avatar Guchun Chen Committed by Alex Deucher

drm/amdgpu: decouple EccErrCnt query and clear operation

Due to hardware bug that when RSMU UMC index is disabled,
clear EccErrCnt at the first UMC instance will clean up all other
EccErrCnt registes from other instances at the same time. This
will break the correctable error count log in EccErrCnt register
once querying it. So decouple both to make error count query workable.
Signed-off-by: default avatarGuchun Chen <guchun.chen@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 40e73314
...@@ -104,6 +104,81 @@ static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev, ...@@ -104,6 +104,81 @@ static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst; return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
} }
static void umc_v6_1_clear_error_count_per_channel(struct amdgpu_device *adev,
uint32_t umc_reg_offset)
{
uint32_t ecc_err_cnt_addr;
uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
if (adev->asic_type == CHIP_ARCTURUS) {
/* UMC 6_1_2 registers */
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0,
mmUMCCH0_0_EccErrCntSel_ARCT);
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0,
mmUMCCH0_0_EccErrCnt_ARCT);
} else {
/* UMC 6_1_1 registers */
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0,
mmUMCCH0_0_EccErrCntSel);
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0,
mmUMCCH0_0_EccErrCnt);
}
/* select the lower chip */
ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
umc_reg_offset) * 4);
ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
UMCCH0_0_EccErrCntSel,
EccErrCntCsSel, 0);
WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
ecc_err_cnt_sel);
/* clear lower chip error count */
WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
UMC_V6_1_CE_CNT_INIT);
/* select the higher chip */
ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
umc_reg_offset) * 4);
ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
UMCCH0_0_EccErrCntSel,
EccErrCntCsSel, 1);
WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
ecc_err_cnt_sel);
/* clear higher chip error count */
WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
UMC_V6_1_CE_CNT_INIT);
}
static void umc_v6_1_clear_error_count(struct amdgpu_device *adev)
{
uint32_t umc_inst = 0;
uint32_t ch_inst = 0;
uint32_t umc_reg_offset = 0;
uint32_t rsmu_umc_index_state =
umc_v6_1_get_umc_index_mode_state(adev);
if (rsmu_umc_index_state)
umc_v6_1_disable_umc_index_mode(adev);
LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
umc_reg_offset = get_umc_6_reg_offset(adev,
umc_inst,
ch_inst);
umc_v6_1_clear_error_count_per_channel(adev,
umc_reg_offset);
}
if (rsmu_umc_index_state)
umc_v6_1_enable_umc_index_mode(adev);
}
static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
uint32_t umc_reg_offset, uint32_t umc_reg_offset,
unsigned long *error_count) unsigned long *error_count)
...@@ -136,23 +211,21 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, ...@@ -136,23 +211,21 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
EccErrCntCsSel, 0); EccErrCntCsSel, 0);
WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
*error_count += *error_count +=
(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
UMC_V6_1_CE_CNT_INIT); UMC_V6_1_CE_CNT_INIT);
/* clear the lower chip err count */
WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
/* select the higher chip and check the err counter */ /* select the higher chip and check the err counter */
ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
EccErrCntCsSel, 1); EccErrCntCsSel, 1);
WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
*error_count += *error_count +=
(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
UMC_V6_1_CE_CNT_INIT); UMC_V6_1_CE_CNT_INIT);
/* clear the higher chip err count */
WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
/* check for SRAM correctable error /* check for SRAM correctable error
MCUMC_STATUS is a 64 bit register */ MCUMC_STATUS is a 64 bit register */
...@@ -228,6 +301,8 @@ static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev, ...@@ -228,6 +301,8 @@ static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
if (rsmu_umc_index_state) if (rsmu_umc_index_state)
umc_v6_1_enable_umc_index_mode(adev); umc_v6_1_enable_umc_index_mode(adev);
umc_v6_1_clear_error_count(adev);
} }
static void umc_v6_1_query_error_address(struct amdgpu_device *adev, static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment