Commit 4cf781c2 authored by John Clements's avatar John Clements Committed by Alex Deucher

drm/amdgpu: Added RAS UMC error query support for Arcturus

Updated UMC 6.1 function set to support UMC 6.1.1 and 6.1.2 devices
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a0250689
...@@ -708,11 +708,18 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) ...@@ -708,11 +708,18 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
adev->umc.funcs = &umc_v6_0_funcs; adev->umc.funcs = &umc_v6_0_funcs;
break; break;
case CHIP_VEGA20: case CHIP_VEGA20:
adev->umc.max_ras_err_cnt_per_query = UMC_V6_1_TOTAL_CHANNEL_NUM;
adev->umc.channel_inst_num = UMC_V6_1_CHANNEL_INSTANCE_NUM;
adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM;
adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET_VG20;
adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0];
adev->umc.funcs = &umc_v6_1_funcs;
break;
case CHIP_ARCTURUS: case CHIP_ARCTURUS:
adev->umc.max_ras_err_cnt_per_query = UMC_V6_1_TOTAL_CHANNEL_NUM; adev->umc.max_ras_err_cnt_per_query = UMC_V6_1_TOTAL_CHANNEL_NUM;
adev->umc.channel_inst_num = UMC_V6_1_CHANNEL_INSTANCE_NUM; adev->umc.channel_inst_num = UMC_V6_1_CHANNEL_INSTANCE_NUM;
adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM; adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM;
adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET; adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET_ARCT;
adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0]; adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0];
adev->umc.funcs = &umc_v6_1_funcs; adev->umc.funcs = &umc_v6_1_funcs;
break; break;
......
...@@ -31,6 +31,14 @@ ...@@ -31,6 +31,14 @@
#define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10
/* UMC 6_1_2 register offsets */
#define mmUMCCH0_0_EccErrCntSel_ARCT 0x0360
#define mmUMCCH0_0_EccErrCntSel_ARCT_BASE_IDX 1
#define mmUMCCH0_0_EccErrCnt_ARCT 0x0361
#define mmUMCCH0_0_EccErrCnt_ARCT_BASE_IDX 1
#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT 0x03c2
#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT_BASE_IDX 1
/* /*
* (addr / 256) * 8192, the higher 26 bits in ErrorAddr * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
* is the index of 8KB block * is the index of 8KB block
...@@ -95,12 +103,25 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, ...@@ -95,12 +103,25 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_umc_status; uint64_t mc_umc_status;
uint32_t mc_umc_status_addr; uint32_t mc_umc_status_addr;
ecc_err_cnt_sel_addr = if (adev->asic_type == CHIP_ARCTURUS) {
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); /* UMC 6_1_2 registers */
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); ecc_err_cnt_sel_addr =
mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
} else {
/* UMC 6_1_1 registers */
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
}
/* select the lower chip and check the error count */ /* select the lower chip and check the error count */
ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset);
...@@ -141,8 +162,17 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev ...@@ -141,8 +162,17 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev
uint64_t mc_umc_status; uint64_t mc_umc_status;
uint32_t mc_umc_status_addr; uint32_t mc_umc_status_addr;
mc_umc_status_addr = if (adev->asic_type == CHIP_ARCTURUS) {
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); /* UMC 6_1_2 registers */
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
} else {
/* UMC 6_1_1 registers */
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
}
/* check the MCUMC_STATUS */ /* check the MCUMC_STATUS */
mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
...@@ -179,8 +209,17 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ...@@ -179,8 +209,17 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
uint64_t mc_umc_status, err_addr, retired_page; uint64_t mc_umc_status, err_addr, retired_page;
struct eeprom_table_record *err_rec; struct eeprom_table_record *err_rec;
mc_umc_status_addr = if (adev->asic_type == CHIP_ARCTURUS) {
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); /* UMC 6_1_2 registers */
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
} else {
/* UMC 6_1_1 registers */
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
}
/* skip error address process if -ENOMEM */ /* skip error address process if -ENOMEM */
if (!err_data->err_addr) { if (!err_data->err_addr) {
...@@ -241,10 +280,21 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, ...@@ -241,10 +280,21 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
uint32_t ecc_err_cnt_addr; uint32_t ecc_err_cnt_addr;
ecc_err_cnt_sel_addr = if (adev->asic_type == CHIP_ARCTURUS) {
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); /* UMC 6_1_2 registers */
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
} else {
/* UMC 6_1_1 registers */
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
}
/* select the lower chip and check the error count */ /* select the lower chip and check the error count */
ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset);
......
...@@ -35,7 +35,8 @@ ...@@ -35,7 +35,8 @@
/* total channel instances in one umc block */ /* total channel instances in one umc block */
#define UMC_V6_1_TOTAL_CHANNEL_NUM (UMC_V6_1_CHANNEL_INSTANCE_NUM * UMC_V6_1_UMC_INSTANCE_NUM) #define UMC_V6_1_TOTAL_CHANNEL_NUM (UMC_V6_1_CHANNEL_INSTANCE_NUM * UMC_V6_1_UMC_INSTANCE_NUM)
/* UMC regiser per channel offset */ /* UMC regiser per channel offset */
#define UMC_V6_1_PER_CHANNEL_OFFSET 0x800 #define UMC_V6_1_PER_CHANNEL_OFFSET_VG20 0x800
#define UMC_V6_1_PER_CHANNEL_OFFSET_ARCT 0x400
/* EccErrCnt max value */ /* EccErrCnt max value */
#define UMC_V6_1_CE_CNT_MAX 0xffff #define UMC_V6_1_CE_CNT_MAX 0xffff
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment