Commit 88f8575b authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher

drm/amdgpu: enable watchdog feature for SQ of aldebaran

SQ's watchdog timer monitors forward progress, a mask of which waves
caused the watchdog timeout is recorded into ras status registers and
then trigger a system fatal error event.

v2:
1. change *query_timeout_status to *query_sq_timeout_status.
2. move query_sq_timeout_status into amdgpu_ras_do_recovery.
3. add module parameters to enable/disable fatal error event and modify
the watchdog timer.

v3:
1. remove unused parameters of *enable_watchdog_timer
Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 4abc2567
...@@ -126,6 +126,12 @@ struct amdgpu_mgpu_info ...@@ -126,6 +126,12 @@ struct amdgpu_mgpu_info
uint32_t num_apu; uint32_t num_apu;
}; };
struct amdgpu_watchdog_timer
{
bool timeout_fatal_disable;
uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
};
#define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256
/* /*
...@@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info; ...@@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable; extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask; extern uint amdgpu_ras_mask;
extern int amdgpu_bad_page_threshold; extern int amdgpu_bad_page_threshold;
extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
extern int amdgpu_async_gfx_ring; extern int amdgpu_async_gfx_ring;
extern int amdgpu_mcbp; extern int amdgpu_mcbp;
extern int amdgpu_discovery; extern int amdgpu_discovery;
......
...@@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = { ...@@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = {
int amdgpu_ras_enable = -1; int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff; uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = 100; int amdgpu_bad_page_threshold = 100;
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
.period = 0x3f, /* about 8s */
};
/** /**
* DOC: vramlimit (int) * DOC: vramlimit (int)
...@@ -530,6 +534,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444); ...@@ -530,6 +534,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1"); MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444); module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
/**
* DOC: timeout_fatal_disable (bool)
* Disable Watchdog timeout fatal error event
*/
MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);
/**
* DOC: timeout_period (uint)
* Modify the watchdog timeout max_cycles as (1 << period)
*/
MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
/** /**
* DOC: si_support (int) * DOC: si_support (int)
* Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled, * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
......
...@@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs { ...@@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs {
void (*init_spm_golden)(struct amdgpu_device *adev); void (*init_spm_golden)(struct amdgpu_device *adev);
void (*query_ras_error_status) (struct amdgpu_device *adev); void (*query_ras_error_status) (struct amdgpu_device *adev);
void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable); void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
void (*query_sq_timeout_status)(struct amdgpu_device *adev);
}; };
struct sq_work { struct sq_work {
......
...@@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, ...@@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
case AMDGPU_RAS_BLOCK__GFX: case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_status) if (adev->gfx.funcs->query_ras_error_status)
adev->gfx.funcs->query_ras_error_status(adev); adev->gfx.funcs->query_ras_error_status(adev);
if (adev->gfx.funcs->query_sq_timeout_status)
adev->gfx.funcs->query_sq_timeout_status(adev);
break; break;
case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.funcs->query_ras_error_status) if (adev->mmhub.funcs->query_ras_error_status)
......
...@@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = { ...@@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count, .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status, .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
.query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
}; };
static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
...@@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle) ...@@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle)
if (adev->asic_type == CHIP_ALDEBARAN) if (adev->asic_type == CHIP_ALDEBARAN)
gfx_v9_4_2_set_power_brake_sequence(adev); gfx_v9_4_2_set_power_brake_sequence(adev);
if (adev->gfx.funcs->enable_watchdog_timer)
adev->gfx.funcs->enable_watchdog_timer(adev);
return r; return r;
} }
......
...@@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev) ...@@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
gfx_v9_4_2_query_ea_err_status(adev); gfx_v9_4_2_query_ea_err_status(adev);
gfx_v9_4_2_query_utc_err_status(adev); gfx_v9_4_2_query_utc_err_status(adev);
} }
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
{
uint32_t i;
uint32_t data;
data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
amdgpu_watchdog_timer.timeout_fatal_disable ? 1 :
0);
data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
amdgpu_watchdog_timer.period);
mutex_lock(&adev->grbm_idx_mutex);
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff);
WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data);
}
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
mutex_unlock(&adev->grbm_idx_mutex);
}
static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
{
WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX,
(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
(simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
(address << SQ_IND_INDEX__INDEX__SHIFT) |
(SQ_IND_INDEX__FORCE_READ_MASK));
return RREG32_SOC15(GC, 0, regSQ_IND_DATA);
}
static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
uint32_t status)
{
struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
uint32_t i, simd, wave;
uint32_t wave_status;
uint32_t wave_pc_lo, wave_pc_hi;
uint32_t wave_exec_lo, wave_exec_hi;
uint32_t wave_inst_dw0, wave_inst_dw1;
uint32_t wave_ib_sts;
for (i = 0; i < 32; i++) {
if (!((i << 1) & status))
continue;
simd = i / cu_info->max_waves_per_simd;
wave = i % cu_info->max_waves_per_simd;
wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS);
wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO);
wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI);
wave_exec_lo =
wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO);
wave_exec_hi =
wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI);
wave_inst_dw0 =
wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0);
wave_inst_dw1 =
wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1);
wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS);
dev_info(
adev->dev,
"\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n",
simd, wave, wave_status,
((uint64_t)wave_pc_hi << 32 | wave_pc_lo),
((uint64_t)wave_exec_hi << 32 | wave_exec_lo),
((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0),
wave_ib_sts);
}
}
void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
{
uint32_t se_idx, sh_idx, cu_idx;
uint32_t status;
mutex_lock(&adev->grbm_idx_mutex);
for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
se_idx++) {
for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
sh_idx++) {
for (cu_idx = 0;
cu_idx < adev->gfx.config.max_cu_per_sh;
cu_idx++) {
gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
cu_idx);
status = RREG32_SOC15(GC, 0,
regSQ_TIMEOUT_STATUS);
if (status != 0) {
dev_info(
adev->dev,
"GFX Watchdog Timeout: SE %d, SH %d, CU %d\n",
se_idx, sh_idx, cu_idx);
gfx_v9_4_2_log_cu_timeout_status(
adev, status);
}
/* clear old status */
WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
}
}
}
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
mutex_unlock(&adev->grbm_idx_mutex);
}
\ No newline at end of file
...@@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if); ...@@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if);
void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev); void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev, int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
#endif /* __GFX_V9_4_2_H__ */ #endif /* __GFX_V9_4_2_H__ */
...@@ -100,6 +100,30 @@ ...@@ -100,6 +100,30 @@
} \ } \
} while (0) } while (0)
#define WREG32_RLC_EX(prefix, reg, value) \
do { \
if (amdgpu_sriov_fullaccess(adev)) { \
uint32_t i = 0; \
uint32_t retries = 50000; \
uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0; \
uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1; \
uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT; \
WREG32(r0, value); \
WREG32(r1, (reg | 0x80000000)); \
WREG32(spare_int, 0x1); \
for (i = 0; i < retries; i++) { \
u32 tmp = RREG32(r1); \
if (!(tmp & 0x80000000)) \
break; \
udelay(10); \
} \
if (i >= retries) \
pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg); \
} else { \
WREG32(reg, value); \
} \
} while (0)
#define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \ #define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \
do { \ do { \
uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\ uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\
...@@ -142,6 +166,12 @@ ...@@ -142,6 +166,12 @@
WREG32_RLC(target_reg, value); \ WREG32_RLC(target_reg, value); \
} while (0) } while (0)
#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \
do { \
uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\
WREG32_RLC_EX(prefix, target_reg, value); \
} while (0)
#define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \ #define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \
WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \ WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \
(RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \ (RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment