Commit 7e437167 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: create amdgpu_ras_in_recovery to simplify code

Reduce redundant code and user doesn't need to pay attention to RAS
details.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5f7697bb
...@@ -6276,20 +6276,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) ...@@ -6276,20 +6276,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct amdgpu_reset_context reset_context; struct amdgpu_reset_context reset_context;
u32 memsize; u32 memsize;
struct list_head device_list; struct list_head device_list;
struct amdgpu_hive_info *hive;
int hive_ras_recovery = 0;
struct amdgpu_ras *ras;
/* PCI error slot reset should be skipped During RAS recovery */ /* PCI error slot reset should be skipped During RAS recovery */
hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
hive_ras_recovery = atomic_read(&hive->ras_recovery);
amdgpu_put_xgmi_hive(hive);
}
ras = amdgpu_ras_get_context(adev);
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) amdgpu_ras_in_recovery(adev))
return PCI_ERS_RESULT_RECOVERED; return PCI_ERS_RESULT_RECOVERED;
DRM_INFO("PCI error: slot reset callback!!\n"); DRM_INFO("PCI error: slot reset callback!!\n");
......
...@@ -506,9 +506,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) ...@@ -506,9 +506,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
{ {
struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
struct amdgpu_ring *kiq_ring = &kiq->ring; struct amdgpu_ring *kiq_ring = &kiq->ring;
struct amdgpu_hive_info *hive;
struct amdgpu_ras *ras;
int hive_ras_recovery = 0;
int i, r = 0; int i, r = 0;
int j; int j;
...@@ -533,16 +530,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) ...@@ -533,16 +530,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
* This is workaround: only skip kiq_ring test * This is workaround: only skip kiq_ring test
* during ras recovery in suspend stage for gfx9.4.3 * during ras recovery in suspend stage for gfx9.4.3
*/ */
hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
hive_ras_recovery = atomic_read(&hive->ras_recovery);
amdgpu_put_xgmi_hive(hive);
}
ras = amdgpu_ras_get_context(adev);
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) { amdgpu_ras_in_recovery(adev)) {
spin_unlock(&kiq->ring_lock); spin_unlock(&kiq->ring_lock);
return 0; return 0;
} }
......
...@@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, ...@@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
enum amdgpu_ras_block block) enum amdgpu_ras_block block)
{ {
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
struct amdgpu_hive_info *hive;
int hive_ras_recovery = 0;
if (!block_obj || !block_obj->hw_ops) { if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
...@@ -1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, ...@@ -1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
!amdgpu_ras_get_aca_debug_mode(adev)) !amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP; return -EOPNOTSUPP;
hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
hive_ras_recovery = atomic_read(&hive->ras_recovery);
amdgpu_put_xgmi_hive(hive);
}
/* skip ras error reset in gpu reset */ /* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
hive_ras_recovery) &&
((smu_funcs && smu_funcs->set_debug_mode) || ((smu_funcs && smu_funcs->set_debug_mode) ||
(mca_funcs && mca_funcs->mca_set_debug_mode))) (mca_funcs && mca_funcs->mca_set_debug_mode)))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -2461,6 +2451,23 @@ static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, ...@@ -2461,6 +2451,23 @@ static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
} }
} }
bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
{
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int hive_ras_recovery = 0;
if (hive) {
hive_ras_recovery = atomic_read(&hive->ras_recovery);
amdgpu_put_xgmi_hive(hive);
}
if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
return true;
return false;
}
static void amdgpu_ras_do_recovery(struct work_struct *work) static void amdgpu_ras_do_recovery(struct work_struct *work)
{ {
struct amdgpu_ras *ras = struct amdgpu_ras *ras =
...@@ -2821,7 +2828,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) ...@@ -2821,7 +2828,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
struct ras_err_data err_data; struct ras_err_data err_data;
unsigned long err_cnt; unsigned long err_cnt;
if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
return; return;
amdgpu_ras_error_data_init(&err_data); amdgpu_ras_error_data_init(&err_data);
......
...@@ -954,6 +954,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, ...@@ -954,6 +954,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid, enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset); pasid_notify pasid_fn, void *data, uint32_t reset);
bool amdgpu_ras_in_recovery(struct amdgpu_device *adev);
__printf(3, 4) __printf(3, 4)
void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
const char *fmt, ...); const char *fmt, ...);
......
...@@ -1863,7 +1863,6 @@ static int aldebaran_mode1_reset(struct smu_context *smu) ...@@ -1863,7 +1863,6 @@ static int aldebaran_mode1_reset(struct smu_context *smu)
u32 fatal_err, param; u32 fatal_err, param;
int ret = 0; int ret = 0;
struct amdgpu_device *adev = smu->adev; struct amdgpu_device *adev = smu->adev;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
fatal_err = 0; fatal_err = 0;
param = SMU_RESET_MODE_1; param = SMU_RESET_MODE_1;
...@@ -1876,8 +1875,8 @@ static int aldebaran_mode1_reset(struct smu_context *smu) ...@@ -1876,8 +1875,8 @@ static int aldebaran_mode1_reset(struct smu_context *smu)
} else { } else {
/* fatal error triggered by ras, PMFW supports the flag /* fatal error triggered by ras, PMFW supports the flag
from 68.44.0 */ from 68.44.0 */
if ((smu->smc_fw_version >= 0x00442c00) && ras && if ((smu->smc_fw_version >= 0x00442c00) &&
atomic_read(&ras->in_recovery)) amdgpu_ras_in_recovery(adev))
fatal_err = 1; fatal_err = 1;
param |= (fatal_err << 16); param |= (fatal_err << 16);
......
...@@ -2786,10 +2786,9 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu, ...@@ -2786,10 +2786,9 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu,
uint32_t *param) uint32_t *param)
{ {
struct amdgpu_device *adev = smu->adev; struct amdgpu_device *adev = smu->adev;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if ((smu->smc_fw_version >= supported_version) && if ((smu->smc_fw_version >= supported_version) &&
ras && atomic_read(&ras->in_recovery)) amdgpu_ras_in_recovery(adev))
/* Set RAS fatal error reset flag */ /* Set RAS fatal error reset flag */
*param = 1 << 16; *param = 1 << 16;
else else
......
...@@ -2574,24 +2574,14 @@ static int smu_v13_0_6_get_thermal_temperature_range(struct smu_context *smu, ...@@ -2574,24 +2574,14 @@ static int smu_v13_0_6_get_thermal_temperature_range(struct smu_context *smu,
static int smu_v13_0_6_mode1_reset(struct smu_context *smu) static int smu_v13_0_6_mode1_reset(struct smu_context *smu)
{ {
struct amdgpu_device *adev = smu->adev; struct amdgpu_device *adev = smu->adev;
struct amdgpu_hive_info *hive = NULL;
u32 hive_ras_recovery = 0;
struct amdgpu_ras *ras;
u32 fatal_err, param; u32 fatal_err, param;
int ret = 0; int ret = 0;
hive = amdgpu_get_xgmi_hive(adev);
ras = amdgpu_ras_get_context(adev);
fatal_err = 0; fatal_err = 0;
param = SMU_RESET_MODE_1; param = SMU_RESET_MODE_1;
if (hive) {
hive_ras_recovery = atomic_read(&hive->ras_recovery);
amdgpu_put_xgmi_hive(hive);
}
/* fatal error triggered by ras, PMFW supports the flag */ /* fatal error triggered by ras, PMFW supports the flag */
if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) if (amdgpu_ras_in_recovery(adev))
fatal_err = 1; fatal_err = 1;
param |= (fatal_err << 16); param |= (fatal_err << 16);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment