Commit ae45a18b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: add RAS poison handling for MCA

For MCA poison, if unmap queue fails, only gpu reset should be
triggered without page retirement handling, MCA notifier will do it.

v2: handle MCA poison consumption in umc_poison_handler directly.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 24b82292
...@@ -169,7 +169,9 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, ...@@ -169,7 +169,9 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
bool reset) bool reset)
{ {
int ret; int ret = AMDGPU_RAS_SUCCESS;
if (!adev->gmc.xgmi.connected_to_cpu) {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct ras_common_if head = { struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC, .block = AMDGPU_RAS_BLOCK__UMC,
...@@ -183,6 +185,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, ...@@ -183,6 +185,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
obj->err_data.ue_count += err_data->ue_count; obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count; obj->err_data.ce_count += err_data->ce_count;
} }
} else if (reset) {
/* MCA poison handler is only responsible for GPU reset,
* let MCA notifier do page retirement.
*/
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_ras_reset_gpu(adev);
}
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment