Commit f4409ee8 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: add gpu reset control for umc page retirement

Add a reset parameter for umc page retirement, let user decide whether
call gpu reset in umc page retirement.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Acked-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d764fb2a
...@@ -23,6 +23,13 @@ ...@@ -23,6 +23,13 @@
#include "amdgpu_ras.h" #include "amdgpu_ras.h"
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{ {
int r; int r;
...@@ -88,9 +95,10 @@ void amdgpu_umc_ras_fini(struct amdgpu_device *adev) ...@@ -88,9 +95,10 @@ void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
} }
} }
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry,
bool reset)
{ {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
...@@ -164,7 +172,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -164,7 +172,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
} }
amdgpu_ras_reset_gpu(adev); if (reset)
amdgpu_ras_reset_gpu(adev);
} }
kfree(err_data->err_addr); kfree(err_data->err_addr);
......
...@@ -78,9 +78,10 @@ struct amdgpu_umc { ...@@ -78,9 +78,10 @@ struct amdgpu_umc {
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
void amdgpu_umc_ras_fini(struct amdgpu_device *adev); void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
struct amdgpu_iv_entry *entry); struct amdgpu_iv_entry *entry,
bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry); struct amdgpu_iv_entry *entry);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment