Commit 4d33e0f1 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: exclude duplicate pages from UMC RAS UE count

If a UMC bad page is reserved but not freed by an application, the
application may trigger uncorrectable error repeatly by accessing the page.

v2: add specific function to do the check.
v3: remove duplicate pages, calculate new added bad page number.
v4: reuse save_bad_pages to calculate new added bad page number.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e69c7857
...@@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre ...@@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
if (amdgpu_bad_page_threshold != 0) { if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr, amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
err_data.err_addr_cnt); err_data.err_addr_cnt);
amdgpu_ras_save_bad_pages(adev); amdgpu_ras_save_bad_pages(adev, NULL);
} }
dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
...@@ -2084,22 +2084,32 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, ...@@ -2084,22 +2084,32 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
/* /*
* write error record array to eeprom, the function should be * write error record array to eeprom, the function should be
* protected by recovery_lock * protected by recovery_lock
* new_cnt: new added UE count, excluding reserved bad pages, can be NULL
*/ */
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
unsigned long *new_cnt)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data; struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control; struct amdgpu_ras_eeprom_control *control;
int save_count; int save_count;
if (!con || !con->eh_data) if (!con || !con->eh_data) {
if (new_cnt)
*new_cnt = 0;
return 0; return 0;
}
mutex_lock(&con->recovery_lock); mutex_lock(&con->recovery_lock);
control = &con->eeprom_control; control = &con->eeprom_control;
data = con->eh_data; data = con->eh_data;
save_count = data->count - control->ras_num_recs; save_count = data->count - control->ras_num_recs;
mutex_unlock(&con->recovery_lock); mutex_unlock(&con->recovery_lock);
if (new_cnt)
*new_cnt = save_count / adev->umc.retire_unit;
/* only new entries are saved */ /* only new entries are saved */
if (save_count > 0) { if (save_count > 0) {
if (amdgpu_ras_eeprom_append(control, if (amdgpu_ras_eeprom_append(control,
......
...@@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, ...@@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages); struct eeprom_table_record *bps, int pages);
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev); int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
unsigned long *new_cnt);
static inline enum ta_ras_block static inline enum ta_ras_block
amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
......
...@@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, ...@@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
if (amdgpu_bad_page_threshold != 0) { if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr, amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
err_data.err_addr_cnt); err_data.err_addr_cnt);
amdgpu_ras_save_bad_pages(adev); amdgpu_ras_save_bad_pages(adev, NULL);
} }
out: out:
...@@ -147,7 +147,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -147,7 +147,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
err_data->err_addr_cnt) { err_data->err_addr_cnt) {
amdgpu_ras_add_bad_pages(adev, err_data->err_addr, amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt); err_data->err_addr_cnt);
amdgpu_ras_save_bad_pages(adev); amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment