Commit 78146c1d authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: add variable to record the deferred error number read by driver

Add variable to record the deferred error
number read by driver.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 29b6985d
...@@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) ...@@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */ /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms #define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
...@@ -2799,7 +2799,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) ...@@ -2799,7 +2799,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key)); memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
ecc_log->de_updated = false; ecc_log->de_queried_count = 0;
ecc_log->prev_de_queried_count = 0;
} }
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
...@@ -2818,7 +2819,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) ...@@ -2818,7 +2819,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_unlock(&ecc_log->lock); mutex_unlock(&ecc_log->lock);
mutex_destroy(&ecc_log->lock); mutex_destroy(&ecc_log->lock);
ecc_log->de_updated = false; ecc_log->de_queried_count = 0;
ecc_log->prev_de_queried_count = 0;
} }
static void amdgpu_ras_do_page_retirement(struct work_struct *work) static void amdgpu_ras_do_page_retirement(struct work_struct *work)
...@@ -2850,40 +2852,64 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) ...@@ -2850,40 +2852,64 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
mutex_unlock(&con->umc_ecc_log.lock); mutex_unlock(&con->umc_ecc_log.lock);
} }
static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
uint32_t timeout_ms) uint32_t poison_creation_count)
{ {
int ret = 0; int ret = 0;
struct ras_ecc_log_info *ecc_log; struct ras_ecc_log_info *ecc_log;
struct ras_query_if info; struct ras_query_if info;
uint32_t timeout = timeout_ms; uint32_t timeout = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
uint64_t de_queried_count;
uint32_t new_detect_count, total_detect_count;
uint32_t need_query_count = poison_creation_count;
bool query_data_timeout = false;
memset(&info, 0, sizeof(info)); memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC; info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log; ecc_log = &ras->umc_ecc_log;
ecc_log->de_updated = false; total_detect_count = 0;
do { do {
ret = amdgpu_ras_query_error_status(adev, &info); ret = amdgpu_ras_query_error_status(adev, &info);
if (ret) { if (ret)
dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); return ret;
return;
de_queried_count = ecc_log->de_queried_count;
if (de_queried_count > ecc_log->prev_de_queried_count) {
new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
ecc_log->prev_de_queried_count = de_queried_count;
timeout = 0;
} else {
new_detect_count = 0;
} }
if (timeout && !ecc_log->de_updated) { if (new_detect_count) {
total_detect_count += new_detect_count;
} else {
if (!timeout && need_query_count)
timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
if (timeout) {
if (!--timeout) {
query_data_timeout = true;
break;
}
msleep(1); msleep(1);
timeout--;
} }
} while (timeout && !ecc_log->de_updated); }
} while (total_detect_count < need_query_count);
if (timeout_ms && !timeout) { if (query_data_timeout) {
dev_warn(adev->dev, "Can't find deferred error\n"); dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
return; (need_query_count - total_detect_count));
return -ENOENT;
} }
if (!ret) if (total_detect_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0); schedule_delayed_work(&ras->page_retirement_dwork, 0);
return 0;
} }
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
......
...@@ -469,7 +469,8 @@ struct ras_ecc_log_info { ...@@ -469,7 +469,8 @@ struct ras_ecc_log_info {
struct mutex lock; struct mutex lock;
siphash_key_t ecc_key; siphash_key_t ecc_key;
struct radix_tree_root de_page_tree; struct radix_tree_root de_page_tree;
bool de_updated; uint64_t de_queried_count;
uint64_t prev_de_queried_count;
}; };
struct amdgpu_ras { struct amdgpu_ras {
......
...@@ -557,7 +557,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ...@@ -557,7 +557,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
if (ret) { if (ret) {
if (ret == -EEXIST) if (ret == -EEXIST)
con->umc_ecc_log.de_updated = true; con->umc_ecc_log.de_queried_count++;
else else
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
...@@ -566,7 +566,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ...@@ -566,7 +566,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return ret; return ret;
} }
con->umc_ecc_log.de_updated = true; con->umc_ecc_log.de_queried_count++;
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment