Commit 5f08275c authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: refine poison creation interrupt handler

In order to apply to the case where a large number
of ras poison interrupts:
1. Change to use variable to record poison creation
   requests to avoid fifo full.
2. Prioritize handling poison creation requests
   instead of following the order of requests
   received by the driver.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent cbda2758
...@@ -2105,10 +2105,8 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj ...@@ -2105,10 +2105,8 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
amdgpu_ras_put_poison_req(obj->adev,
AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
atomic_inc(&con->page_retirement_req_cnt); atomic_inc(&con->page_retirement_req_cnt);
atomic_inc(&con->poison_creation_count);
wake_up(&con->page_retirement_wq); wake_up(&con->page_retirement_wq);
} }
...@@ -2939,9 +2937,10 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2939,9 +2937,10 @@ static int amdgpu_ras_page_retirement_thread(void *param)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)param; struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t poison_creation_count;
int ret;
struct ras_poison_msg poison_msg; struct ras_poison_msg poison_msg;
enum amdgpu_ras_block ras_block; enum amdgpu_ras_block ras_block;
bool poison_creation_is_handled = false;
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
...@@ -2952,7 +2951,18 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2952,7 +2951,18 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop()) if (kthread_should_stop())
break; break;
atomic_dec(&con->page_retirement_req_cnt);
do {
poison_creation_count = atomic_read(&con->poison_creation_count);
ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count);
if (ret == -EIO)
break;
if (poison_creation_count) {
atomic_sub(poison_creation_count, &con->poison_creation_count);
atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
}
} while (atomic_read(&con->poison_creation_count));
if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
continue; continue;
...@@ -2962,24 +2972,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2962,24 +2972,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
dev_dbg(adev->dev, "Start processing ras block %s(%d)\n", dev_dbg(adev->dev, "Start processing ras block %s(%d)\n",
ras_block_str(ras_block), ras_block); ras_block_str(ras_block), ras_block);
if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_poison_creation_handler(adev,
MAX_UMC_POISON_POLLING_TIME_ASYNC);
poison_creation_is_handled = true;
} else {
/* poison_creation_is_handled:
* false: no poison creation interrupt, but it has poison
* consumption interrupt.
* true: It has poison creation interrupt at the beginning,
* but it has no poison creation interrupt later.
*/
amdgpu_ras_poison_creation_handler(adev,
poison_creation_is_handled ?
0 : MAX_UMC_POISON_POLLING_TIME_ASYNC);
amdgpu_ras_poison_consumption_handler(adev, &poison_msg); amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
poison_creation_is_handled = false;
}
} }
return 0; return 0;
...@@ -3052,6 +3045,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -3052,6 +3045,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->page_retirement_lock); mutex_init(&con->page_retirement_lock);
init_waitqueue_head(&con->page_retirement_wq); init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->page_retirement_req_cnt, 0);
atomic_set(&con->poison_creation_count, 0);
con->page_retirement_thread = con->page_retirement_thread =
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
if (IS_ERR(con->page_retirement_thread)) { if (IS_ERR(con->page_retirement_thread)) {
...@@ -3100,6 +3094,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) ...@@ -3100,6 +3094,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
kthread_stop(con->page_retirement_thread); kthread_stop(con->page_retirement_thread);
atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->page_retirement_req_cnt, 0);
atomic_set(&con->poison_creation_count, 0);
mutex_destroy(&con->page_rsv_lock); mutex_destroy(&con->page_rsv_lock);
......
...@@ -532,6 +532,7 @@ struct amdgpu_ras { ...@@ -532,6 +532,7 @@ struct amdgpu_ras {
wait_queue_head_t page_retirement_wq; wait_queue_head_t page_retirement_wq;
struct mutex page_retirement_lock; struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt; atomic_t page_retirement_req_cnt;
atomic_t poison_creation_count;
struct mutex page_rsv_lock; struct mutex page_rsv_lock;
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
struct ras_ecc_log_info umc_ecc_log; struct ras_ecc_log_info umc_ecc_log;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment