Commit e278849c authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: refine poison consumption interrupt handler

1. The poison fifo is only used for poison consumption
   requests.
2. Merge reset requests when poison fifo caches multiple
   poison consumption messages
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5f08275c
...@@ -2911,23 +2911,41 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, ...@@ -2911,23 +2911,41 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
} }
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
struct ras_poison_msg *poison_msg) uint32_t msg_count, uint32_t *gpu_reset)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t reset = poison_msg->reset; uint32_t reset_flags = 0, reset = 0;
uint16_t pasid = poison_msg->pasid; struct ras_poison_msg msg;
int ret, i;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (poison_msg->pasid_fn) for (i = 0; i < msg_count; i++) {
poison_msg->pasid_fn(adev, pasid, poison_msg->data); ret = amdgpu_ras_get_poison_req(adev, &msg);
if (!ret)
continue;
if (msg.pasid_fn)
msg.pasid_fn(adev, msg.pasid, msg.data);
reset_flags |= msg.reset;
}
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
if (reset && !con->is_rma) { if (reset_flags && !con->is_rma) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
else
reset = reset_flags;
flush_delayed_work(&con->page_retirement_dwork); flush_delayed_work(&con->page_retirement_dwork);
con->gpu_reset_flags |= reset; con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
*gpu_reset = reset;
} }
return 0; return 0;
...@@ -2937,10 +2955,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2937,10 +2955,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)param; struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t poison_creation_count; uint32_t poison_creation_count, msg_count;
uint32_t gpu_reset;
int ret; int ret;
struct ras_poison_msg poison_msg;
enum amdgpu_ras_block ras_block;
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
...@@ -2951,6 +2968,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2951,6 +2968,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop()) if (kthread_should_stop())
break; break;
gpu_reset = 0;
do { do {
poison_creation_count = atomic_read(&con->poison_creation_count); poison_creation_count = atomic_read(&con->poison_creation_count);
...@@ -2964,15 +2982,16 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2964,15 +2982,16 @@ static int amdgpu_ras_page_retirement_thread(void *param)
} }
} while (atomic_read(&con->poison_creation_count)); } while (atomic_read(&con->poison_creation_count));
if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) if (ret != -EIO) {
continue; msg_count = kfifo_len(&con->poison_fifo);
if (msg_count) {
ras_block = poison_msg.block; ret = amdgpu_ras_poison_consumption_handler(adev,
msg_count, &gpu_reset);
dev_dbg(adev->dev, "Start processing ras block %s(%d)\n", if ((ret != -EIO) &&
ras_block_str(ras_block), ras_block); (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET))
atomic_sub(msg_count, &con->page_retirement_req_cnt);
amdgpu_ras_poison_consumption_handler(adev, &poison_msg); }
}
} }
return 0; return 0;
......
...@@ -293,14 +293,15 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, ...@@ -293,14 +293,15 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
amdgpu_ras_error_data_fini(&err_data); amdgpu_ras_error_data_fini(&err_data);
} else { } else {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret;
amdgpu_ras_put_poison_req(adev,
block, pasid, pasid_fn, data, reset);
ret = amdgpu_ras_put_poison_req(adev,
block, pasid, pasid_fn, data, reset);
if (!ret) {
atomic_inc(&con->page_retirement_req_cnt); atomic_inc(&con->page_retirement_req_cnt);
wake_up(&con->page_retirement_wq); wake_up(&con->page_retirement_wq);
}
} }
} else { } else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler) if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment