Commit b6485bed authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdkfd: reset queue which consumes RAS poison (v2)

CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption
saves much time, and we can also fallback to gpu reset solution if reset
queue fails.

v2: Return directly if process is NULL;
    Reset queue solution is not applicable to SDMA, fallback to legacy
way;
    Call kfd_unref_process after lookup process.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Acked-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent dec63443
...@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev) ...@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
return adev->have_atomics_support; return adev->have_atomics_support;
} }
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev) void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
{ {
struct ras_err_data err_data = {0, 0, 0, NULL}; struct ras_err_data err_data = {0, 0, 0, NULL};
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */ /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
if (!adev->gmc.xgmi.connected_to_cpu) if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL); amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
else else if (reset)
amdgpu_amdkfd_gpu_reset(adev); amdgpu_amdkfd_gpu_reset(adev);
} }
...@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, ...@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
uint64_t *mmap_offset); uint64_t *mmap_offset);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config); struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool reset);
#if IS_ENABLED(CONFIG_HSA_AMD) #if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
......
...@@ -89,6 +89,44 @@ enum SQ_INTERRUPT_ERROR_TYPE { ...@@ -89,6 +89,44 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
uint16_t pasid, uint16_t source_id)
{
int ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
/* all queues of a process will be unmapped in one time */
if (atomic_read(&p->poison)) {
kfd_unref_process(p);
return;
}
atomic_set(&p->poison, 1);
kfd_unref_process(p);
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
break;
case SOC15_INTSRC_SDMA_ECC:
default:
break;
}
kfd_signal_poison_consumed_event(dev, pasid);
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
if (!ret)
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
else
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
}
static bool event_interrupt_isr_v9(struct kfd_dev *dev, static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry, const uint32_t *ih_ring_entry,
uint32_t *patched_ihre, uint32_t *patched_ihre,
...@@ -230,8 +268,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, ...@@ -230,8 +268,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err); sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
kfd_signal_poison_consumed_event(dev, pasid); event_interrupt_poison_consumption(dev, pasid, source_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
return; return;
} }
break; break;
...@@ -252,8 +289,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, ...@@ -252,8 +289,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) { if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) { } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
kfd_signal_poison_consumed_event(dev, pasid); event_interrupt_poison_consumption(dev, pasid, source_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
return; return;
} }
} else if (client_id == SOC15_IH_CLIENTID_VMC || } else if (client_id == SOC15_IH_CLIENTID_VMC ||
......
...@@ -856,6 +856,8 @@ struct kfd_process { ...@@ -856,6 +856,8 @@ struct kfd_process {
struct svm_range_list svms; struct svm_range_list svms;
bool xnack_enabled; bool xnack_enabled;
atomic_t poison;
}; };
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment