Commit 12b435a4 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: add ras POSION_CONSUMPTION event id support

add amdgpu ras POSION_CONSUMPTION event id support.
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 91ba536e
...@@ -2076,10 +2076,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * ...@@ -2076,10 +2076,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_ras_block_object *block_obj = struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0); amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
u64 event_id;
int ret;
if (!block_obj || !con) if (!block_obj || !con)
return; return;
ret = amdgpu_ras_mark_ras_event(adev, type);
if (ret)
return;
/* both query_poison_status and handle_poison_consumption are optional, /* both query_poison_status and handle_poison_consumption are optional,
* but at least one of them should be implemented if we need poison * but at least one of them should be implemented if we need poison
* consumption handler * consumption handler
...@@ -2104,7 +2111,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * ...@@ -2104,7 +2111,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
* For RMA case, amdgpu_umc_poison_handler will handle gpu reset. * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
*/ */
if (poison_stat && !con->is_rma) { if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id,
"GPU reset for %s RAS poison consumption is issued!\n",
block_obj->ras_comm.name); block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
...@@ -2498,7 +2507,7 @@ static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device ...@@ -2498,7 +2507,7 @@ static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device
if (amdgpu_ras_intr_triggered()) if (amdgpu_ras_intr_triggered())
return RAS_EVENT_TYPE_FATAL; return RAS_EVENT_TYPE_FATAL;
else else
return RAS_EVENT_TYPE_INVALID; return RAS_EVENT_TYPE_POISON_CONSUMPTION;
} }
static void amdgpu_ras_do_recovery(struct work_struct *work) static void amdgpu_ras_do_recovery(struct work_struct *work)
...@@ -3986,6 +3995,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type ...@@ -3986,6 +3995,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
switch (type) { switch (type) {
case RAS_EVENT_TYPE_FATAL: case RAS_EVENT_TYPE_FATAL:
case RAS_EVENT_TYPE_POISON_CREATION: case RAS_EVENT_TYPE_POISON_CREATION:
case RAS_EVENT_TYPE_POISON_CONSUMPTION:
event_mgr = __get_ras_event_mgr(adev); event_mgr = __get_ras_event_mgr(adev);
if (!event_mgr) if (!event_mgr)
return RAS_EVENT_INVALID_ID; return RAS_EVENT_INVALID_ID;
......
...@@ -436,6 +436,7 @@ enum ras_event_type { ...@@ -436,6 +436,7 @@ enum ras_event_type {
RAS_EVENT_TYPE_INVALID = 0, RAS_EVENT_TYPE_INVALID = 0,
RAS_EVENT_TYPE_FATAL, RAS_EVENT_TYPE_FATAL,
RAS_EVENT_TYPE_POISON_CREATION, RAS_EVENT_TYPE_POISON_CREATION,
RAS_EVENT_TYPE_POISON_CONSUMPTION,
RAS_EVENT_TYPE_COUNT, RAS_EVENT_TYPE_COUNT,
}; };
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "soc15_int.h" #include "soc15_int.h"
#include "kfd_device_queue_manager.h" #include "kfd_device_queue_manager.h"
#include "kfd_smi_events.h" #include "kfd_smi_events.h"
#include "amdgpu_ras.h"
/* /*
* GFX9 SQ Interrupts * GFX9 SQ Interrupts
...@@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id) uint16_t pasid, uint16_t client_id)
{ {
enum amdgpu_ras_block block = 0; enum amdgpu_ras_block block = 0;
int old_poison;
uint32_t reset = 0; uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
u64 event_id;
int old_poison, ret;
if (!p) if (!p)
return; return;
...@@ -193,9 +196,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -193,9 +196,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
return; return;
} }
ret = amdgpu_ras_mark_ras_event(dev->adev, type);
if (ret)
return;
kfd_signal_poison_consumed_event(dev, pasid); kfd_signal_poison_consumed_event(dev, pasid);
dev_warn(dev->adev->dev, event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
RAS_EVENT_LOG(dev->adev, event_id,
"poison is consumed by client %d, kick off gpu reset flow\n", client_id); "poison is consumed by client %d, kick off gpu reset flow\n", client_id);
amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment