Commit 2640c3fa authored by shaoyunl's avatar shaoyunl Committed by Oded Gabbay

drm/amdkfd: Handle VM faults in KFD

1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
   per-vmid. amdkfd needs to get the information from amdgpu through the
   new get_vm_fault_info interface. On GFX9 and later, all the required
   information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
   run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY
Signed-off-by: default avatarshaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent b97dfa27
...@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, ...@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
ihre->source_id == CIK_INTSRC_SDMA_TRAP || ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE; ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
} }
static void cik_event_interrupt_wq(struct kfd_dev *dev, static void cik_event_interrupt_wq(struct kfd_dev *dev,
const uint32_t *ih_ring_entry) const uint32_t *ih_ring_entry)
{ {
unsigned int pasid;
const struct cik_ih_ring_entry *ihre = const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry; (const struct cik_ih_ring_entry *)ih_ring_entry;
uint32_t context_id = ihre->data & 0xfffffff; uint32_t context_id = ihre->data & 0xfffffff;
unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
pasid = (ihre->ring_id & 0xffff0000) >> 16; unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
if (pasid == 0) if (pasid == 0)
return; return;
...@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ...@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
kfd_signal_hw_exception_event(pasid); kfd_signal_hw_exception_event(pasid);
else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
kfd_process_vm_fault(dev->dqm, pasid);
memset(&info, 0, sizeof(info));
dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
if (!info.page_addr && !info.status)
return;
if (info.vmid == vmid)
kfd_signal_vm_fault_event(dev, pasid, &info);
else
kfd_signal_vm_fault_event(dev, pasid, NULL);
}
} }
const struct kfd_event_interrupt_class event_interrupt_class_cik = { const struct kfd_event_interrupt_class event_interrupt_class_cik = {
......
...@@ -37,6 +37,8 @@ struct cik_ih_ring_entry { ...@@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
#define CIK_INTSRC_SDMA_TRAP 0xE0 #define CIK_INTSRC_SDMA_TRAP 0xE0
#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93
#endif #endif
...@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) ...@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
kfree(dqm); kfree(dqm);
} }
int kfd_process_vm_fault(struct device_queue_manager *dqm,
unsigned int pasid)
{
struct kfd_process_device *pdd;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
int ret = 0;
if (!p)
return -EINVAL;
pdd = kfd_get_process_device_data(dqm->dev, p);
if (pdd)
ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
kfd_unref_process(p);
return ret;
}
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m, static void seq_reg_dump(struct seq_file *m,
......
...@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid) ...@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
mutex_unlock(&p->event_mutex); mutex_unlock(&p->event_mutex);
kfd_unref_process(p); kfd_unref_process(p);
} }
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info)
{
struct kfd_event *ev;
uint32_t id;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
struct kfd_hsa_memory_exception_data memory_exception_data;
if (!p)
return; /* Presumably process exited. */
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.gpu_id = dev->id;
memory_exception_data.failure.imprecise = 1;
/* Set failure reason */
if (info) {
memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
memory_exception_data.failure.NotPresent =
info->prot_valid ? 1 : 0;
memory_exception_data.failure.NoExecute =
info->prot_exec ? 1 : 0;
memory_exception_data.failure.ReadOnly =
info->prot_write ? 1 : 0;
memory_exception_data.failure.imprecise = 0;
}
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id)
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
ev->memory_exception_data = memory_exception_data;
set_event(ev);
}
mutex_unlock(&p->event_mutex);
kfd_unref_process(p);
}
...@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, ...@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
return source_id == SOC15_INTSRC_CP_END_OF_PIPE || return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP || source_id == SOC15_INTSRC_SDMA_TRAP ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE; source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2;
} }
static void event_interrupt_wq_v9(struct kfd_dev *dev, static void event_interrupt_wq_v9(struct kfd_dev *dev,
...@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, ...@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_hw_exception_event(pasid); kfd_signal_hw_exception_event(pasid);
else if (client_id == SOC15_IH_CLIENTID_VMC || else if (client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2) { client_id == SOC15_IH_CLIENTID_UTCL2) {
/* TODO */ struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
info.prot_valid = ring_id & 0x08;
info.prot_read = ring_id & 0x10;
info.prot_write = ring_id & 0x20;
kfd_process_vm_fault(dev->dqm, pasid);
kfd_signal_vm_fault_event(dev, pasid, &info);
} }
} }
......
...@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm); ...@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type); enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq); void kernel_queue_uninit(struct kernel_queue *kq);
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */ /* Process Queue Manager */
struct process_queue_node { struct process_queue_node {
...@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, ...@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint64_t *event_page_offset, uint32_t *event_slot_index); uint64_t *event_page_offset, uint32_t *event_slot_index);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info);
void kfd_flush_tlb(struct kfd_process_device *pdd); void kfd_flush_tlb(struct kfd_process_device *pdd);
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
......
...@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure { ...@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
__u32 NotPresent; /* Page not present or supervisor privilege */ __u32 NotPresent; /* Page not present or supervisor privilege */
__u32 ReadOnly; /* Write access to a read-only page */ __u32 ReadOnly; /* Write access to a read-only page */
__u32 NoExecute; /* Execute access to a page marked NX */ __u32 NoExecute; /* Execute access to a page marked NX */
__u32 pad; __u32 imprecise; /* Can't determine the exact fault address */
}; };
/* memory exception data*/ /* memory exception data*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment