Commit bef153b7 authored by David Yat Sin's avatar David Yat Sin Committed by Alex Deucher

drm/amdkfd: CRIU implement gpu_id remapping

When doing a restore on a different node, the gpu_id's on the restore
node may be different. But the user space application will still refer
use the original gpu_id's in the ioctl calls. Adding code to create a
gpu id mapping so that kfd can determine actual gpu_id during the user
ioctl's.
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarDavid Yat Sin <david.yatsin@amd.com>
Signed-off-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 40e8a766
This diff is collapsed.
...@@ -342,11 +342,12 @@ int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset) ...@@ -342,11 +342,12 @@ int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)
return -EINVAL; return -EINVAL;
} }
kfd = kfd_device_by_id(GET_GPU_ID(event_page_offset)); pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset));
if (!kfd) { if (!pdd) {
pr_err("Getting device by id failed in %s\n", __func__); pr_err("Getting device by id failed in %s\n", __func__);
return -EINVAL; return -EINVAL;
} }
kfd = pdd->dev;
pdd = kfd_bind_process_to_device(kfd, p); pdd = kfd_bind_process_to_device(kfd, p);
if (IS_ERR(pdd)) if (IS_ERR(pdd))
...@@ -1094,6 +1095,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid, ...@@ -1094,6 +1095,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
{ {
struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_memory_exception_data memory_exception_data;
struct vm_area_struct *vma; struct vm_area_struct *vma;
int user_gpu_id;
/* /*
* Because we are called from arbitrary context (workqueue) as opposed * Because we are called from arbitrary context (workqueue) as opposed
...@@ -1115,12 +1117,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid, ...@@ -1115,12 +1117,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
return; /* Process is exiting */ return; /* Process is exiting */
} }
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
return;
}
memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memset(&memory_exception_data, 0, sizeof(memory_exception_data));
mmap_read_lock(mm); mmap_read_lock(mm);
vma = find_vma(mm, address); vma = find_vma(mm, address);
memory_exception_data.gpu_id = dev->id; memory_exception_data.gpu_id = user_gpu_id;
memory_exception_data.va = address; memory_exception_data.va = address;
/* Set failure reason */ /* Set failure reason */
memory_exception_data.failure.NotPresent = 1; memory_exception_data.failure.NotPresent = 1;
...@@ -1196,11 +1203,19 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid, ...@@ -1196,11 +1203,19 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
uint32_t id; uint32_t id;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_memory_exception_data memory_exception_data;
int user_gpu_id;
if (!p) if (!p)
return; /* Presumably process exited. */ return; /* Presumably process exited. */
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
return;
}
memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.gpu_id = dev->id; memory_exception_data.gpu_id = user_gpu_id;
memory_exception_data.failure.imprecise = true; memory_exception_data.failure.imprecise = true;
/* Set failure reason */ /* Set failure reason */
if (info) { if (info) {
...@@ -1240,27 +1255,34 @@ void kfd_signal_reset_event(struct kfd_dev *dev) ...@@ -1240,27 +1255,34 @@ void kfd_signal_reset_event(struct kfd_dev *dev)
/* Whole gpu reset caused by GPU hang and memory is lost */ /* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data)); memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1; hw_exception_data.memory_lost = 1;
hw_exception_data.reset_cause = reset_cause; hw_exception_data.reset_cause = reset_cause;
memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
memory_exception_data.gpu_id = dev->id;
memory_exception_data.failure.imprecise = true; memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu); idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
continue;
}
mutex_lock(&p->event_mutex); mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID; id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id) { idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data; ev->hw_exception_data = hw_exception_data;
ev->hw_exception_data.gpu_id = user_gpu_id;
set_event(ev); set_event(ev);
} }
if (ev->type == KFD_EVENT_TYPE_MEMORY && if (ev->type == KFD_EVENT_TYPE_MEMORY &&
reset_cause == KFD_HW_EXCEPTION_ECC) { reset_cause == KFD_HW_EXCEPTION_ECC) {
ev->memory_exception_data = memory_exception_data; ev->memory_exception_data = memory_exception_data;
ev->memory_exception_data.gpu_id = user_gpu_id;
set_event(ev); set_event(ev);
} }
} }
...@@ -1276,18 +1298,25 @@ void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid) ...@@ -1276,18 +1298,25 @@ void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid)
struct kfd_hsa_hw_exception_data hw_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_event *ev; struct kfd_event *ev;
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;
if (!p) if (!p)
return; /* Presumably process exited. */ return; /* Presumably process exited. */
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
return;
}
memset(&hw_exception_data, 0, sizeof(hw_exception_data)); memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id; hw_exception_data.gpu_id = user_gpu_id;
hw_exception_data.memory_lost = 1; hw_exception_data.memory_lost = 1;
hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC; hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED; memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
memory_exception_data.gpu_id = dev->id; memory_exception_data.gpu_id = user_gpu_id;
memory_exception_data.failure.imprecise = true; memory_exception_data.failure.imprecise = true;
mutex_lock(&p->event_mutex); mutex_lock(&p->event_mutex);
......
...@@ -774,6 +774,12 @@ struct kfd_process_device { ...@@ -774,6 +774,12 @@ struct kfd_process_device {
uint64_t faults; uint64_t faults;
uint64_t page_in; uint64_t page_in;
uint64_t page_out; uint64_t page_out;
/*
* If this process has been checkpointed before, then the user
* application will use the original gpu_id on the
* checkpointed node to refer to this device.
*/
uint32_t user_gpu_id;
}; };
#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
...@@ -933,6 +939,11 @@ int kfd_process_restore_queues(struct kfd_process *p); ...@@ -933,6 +939,11 @@ int kfd_process_restore_queues(struct kfd_process *p);
void kfd_suspend_all_processes(void); void kfd_suspend_all_processes(void);
int kfd_resume_all_processes(void); int kfd_resume_all_processes(void);
struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process,
uint32_t gpu_id);
int kfd_process_get_user_gpu_id(struct kfd_process *p, uint32_t actual_gpu_id);
int kfd_process_device_init_vm(struct kfd_process_device *pdd, int kfd_process_device_init_vm(struct kfd_process_device *pdd,
struct file *drm_file); struct file *drm_file);
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
......
...@@ -1526,6 +1526,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, ...@@ -1526,6 +1526,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
pdd->runtime_inuse = false; pdd->runtime_inuse = false;
pdd->vram_usage = 0; pdd->vram_usage = 0;
pdd->sdma_past_activity_counter = 0; pdd->sdma_past_activity_counter = 0;
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0); atomic64_set(&pdd->evict_duration_counter, 0);
p->pdds[p->n_pdds++] = pdd; p->pdds[p->n_pdds++] = pdd;
...@@ -1981,6 +1982,37 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type) ...@@ -1981,6 +1982,37 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
} }
} }
struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
{
int i;
if (gpu_id) {
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
if (pdd->user_gpu_id == gpu_id)
return pdd;
}
}
return NULL;
}
int kfd_process_get_user_gpu_id(struct kfd_process *p, uint32_t actual_gpu_id)
{
int i;
if (!actual_gpu_id)
return 0;
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
if (pdd->dev->id == actual_gpu_id)
return pdd->user_gpu_id;
}
return -EINVAL;
}
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
......
...@@ -610,7 +610,7 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd, ...@@ -610,7 +610,7 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
mqd = (void *)(q_data + 1); mqd = (void *)(q_data + 1);
ctl_stack = mqd + q_data->mqd_size; ctl_stack = mqd + q_data->mqd_size;
q_data->gpu_id = pdd->dev->id; q_data->gpu_id = pdd->user_gpu_id;
q_data->type = q->properties.type; q_data->type = q->properties.type;
q_data->format = q->properties.format; q_data->format = q->properties.format;
q_data->q_id = q->properties.queue_id; q_data->q_id = q->properties.queue_id;
...@@ -769,7 +769,6 @@ int kfd_criu_restore_queue(struct kfd_process *p, ...@@ -769,7 +769,6 @@ int kfd_criu_restore_queue(struct kfd_process *p,
uint64_t q_extra_data_size; uint64_t q_extra_data_size;
struct queue_properties qp; struct queue_properties qp;
unsigned int queue_id; unsigned int queue_id;
struct kfd_dev *dev;
int ret = 0; int ret = 0;
if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size) if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size)
...@@ -807,20 +806,11 @@ int kfd_criu_restore_queue(struct kfd_process *p, ...@@ -807,20 +806,11 @@ int kfd_criu_restore_queue(struct kfd_process *p,
*priv_data_offset += q_extra_data_size; *priv_data_offset += q_extra_data_size;
dev = kfd_device_by_id(q_data->gpu_id); pdd = kfd_process_device_data_by_id(p, q_data->gpu_id);
if (!dev) {
pr_err("Could not get kfd_dev from gpu_id = 0x%x\n",
q_data->gpu_id);
ret = -EINVAL;
goto exit;
}
pdd = kfd_get_process_device_data(dev, p);
if (!pdd) { if (!pdd) {
pr_err("Failed to get pdd\n"); pr_err("Failed to get pdd\n");
ret = -EFAULT; ret = -EINVAL;
return ret; goto exit;
} }
/* data stored in this order: mqd, ctl_stack */ /* data stored in this order: mqd, ctl_stack */
mqd = q_extra_data; mqd = q_extra_data;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment