Commit 740a451b authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdkfd: Handle incomplete migration to system memory

If some pages fail to migrate to system memory, don't update
prange->actual_loc = 0. This prevents endless CPU page faults after
partial migration failures due to contested page locks.

Migration to RAM must be complete during migrations from VRAM to VRAM and
during evictions. Implement retry and fail if the migration to RAM fails.
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 12fcf0a7
......@@ -281,6 +281,19 @@ static unsigned long svm_migrate_successful_pages(struct migrate_vma *migrate)
return cpages;
}
static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate)
{
unsigned long upages = 0;
unsigned long i;
for (i = 0; i < migrate->npages; i++) {
if (migrate->src[i] & MIGRATE_PFN_VALID &&
!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
upages++;
}
return upages;
}
static int
svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
struct migrate_vma *migrate, struct dma_fence **mfence,
......@@ -634,10 +647,11 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
struct vm_area_struct *vma, uint64_t start, uint64_t end)
{
uint64_t npages = (end - start) >> PAGE_SHIFT;
unsigned long upages = npages;
unsigned long cpages = 0;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
struct migrate_vma migrate;
unsigned long cpages = 0;
dma_addr_t *scratch;
size_t size;
void *buf;
......@@ -671,6 +685,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
if (!cpages) {
pr_debug("failed collect migrate device pages [0x%lx 0x%lx]\n",
prange->start, prange->last);
upages = svm_migrate_unsuccessful_pages(&migrate);
goto out_free;
}
if (cpages != npages)
......@@ -683,8 +698,9 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
scratch, npages);
migrate_vma_pages(&migrate);
pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
svm_migrate_successful_pages(&migrate), cpages, migrate.npages);
upages = svm_migrate_unsuccessful_pages(&migrate);
pr_debug("unsuccessful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
upages, cpages, migrate.npages);
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize(&migrate);
......@@ -698,9 +714,9 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
if (pdd)
WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
return cpages;
return upages;
}
return r;
return r ? r : upages;
}
/**
......@@ -720,7 +736,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
unsigned long addr;
unsigned long start;
unsigned long end;
unsigned long cpages = 0;
unsigned long upages = 0;
long r = 0;
if (!prange->actual_loc) {
......@@ -756,12 +772,12 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
pr_debug("failed %ld to migrate\n", r);
break;
} else {
cpages += r;
upages += r;
}
addr = next;
}
if (cpages) {
if (!upages) {
svm_range_vram_node_free(prange);
prange->actual_loc = 0;
}
......@@ -784,7 +800,7 @@ static int
svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
struct mm_struct *mm)
{
int r;
int r, retries = 3;
/*
* TODO: for both devices with PCIe large bar or on same xgmi hive, skip
......@@ -793,9 +809,14 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
r = svm_migrate_vram_to_ram(prange, mm);
if (r)
return r;
do {
r = svm_migrate_vram_to_ram(prange, mm);
if (r)
return r;
} while (prange->actual_loc && --retries);
if (prange->actual_loc)
return -EDEADLK;
return svm_migrate_ram_to_vram(prange, best_loc, mm);
}
......
......@@ -3096,6 +3096,8 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
struct svm_range *prange =
list_first_entry(&svm_bo->range_list,
struct svm_range, svm_bo_list);
int retries = 3;
list_del_init(&prange->svm_bo_list);
spin_unlock(&svm_bo->list_lock);
......@@ -3103,7 +3105,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
prange->start, prange->last);
mutex_lock(&prange->migrate_mutex);
svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm);
do {
svm_migrate_vram_to_ram(prange,
svm_bo->eviction_fence->mm);
} while (prange->actual_loc && --retries);
WARN(prange->actual_loc, "Migration failed during eviction");
mutex_lock(&prange->lock);
prange->svm_bo = NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment