Commit f89b881c authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher

drm/amdgpu: reserve backup pages for bad page retirment

To ensure user has a constant of VRAM accessible in run-time, driver
reserves limit backup pages when init, and return ones when bad pages
retired, to keep no change of unused memory size.

v2: refine codes to calculate badpags threshold
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6c65a582
...@@ -171,7 +171,7 @@ struct amdgpu_mgpu_info mgpu_info = { ...@@ -171,7 +171,7 @@ struct amdgpu_mgpu_info mgpu_info = {
}; };
int amdgpu_ras_enable = -1; int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff; uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = -1; int amdgpu_bad_page_threshold = 100;
/** /**
* DOC: vramlimit (int) * DOC: vramlimit (int)
...@@ -805,7 +805,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444); ...@@ -805,7 +805,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
* faulty pages by ECC exceed threshold value and leave it for user's further * faulty pages by ECC exceed threshold value and leave it for user's further
* check. * check.
*/ */
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
......
...@@ -1747,13 +1747,14 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, ...@@ -1747,13 +1747,14 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
return ret; return ret;
} }
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, static uint32_t
uint32_t max_length) amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold; int tmp_threshold = amdgpu_bad_page_threshold;
u64 val; u64 val;
uint32_t max_length = 0;
max_length = amdgpu_ras_eeprom_get_record_max_length();
/* /*
* Justification of value bad_page_cnt_threshold in ras structure * Justification of value bad_page_cnt_threshold in ras structure
* *
...@@ -1779,20 +1780,18 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, ...@@ -1779,20 +1780,18 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
tmp_threshold = max_length; tmp_threshold = max_length;
if (tmp_threshold == -1) { if (tmp_threshold == -1) {
val = adev->gmc.mc_vram_size; val = adev->gmc.real_vram_size;
do_div(val, RAS_BAD_PAGE_RATE); do_div(val, RAS_BAD_PAGE_RATE);
con->bad_page_cnt_threshold = min(lower_32_bits(val), tmp_threshold = min(lower_32_bits(val), max_length);
max_length);
} else {
con->bad_page_cnt_threshold = tmp_threshold;
} }
return tmp_threshold;
} }
int amdgpu_ras_recovery_init(struct amdgpu_device *adev) int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data; struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false; bool exc_err_limit = false;
int ret; int ret;
...@@ -1812,8 +1811,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1812,8 +1811,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0); atomic_set(&con->in_recovery, 0);
con->adev = adev; con->adev = adev;
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); if (!con->bad_page_cnt_threshold) {
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); con->bad_page_cnt_threshold =
amdgpu_ras_calculate_badpags_threshold(adev);
ret = amdgpu_vram_mgr_reserve_backup_pages(
ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
con->bad_page_cnt_threshold);
if (ret)
goto out;
}
ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
/* /*
......
...@@ -48,6 +48,8 @@ struct amdgpu_vram_mgr { ...@@ -48,6 +48,8 @@ struct amdgpu_vram_mgr {
spinlock_t lock; spinlock_t lock;
struct list_head reservations_pending; struct list_head reservations_pending;
struct list_head reserved_pages; struct list_head reserved_pages;
struct list_head backup_pages;
uint32_t num_backup_pages;
atomic64_t usage; atomic64_t usage;
atomic64_t vis_usage; atomic64_t vis_usage;
}; };
...@@ -123,6 +125,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man); ...@@ -123,6 +125,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man);
uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man); uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size); uint64_t start, uint64_t size);
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
uint32_t num_pages);
int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man, int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
uint64_t start); uint64_t start);
......
...@@ -28,6 +28,9 @@ ...@@ -28,6 +28,9 @@
#include "amdgpu_atomfirmware.h" #include "amdgpu_atomfirmware.h"
#include "atom.h" #include "atom.h"
static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
uint32_t num_pages);
static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man) static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man)
{ {
return container_of(man, struct amdgpu_vram_mgr, manager); return container_of(man, struct amdgpu_vram_mgr, manager);
...@@ -186,6 +189,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) ...@@ -186,6 +189,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
spin_lock_init(&mgr->lock); spin_lock_init(&mgr->lock);
INIT_LIST_HEAD(&mgr->reservations_pending); INIT_LIST_HEAD(&mgr->reservations_pending);
INIT_LIST_HEAD(&mgr->reserved_pages); INIT_LIST_HEAD(&mgr->reserved_pages);
INIT_LIST_HEAD(&mgr->backup_pages);
/* Add the two VRAM-related sysfs files */ /* Add the two VRAM-related sysfs files */
ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes); ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes);
...@@ -226,6 +230,11 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) ...@@ -226,6 +230,11 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
drm_mm_remove_node(&rsv->mm_node); drm_mm_remove_node(&rsv->mm_node);
kfree(rsv); kfree(rsv);
} }
list_for_each_entry_safe(rsv, temp, &mgr->backup_pages, node) {
drm_mm_remove_node(&rsv->mm_node);
kfree(rsv);
}
drm_mm_takedown(&mgr->mm); drm_mm_takedown(&mgr->mm);
spin_unlock(&mgr->lock); spin_unlock(&mgr->lock);
...@@ -297,12 +306,14 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man) ...@@ -297,12 +306,14 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
continue; continue;
dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n", dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
rsv->mm_node.start, rsv->mm_node.size); rsv->mm_node.start << PAGE_SHIFT, rsv->mm_node.size);
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node); vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
atomic64_add(vis_usage, &mgr->vis_usage); atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage); atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage);
list_move(&rsv->node, &mgr->reserved_pages); list_move(&rsv->node, &mgr->reserved_pages);
amdgpu_vram_mgr_free_backup_pages(mgr, rsv->mm_node.size);
} }
} }
...@@ -319,6 +330,7 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, ...@@ -319,6 +330,7 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size) uint64_t start, uint64_t size)
{ {
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv; struct amdgpu_vram_reservation *rsv;
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL); rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
...@@ -329,14 +341,94 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, ...@@ -329,14 +341,94 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
rsv->mm_node.start = start >> PAGE_SHIFT; rsv->mm_node.start = start >> PAGE_SHIFT;
rsv->mm_node.size = size >> PAGE_SHIFT; rsv->mm_node.size = size >> PAGE_SHIFT;
dev_dbg(adev->dev, "Pending Reservation: 0x%llx\n", start);
spin_lock(&mgr->lock); spin_lock(&mgr->lock);
list_add_tail(&mgr->reservations_pending, &rsv->node); list_add_tail(&rsv->node, &mgr->reservations_pending);
amdgpu_vram_mgr_do_reserve(man); amdgpu_vram_mgr_do_reserve(man);
spin_unlock(&mgr->lock); spin_unlock(&mgr->lock);
return 0; return 0;
} }
static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
uint32_t num_pages)
{
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
uint32_t i;
uint64_t vis_usage = 0, total_usage = 0;
if (num_pages > mgr->num_backup_pages) {
dev_warn(adev->dev, "No enough backup pages\n");
return -EINVAL;
}
for (i = 0; i < num_pages; i++) {
rsv = list_first_entry(&mgr->backup_pages,
struct amdgpu_vram_reservation, node);
vis_usage += amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
total_usage += (rsv->mm_node.size << PAGE_SHIFT);
drm_mm_remove_node(&rsv->mm_node);
list_del(&rsv->node);
kfree(rsv);
mgr->num_backup_pages--;
}
atomic64_sub(total_usage, &mgr->usage);
atomic64_sub(vis_usage, &mgr->vis_usage);
return 0;
}
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
uint32_t num_pages)
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
struct drm_mm *mm = &mgr->mm;
uint32_t i;
int ret = 0;
uint64_t vis_usage, total_usage;
for (i = 0; i < num_pages; i++) {
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
if (!rsv) {
ret = -ENOMEM;
goto pro_end;
}
INIT_LIST_HEAD(&rsv->node);
ret = drm_mm_insert_node(mm, &rsv->mm_node, 1);
if (ret) {
dev_err(adev->dev, "failed to reserve backup page %d, ret 0x%x\n", i, ret);
kfree(rsv);
goto pro_end;
}
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
total_usage = (rsv->mm_node.size << PAGE_SHIFT);
spin_lock(&mgr->lock);
atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(total_usage, &mgr->usage);
list_add_tail(&rsv->node, &mgr->backup_pages);
mgr->num_backup_pages++;
spin_unlock(&mgr->lock);
}
pro_end:
if (ret) {
spin_lock(&mgr->lock);
amdgpu_vram_mgr_free_backup_pages(mgr, mgr->num_backup_pages);
spin_unlock(&mgr->lock);
}
return ret;
}
/** /**
* amdgpu_vram_mgr_query_page_status - query the reservation status * amdgpu_vram_mgr_query_page_status - query the reservation status
* *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment