Commit e5c04edf authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/amdgpu: revert "reserve backup pages for bad page retirment"

As noted during the review this approach doesn't make sense at all.

We should not apply any limitation on the VRAM applications can use inside the kernel.

If an application or end user wants to reserve a certain amount of VRAM for bad pages handling we should do this in the upper layer.

This reverts commit f89b881c.
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6b44b667
......@@ -180,7 +180,7 @@ struct amdgpu_mgpu_info mgpu_info = {
};
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = 100;
int amdgpu_bad_page_threshold = -1;
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
.period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
......@@ -854,7 +854,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
* faulty pages by ECC exceed threshold value and leave it for user's further
* check.
*/
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
......
......@@ -1790,14 +1790,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
return ret;
}
static uint32_t
amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
uint32_t max_length)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
uint32_t max_length = 0;
max_length = amdgpu_ras_eeprom_get_record_max_length();
/*
* Justification of value bad_page_cnt_threshold in ras structure
*
......@@ -1823,18 +1822,20 @@ amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
tmp_threshold = max_length;
if (tmp_threshold == -1) {
val = adev->gmc.real_vram_size;
val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
tmp_threshold = min(lower_32_bits(val), max_length);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_length);
} else {
con->bad_page_cnt_threshold = tmp_threshold;
}
return tmp_threshold;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret;
......@@ -1854,16 +1855,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;
if (!con->bad_page_cnt_threshold) {
con->bad_page_cnt_threshold =
amdgpu_ras_calculate_badpags_threshold(adev);
ret = amdgpu_vram_mgr_reserve_backup_pages(
ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
con->bad_page_cnt_threshold);
if (ret)
goto out;
}
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
/* Todo: During test the SMU might fail to read the eeprom through I2C
* when the GPU is pending on XGMI reset during probe time
......
......@@ -48,8 +48,6 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
struct list_head backup_pages;
uint32_t num_backup_pages;
atomic64_t usage;
atomic64_t vis_usage;
};
......@@ -124,8 +122,6 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man);
uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size);
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
uint32_t num_pages);
int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
uint64_t start);
......
......@@ -28,9 +28,6 @@
#include "amdgpu_atomfirmware.h"
#include "atom.h"
static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
uint32_t num_pages);
static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man)
{
return container_of(man, struct amdgpu_vram_mgr, manager);
......@@ -189,7 +186,6 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
spin_lock_init(&mgr->lock);
INIT_LIST_HEAD(&mgr->reservations_pending);
INIT_LIST_HEAD(&mgr->reserved_pages);
INIT_LIST_HEAD(&mgr->backup_pages);
/* Add the two VRAM-related sysfs files */
ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes);
......@@ -230,11 +226,6 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
drm_mm_remove_node(&rsv->mm_node);
kfree(rsv);
}
list_for_each_entry_safe(rsv, temp, &mgr->backup_pages, node) {
drm_mm_remove_node(&rsv->mm_node);
kfree(rsv);
}
drm_mm_takedown(&mgr->mm);
spin_unlock(&mgr->lock);
......@@ -306,14 +297,12 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
continue;
dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
rsv->mm_node.start << PAGE_SHIFT, rsv->mm_node.size);
rsv->mm_node.start, rsv->mm_node.size);
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage);
list_move(&rsv->node, &mgr->reserved_pages);
amdgpu_vram_mgr_free_backup_pages(mgr, rsv->mm_node.size);
}
}
......@@ -330,7 +319,6 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size)
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
......@@ -341,94 +329,14 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
rsv->mm_node.start = start >> PAGE_SHIFT;
rsv->mm_node.size = size >> PAGE_SHIFT;
dev_dbg(adev->dev, "Pending Reservation: 0x%llx\n", start);
spin_lock(&mgr->lock);
list_add_tail(&rsv->node, &mgr->reservations_pending);
list_add_tail(&mgr->reservations_pending, &rsv->node);
amdgpu_vram_mgr_do_reserve(man);
spin_unlock(&mgr->lock);
return 0;
}
static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
uint32_t num_pages)
{
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
uint32_t i;
uint64_t vis_usage = 0, total_usage = 0;
if (num_pages > mgr->num_backup_pages) {
dev_warn(adev->dev, "No enough backup pages\n");
return -EINVAL;
}
for (i = 0; i < num_pages; i++) {
rsv = list_first_entry(&mgr->backup_pages,
struct amdgpu_vram_reservation, node);
vis_usage += amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
total_usage += (rsv->mm_node.size << PAGE_SHIFT);
drm_mm_remove_node(&rsv->mm_node);
list_del(&rsv->node);
kfree(rsv);
mgr->num_backup_pages--;
}
atomic64_sub(total_usage, &mgr->usage);
atomic64_sub(vis_usage, &mgr->vis_usage);
return 0;
}
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
uint32_t num_pages)
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
struct drm_mm *mm = &mgr->mm;
uint32_t i;
int ret = 0;
uint64_t vis_usage, total_usage;
for (i = 0; i < num_pages; i++) {
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
if (!rsv) {
ret = -ENOMEM;
goto pro_end;
}
INIT_LIST_HEAD(&rsv->node);
ret = drm_mm_insert_node(mm, &rsv->mm_node, 1);
if (ret) {
dev_err(adev->dev, "failed to reserve backup page %d, ret 0x%x\n", i, ret);
kfree(rsv);
goto pro_end;
}
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
total_usage = (rsv->mm_node.size << PAGE_SHIFT);
spin_lock(&mgr->lock);
atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(total_usage, &mgr->usage);
list_add_tail(&rsv->node, &mgr->backup_pages);
mgr->num_backup_pages++;
spin_unlock(&mgr->lock);
}
pro_end:
if (ret) {
spin_lock(&mgr->lock);
amdgpu_vram_mgr_free_backup_pages(mgr, mgr->num_backup_pages);
spin_unlock(&mgr->lock);
}
return ret;
}
/**
* amdgpu_vram_mgr_query_page_status - query the reservation status
*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment