Commit 473af28d authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher

drm/amdgpu: Estimate RAS reservation when report capacity v2

Add estimate of how much vram we need to reserve for RAS
when caculating the total available vram.

v2: apply the change to MP0 v13_0_2 and v13_0_14
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 76bec2a0
......@@ -172,6 +172,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
{
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
uint64_t vram_size = 0;
......@@ -220,7 +222,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) {
vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) {
ret = -ENOMEM;
goto release;
}
......@@ -1673,6 +1675,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
{
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
ssize_t available;
uint64_t vram_available, system_mem_available, ttm_mem_available;
......@@ -1680,7 +1684,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
- adev->kfd.vram_used_aligned[xcp_id]
- atomic64_read(&adev->vram_pin_size)
- reserved_for_pt;
- reserved_for_pt
- reserved_for_ras;
if (adev->flags & AMD_IS_APU) {
system_mem_available = no_system_mem_limit ?
......
......@@ -3298,6 +3298,24 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
amdgpu_put_xgmi_hive(hive);
}
static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!con || (adev->flags & AMD_IS_APU))
return;
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 14):
con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
break;
default:
break;
}
}
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
......@@ -3403,6 +3421,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
amdgpu_ras_init_reserved_vram_size(adev);
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
......
......@@ -64,6 +64,9 @@ struct amdgpu_iv_entry;
#define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
#define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000
/* Reserve 8 physical dram row for possible retirement.
* In worst cases, it will lose 8 * 2MB memory in vram domain */
#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20)
/* The high three bits indicates socketid */
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
......@@ -541,6 +544,7 @@ struct amdgpu_ras {
struct ras_event_manager __event_mgr;
struct ras_event_manager *event_mgr;
uint64_t reserved_pages_in_bytes;
};
struct ras_fs_data {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment