Commit b2057956 authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdkfd: Add eviction debug messages

Use WARN to print messages with backtrace when evictions are triggered.
This can help determine the root cause of evictions and help spot driver
bugs triggering evictions unintentionally, or help with performance tuning
by avoiding conditions that cause evictions in a specific workload.

The messages are controlled by a new module parameter that can be changed
at runtime:

  echo Y > /sys/module/amdgpu/parameters/debug_evictions
  echo N > /sys/module/amdgpu/parameters/debug_evictions
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e3569fab
...@@ -186,8 +186,10 @@ extern int amdgpu_noretry; ...@@ -186,8 +186,10 @@ extern int amdgpu_noretry;
extern int amdgpu_force_asic_type; extern int amdgpu_force_asic_type;
#ifdef CONFIG_HSA_AMD #ifdef CONFIG_HSA_AMD
extern int sched_policy; extern int sched_policy;
extern bool debug_evictions;
#else #else
static const int sched_policy = KFD_SCHED_POLICY_HWS; static const int sched_policy = KFD_SCHED_POLICY_HWS;
static const bool debug_evictions; /* = false */
#endif #endif
extern int amdgpu_tmz; extern int amdgpu_tmz;
......
...@@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2 FW supports GWS barriers (false = ...@@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2 FW supports GWS barriers (false =
int queue_preemption_timeout_ms = 9000; int queue_preemption_timeout_ms = 9000;
module_param(queue_preemption_timeout_ms, int, 0644); module_param(queue_preemption_timeout_ms, int, 0644);
MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)"); MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)");
/**
* DOC: debug_evictions(bool)
* Enable extra debug messages to help determine the cause of evictions
*/
bool debug_evictions;
module_param(debug_evictions, bool, 0644);
MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages (false = default)");
#endif #endif
/** /**
......
...@@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync, ...@@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
continue; continue;
} }
WARN(debug_evictions && fence_owner == AMDGPU_FENCE_OWNER_KFD,
"Adding eviction fence to sync obj");
r = amdgpu_sync_fence(sync, f, false); r = amdgpu_sync_fence(sync, f, false);
if (r) if (r)
break; break;
......
...@@ -935,6 +935,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) ...@@ -935,6 +935,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
if (!p) if (!p)
return -ESRCH; return -ESRCH;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
r = kfd_process_evict_queues(p); r = kfd_process_evict_queues(p);
kfd_unref_process(p); kfd_unref_process(p);
...@@ -1002,6 +1003,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, ...@@ -1002,6 +1003,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
/* During process initialization eviction_work.dwork is initialized /* During process initialization eviction_work.dwork is initialized
* to kfd_evict_bo_worker * to kfd_evict_bo_worker
*/ */
WARN(debug_evictions, "Scheduling eviction of pid %d in %ld jiffies",
p->lead_thread->pid, delay_jiffies);
schedule_delayed_work(&p->eviction_work, delay_jiffies); schedule_delayed_work(&p->eviction_work, delay_jiffies);
out: out:
kfd_unref_process(p); kfd_unref_process(p);
......
...@@ -177,6 +177,11 @@ extern bool hws_gws_support; ...@@ -177,6 +177,11 @@ extern bool hws_gws_support;
*/ */
extern int queue_preemption_timeout_ms; extern int queue_preemption_timeout_ms;
/*
* Enable eviction debug messages
*/
extern bool debug_evictions;
enum cache_policy { enum cache_policy {
cache_policy_coherent, cache_policy_coherent,
cache_policy_noncoherent cache_policy_noncoherent
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment