Commit dcebf026 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher

drm/amdgpu: Add gpu_recovery parameter

Add new parameter to control GPU recovery procedure.

v2:
Add auto logic where reset is disabled for bare metal and enabled
for SR-IOV.
Allow forced reset from debugfs.
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3e98d829
...@@ -126,6 +126,7 @@ extern int amdgpu_param_buf_per_se; ...@@ -126,6 +126,7 @@ extern int amdgpu_param_buf_per_se;
extern int amdgpu_job_hang_limit; extern int amdgpu_job_hang_limit;
extern int amdgpu_lbpw; extern int amdgpu_lbpw;
extern int amdgpu_compute_multipipe; extern int amdgpu_compute_multipipe;
extern int amdgpu_gpu_recovery;
#ifdef CONFIG_DRM_AMDGPU_SI #ifdef CONFIG_DRM_AMDGPU_SI
extern int amdgpu_si_support; extern int amdgpu_si_support;
...@@ -1910,7 +1911,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring) ...@@ -1910,7 +1911,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
#define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i)) #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
/* Common functions */ /* Common functions */
int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job); int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job, bool force);
bool amdgpu_need_backup(struct amdgpu_device *adev); bool amdgpu_need_backup(struct amdgpu_device *adev);
void amdgpu_pci_config_reset(struct amdgpu_device *adev); void amdgpu_pci_config_reset(struct amdgpu_device *adev);
bool amdgpu_need_post(struct amdgpu_device *adev); bool amdgpu_need_post(struct amdgpu_device *adev);
......
...@@ -3009,11 +3009,12 @@ static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags, ...@@ -3009,11 +3009,12 @@ static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags,
* *
* @adev: amdgpu device pointer * @adev: amdgpu device pointer
* @job: which job trigger hang * @job: which job trigger hang
* @force forces reset regardless of amdgpu_gpu_recovery
* *
* Attempt to reset the GPU if it has hung (all asics). * Attempt to reset the GPU if it has hung (all asics).
* Returns 0 for success or an error on failure. * Returns 0 for success or an error on failure.
*/ */
int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job, bool force)
{ {
struct drm_atomic_state *state = NULL; struct drm_atomic_state *state = NULL;
uint64_t reset_flags = 0; uint64_t reset_flags = 0;
...@@ -3024,6 +3025,12 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) ...@@ -3024,6 +3025,12 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
return 0; return 0;
} }
if (!force && (amdgpu_gpu_recovery == 0 ||
(amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) {
DRM_INFO("GPU recovery disabled.\n");
return 0;
}
dev_info(adev->dev, "GPU reset begin!\n"); dev_info(adev->dev, "GPU reset begin!\n");
mutex_lock(&adev->lock_reset); mutex_lock(&adev->lock_reset);
......
...@@ -128,6 +128,7 @@ int amdgpu_param_buf_per_se = 0; ...@@ -128,6 +128,7 @@ int amdgpu_param_buf_per_se = 0;
int amdgpu_job_hang_limit = 0; int amdgpu_job_hang_limit = 0;
int amdgpu_lbpw = -1; int amdgpu_lbpw = -1;
int amdgpu_compute_multipipe = -1; int amdgpu_compute_multipipe = -1;
int amdgpu_gpu_recovery = -1; /* auto */
MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes"); MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
...@@ -280,6 +281,9 @@ module_param_named(lbpw, amdgpu_lbpw, int, 0444); ...@@ -280,6 +281,9 @@ module_param_named(lbpw, amdgpu_lbpw, int, 0444);
MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)"); MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)");
module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444); module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto");
module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
#ifdef CONFIG_DRM_AMDGPU_SI #ifdef CONFIG_DRM_AMDGPU_SI
#if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE) #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE)
......
...@@ -705,7 +705,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) ...@@ -705,7 +705,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
struct amdgpu_device *adev = dev->dev_private; struct amdgpu_device *adev = dev->dev_private;
seq_printf(m, "gpu recover\n"); seq_printf(m, "gpu recover\n");
amdgpu_gpu_recover(adev, NULL); amdgpu_gpu_recover(adev, NULL, true);
return 0; return 0;
} }
......
...@@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work) ...@@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
reset_work); reset_work);
if (!amdgpu_sriov_vf(adev)) if (!amdgpu_sriov_vf(adev))
amdgpu_gpu_recover(adev, NULL); amdgpu_gpu_recover(adev, NULL, false);
} }
/* Disable *all* interrupts */ /* Disable *all* interrupts */
......
...@@ -37,7 +37,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) ...@@ -37,7 +37,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
atomic_read(&job->ring->fence_drv.last_seq), atomic_read(&job->ring->fence_drv.last_seq),
job->ring->fence_drv.sync_seq); job->ring->fence_drv.sync_seq);
amdgpu_gpu_recover(job->adev, job); amdgpu_gpu_recover(job->adev, job, false);
} }
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
......
...@@ -253,7 +253,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) ...@@ -253,7 +253,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
} }
/* Trigger recovery due to world switch failure */ /* Trigger recovery due to world switch failure */
amdgpu_gpu_recover(adev, NULL); amdgpu_gpu_recover(adev, NULL, false);
} }
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
......
...@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) ...@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
} }
/* Trigger recovery due to world switch failure */ /* Trigger recovery due to world switch failure */
amdgpu_gpu_recover(adev, NULL); amdgpu_gpu_recover(adev, NULL, false);
} }
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment