Commit 142bc7d9 authored by Michel Thierry's avatar Michel Thierry Committed by Chris Wilson

drm/i915: Modify error handler for per engine hang recovery

This is a preparatory patch which modifies error handler to do per engine
hang recovery. The actual patch which implements this sequence follows
later in the series. The aim is to prepare existing recovery function to
adapt to this new function where applicable (which fails at this point
because core implementation is lacking) and continue recovery using legacy
full gpu reset.

A helper function is also added to query the availability of engine
reset. A subsequent patch will add the capability to query which type
of reset is present (engine -> full -> no-reset) via the get-param
ioctl.

It has been decided that the error events that are used to notify user of
reset will only be sent in case if full chip reset. In case of just
single (or multiple) engine resets, userspace won't be notified by these
events.

Note that this implementation of engine reset is for i915 directly
submitting to the ELSP, where the driver manages the hang detection,
recovery and resubmission. With GuC submission these tasks are shared
between driver and firmware; i915 will still responsible for detecting a
hang, and when it does it will have to request GuC to reset that Engine and
remind the firmware about the outstanding submissions. This will be
added in different patch.

v2: rebase, advertise engine reset availability in platform definition,
add note about GuC submission.
v3: s/*engine_reset*/*reset_engine*/. (Chris)
Handle reset as 2 level resets, by first going to engine only and fall
backing to full/chip reset as needed, i.e. reset_engine will need the
struct_mutex.
v4: Pass the engine mask to i915_reset. (Chris)
v5: Rebase, update selftests.
v6: Rebase, prepare for mutex-less reset engine.
v7: Pass reset_engine mask as a function parameter, and iterate over the
engine mask for reset_engine. (Chris)
v8: Use i915.reset >=2 in has_reset_engine; remove redundant reset
logging; add a reset-engine-in-progress flag to prevent concurrent
resets, and avoid dual purposing of reset-backoff. (Chris)
v9: Support reset of different engines in parallel (Chris)
v10: Handle reset-engine flag locking better (Chris)
v11: Squash in reporting of per-engine-reset availability.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: default avatarIan Lister <ian.lister@intel.com>
Signed-off-by: default avatarTomas Elf <tomas.elf@intel.com>
Signed-off-by: default avatarArun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: default avatarMichel Thierry <michel.thierry@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170615201828.23144-4-michel.thierry@intel.comReviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/20170620095751.13127-5-chris@chris-wilson.co.uk
parent ed35dd7b
...@@ -331,6 +331,8 @@ static int i915_getparam(struct drm_device *dev, void *data, ...@@ -331,6 +331,8 @@ static int i915_getparam(struct drm_device *dev, void *data,
break; break;
case I915_PARAM_HAS_GPU_RESET: case I915_PARAM_HAS_GPU_RESET:
value = i915.enable_hangcheck && intel_has_gpu_reset(dev_priv); value = i915.enable_hangcheck && intel_has_gpu_reset(dev_priv);
if (value && intel_has_reset_engine(dev_priv))
value = 2;
break; break;
case I915_PARAM_HAS_RESOURCE_STREAMER: case I915_PARAM_HAS_RESOURCE_STREAMER:
value = HAS_RESOURCE_STREAMER(dev_priv); value = HAS_RESOURCE_STREAMER(dev_priv);
...@@ -1915,6 +1917,19 @@ void i915_reset(struct drm_i915_private *dev_priv) ...@@ -1915,6 +1917,19 @@ void i915_reset(struct drm_i915_private *dev_priv)
goto finish; goto finish;
} }
/**
* i915_reset_engine - reset GPU engine to recover from a hang
* @engine: engine to reset
*
* Reset a specific GPU engine. Useful if a hang is detected.
* Returns zero on successful reset or otherwise an error code.
*/
int i915_reset_engine(struct intel_engine_cs *engine)
{
/* FIXME: replace me with engine reset sequence */
return -ENODEV;
}
static int i915_pm_suspend(struct device *kdev) static int i915_pm_suspend(struct device *kdev)
{ {
struct pci_dev *pdev = to_pci_dev(kdev); struct pci_dev *pdev = to_pci_dev(kdev);
......
...@@ -752,6 +752,7 @@ struct intel_csr { ...@@ -752,6 +752,7 @@ struct intel_csr {
func(has_csr); \ func(has_csr); \
func(has_ddi); \ func(has_ddi); \
func(has_dp_mst); \ func(has_dp_mst); \
func(has_reset_engine); \
func(has_fbc); \ func(has_fbc); \
func(has_fpga_dbg); \ func(has_fpga_dbg); \
func(has_full_ppgtt); \ func(has_full_ppgtt); \
...@@ -1549,6 +1550,12 @@ struct i915_gpu_error { ...@@ -1549,6 +1550,12 @@ struct i915_gpu_error {
* inspect the bit and do the reset directly, otherwise the worker * inspect the bit and do the reset directly, otherwise the worker
* waits for the struct_mutex. * waits for the struct_mutex.
* *
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset attempts in the same engine.
* As the number of engines continues to grow, allocate the flags from
* the most significant bits.
*
* #I915_WEDGED - If reset fails and we can no longer use the GPU, * #I915_WEDGED - If reset fails and we can no longer use the GPU,
* we set the #I915_WEDGED bit. Prior to command submission, e.g. * we set the #I915_WEDGED bit. Prior to command submission, e.g.
* i915_gem_request_alloc(), this bit is checked and the sequence * i915_gem_request_alloc(), this bit is checked and the sequence
...@@ -1558,6 +1565,7 @@ struct i915_gpu_error { ...@@ -1558,6 +1565,7 @@ struct i915_gpu_error {
#define I915_RESET_BACKOFF 0 #define I915_RESET_BACKOFF 0
#define I915_RESET_HANDOFF 1 #define I915_RESET_HANDOFF 1
#define I915_WEDGED (BITS_PER_LONG - 1) #define I915_WEDGED (BITS_PER_LONG - 1)
#define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
/** /**
* Waitqueue to signal when a hang is detected. Used to for waiters * Waitqueue to signal when a hang is detected. Used to for waiters
...@@ -3092,6 +3100,8 @@ extern void i915_driver_unload(struct drm_device *dev); ...@@ -3092,6 +3100,8 @@ extern void i915_driver_unload(struct drm_device *dev);
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
extern void i915_reset(struct drm_i915_private *dev_priv); extern void i915_reset(struct drm_i915_private *dev_priv);
extern int i915_reset_engine(struct intel_engine_cs *engine);
extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
extern int intel_guc_reset(struct drm_i915_private *dev_priv); extern int intel_guc_reset(struct drm_i915_private *dev_priv);
extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine); extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
extern void intel_hangcheck_init(struct drm_i915_private *dev_priv); extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
......
...@@ -2715,6 +2715,8 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -2715,6 +2715,8 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
u32 engine_mask, u32 engine_mask,
const char *fmt, ...) const char *fmt, ...)
{ {
struct intel_engine_cs *engine;
unsigned int tmp;
va_list args; va_list args;
char error_msg[80]; char error_msg[80];
...@@ -2734,9 +2736,31 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -2734,9 +2736,31 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
i915_capture_error_state(dev_priv, engine_mask, error_msg); i915_capture_error_state(dev_priv, engine_mask, error_msg);
i915_clear_error_registers(dev_priv); i915_clear_error_registers(dev_priv);
/*
* Try engine reset when available. We fall back to full reset if
* single reset fails.
*/
if (intel_has_reset_engine(dev_priv)) {
for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
BUILD_BUG_ON(I915_RESET_HANDOFF >= I915_RESET_ENGINE);
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
&dev_priv->gpu_error.flags))
continue;
if (i915_reset_engine(engine) == 0)
engine_mask &= ~intel_engine_flag(engine);
clear_bit(I915_RESET_ENGINE + engine->id,
&dev_priv->gpu_error.flags);
wake_up_bit(&dev_priv->gpu_error.flags,
I915_RESET_ENGINE + engine->id);
}
}
if (!engine_mask) if (!engine_mask)
goto out; goto out;
/* Full reset needs the mutex, stop any other user trying to do so. */
if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) { if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) {
wait_event(dev_priv->gpu_error.reset_queue, wait_event(dev_priv->gpu_error.reset_queue,
!test_bit(I915_RESET_BACKOFF, !test_bit(I915_RESET_BACKOFF,
...@@ -2744,8 +2768,22 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -2744,8 +2768,22 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
goto out; goto out;
} }
/* Prevent any other reset-engine attempt. */
for_each_engine(engine, dev_priv, tmp) {
while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
&dev_priv->gpu_error.flags))
wait_on_bit(&dev_priv->gpu_error.flags,
I915_RESET_ENGINE + engine->id,
TASK_UNINTERRUPTIBLE);
}
i915_reset_device(dev_priv); i915_reset_device(dev_priv);
for_each_engine(engine, dev_priv, tmp) {
clear_bit(I915_RESET_ENGINE + engine->id,
&dev_priv->gpu_error.flags);
}
clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags); clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
wake_up_all(&dev_priv->gpu_error.reset_queue); wake_up_all(&dev_priv->gpu_error.reset_queue);
......
...@@ -310,7 +310,8 @@ static const struct intel_device_info intel_haswell_info = { ...@@ -310,7 +310,8 @@ static const struct intel_device_info intel_haswell_info = {
BDW_COLORS, \ BDW_COLORS, \
.has_logical_ring_contexts = 1, \ .has_logical_ring_contexts = 1, \
.has_full_48bit_ppgtt = 1, \ .has_full_48bit_ppgtt = 1, \
.has_64bit_reloc = 1 .has_64bit_reloc = 1, \
.has_reset_engine = 1
#define BDW_PLATFORM \ #define BDW_PLATFORM \
BDW_FEATURES, \ BDW_FEATURES, \
...@@ -342,6 +343,7 @@ static const struct intel_device_info intel_cherryview_info = { ...@@ -342,6 +343,7 @@ static const struct intel_device_info intel_cherryview_info = {
.has_gmch_display = 1, .has_gmch_display = 1,
.has_aliasing_ppgtt = 1, .has_aliasing_ppgtt = 1,
.has_full_ppgtt = 1, .has_full_ppgtt = 1,
.has_reset_engine = 1,
.display_mmio_offset = VLV_DISPLAY_BASE, .display_mmio_offset = VLV_DISPLAY_BASE,
GEN_CHV_PIPEOFFSETS, GEN_CHV_PIPEOFFSETS,
CURSOR_OFFSETS, CURSOR_OFFSETS,
...@@ -387,6 +389,7 @@ static const struct intel_device_info intel_skylake_gt3_info = { ...@@ -387,6 +389,7 @@ static const struct intel_device_info intel_skylake_gt3_info = {
.has_aliasing_ppgtt = 1, \ .has_aliasing_ppgtt = 1, \
.has_full_ppgtt = 1, \ .has_full_ppgtt = 1, \
.has_full_48bit_ppgtt = 1, \ .has_full_48bit_ppgtt = 1, \
.has_reset_engine = 1, \
GEN_DEFAULT_PIPEOFFSETS, \ GEN_DEFAULT_PIPEOFFSETS, \
IVB_CURSOR_OFFSETS, \ IVB_CURSOR_OFFSETS, \
BDW_COLORS BDW_COLORS
......
...@@ -1719,6 +1719,17 @@ bool intel_has_gpu_reset(struct drm_i915_private *dev_priv) ...@@ -1719,6 +1719,17 @@ bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
return intel_get_gpu_reset(dev_priv) != NULL; return intel_get_gpu_reset(dev_priv) != NULL;
} }
/*
* When GuC submission is enabled, GuC manages ELSP and can initiate the
* engine reset too. For now, fall back to full GPU reset if it is enabled.
*/
bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
{
return (dev_priv->info.has_reset_engine &&
!dev_priv->guc.execbuf_client &&
i915.reset >= 2);
}
int intel_guc_reset(struct drm_i915_private *dev_priv) int intel_guc_reset(struct drm_i915_private *dev_priv)
{ {
int ret; int ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment