Commit 84102171 authored by Mika Kuoppala's avatar Mika Kuoppala Committed by Mika Kuoppala

drm/i915: Add bannable context parameter

Now when driver has per context scoring of 'hanging badness'
and also subsequent hangs during short windows are allowed,
if there is progress made in between, it does not make sense
to expose a ban timing window as a context parameter anymore.

Let the scoring be the sole indicator for ban policy and substitute
ban period context parameter as a boolean to get/set context
bannable property.

v2: allow non root to opt into being banned (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Suggested-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
parent e5e1fc47
...@@ -850,6 +850,7 @@ struct drm_i915_error_state { ...@@ -850,6 +850,7 @@ struct drm_i915_error_state {
long jiffies; long jiffies;
pid_t pid; pid_t pid;
u32 context; u32 context;
int ban_score;
u32 seqno; u32 seqno;
u32 head; u32 head;
u32 tail; u32 tail;
...@@ -909,16 +910,10 @@ struct i915_ctx_hang_stats { ...@@ -909,16 +910,10 @@ struct i915_ctx_hang_stats {
/* This context had batch active when hang was declared */ /* This context had batch active when hang was declared */
unsigned batch_active; unsigned batch_active;
/* Time when this context was last blamed for a GPU reset */ bool bannable:1;
unsigned long guilty_ts;
/* If the contexts causes a second GPU hang within this time,
* it is permanently banned from submitting any more work.
*/
unsigned long ban_period_seconds;
/* This context is banned to submit more work */ /* This context is banned to submit more work */
bool banned; bool banned:1;
#define CONTEXT_SCORE_GUILTY 10 #define CONTEXT_SCORE_GUILTY 10
#define CONTEXT_SCORE_BAN_THRESHOLD 40 #define CONTEXT_SCORE_BAN_THRESHOLD 40
...@@ -1459,8 +1454,6 @@ struct i915_gpu_error { ...@@ -1459,8 +1454,6 @@ struct i915_gpu_error {
/* For hangcheck timer */ /* For hangcheck timer */
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD) #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
/* Hang gpu twice in this window and your context gets banned */
#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
struct delayed_work hangcheck_work; struct delayed_work hangcheck_work;
......
...@@ -2623,20 +2623,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, ...@@ -2623,20 +2623,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
static bool i915_context_is_banned(const struct i915_gem_context *ctx) static bool i915_context_is_banned(const struct i915_gem_context *ctx)
{ {
const struct i915_ctx_hang_stats *hs = &ctx->hang_stats; const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
unsigned long elapsed;
if (hs->banned) if (hs->banned)
return true; return true;
if (!hs->ban_period_seconds) if (!hs->bannable)
return false; return false;
elapsed = get_seconds() - hs->guilty_ts;
if (elapsed <= hs->ban_period_seconds) {
DRM_DEBUG("context hanging too fast, banning!\n");
return true;
}
if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) { if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
DRM_DEBUG("context hanging too often, banning!\n"); DRM_DEBUG("context hanging too often, banning!\n");
return true; return true;
...@@ -2653,7 +2646,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) ...@@ -2653,7 +2646,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
hs->banned = i915_context_is_banned(ctx); hs->banned = i915_context_is_banned(ctx);
hs->batch_active++; hs->batch_active++;
hs->guilty_ts = get_seconds();
} }
static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
......
...@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev, ...@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
* is no remap info, it will be a NOP. */ * is no remap info, it will be a NOP. */
ctx->remap_slice = ALL_L3_SLICES(dev_priv); ctx->remap_slice = ALL_L3_SLICES(dev_priv);
ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD; ctx->hang_stats.bannable = true;
ctx->ring_size = 4 * PAGE_SIZE; ctx->ring_size = 4 * PAGE_SIZE;
ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) << ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
GEN8_CTX_ADDRESSING_MODE_SHIFT; GEN8_CTX_ADDRESSING_MODE_SHIFT;
...@@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, ...@@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
args->size = 0; args->size = 0;
switch (args->param) { switch (args->param) {
case I915_CONTEXT_PARAM_BAN_PERIOD: case I915_CONTEXT_PARAM_BAN_PERIOD:
args->value = ctx->hang_stats.ban_period_seconds; ret = -EINVAL;
break; break;
case I915_CONTEXT_PARAM_NO_ZEROMAP: case I915_CONTEXT_PARAM_NO_ZEROMAP:
args->value = ctx->flags & CONTEXT_NO_ZEROMAP; args->value = ctx->flags & CONTEXT_NO_ZEROMAP;
...@@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, ...@@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE); args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
break; break;
case I915_CONTEXT_PARAM_BANNABLE:
args->value = ctx->hang_stats.bannable;
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
...@@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, ...@@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
switch (args->param) { switch (args->param) {
case I915_CONTEXT_PARAM_BAN_PERIOD: case I915_CONTEXT_PARAM_BAN_PERIOD:
if (args->size) ret = -EINVAL;
ret = -EINVAL;
else if (args->value < ctx->hang_stats.ban_period_seconds &&
!capable(CAP_SYS_ADMIN))
ret = -EPERM;
else
ctx->hang_stats.ban_period_seconds = args->value;
break; break;
case I915_CONTEXT_PARAM_NO_ZEROMAP: case I915_CONTEXT_PARAM_NO_ZEROMAP:
if (args->size) { if (args->size) {
...@@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, ...@@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE; ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
} }
break; break;
case I915_CONTEXT_PARAM_BANNABLE:
if (args->size)
ret = -EINVAL;
else if (!capable(CAP_SYS_ADMIN) && !args->value)
ret = -EPERM;
else
ctx->hang_stats.bannable = args->value;
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
......
...@@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, ...@@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
if (!erq->seqno) if (!erq->seqno)
return; return;
err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n", err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
prefix, erq->pid, prefix, erq->pid, erq->ban_score,
erq->context, erq->seqno, erq->context, erq->seqno,
jiffies_to_msecs(jiffies - erq->jiffies), jiffies_to_msecs(jiffies - erq->jiffies),
erq->head, erq->tail); erq->head, erq->tail);
...@@ -1170,6 +1170,7 @@ static void record_request(struct drm_i915_gem_request *request, ...@@ -1170,6 +1170,7 @@ static void record_request(struct drm_i915_gem_request *request,
struct drm_i915_error_request *erq) struct drm_i915_error_request *erq)
{ {
erq->context = request->ctx->hw_id; erq->context = request->ctx->hw_id;
erq->ban_score = request->ctx->hang_stats.ban_score;
erq->seqno = request->global_seqno; erq->seqno = request->global_seqno;
erq->jiffies = request->emitted_jiffies; erq->jiffies = request->emitted_jiffies;
erq->head = request->head; erq->head = request->head;
......
...@@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param { ...@@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param {
#define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2
#define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_GTT_SIZE 0x3
#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4
#define I915_CONTEXT_PARAM_BANNABLE 0x5
__u64 value; __u64 value;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment