Commit a840d9e6 authored by Chris Wilson's avatar Chris Wilson Committed by Jiri Slaby

drm/i915: Evict CS TLBs between batches

commit c4d69da1 upstream.

Running igt, I was encountering the invalid TLB bug on my 845g, despite
that it was using the CS workaround. Examining the w/a buffer in the
error state, showed that the copy from the user batch into the
workaround itself was suffering from the invalid TLB bug (the first
cacheline was broken with the first two words reversed). Time to try a
fresh approach. This extends the workaround to write into each page of
our scratch buffer in order to overflow the TLB and evict the invalid
entries. This could be refined to only do so after we update the GTT,
but for simplicity, we do it before each batch.

I suspect this supersedes our current workaround, but for safety keep
doing both.

V2: The magic number shall be 2.

This doesn't conclusively prove that it is the mythical TLB bug we've
been trying to workaround for so long, that it requires touching a number
of pages to prevent the corruption indicates to me that it is TLB
related, but the corruption (the reversed cacheline) is more subtle than
a TLB bug, where we would expect it to read the wrong page entirely.

Oh well, it prevents a reliable hang for me and so probably for others
as well.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: stable@vger.kernel.org
Reviewed-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: default avatarJani Nikula <jani.nikula@intel.com>
Signed-off-by: default avatarTakashi Iwai <tiwai@suse.de>
Signed-off-by: default avatarJiri Slaby <jslaby@suse.cz>
parent 0bf81403
...@@ -290,16 +290,20 @@ ...@@ -290,16 +290,20 @@
#define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1)
#define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
#define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2)
#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2))
#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) #define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4)
#define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6)
#define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5)
#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) #define BLT_WRITE_A (2<<20)
#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) #define BLT_WRITE_RGB (1<<20)
#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A)
#define BLT_DEPTH_8 (0<<24) #define BLT_DEPTH_8 (0<<24)
#define BLT_DEPTH_16_565 (1<<24) #define BLT_DEPTH_16_565 (1<<24)
#define BLT_DEPTH_16_1555 (2<<24) #define BLT_DEPTH_16_1555 (2<<24)
#define BLT_DEPTH_32 (3<<24) #define BLT_DEPTH_32 (3<<24)
#define BLT_ROP_GXCOPY (0xcc<<16) #define BLT_ROP_SRC_COPY (0xcc<<16)
#define BLT_ROP_COLOR_COPY (0xf0<<16)
#define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */
#define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */
#define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2)
......
...@@ -1088,53 +1088,65 @@ i965_dispatch_execbuffer(struct intel_ring_buffer *ring, ...@@ -1088,53 +1088,65 @@ i965_dispatch_execbuffer(struct intel_ring_buffer *ring,
/* Just userspace ABI convention to limit the wa batch bo to a resonable size */ /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
#define I830_BATCH_LIMIT (256*1024) #define I830_BATCH_LIMIT (256*1024)
#define I830_TLB_ENTRIES (2)
#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
static int static int
i830_dispatch_execbuffer(struct intel_ring_buffer *ring, i830_dispatch_execbuffer(struct intel_ring_buffer *ring,
u32 offset, u32 len, u32 offset, u32 len,
unsigned flags) unsigned flags)
{ {
u32 cs_offset = ring->scratch.gtt_offset;
int ret; int ret;
if (flags & I915_DISPATCH_PINNED) { ret = intel_ring_begin(ring, 6);
ret = intel_ring_begin(ring, 4);
if (ret) if (ret)
return ret; return ret;
intel_ring_emit(ring, MI_BATCH_BUFFER); /* Evict the invalid PTE TLBs */
intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA);
intel_ring_emit(ring, offset + len - 8); intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */
intel_ring_emit(ring, cs_offset);
intel_ring_emit(ring, 0xdeadbeef);
intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_NOOP);
intel_ring_advance(ring); intel_ring_advance(ring);
} else {
u32 cs_offset = ring->scratch.gtt_offset;
if ((flags & I915_DISPATCH_PINNED) == 0) {
if (len > I830_BATCH_LIMIT) if (len > I830_BATCH_LIMIT)
return -ENOSPC; return -ENOSPC;
ret = intel_ring_begin(ring, 9+3); ret = intel_ring_begin(ring, 6 + 2);
if (ret) if (ret)
return ret; return ret;
/* Blit the batch (which has now all relocs applied) to the stable batch
* scratch bo area (so that the CS never stumbles over its tlb /* Blit the batch (which has now all relocs applied) to the
* invalidation bug) ... */ * stable batch scratch bo area (so that the CS never
intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD | * stumbles over its tlb invalidation bug) ...
XY_SRC_COPY_BLT_WRITE_ALPHA | */
XY_SRC_COPY_BLT_WRITE_RGB); intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096); intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
intel_ring_emit(ring, 0); intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 1024);
intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024);
intel_ring_emit(ring, cs_offset); intel_ring_emit(ring, cs_offset);
intel_ring_emit(ring, 0);
intel_ring_emit(ring, 4096); intel_ring_emit(ring, 4096);
intel_ring_emit(ring, offset); intel_ring_emit(ring, offset);
intel_ring_emit(ring, MI_FLUSH); intel_ring_emit(ring, MI_FLUSH);
intel_ring_emit(ring, MI_NOOP);
intel_ring_advance(ring);
/* ... and execute it. */ /* ... and execute it. */
offset = cs_offset;
}
ret = intel_ring_begin(ring, 4);
if (ret)
return ret;
intel_ring_emit(ring, MI_BATCH_BUFFER); intel_ring_emit(ring, MI_BATCH_BUFFER);
intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE));
intel_ring_emit(ring, cs_offset + len - 8); intel_ring_emit(ring, offset + len - 8);
intel_ring_emit(ring, MI_NOOP);
intel_ring_advance(ring); intel_ring_advance(ring);
}
return 0; return 0;
} }
...@@ -1811,7 +1823,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev) ...@@ -1811,7 +1823,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
struct drm_i915_gem_object *obj; struct drm_i915_gem_object *obj;
int ret; int ret;
obj = i915_gem_alloc_object(dev, I830_BATCH_LIMIT); obj = i915_gem_alloc_object(dev, I830_WA_SIZE);
if (obj == NULL) { if (obj == NULL) {
DRM_ERROR("Failed to allocate batch bo\n"); DRM_ERROR("Failed to allocate batch bo\n");
return -ENOMEM; return -ENOMEM;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment