| From: Chris Wilson <chris@chris-wilson.co.uk> |
| Date: Mon, 5 Nov 2018 09:43:05 +0000 |
| Subject: drm/i915/ringbuffer: Delay after EMIT_INVALIDATE for gen4/gen5 |
| MIME-Version: 1.0 |
| Content-Type: text/plain; charset=UTF-8 |
| Content-Transfer-Encoding: 8bit |
| |
| commit 55f99bf2a9c331838c981694bc872cd1ec4070b2 upstream. |
| |
| Exercising the gpu reloc path strenuously revealed an issue where the |
| updated relocations (from MI_STORE_DWORD_IMM) were not being observed |
| upon execution. After some experiments with adding pipecontrols (a lot |
| of pipecontrols (32) as gen4/5 do not have a bit to wait on earlier pipe |
| controls or even the current on), it was discovered that we merely |
| needed to delay the EMIT_INVALIDATE by several flushes. It is important |
| to note that it is the EMIT_INVALIDATE as opposed to the EMIT_FLUSH that |
| needs the delay as opposed to what one might first expect -- that the |
| delay is required for the TLB invalidation to take effect (one presumes |
| to purge any CS buffers) as opposed to a delay after flushing to ensure |
| the writes have landed before triggering invalidation. |
| |
| Testcase: igt/gem_tiled_fence_blits |
| Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> |
| Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> |
| Link: https://patchwork.freedesktop.org/patch/msgid/20181105094305.5767-1-chris@chris-wilson.co.uk |
| [bwh: Backported to 3.16: |
| - Use intel_ring_emit() instead of assignments |
| - Use ring->scratch.gtt_offset instead of i915_ggtt_offset() |
| - Use (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) instead of |
| (mode & EMIT_INVALIDATE) |
| - Adjust context] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| drivers/gpu/drm/i915/intel_ringbuffer.c | 38 +++++++++++++++++++++++-- |
| 1 file changed, 36 insertions(+), 2 deletions(-) |
| |
| --- a/drivers/gpu/drm/i915/intel_ringbuffer.c |
| +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c |
| @@ -103,6 +103,7 @@ gen4_render_ring_flush(struct intel_engi |
| struct drm_device *dev = ring->dev; |
| u32 cmd; |
| int ret; |
| + int i; |
| |
| /* |
| * read/write caches: |
| @@ -142,12 +143,47 @@ gen4_render_ring_flush(struct intel_engi |
| (IS_G4X(dev) || IS_GEN5(dev))) |
| cmd |= MI_INVALIDATE_ISP; |
| |
| - ret = intel_ring_begin(ring, 2); |
| + i = 2; |
| + if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) |
| + i += 20; |
| + |
| + ret = intel_ring_begin(ring, i); |
| if (ret) |
| return ret; |
| |
| intel_ring_emit(ring, cmd); |
| - intel_ring_emit(ring, MI_NOOP); |
| + |
| + /* |
| + * A random delay to let the CS invalidate take effect? Without this |
| + * delay, the GPU relocation path fails as the CS does not see |
| + * the updated contents. Just as important, if we apply the flushes |
| + * to the EMIT_FLUSH branch (i.e. immediately after the relocation |
| + * write and before the invalidate on the next batch), the relocations |
| + * still fail. This implies that is a delay following invalidation |
| + * that is required to reset the caches as opposed to a delay to |
| + * ensure the memory is written. |
| + */ |
| + if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) { |
| + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4) | |
| + PIPE_CONTROL_QW_WRITE); |
| + intel_ring_emit(ring, ring->scratch.gtt_offset | |
| + PIPE_CONTROL_GLOBAL_GTT); |
| + intel_ring_emit(ring, 0); |
| + intel_ring_emit(ring, 0); |
| + |
| + for (i = 0; i < 12; i++) |
| + intel_ring_emit(ring, MI_FLUSH); |
| + |
| + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4) | |
| + PIPE_CONTROL_QW_WRITE); |
| + intel_ring_emit(ring, ring->scratch.gtt_offset | |
| + PIPE_CONTROL_GLOBAL_GTT); |
| + intel_ring_emit(ring, 0); |
| + intel_ring_emit(ring, 0); |
| + } |
| + |
| + intel_ring_emit(ring, cmd); |
| + |
| intel_ring_advance(ring); |
| |
| return 0; |