releases/3.16.65/drm-i915-ringbuffer-delay-after-emit_invalidate-for-gen4-gen5.patch - pub/scm/linux/kernel/git/bwh/linux-stable-queue - Git at Google

 From: Chris Wilson <chris@chris-wilson.co.uk>
 Date: Mon, 5 Nov 2018 09:43:05 +0000
 Subject: drm/i915/ringbuffer: Delay after EMIT_INVALIDATE for gen4/gen5
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit

 commit 55f99bf2a9c331838c981694bc872cd1ec4070b2 upstream.

 Exercising the gpu reloc path strenuously revealed an issue where the
 updated relocations (from MI_STORE_DWORD_IMM) were not being observed
 upon execution. After some experiments with adding pipecontrols (a lot
 of pipecontrols (32) as gen4/5 do not have a bit to wait on earlier pipe
 controls or even the current on), it was discovered that we merely
 needed to delay the EMIT_INVALIDATE by several flushes. It is important
 to note that it is the EMIT_INVALIDATE as opposed to the EMIT_FLUSH that
 needs the delay as opposed to what one might first expect -- that the
 delay is required for the TLB invalidation to take effect (one presumes
 to purge any CS buffers) as opposed to a delay after flushing to ensure
 the writes have landed before triggering invalidation.

 Testcase: igt/gem_tiled_fence_blits
 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
 Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
 Link: https://patchwork.freedesktop.org/patch/msgid/20181105094305.5767-1-chris@chris-wilson.co.uk
 [bwh: Backported to 3.16:
  - Use intel_ring_emit() instead of assignments
  - Use ring->scratch.gtt_offset instead of i915_ggtt_offset()
  - Use (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) instead of
    (mode & EMIT_INVALIDATE)
  - Adjust context]
 Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
 ---
  drivers/gpu/drm/i915/intel_ringbuffer.c | 38 +++++++++++++++++++++++--
  1 file changed, 36 insertions(+), 2 deletions(-)

 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
 +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
 @@ -103,6 +103,7 @@ gen4_render_ring_flush(struct intel_engi
  	struct drm_device *dev = ring->dev;
  	u32 cmd;
  	int ret;
 +	int i;

  	/*
  	 * read/write caches:
 @@ -142,12 +143,47 @@ gen4_render_ring_flush(struct intel_engi
  	    (IS_G4X(dev) || IS_GEN5(dev)))
  		cmd |= MI_INVALIDATE_ISP;

 -	ret = intel_ring_begin(ring, 2);
 +	i = 2;
 +	if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION)
 +		i += 20;
 +
 +	ret = intel_ring_begin(ring, i);
  	if (ret)
  		return ret;

  	intel_ring_emit(ring, cmd);
 -	intel_ring_emit(ring, MI_NOOP);
 +
 +	/*
 +	 * A random delay to let the CS invalidate take effect? Without this
 +	 * delay, the GPU relocation path fails as the CS does not see
 +	 * the updated contents. Just as important, if we apply the flushes
 +	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
 +	 * write and before the invalidate on the next batch), the relocations
 +	 * still fail. This implies that is a delay following invalidation
 +	 * that is required to reset the caches as opposed to a delay to
 +	 * ensure the memory is written.
 +	 */
 +	if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) {
 +		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4) |
 +				PIPE_CONTROL_QW_WRITE);
 +		intel_ring_emit(ring, ring->scratch.gtt_offset |
 +				PIPE_CONTROL_GLOBAL_GTT);
 +		intel_ring_emit(ring, 0);
 +		intel_ring_emit(ring, 0);
 +
 +		for (i = 0; i < 12; i++)
 +			intel_ring_emit(ring, MI_FLUSH);
 +
 +		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4) |
 +				PIPE_CONTROL_QW_WRITE);
 +		intel_ring_emit(ring, ring->scratch.gtt_offset |
 +				PIPE_CONTROL_GLOBAL_GTT);
 +		intel_ring_emit(ring, 0);
 +		intel_ring_emit(ring, 0);
 +	}
 +
 +	intel_ring_emit(ring, cmd);
 +
  	intel_ring_advance(ring);

  	return 0;
	From: Chris Wilson <chris@chris-wilson.co.uk>
	Date: Mon, 5 Nov 2018 09:43:05 +0000
	Subject: drm/i915/ringbuffer: Delay after EMIT_INVALIDATE for gen4/gen5
	MIME-Version: 1.0
	Content-Type: text/plain; charset=UTF-8
	Content-Transfer-Encoding: 8bit

	commit 55f99bf2a9c331838c981694bc872cd1ec4070b2 upstream.

	Exercising the gpu reloc path strenuously revealed an issue where the
	updated relocations (from MI_STORE_DWORD_IMM) were not being observed
	upon execution. After some experiments with adding pipecontrols (a lot
	of pipecontrols (32) as gen4/5 do not have a bit to wait on earlier pipe
	controls or even the current on), it was discovered that we merely
	needed to delay the EMIT_INVALIDATE by several flushes. It is important
	to note that it is the EMIT_INVALIDATE as opposed to the EMIT_FLUSH that
	needs the delay as opposed to what one might first expect -- that the
	delay is required for the TLB invalidation to take effect (one presumes
	to purge any CS buffers) as opposed to a delay after flushing to ensure
	the writes have landed before triggering invalidation.

	Testcase: igt/gem_tiled_fence_blits
	Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
	Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
	Link: https://patchwork.freedesktop.org/patch/msgid/20181105094305.5767-1-chris@chris-wilson.co.uk
	[bwh: Backported to 3.16:
	- Use intel_ring_emit() instead of assignments
	- Use ring->scratch.gtt_offset instead of i915_ggtt_offset()
	- Use (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) instead of
	(mode & EMIT_INVALIDATE)
	- Adjust context]
	Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
	---
	drivers/gpu/drm/i915/intel_ringbuffer.c \| 38 +++++++++++++++++++++++--
	1 file changed, 36 insertions(+), 2 deletions(-)

	--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
	+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
	@@ -103,6 +103,7 @@ gen4_render_ring_flush(struct intel_engi
	struct drm_device *dev = ring->dev;
	u32 cmd;
	int ret;
	+ int i;

	/*
	* read/write caches:
	@@ -142,12 +143,47 @@ gen4_render_ring_flush(struct intel_engi
	(IS_G4X(dev) \|\| IS_GEN5(dev)))
	cmd \|= MI_INVALIDATE_ISP;

	- ret = intel_ring_begin(ring, 2);
	+ i = 2;
	+ if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION)
	+ i += 20;
	+
	+ ret = intel_ring_begin(ring, i);
	if (ret)
	return ret;

	intel_ring_emit(ring, cmd);
	- intel_ring_emit(ring, MI_NOOP);
	+
	+ /*
	+ * A random delay to let the CS invalidate take effect? Without this
	+ * delay, the GPU relocation path fails as the CS does not see
	+ * the updated contents. Just as important, if we apply the flushes
	+ * to the EMIT_FLUSH branch (i.e. immediately after the relocation
	+ * write and before the invalidate on the next batch), the relocations
	+ * still fail. This implies that is a delay following invalidation
	+ * that is required to reset the caches as opposed to a delay to
	+ * ensure the memory is written.
	+ */
	+ if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) {
	+ intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4) \|
	+ PIPE_CONTROL_QW_WRITE);
	+ intel_ring_emit(ring, ring->scratch.gtt_offset \|
	+ PIPE_CONTROL_GLOBAL_GTT);
	+ intel_ring_emit(ring, 0);
	+ intel_ring_emit(ring, 0);
	+
	+ for (i = 0; i < 12; i++)
	+ intel_ring_emit(ring, MI_FLUSH);
	+
	+ intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4) \|
	+ PIPE_CONTROL_QW_WRITE);
	+ intel_ring_emit(ring, ring->scratch.gtt_offset \|
	+ PIPE_CONTROL_GLOBAL_GTT);
	+ intel_ring_emit(ring, 0);
	+ intel_ring_emit(ring, 0);
	+ }
	+
	+ intel_ring_emit(ring, cmd);
	+
	intel_ring_advance(ring);

	return 0;