| From f31ffe1f1f0f76084f88564bcd0b8cd97068eca0 Mon Sep 17 00:00:00 2001 |
| From: Mika Kuoppala <mika.kuoppala@linux.intel.com> |
| Date: Thu, 30 May 2013 09:04:29 +0300 |
| Subject: drm/i915: detect hang using per ring hangcheck_score |
| |
| Keep track of ring seqno progress and if there are no |
| progress detected, declare hang. Use actual head (acthd) |
| to distinguish between ring stuck and batchbuffer looping |
| situation. Stuck ring will be kicked to trigger progress. |
| |
| This commit adds a hard limit for batchbuffer completion time. |
| If batchbuffer completion time is more than 4.5 seconds, |
| the gpu will be declared hung. |
| |
| Review comment from Ben which nicely clarifies the semantic change: |
| |
| "Maybe I'm just stating the functional changes of the patch, but in case |
| they were unintended here is what I see as potential issues: |
| |
| 1. "If ring B is waiting on ring A via semaphore, and ring A is making |
| progress, albeit slowly - the hangcheck will fire. The check will |
| determine that A is moving, however ring B will appear hung because |
| the ACTHD doesn't move. I honestly can't say if that's actually a |
| realistic problem to hit it probably implies the timeout value is too |
| low. |
| |
| 2. "There's also another corner case on the kick. If the seqno = 2 |
| (though not stuck), and on the 3rd hangcheck, the ring is stuck, and |
| we try to kick it... we don't actually try to find out if the kick |
| helped" |
| |
| v2: use atchd to detect stuck ring from loop (Ben Widawsky) |
| |
| v3: Use acthd to check when ring needs kicking. |
| Declare hang on third time in order to give time for |
| kick_ring to take effect. |
| |
| v4: Update commit msg |
| |
| Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> |
| Reviewed-by: Ben Widawsky <ben@bwidawsk.net> |
| [danvet: Paste in Ben's review comment.] |
| Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> |
| |
| (cherry picked from commit 05407ff889ceebe383aa5907219f86582ef96b72) |
| Signed-off-by: Darren Hart <dvhart@linux.intel.com> |
| --- |
| drivers/gpu/drm/i915/i915_irq.c | 80 +++++++++++++++++++-------------- |
| drivers/gpu/drm/i915/intel_ringbuffer.h | 2 + |
| 2 files changed, 49 insertions(+), 33 deletions(-) |
| |
| diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c |
| index 2ecf66cb26f3..e213bd71524e 100644 |
| --- a/drivers/gpu/drm/i915/i915_irq.c |
| +++ b/drivers/gpu/drm/i915/i915_irq.c |
| @@ -674,7 +674,6 @@ static void notify_ring(struct drm_device *dev, |
| |
| wake_up_all(&ring->irq_queue); |
| if (i915_enable_hangcheck) { |
| - dev_priv->gpu_error.hangcheck_count = 0; |
| mod_timer(&dev_priv->gpu_error.hangcheck_timer, |
| round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); |
| } |
| @@ -2459,61 +2458,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev) |
| |
| /** |
| * This is called when the chip hasn't reported back with completed |
| - * batchbuffers in a long time. The first time this is called we simply record |
| - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses |
| - * again, we assume the chip is wedged and try to fix it. |
| + * batchbuffers in a long time. We keep track per ring seqno progress and |
| + * if there are no progress, hangcheck score for that ring is increased. |
| + * Further, acthd is inspected to see if the ring is stuck. On stuck case |
| + * we kick the ring. If we see no progress on three subsequent calls |
| + * we assume chip is wedged and try to fix it by resetting the chip. |
| */ |
| void i915_hangcheck_elapsed(unsigned long data) |
| { |
| struct drm_device *dev = (struct drm_device *)data; |
| drm_i915_private_t *dev_priv = dev->dev_private; |
| struct intel_ring_buffer *ring; |
| - bool err = false, idle; |
| int i; |
| - u32 seqno[I915_NUM_RINGS]; |
| - bool work_done; |
| + int busy_count = 0, rings_hung = 0; |
| + bool stuck[I915_NUM_RINGS]; |
| |
| if (!i915_enable_hangcheck) |
| return; |
| |
| - idle = true; |
| for_each_ring(ring, dev_priv, i) { |
| - seqno[i] = ring->get_seqno(ring, false); |
| - idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err); |
| - } |
| + u32 seqno, acthd; |
| + bool idle, err = false; |
| + |
| + seqno = ring->get_seqno(ring, false); |
| + acthd = intel_ring_get_active_head(ring); |
| + idle = i915_hangcheck_ring_idle(ring, seqno, &err); |
| + stuck[i] = ring->hangcheck.acthd == acthd; |
| + |
| + if (idle) { |
| + if (err) |
| + ring->hangcheck.score += 2; |
| + else |
| + ring->hangcheck.score = 0; |
| + } else { |
| + busy_count++; |
| |
| - /* If all work is done then ACTHD clearly hasn't advanced. */ |
| - if (idle) { |
| - if (err) { |
| - if (i915_hangcheck_hung(dev)) |
| - return; |
| + if (ring->hangcheck.seqno == seqno) { |
| + ring->hangcheck.score++; |
| |
| - goto repeat; |
| + /* Kick ring if stuck*/ |
| + if (stuck[i]) |
| + i915_hangcheck_ring_hung(ring); |
| + } else { |
| + ring->hangcheck.score = 0; |
| + } |
| } |
| |
| - dev_priv->gpu_error.hangcheck_count = 0; |
| - return; |
| + ring->hangcheck.seqno = seqno; |
| + ring->hangcheck.acthd = acthd; |
| } |
| |
| - work_done = false; |
| for_each_ring(ring, dev_priv, i) { |
| - if (ring->hangcheck.seqno != seqno[i]) { |
| - work_done = true; |
| - ring->hangcheck.seqno = seqno[i]; |
| + if (ring->hangcheck.score > 2) { |
| + rings_hung++; |
| + DRM_ERROR("%s: %s on %s 0x%x\n", ring->name, |
| + stuck[i] ? "stuck" : "no progress", |
| + stuck[i] ? "addr" : "seqno", |
| + stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR : |
| + ring->hangcheck.seqno); |
| } |
| } |
| |
| - if (!work_done) { |
| - if (i915_hangcheck_hung(dev)) |
| - return; |
| - } else { |
| - dev_priv->gpu_error.hangcheck_count = 0; |
| - } |
| + if (rings_hung) |
| + return i915_handle_error(dev, true); |
| |
| -repeat: |
| - /* Reset timer case chip hangs without another request being added */ |
| - mod_timer(&dev_priv->gpu_error.hangcheck_timer, |
| - round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); |
| + if (busy_count) |
| + /* Reset timer case chip hangs without another request |
| + * being added */ |
| + mod_timer(&dev_priv->gpu_error.hangcheck_timer, |
| + round_jiffies_up(jiffies + |
| + DRM_I915_HANGCHECK_JIFFIES)); |
| } |
| |
| /* drm_dma.h hooks |
| diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h |
| index 022d07e43d12..4c7e103e6fa4 100644 |
| --- a/drivers/gpu/drm/i915/intel_ringbuffer.h |
| +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h |
| @@ -39,6 +39,8 @@ struct intel_hw_status_page { |
| |
| struct intel_ring_hangcheck { |
| u32 seqno; |
| + u32 acthd; |
| + int score; |
| }; |
| |
| struct intel_ring_buffer { |
| -- |
| 1.8.5.rc3 |
| |