| From 9cc0f84ee9314a65625ee0d2f3640a199e54c833 Mon Sep 17 00:00:00 2001 |
| From: Zhu Yanjun <yanjunz@mellanox.com> |
| Date: Wed, 12 Feb 2020 09:26:33 +0200 |
| Subject: [PATCH] RDMA/rxe: Fix soft lockup problem due to using tasklets in |
| softirq |
| |
| commit 8ac0e6641c7ca14833a2a8c6f13d8e0a435e535c upstream. |
| |
| When run stress tests with RXE, the following Call Traces often occur |
| |
| watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [swapper/2:0] |
| ... |
| Call Trace: |
| <IRQ> |
| create_object+0x3f/0x3b0 |
| kmem_cache_alloc_node_trace+0x129/0x2d0 |
| __kmalloc_reserve.isra.52+0x2e/0x80 |
| __alloc_skb+0x83/0x270 |
| rxe_init_packet+0x99/0x150 [rdma_rxe] |
| rxe_requester+0x34e/0x11a0 [rdma_rxe] |
| rxe_do_task+0x85/0xf0 [rdma_rxe] |
| tasklet_action_common.isra.21+0xeb/0x100 |
| __do_softirq+0xd0/0x298 |
| irq_exit+0xc5/0xd0 |
| smp_apic_timer_interrupt+0x68/0x120 |
| apic_timer_interrupt+0xf/0x20 |
| </IRQ> |
| ... |
| |
| The root cause is that tasklet is actually a softirq. In a tasklet |
| handler, another softirq handler is triggered. Usually these softirq |
| handlers run on the same cpu core. So this will cause "soft lockup Bug". |
| |
| Fixes: 8700e3e7c485 ("Soft RoCE driver") |
| Link: https://lore.kernel.org/r/20200212072635.682689-8-leon@kernel.org |
| Signed-off-by: Zhu Yanjun <yanjunz@mellanox.com> |
| Signed-off-by: Leon Romanovsky <leonro@mellanox.com> |
| Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c |
| index 00eb99d3df86..7c5999ada61d 100644 |
| --- a/drivers/infiniband/sw/rxe/rxe_comp.c |
| +++ b/drivers/infiniband/sw/rxe/rxe_comp.c |
| @@ -329,7 +329,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, |
| qp->comp.psn = pkt->psn; |
| if (qp->req.wait_psn) { |
| qp->req.wait_psn = 0; |
| - rxe_run_task(&qp->req.task, 1); |
| + rxe_run_task(&qp->req.task, 0); |
| } |
| } |
| return COMPST_ERROR_RETRY; |
| @@ -463,7 +463,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) |
| */ |
| if (qp->req.wait_fence) { |
| qp->req.wait_fence = 0; |
| - rxe_run_task(&qp->req.task, 1); |
| + rxe_run_task(&qp->req.task, 0); |
| } |
| } |
| |
| @@ -479,7 +479,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, |
| if (qp->req.need_rd_atomic) { |
| qp->comp.timeout_retry = 0; |
| qp->req.need_rd_atomic = 0; |
| - rxe_run_task(&qp->req.task, 1); |
| + rxe_run_task(&qp->req.task, 0); |
| } |
| } |
| |
| @@ -725,7 +725,7 @@ int rxe_completer(void *arg) |
| RXE_CNT_COMP_RETRY); |
| qp->req.need_retry = 1; |
| qp->comp.started_retry = 1; |
| - rxe_run_task(&qp->req.task, 1); |
| + rxe_run_task(&qp->req.task, 0); |
| } |
| |
| if (pkt) { |
| -- |
| 2.7.4 |
| |