Commit c06f8c21 authored by Erez Shitrit's avatar Erez Shitrit Committed by Greg Kroah-Hartman

IB/mlx5: Fetch soft WQE's on fatal error state

commit 7b74a83c upstream.

On fatal error the driver simulates CQE's for ULPs that rely on
completion of all their posted work-request.

For the GSI traffic, the mlx5 has its own mechanism that sends the
completions via software CQE's directly to the relevant CQ.

This should be kept in fatal error too, so the driver should simulate
such CQE's with the specified error state in order to complete GSI QP
work requests.

Without the fix the next deadlock might appears:
        schedule_timeout+0x274/0x350
        wait_for_common+0xec/0x240
        mcast_remove_one+0xd0/0x120 [ib_core]
        ib_unregister_device+0x12c/0x230 [ib_core]
        mlx5_ib_remove+0xc4/0x270 [mlx5_ib]
        mlx5_detach_device+0x184/0x1a0 [mlx5_core]
        mlx5_unload_one+0x308/0x340 [mlx5_core]
        mlx5_pci_err_detected+0x74/0xe0 [mlx5_core]

Cc: <stable@vger.kernel.org> # 4.7
Fixes: 89ea94a7 ("IB/mlx5: Reset flow support for IB kernel ULPs")
Signed-off-by: default avatarErez Shitrit <erezsh@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 96fb9b88
...@@ -646,7 +646,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, ...@@ -646,7 +646,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
} }
static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
struct ib_wc *wc) struct ib_wc *wc, bool is_fatal_err)
{ {
struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
struct mlx5_ib_wc *soft_wc, *next; struct mlx5_ib_wc *soft_wc, *next;
...@@ -659,6 +659,10 @@ static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, ...@@ -659,6 +659,10 @@ static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n",
cq->mcq.cqn); cq->mcq.cqn);
if (unlikely(is_fatal_err)) {
soft_wc->wc.status = IB_WC_WR_FLUSH_ERR;
soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
}
wc[npolled++] = soft_wc->wc; wc[npolled++] = soft_wc->wc;
list_del(&soft_wc->list); list_del(&soft_wc->list);
kfree(soft_wc); kfree(soft_wc);
...@@ -679,12 +683,17 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) ...@@ -679,12 +683,17 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
spin_lock_irqsave(&cq->lock, flags); spin_lock_irqsave(&cq->lock, flags);
if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled); /* make sure no soft wqe's are waiting */
if (unlikely(!list_empty(&cq->wc_list)))
soft_polled = poll_soft_wc(cq, num_entries, wc, true);
mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled,
wc + soft_polled, &npolled);
goto out; goto out;
} }
if (unlikely(!list_empty(&cq->wc_list))) if (unlikely(!list_empty(&cq->wc_list)))
soft_polled = poll_soft_wc(cq, num_entries, wc); soft_polled = poll_soft_wc(cq, num_entries, wc, false);
for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { for (npolled = 0; npolled < num_entries - soft_polled; npolled++) {
if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled)) if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment