Commit 73dd3a48 authored by Mohamad Haj Yahia's avatar Mohamad Haj Yahia Committed by Saeed Mahameed

net/mlx5: Avoid using pending command interface slots

Currently when firmware command gets stuck or it takes long time to
complete, the driver command will get timeout and the command slot is
freed and can be used for new commands, and if the firmware receive new
command on the old busy slot its behavior is unexpected and this could
be harmful.
To fix this when the driver command gets timeout we return failure,
but we don't free the command slot and we wait for the firmware to
explicitly respond to that command.
Once all the entries are busy we will stop processing new firmware
commands.

Fixes: 9cba4ebc ('net/mlx5: Fix potential deadlock in command mode change')
Signed-off-by: default avatarMohamad Haj Yahia <mohamad@mellanox.com>
Cc: kernel-team@fb.com
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent b57fe691
......@@ -774,7 +774,7 @@ static void cb_timeout_handler(struct work_struct *work)
mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
mlx5_command_str(msg_to_opcode(ent->in)),
msg_to_opcode(ent->in));
mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
}
static void cmd_work_handler(struct work_struct *work)
......@@ -804,6 +804,7 @@ static void cmd_work_handler(struct work_struct *work)
}
cmd->ent_arr[ent->idx] = ent;
set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
lay = get_inst(cmd, ent->idx);
ent->lay = lay;
memset(lay, 0, sizeof(*lay));
......@@ -825,6 +826,20 @@ static void cmd_work_handler(struct work_struct *work)
if (ent->callback)
schedule_delayed_work(&ent->cb_timeout_work, cb_timeout);
/* Skip sending command to fw if internal error */
if (pci_channel_offline(dev->pdev) ||
dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
u8 status = 0;
u32 drv_synd;
ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status);
MLX5_SET(mbox_out, ent->out, status, status);
MLX5_SET(mbox_out, ent->out, syndrome, drv_synd);
mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
return;
}
/* ring doorbell after the descriptor is valid */
mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx);
wmb();
......@@ -835,7 +850,7 @@ static void cmd_work_handler(struct work_struct *work)
poll_timeout(ent);
/* make sure we read the descriptor after ownership is SW */
rmb();
mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT));
}
}
......@@ -879,7 +894,7 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
wait_for_completion(&ent->done);
} else if (!wait_for_completion_timeout(&ent->done, timeout)) {
ent->ret = -ETIMEDOUT;
mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
}
err = ent->ret;
......@@ -1375,7 +1390,7 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
}
}
void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec)
void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced)
{
struct mlx5_cmd *cmd = &dev->cmd;
struct mlx5_cmd_work_ent *ent;
......@@ -1395,6 +1410,19 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec)
struct semaphore *sem;
ent = cmd->ent_arr[i];
/* if we already completed the command, ignore it */
if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP,
&ent->state)) {
/* only real completion can free the cmd slot */
if (!forced) {
mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n",
ent->idx);
free_ent(cmd, ent->idx);
}
continue;
}
if (ent->callback)
cancel_delayed_work(&ent->cb_timeout_work);
if (ent->page_queue)
......@@ -1417,7 +1445,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec)
mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n",
ent->ret, deliv_status_to_str(ent->status), ent->status);
}
free_ent(cmd, ent->idx);
/* only real completion will free the entry slot */
if (!forced)
free_ent(cmd, ent->idx);
if (ent->callback) {
ds = ent->ts2 - ent->ts1;
......
......@@ -422,7 +422,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
break;
case MLX5_EVENT_TYPE_CMD:
mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector));
mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false);
break;
case MLX5_EVENT_TYPE_PORT_CHANGE:
......
......@@ -90,7 +90,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev)
spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
mlx5_cmd_comp_handler(dev, vector);
mlx5_cmd_comp_handler(dev, vector, true);
return;
no_trig:
......
......@@ -787,7 +787,12 @@ enum {
typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
enum {
MLX5_CMD_ENT_STATE_PENDING_COMP,
};
struct mlx5_cmd_work_ent {
unsigned long state;
struct mlx5_cmd_msg *in;
struct mlx5_cmd_msg *out;
void *uout;
......@@ -976,7 +981,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec);
void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced);
void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
int nent, u64 mask, const char *name,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment