Commit 485d65e1 authored by Akiva Goldberger's avatar Akiva Goldberger Committed by Jakub Kicinski

net/mlx5: Add a timeout to acquire the command queue semaphore

Prevent forced completion handling on an entry that has not yet been
assigned an index, causing an out of bounds access on idx = -22.
Instead of waiting indefinitely for the sem, blocking flow now waits for
index to be allocated or a sem acquisition timeout before beginning the
timer for FW completion.

Kernel log example:
mlx5_core 0000:06:00.0: wait_func_handle_exec_timeout:1128:(pid 185911): cmd[-22]: CREATE_UCTX(0xa04) No done completion

Fixes: 8e715cd6 ("net/mlx5: Set command entry semaphore up once got index free")
Signed-off-by: default avatarAkiva Goldberger <agoldberger@nvidia.com>
Reviewed-by: default avatarMoshe Shemesh <moshe@nvidia.com>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240509112951.590184-5-tariqt@nvidia.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 0f06228d
...@@ -969,19 +969,32 @@ static void cmd_work_handler(struct work_struct *work) ...@@ -969,19 +969,32 @@ static void cmd_work_handler(struct work_struct *work)
bool poll_cmd = ent->polling; bool poll_cmd = ent->polling;
struct mlx5_cmd_layout *lay; struct mlx5_cmd_layout *lay;
struct mlx5_core_dev *dev; struct mlx5_core_dev *dev;
unsigned long cb_timeout; unsigned long timeout;
struct semaphore *sem;
unsigned long flags; unsigned long flags;
int alloc_ret; int alloc_ret;
int cmd_mode; int cmd_mode;
complete(&ent->handling);
dev = container_of(cmd, struct mlx5_core_dev, cmd); dev = container_of(cmd, struct mlx5_core_dev, cmd);
cb_timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD)); timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
complete(&ent->handling);
sem = ent->page_queue ? &cmd->vars.pages_sem : &cmd->vars.sem;
down(sem);
if (!ent->page_queue) { if (!ent->page_queue) {
if (down_timeout(&cmd->vars.sem, timeout)) {
mlx5_core_warn(dev, "%s(0x%x) timed out while waiting for a slot.\n",
mlx5_command_str(ent->op), ent->op);
if (ent->callback) {
ent->callback(-EBUSY, ent->context);
mlx5_free_cmd_msg(dev, ent->out);
free_msg(dev, ent->in);
cmd_ent_put(ent);
} else {
ent->ret = -EBUSY;
complete(&ent->done);
}
complete(&ent->slotted);
return;
}
alloc_ret = cmd_alloc_index(cmd, ent); alloc_ret = cmd_alloc_index(cmd, ent);
if (alloc_ret < 0) { if (alloc_ret < 0) {
mlx5_core_err_rl(dev, "failed to allocate command entry\n"); mlx5_core_err_rl(dev, "failed to allocate command entry\n");
...@@ -994,10 +1007,11 @@ static void cmd_work_handler(struct work_struct *work) ...@@ -994,10 +1007,11 @@ static void cmd_work_handler(struct work_struct *work)
ent->ret = -EAGAIN; ent->ret = -EAGAIN;
complete(&ent->done); complete(&ent->done);
} }
up(sem); up(&cmd->vars.sem);
return; return;
} }
} else { } else {
down(&cmd->vars.pages_sem);
ent->idx = cmd->vars.max_reg_cmds; ent->idx = cmd->vars.max_reg_cmds;
spin_lock_irqsave(&cmd->alloc_lock, flags); spin_lock_irqsave(&cmd->alloc_lock, flags);
clear_bit(ent->idx, &cmd->vars.bitmask); clear_bit(ent->idx, &cmd->vars.bitmask);
...@@ -1005,6 +1019,8 @@ static void cmd_work_handler(struct work_struct *work) ...@@ -1005,6 +1019,8 @@ static void cmd_work_handler(struct work_struct *work)
spin_unlock_irqrestore(&cmd->alloc_lock, flags); spin_unlock_irqrestore(&cmd->alloc_lock, flags);
} }
complete(&ent->slotted);
lay = get_inst(cmd, ent->idx); lay = get_inst(cmd, ent->idx);
ent->lay = lay; ent->lay = lay;
memset(lay, 0, sizeof(*lay)); memset(lay, 0, sizeof(*lay));
...@@ -1023,7 +1039,7 @@ static void cmd_work_handler(struct work_struct *work) ...@@ -1023,7 +1039,7 @@ static void cmd_work_handler(struct work_struct *work)
ent->ts1 = ktime_get_ns(); ent->ts1 = ktime_get_ns();
cmd_mode = cmd->mode; cmd_mode = cmd->mode;
if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout)) if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, timeout))
cmd_ent_get(ent); cmd_ent_get(ent);
set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
...@@ -1143,6 +1159,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) ...@@ -1143,6 +1159,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
ent->ret = -ECANCELED; ent->ret = -ECANCELED;
goto out_err; goto out_err;
} }
wait_for_completion(&ent->slotted);
if (cmd->mode == CMD_MODE_POLLING || ent->polling) if (cmd->mode == CMD_MODE_POLLING || ent->polling)
wait_for_completion(&ent->done); wait_for_completion(&ent->done);
else if (!wait_for_completion_timeout(&ent->done, timeout)) else if (!wait_for_completion_timeout(&ent->done, timeout))
...@@ -1157,6 +1176,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) ...@@ -1157,6 +1176,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
} else if (err == -ECANCELED) { } else if (err == -ECANCELED) {
mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n", mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
mlx5_command_str(ent->op), ent->op); mlx5_command_str(ent->op), ent->op);
} else if (err == -EBUSY) {
mlx5_core_warn(dev, "%s(0x%x) timeout while waiting for command semaphore.\n",
mlx5_command_str(ent->op), ent->op);
} }
mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n", mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
err, deliv_status_to_str(ent->status), ent->status); err, deliv_status_to_str(ent->status), ent->status);
...@@ -1208,6 +1230,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, ...@@ -1208,6 +1230,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
ent->polling = force_polling; ent->polling = force_polling;
init_completion(&ent->handling); init_completion(&ent->handling);
init_completion(&ent->slotted);
if (!callback) if (!callback)
init_completion(&ent->done); init_completion(&ent->done);
...@@ -1225,7 +1248,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, ...@@ -1225,7 +1248,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
return 0; /* mlx5_cmd_comp_handler() will put(ent) */ return 0; /* mlx5_cmd_comp_handler() will put(ent) */
err = wait_func(dev, ent); err = wait_func(dev, ent);
if (err == -ETIMEDOUT || err == -ECANCELED) if (err == -ETIMEDOUT || err == -ECANCELED || err == -EBUSY)
goto out_free; goto out_free;
ds = ent->ts2 - ent->ts1; ds = ent->ts2 - ent->ts1;
......
...@@ -862,6 +862,7 @@ struct mlx5_cmd_work_ent { ...@@ -862,6 +862,7 @@ struct mlx5_cmd_work_ent {
void *context; void *context;
int idx; int idx;
struct completion handling; struct completion handling;
struct completion slotted;
struct completion done; struct completion done;
struct mlx5_cmd *cmd; struct mlx5_cmd *cmd;
struct work_struct work; struct work_struct work;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment