Commit 9b98d395 authored by Moshe Shemesh's avatar Moshe Shemesh Committed by Jakub Kicinski

net/mlx5: Start health poll at earlier stage of driver load

Start health poll at earlier stage, so if fw fatal issue occurred before
or during initialization commands such as init_hca or set_hca_cap the
poll health can detect and indicate that the driver is already in error
state.
Signed-off-by: default avatarMoshe Shemesh <moshe@nvidia.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 16ab85e7
...@@ -843,9 +843,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) ...@@ -843,9 +843,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms); health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
add_timer(&health->timer); add_timer(&health->timer);
if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
} }
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
...@@ -862,6 +859,14 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) ...@@ -862,6 +859,14 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
del_timer_sync(&health->timer); del_timer_sync(&health->timer);
} }
void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
}
void mlx5_drain_health_wq(struct mlx5_core_dev *dev) void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
{ {
struct mlx5_core_health *health = &dev->priv.health; struct mlx5_core_health *health = &dev->priv.health;
......
...@@ -1092,7 +1092,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) ...@@ -1092,7 +1092,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_devcom_unregister_device(dev->priv.devcom); mlx5_devcom_unregister_device(dev->priv.devcom);
} }
static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout) static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, u64 timeout)
{ {
int err; int err;
...@@ -1130,10 +1130,12 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout) ...@@ -1130,10 +1130,12 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP);
mlx5_start_health_poll(dev);
err = mlx5_core_enable_hca(dev, 0); err = mlx5_core_enable_hca(dev, 0);
if (err) { if (err) {
mlx5_core_err(dev, "enable hca failed\n"); mlx5_core_err(dev, "enable hca failed\n");
goto err_cmd_cleanup; goto stop_health_poll;
} }
err = mlx5_core_set_issi(dev); err = mlx5_core_set_issi(dev);
...@@ -1185,8 +1187,7 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout) ...@@ -1185,8 +1187,7 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
mlx5_core_err(dev, "query hca failed\n"); mlx5_core_err(dev, "query hca failed\n");
goto reclaim_boot_pages; goto reclaim_boot_pages;
} }
mlx5_start_health_fw_log_up(dev);
mlx5_start_health_poll(dev);
return 0; return 0;
...@@ -1194,6 +1195,8 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout) ...@@ -1194,6 +1195,8 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
mlx5_reclaim_startup_pages(dev); mlx5_reclaim_startup_pages(dev);
err_disable_hca: err_disable_hca:
mlx5_core_disable_hca(dev, 0); mlx5_core_disable_hca(dev, 0);
stop_health_poll:
mlx5_stop_health_poll(dev, boot);
err_cmd_cleanup: err_cmd_cleanup:
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
mlx5_cmd_cleanup(dev); mlx5_cmd_cleanup(dev);
...@@ -1205,7 +1208,6 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) ...@@ -1205,7 +1208,6 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
{ {
int err; int err;
mlx5_stop_health_poll(dev, boot);
err = mlx5_cmd_teardown_hca(dev); err = mlx5_cmd_teardown_hca(dev);
if (err) { if (err) {
mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n"); mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n");
...@@ -1213,6 +1215,7 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) ...@@ -1213,6 +1215,7 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
} }
mlx5_reclaim_startup_pages(dev); mlx5_reclaim_startup_pages(dev);
mlx5_core_disable_hca(dev, 0); mlx5_core_disable_hca(dev, 0);
mlx5_stop_health_poll(dev, boot);
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
mlx5_cmd_cleanup(dev); mlx5_cmd_cleanup(dev);
...@@ -1362,7 +1365,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev) ...@@ -1362,7 +1365,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
mutex_lock(&dev->intf_state_mutex); mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP; dev->state = MLX5_DEVICE_STATE_UP;
err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT)); err = mlx5_function_setup(dev, true, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err) if (err)
goto err_function; goto err_function;
...@@ -1450,7 +1453,7 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery) ...@@ -1450,7 +1453,7 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT); timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
else else
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT); timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
err = mlx5_function_setup(dev, timeout); err = mlx5_function_setup(dev, false, timeout);
if (err) if (err)
goto err_function; goto err_function;
......
...@@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); ...@@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
int mlx5_health_init(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev);
void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev);
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev);
void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
void mlx5_trigger_health_work(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment