Merge tag 'mlx5-updates-2023-03-20' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says: ==================== mlx5-updates-2023-03-20 mlx5 dynamic msix This patch series adds support for dynamic msix vectors allocation in mlx5. Eli Cohen Says: ================ The following series of patches modifies mlx5_core to work with the dynamic MSIX API. Currently, mlx5_core allocates all the interrupt vectors it needs and distributes them amongst the consumers. With the introduction of dynamic MSIX support, which allows for allocation of interrupts more than once, we now allocate vectors as we need them. This allows other drivers running on top of mlx5_core to allocate interrupt vectors for their own use. An example for this is mlx5_vdpa, which uses these vectors to propagate interrupts directly from the hardware to the vCPU [1]. As a preparation for using this series, a use after free issue is fixed in lib/cpu_rmap.c and the allocator for rmap entries has been modified. A complementary API for irq_cpu_rmap_add() has also been introduced. [1] https://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git/patch/?id=0f2bf1fcae96a83b8c5581854713c9fc3407556e ================ * tag 'mlx5-updates-2023-03-20' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux: net/mlx5: Provide external API for allocating vectors net/mlx5: Use one completion vector if eth is disabled net/mlx5: Refactor calculation of required completion vectors net/mlx5: Move devlink registration before mlx5_load net/mlx5: Use dynamic msix vectors allocation net/mlx5: Refactor completion irq request/release code net/mlx5: Improve naming of pci function vectors net/mlx5: Use newer affinity descriptor net/mlx5: Modify struct mlx5_irq to use struct msi_map net/mlx5: Fix wrong comment net/mlx5e: Coding style fix, add empty line lib: cpu_rmap: Add irq_cpu_rmap_remove to complement irq_cpu_rmap_add lib: cpu_rmap: Use allocator for rmap entries lib: cpu_rmap: Avoid use after free on rmap->obj array entries ==================== Link: https://lore.kernel.org/r/20230324231341.29808-1-saeed@kernel.orgSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge tag 'mlx5-updates-2023-03-20' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5-updates-2023-03-20 mlx5 dynamic msix This patch series adds support for dynamic msix vectors allocation in mlx5. Eli Cohen Says: ================ The following series of patches modifies mlx5_core to work with the dynamic MSIX API. Currently, mlx5_core allocates all the interrupt vectors it needs and distributes them amongst the consumers. With the introduction of dynamic MSIX support, which allows for allocation of interrupts more than once, we now allocate vectors as we need them. This allows other drivers running on top of mlx5_core to allocate interrupt vectors for their own use. An example for this is mlx5_vdpa, which uses these vectors to propagate interrupts directly from the hardware to the vCPU [1]. As a preparation for using this series, a use after free issue is fixed in lib/cpu_rmap.c and the allocator for rmap entries has been modified. A complementary API for irq_cpu_rmap_add() has also been introduced. [1] https://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git/patch/?id=0f2bf1fcae96a83b8c5581854713c9fc3407556e ================ * tag 'mlx5-updates-2023-03-20' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux: net/mlx5: Provide external API for allocating vectors net/mlx5: Use one completion vector if eth is disabled net/mlx5: Refactor calculation of required completion vectors net/mlx5: Move devlink registration before mlx5_load net/mlx5: Use dynamic msix vectors allocation net/mlx5: Refactor completion irq request/release code net/mlx5: Improve naming of pci function vectors net/mlx5: Use newer affinity descriptor net/mlx5: Modify struct mlx5_irq to use struct msi_map net/mlx5: Fix wrong comment net/mlx5e: Coding style fix, add empty line lib: cpu_rmap: Add irq_cpu_rmap_remove to complement irq_cpu_rmap_add lib: cpu_rmap: Use allocator for rmap entries lib: cpu_rmap: Avoid use after free on rmap->obj array entries ==================== Link: https://lore.kernel.org/r/20230324231341.29808-1-saeed@kernel.orgSigned-off-by: Jakub Kicinski <kuba@kernel.org>
de749452 · Jakub Kicinski · e70f94c6 · fb0a6a26 · de749452 · de749452
Commit de749452 authored Mar 28, 2023 by Jakub Kicinski
11 changed files
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -35,6 +35,7 @@
 #include <linux/mlx5/mlx5_ifc_vdpa.h>
 #include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
+#include "devlink.h"

 /* intf dev list mutex */
 static DEFINE_MUTEX(mlx5_intf_mutex);
@@ -109,17 +110,6 @@ bool mlx5_eth_supported(struct mlx5_core_dev *dev)
 	return true;
 }

-static bool is_eth_enabled(struct mlx5_core_dev *dev)
-{
-	union devlink_param_value val;
-	int err;
-
-	err = devl_param_driverinit_value_get(priv_to_devlink(dev),
-					      DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
-					      &val);
-	return err ? false : val.vbool;
-}
-
 bool mlx5_vnet_supported(struct mlx5_core_dev *dev)
 {
 	if (!IS_ENABLED(CONFIG_MLX5_VDPA_NET))
@@ -251,7 +241,7 @@ static const struct mlx5_adev_device {
 					 .is_enabled = &is_ib_enabled },
 	[MLX5_INTERFACE_PROTOCOL_ETH] = { .suffix = "eth",
 					  .is_supported = &mlx5_eth_supported,
-					  .is_enabled = &is_eth_enabled },
+					  .is_enabled = &mlx5_core_is_eth_enabled },
 	[MLX5_INTERFACE_PROTOCOL_ETH_REP] = { .suffix = "eth-rep",
 					   .is_supported = &is_eth_rep_supported },
 	[MLX5_INTERFACE_PROTOCOL_IB_REP] = { .suffix = "rdma-rep",

--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -46,4 +46,15 @@ void mlx5_devlink_free(struct devlink *devlink);
 int mlx5_devlink_params_register(struct devlink *devlink);
 void mlx5_devlink_params_unregister(struct devlink *devlink);

+static inline bool mlx5_core_is_eth_enabled(struct mlx5_core_dev *dev)
+{
+	union devlink_param_value val;
+	int err;
+
+	err = devl_param_driverinit_value_get(priv_to_devlink(dev),
+					      DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+					      &val);
+	return err ? false : val.vbool;
+}
+
 #endif /* __MLX5_DEVLINK_H__ */
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -18,6 +18,7 @@
 #include "lib/clock.h"
 #include "diag/fw_tracer.h"
 #include "mlx5_irq.h"
+#include "pci_irq.h"
 #include "devlink.h"
 #include "en_accel/ipsec.h"

@@ -61,9 +62,7 @@ struct mlx5_eq_table {
 	struct mlx5_irq_table	*irq_table;
 	struct mlx5_irq         **comp_irqs;
 	struct mlx5_irq         *ctrl_irq;
-#ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap		*rmap;
-#endif
 };

 #define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
@@ -637,6 +636,7 @@ static u16 async_eq_depth_devlink_param_get(struct mlx5_core_dev *dev)
 	mlx5_core_dbg(dev, "Failed to get param. using default. err = %d\n", err);
 	return MLX5_NUM_ASYNC_EQE;
 }
+
 static int create_async_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
@@ -803,44 +803,28 @@ void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm)
 }
 EXPORT_SYMBOL(mlx5_eq_update_ci);

-static void comp_irqs_release(struct mlx5_core_dev *dev)
+static void comp_irqs_release_pci(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;

-	if (mlx5_core_is_sf(dev))
-		mlx5_irq_affinity_irqs_release(dev, table->comp_irqs, table->num_comp_eqs);
-	else
-		mlx5_irqs_release_vectors(table->comp_irqs, table->num_comp_eqs);
-	kfree(table->comp_irqs);
+	mlx5_irqs_release_vectors(table->comp_irqs, table->num_comp_eqs);
 }

-static int comp_irqs_request(struct mlx5_core_dev *dev)
+static int comp_irqs_request_pci(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	const struct cpumask *prev = cpu_none_mask;
 	const struct cpumask *mask;
-	int ncomp_eqs = table->num_comp_eqs;
+	int ncomp_eqs;
 	u16 *cpus;
 	int ret;
 	int cpu;
 	int i;

 	ncomp_eqs = table->num_comp_eqs;
-	table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL);
-	if (!table->comp_irqs)
-		return -ENOMEM;
-	if (mlx5_core_is_sf(dev)) {
-		ret = mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs);
-		if (ret < 0)
-			goto free_irqs;
-		return ret;
-	}
-
 	cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL);
-	if (!cpus) {
+	if (!cpus)
 		ret = -ENOMEM;
-		goto free_irqs;
-	}

 	i = 0;
 	rcu_read_lock();
@@ -854,17 +838,89 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
 	}
 spread_done:
 	rcu_read_unlock();
-	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
+	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs, &table->rmap);
 	kfree(cpus);
-	if (ret < 0)
-		goto free_irqs;
 	return ret;
+}
+
+static void comp_irqs_release_sf(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+
+	mlx5_irq_affinity_irqs_release(dev, table->comp_irqs, table->num_comp_eqs);
+}
+
+static int comp_irqs_request_sf(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	int ncomp_eqs = table->num_comp_eqs;
+
+	return mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs);
+}
+
+static void comp_irqs_release(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+
+	mlx5_core_is_sf(dev) ? comp_irqs_release_sf(dev) :
+			       comp_irqs_release_pci(dev);

-free_irqs:
 	kfree(table->comp_irqs);
+}
+
+static int comp_irqs_request(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	int ncomp_eqs;
+	int ret;
+
+	ncomp_eqs = table->num_comp_eqs;
+	table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL);
+	if (!table->comp_irqs)
+		return -ENOMEM;
+
+	ret = mlx5_core_is_sf(dev) ? comp_irqs_request_sf(dev) :
+				     comp_irqs_request_pci(dev);
+	if (ret < 0)
+		kfree(table->comp_irqs);
+
 	return ret;
 }

+#ifdef CONFIG_RFS_ACCEL
+static int alloc_rmap(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_eq_table *eq_table = mdev->priv.eq_table;
+
+	/* rmap is a mapping between irq number and queue number.
+	 * Each irq can be assigned only to a single rmap.
+	 * Since SFs share IRQs, rmap mapping cannot function correctly
+	 * for irqs that are shared between different core/netdev RX rings.
+	 * Hence we don't allow netdev rmap for SFs.
+	 */
+	if (mlx5_core_is_sf(mdev))
+		return 0;
+
+	eq_table->rmap = alloc_irq_cpu_rmap(eq_table->num_comp_eqs);
+	if (!eq_table->rmap)
+		return -ENOMEM;
+	return 0;
+}
+
+static void free_rmap(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_eq_table *eq_table = mdev->priv.eq_table;
+
+	if (eq_table->rmap) {
+		free_irq_cpu_rmap(eq_table->rmap);
+		eq_table->rmap = NULL;
+	}
+}
+#else
+static int alloc_rmap(struct mlx5_core_dev *mdev) { return 0; }
+static void free_rmap(struct mlx5_core_dev *mdev) {}
+#endif
+
 static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
@@ -880,6 +936,7 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 		kfree(eq);
 	}
 	comp_irqs_release(dev);
+	free_rmap(dev);
 }

 static u16 comp_eq_depth_devlink_param_get(struct mlx5_core_dev *dev)
@@ -906,9 +963,16 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	int err;
 	int i;

+	err = alloc_rmap(dev);
+	if (err)
+		return err;
+
 	ncomp_eqs = comp_irqs_request(dev);
-	if (ncomp_eqs < 0)
-		return ncomp_eqs;
+	if (ncomp_eqs < 0) {
+		err = ncomp_eqs;
+		goto err_irqs_req;
+	}
+
 	INIT_LIST_HEAD(&table->comp_eqs_list);
 	nent = comp_eq_depth_devlink_param_get(dev);

@@ -953,6 +1017,8 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	kfree(eq);
 clean:
 	destroy_comp_eqs(dev);
+err_irqs_req:
+	free_rmap(dev);
 	return err;
 }

@@ -1031,55 +1097,12 @@ struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn)
 	return ERR_PTR(-ENOENT);
 }

-static void clear_rmap(struct mlx5_core_dev *dev)
-{
-#ifdef CONFIG_RFS_ACCEL
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
-
-	free_irq_cpu_rmap(eq_table->rmap);
-#endif
-}
-
-static int set_rmap(struct mlx5_core_dev *mdev)
-{
-	int err = 0;
-#ifdef CONFIG_RFS_ACCEL
-	struct mlx5_eq_table *eq_table = mdev->priv.eq_table;
-	int vecidx;
-
-	eq_table->rmap = alloc_irq_cpu_rmap(eq_table->num_comp_eqs);
-	if (!eq_table->rmap) {
-		err = -ENOMEM;
-		mlx5_core_err(mdev, "Failed to allocate cpu_rmap. err %d", err);
-		goto err_out;
-	}
-
-	for (vecidx = 0; vecidx < eq_table->num_comp_eqs; vecidx++) {
-		err = irq_cpu_rmap_add(eq_table->rmap,
-				       pci_irq_vector(mdev->pdev, vecidx));
-		if (err) {
-			mlx5_core_err(mdev, "irq_cpu_rmap_add failed. err %d",
-				      err);
-			goto err_irq_cpu_rmap_add;
-		}
-	}
-	return 0;
-
-err_irq_cpu_rmap_add:
-	clear_rmap(mdev);
-err_out:
-#endif
-	return err;
-}
-
 /* This function should only be called after mlx5_cmd_force_teardown_hca */
 void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;

 	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
-	if (!mlx5_core_is_sf(dev))
-		clear_rmap(dev);
 	mlx5_irq_table_destroy(dev);
 	mutex_unlock(&table->lock);
 }
@@ -1090,44 +1113,47 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 #define MLX5_MAX_ASYNC_EQS 3
 #endif

-int mlx5_eq_table_create(struct mlx5_core_dev *dev)
+static int get_num_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
-	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
+	int max_dev_eqs;
+	int max_eqs_sf;
+	int num_eqs;
+
+	/* If ethernet is disabled we use just a single completion vector to
+	 * have the other vectors available for other drivers using mlx5_core. For
+	 * example, mlx5_vdpa
+	 */
+	if (!mlx5_core_is_eth_enabled(dev) && mlx5_eth_supported(dev))
+		return 1;
+
+	max_dev_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
 		      MLX5_CAP_GEN(dev, max_num_eqs) :
 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
-	int max_eqs_sf;
-	int err;

-	eq_table->num_comp_eqs =
-		min_t(int,
-		      mlx5_irq_table_get_num_comp(eq_table->irq_table),
-		      num_eqs - MLX5_MAX_ASYNC_EQS);
+	num_eqs = min_t(int, mlx5_irq_table_get_num_comp(eq_table->irq_table),
+			max_dev_eqs - MLX5_MAX_ASYNC_EQS);
 	if (mlx5_core_is_sf(dev)) {
 		max_eqs_sf = min_t(int, MLX5_COMP_EQS_PER_SF,
 				   mlx5_irq_table_get_sfs_vec(eq_table->irq_table));
-		eq_table->num_comp_eqs = min_t(int, eq_table->num_comp_eqs,
-					       max_eqs_sf);
+		num_eqs = min_t(int, num_eqs, max_eqs_sf);
 	}

+	return num_eqs;
+}
+
+int mlx5_eq_table_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+	int err;
+
+	eq_table->num_comp_eqs = get_num_eqs(dev);
 	err = create_async_eqs(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to create async EQs\n");
 		goto err_async_eqs;
 	}

-	if (!mlx5_core_is_sf(dev)) {
-		/* rmap is a mapping between irq number and queue number.
-		 * each irq can be assign only to a single rmap.
-		 * since SFs share IRQs, rmap mapping cannot function correctly
-		 * for irqs that are shared for different core/netdev RX rings.
-		 * Hence we don't allow netdev rmap for SFs
-		 */
-		err = set_rmap(dev);
-		if (err)
-			goto err_rmap;
-	}
-
 	err = create_comp_eqs(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to create completion EQs\n");
@@ -1135,10 +1161,8 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 	}

 	return 0;
+
 err_comp_eqs:
-	if (!mlx5_core_is_sf(dev))
-		clear_rmap(dev);
-err_rmap:
 	destroy_async_eqs(dev);
 err_async_eqs:
 	return err;
@@ -1146,8 +1170,6 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)

 void mlx5_eq_table_destroy(struct mlx5_core_dev *dev)
 {
-	if (!mlx5_core_is_sf(dev))
-		clear_rmap(dev);
 	destroy_comp_eqs(dev);
 	destroy_async_eqs(dev);
 }

--- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
@@ -45,30 +45,28 @@ static int cpu_get_least_loaded(struct mlx5_irq_pool *pool,

 /* Creating an IRQ from irq_pool */
 static struct mlx5_irq *
-irq_pool_request_irq(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+irq_pool_request_irq(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc)
 {
-	cpumask_var_t auto_mask;
-	struct mlx5_irq *irq;
+	struct irq_affinity_desc auto_desc = {};
 	u32 irq_index;
 	int err;

-	if (!zalloc_cpumask_var(&auto_mask, GFP_KERNEL))
-		return ERR_PTR(-ENOMEM);
 	err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs, GFP_KERNEL);
 	if (err)
 		return ERR_PTR(err);
 	if (pool->irqs_per_cpu) {
-		if (cpumask_weight(req_mask) > 1)
+		if (cpumask_weight(&af_desc->mask) > 1)
 			/* if req_mask contain more then one CPU, set the least loadad CPU
 			 * of req_mask
 			 */
-			cpumask_set_cpu(cpu_get_least_loaded(pool, req_mask), auto_mask);
+			cpumask_set_cpu(cpu_get_least_loaded(pool, &af_desc->mask),
+					&auto_desc.mask);
 		else
-			cpu_get(pool, cpumask_first(req_mask));
+			cpu_get(pool, cpumask_first(&af_desc->mask));
 	}
-	irq = mlx5_irq_alloc(pool, irq_index, cpumask_empty(auto_mask) ? req_mask : auto_mask);
-	free_cpumask_var(auto_mask);
-	return irq;
+	return mlx5_irq_alloc(pool, irq_index,
+			      cpumask_empty(&auto_desc.mask) ? af_desc : &auto_desc,
+			      NULL);
 }

 /* Looking for the IRQ with the smallest refcount that fits req_mask.
@@ -115,22 +113,22 @@ irq_pool_find_least_loaded(struct mlx5_irq_pool *pool, const struct cpumask *req
 /**
 * mlx5_irq_affinity_request - request an IRQ according to the given mask.
 * @pool: IRQ pool to request from.
- * @req_mask: cpumask requested for this IRQ.
+ * @af_desc: affinity descriptor for this IRQ.
 *
 * This function returns a pointer to IRQ, or ERR_PTR in case of error.
 */
 struct mlx5_irq *
-mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc)
 {
 	struct mlx5_irq *least_loaded_irq, *new_irq;

 	mutex_lock(&pool->lock);
-	least_loaded_irq = irq_pool_find_least_loaded(pool, req_mask);
+	least_loaded_irq = irq_pool_find_least_loaded(pool, &af_desc->mask);
 	if (least_loaded_irq &&
 	    mlx5_irq_read_locked(least_loaded_irq) < pool->min_threshold)
 		goto out;
 	/* We didn't find an IRQ with less than min_thres, try to allocate a new IRQ */
-	new_irq = irq_pool_request_irq(pool, req_mask);
+	new_irq = irq_pool_request_irq(pool, af_desc);
 	if (IS_ERR(new_irq)) {
 		if (!least_loaded_irq) {
 			/* We failed to create an IRQ and we didn't find an IRQ */
@@ -194,32 +192,30 @@ int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
 					struct mlx5_irq **irqs)
 {
 	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
-	cpumask_var_t req_mask;
+	struct irq_affinity_desc af_desc = {};
 	struct mlx5_irq *irq;
 	int i = 0;

-	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_copy(req_mask, cpu_online_mask);
+	af_desc.is_managed = 1;
+	cpumask_copy(&af_desc.mask, cpu_online_mask);
 	for (i = 0; i < nirqs; i++) {
 		if (mlx5_irq_pool_is_sf_pool(pool))
-			irq = mlx5_irq_affinity_request(pool, req_mask);
+			irq = mlx5_irq_affinity_request(pool, &af_desc);
 		else
 			/* In case SF pool doesn't exists, fallback to the PF IRQs.
 			 * The PF IRQs are already allocated and binded to CPU
 			 * at this point. Hence, only an index is needed.
 			 */
-			irq = mlx5_irq_request(dev, i, NULL);
+			irq = mlx5_irq_request(dev, i, NULL, NULL);
 		if (IS_ERR(irq))
 			break;
 		irqs[i] = irq;
-		cpumask_clear_cpu(cpumask_first(mlx5_irq_get_affinity_mask(irq)), req_mask);
+		cpumask_clear_cpu(cpumask_first(mlx5_irq_get_affinity_mask(irq)), &af_desc.mask);
 		mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
 			      pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
 			      cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
 			      mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
 	}
-	free_cpumask_var(req_mask);
 	if (!i)
 		return PTR_ERR(irq);
 	return i;

--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -46,9 +46,6 @@
 #include <linux/kmod.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include <linux/mlx5/vport.h>
-#ifdef CONFIG_RFS_ACCEL
-#include <linux/cpu_rmap.h>
-#endif
 #include <linux/version.h>
 #include <net/devlink.h>
 #include "mlx5_core.h"
@@ -1401,16 +1398,16 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 		goto function_teardown;
 	}

+	err = mlx5_devlink_params_register(priv_to_devlink(dev));
+	if (err)
+		goto err_devlink_params_reg;
+
 	err = mlx5_load(dev);
 	if (err)
 		goto err_load;

 	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);

-	err = mlx5_devlink_params_register(priv_to_devlink(dev));
-	if (err)
-		goto err_devlink_params_reg;
-
 	err = mlx5_register_device(dev);
 	if (err)
 		goto err_register;
@@ -1420,11 +1417,11 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 	return 0;

 err_register:
-	mlx5_devlink_params_unregister(priv_to_devlink(dev));
-err_devlink_params_reg:
 	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 	mlx5_unload(dev);
 err_load:
+	mlx5_devlink_params_unregister(priv_to_devlink(dev));
+err_devlink_params_reg:
 	mlx5_cleanup_once(dev);
 function_teardown:
 	mlx5_function_teardown(dev, true);
@@ -1443,7 +1440,6 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
 	mutex_lock(&dev->intf_state_mutex);

 	mlx5_unregister_device(dev);
-	mlx5_devlink_params_unregister(priv_to_devlink(dev));

 	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
 		mlx5_core_warn(dev, "%s: interface is down, NOP\n",
@@ -1454,6 +1450,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)

 	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 	mlx5_unload(dev);
+	mlx5_devlink_params_unregister(priv_to_devlink(dev));
 	mlx5_cleanup_once(dev);
 	mlx5_function_teardown(dev, true);
 out:

--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@@ -9,6 +9,7 @@
 #define MLX5_COMP_EQS_PER_SF 8

 struct mlx5_irq;
+struct cpu_rmap;

 int mlx5_irq_table_init(struct mlx5_core_dev *dev);
 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
@@ -25,9 +26,10 @@ int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
 struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev);
 void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq);
 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
-				  struct cpumask *affinity);
+				  struct irq_affinity_desc *af_desc,
+				  struct cpu_rmap **rmap);
 int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
-			      struct mlx5_irq **irqs);
+			      struct mlx5_irq **irqs, struct cpu_rmap **rmap);
 void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs);
 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
@@ -39,7 +41,7 @@ struct mlx5_irq_pool;
 int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
 					struct mlx5_irq **irqs);
 struct mlx5_irq *mlx5_irq_affinity_request(struct mlx5_irq_pool *pool,
-					   const struct cpumask *req_mask);
+					   struct irq_affinity_desc *af_desc);
 void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
 				    int num_irqs);
 #else
@@ -50,7 +52,7 @@ static inline int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev,
 }

 static inline struct mlx5_irq *
-mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }

--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -9,6 +9,7 @@
 #include "mlx5_irq.h"
 #include "pci_irq.h"
 #include "lib/sf.h"
+#include "lib/eq.h"
 #ifdef CONFIG_RFS_ACCEL
 #include <linux/cpu_rmap.h>
 #endif
@@ -29,12 +30,11 @@ struct mlx5_irq {
 	char name[MLX5_MAX_IRQ_NAME];
 	struct mlx5_irq_pool *pool;
 	int refcount;
-	u32 index;
-	int irqn;
+	struct msi_map map;
 };

 struct mlx5_irq_table {
-	struct mlx5_irq_pool *pf_pool;
+	struct mlx5_irq_pool *pcif_pool;
 	struct mlx5_irq_pool *sf_ctrl_pool;
 	struct mlx5_irq_pool *sf_comp_pool;
 };
@@ -127,15 +127,26 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
 static void irq_release(struct mlx5_irq *irq)
 {
 	struct mlx5_irq_pool *pool = irq->pool;
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap *rmap;
+#endif

-	xa_erase(&pool->irqs, irq->index);
-	/* free_irq requires that affinity_hint and rmap will be cleared
-	 * before calling it. This is why there is asymmetry with set_rmap
-	 * which should be called after alloc_irq but before request_irq.
+	xa_erase(&pool->irqs, irq->map.index);
+	/* free_irq requires that affinity_hint and rmap will be cleared before
+	 * calling it. To satisfy this requirement, we call
+	 * irq_cpu_rmap_remove() to remove the notifier
 	 */
-	irq_update_affinity_hint(irq->irqn, NULL);
+	irq_update_affinity_hint(irq->map.virq, NULL);
+#ifdef CONFIG_RFS_ACCEL
+	rmap = mlx5_eq_table_get_rmap(pool->dev);
+	if (rmap && irq->map.index)
+		irq_cpu_rmap_remove(rmap, irq->map.virq);
+#endif
+
 	free_cpumask_var(irq->mask);
-	free_irq(irq->irqn, &irq->nh);
+	free_irq(irq->map.virq, &irq->nh);
+	if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev))
+		pci_msix_free_irq(pool->dev->pdev, irq->map);
 	kfree(irq);
 }

@@ -198,7 +209,7 @@ static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
 		return;
 	}

-	if (vecidx == pool->xa_num_irqs.max) {
+	if (!vecidx) {
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
 		return;
 	}
@@ -207,7 +218,8 @@ static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
 }

 struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
-				const struct cpumask *affinity)
+				struct irq_affinity_desc *af_desc,
+				struct cpu_rmap **rmap)
 {
 	struct mlx5_core_dev *dev = pool->dev;
 	char name[MLX5_MAX_IRQ_NAME];
@@ -217,7 +229,28 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
 	irq = kzalloc(sizeof(*irq), GFP_KERNEL);
 	if (!irq)
 		return ERR_PTR(-ENOMEM);
-	irq->irqn = pci_irq_vector(dev->pdev, i);
+	if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) {
+		/* The vector at index 0 was already allocated.
+		 * Just get the irq number. If dynamic irq is not supported
+		 * vectors have also been allocated.
+		 */
+		irq->map.virq = pci_irq_vector(dev->pdev, i);
+		irq->map.index = 0;
+	} else {
+		irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc);
+		if (!irq->map.virq) {
+			err = irq->map.index;
+			goto err_alloc_irq;
+		}
+	}
+
+	if (i && rmap && *rmap) {
+#ifdef CONFIG_RFS_ACCEL
+		err = irq_cpu_rmap_add(*rmap, irq->map.virq);
+		if (err)
+			goto err_irq_rmap;
+#endif
+	}
 	if (!mlx5_irq_pool_is_sf_pool(pool))
 		irq_set_name(pool, name, i);
 	else
@@ -225,7 +258,7 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
 	ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
 	snprintf(irq->name, MLX5_MAX_IRQ_NAME,
 		 "%s@pci:%s", name, pci_name(dev->pdev));
-	err = request_irq(irq->irqn, irq_int_handler, 0, irq->name,
+	err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
 			  &irq->nh);
 	if (err) {
 		mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
@@ -236,26 +269,37 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
 		err = -ENOMEM;
 		goto err_cpumask;
 	}
-	if (affinity) {
-		cpumask_copy(irq->mask, affinity);
-		irq_set_affinity_and_hint(irq->irqn, irq->mask);
+	if (af_desc) {
+		cpumask_copy(irq->mask, &af_desc->mask);
+		irq_set_affinity_and_hint(irq->map.virq, irq->mask);
 	}
 	irq->pool = pool;
 	irq->refcount = 1;
-	irq->index = i;
-	err = xa_err(xa_store(&pool->irqs, irq->index, irq, GFP_KERNEL));
+	irq->map.index = i;
+	err = xa_err(xa_store(&pool->irqs, irq->map.index, irq, GFP_KERNEL));
 	if (err) {
 		mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
-			      irq->index, err);
+			      irq->map.index, err);
 		goto err_xa;
 	}
 	return irq;
 err_xa:
-	irq_update_affinity_hint(irq->irqn, NULL);
+	if (af_desc)
+		irq_update_affinity_hint(irq->map.virq, NULL);
 	free_cpumask_var(irq->mask);
 err_cpumask:
-	free_irq(irq->irqn, &irq->nh);
+	free_irq(irq->map.virq, &irq->nh);
 err_req_irq:
+#ifdef CONFIG_RFS_ACCEL
+	if (i && rmap && *rmap) {
+		free_irq_cpu_rmap(*rmap);
+		*rmap = NULL;
+	}
+err_irq_rmap:
+#endif
+	if (i && pci_msix_can_alloc_dyn(dev->pdev))
+		pci_msix_free_irq(dev->pdev, irq->map);
+err_alloc_irq:
 	kfree(irq);
 	return ERR_PTR(err);
 }
@@ -292,7 +336,7 @@ struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)

 int mlx5_irq_get_index(struct mlx5_irq *irq)
 {
-	return irq->index;
+	return irq->map.index;
 }

 /* irq_pool API */
@@ -300,7 +344,8 @@ int mlx5_irq_get_index(struct mlx5_irq *irq)
 /* requesting an irq from a given pool according to given index */
 static struct mlx5_irq *
 irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
-			struct cpumask *affinity)
+			struct irq_affinity_desc *af_desc,
+			struct cpu_rmap **rmap)
 {
 	struct mlx5_irq *irq;

@@ -310,7 +355,7 @@ irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
 		mlx5_irq_get_locked(irq);
 		goto unlock;
 	}
-	irq = mlx5_irq_alloc(pool, vecidx, affinity);
+	irq = mlx5_irq_alloc(pool, vecidx, af_desc, rmap);
 unlock:
 	mutex_unlock(&pool->lock);
 	return irq;
@@ -337,7 +382,7 @@ struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev)
 	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
 	 * the PF IRQs pool in case the SF pool doesn't exist.
 	 */
-	return pool ? pool : irq_table->pf_pool;
+	return pool ? pool : irq_table->pcif_pool;
 }

 static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
@@ -351,7 +396,7 @@ static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
 	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
 	 * the PF IRQs pool in case the SF pool doesn't exist.
 	 */
-	return pool ? pool : irq_table->pf_pool;
+	return pool ? pool : irq_table->pcif_pool;
 }

 /**
@@ -364,7 +409,7 @@ static void mlx5_irqs_release(struct mlx5_irq **irqs, int nirqs)
 	int i;

 	for (i = 0; i < nirqs; i++) {
-		synchronize_irq(irqs[i]->irqn);
+		synchronize_irq(irqs[i]->map.virq);
 		mlx5_irq_put(irqs[i]);
 	}
 }
@@ -387,26 +432,26 @@ void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq)
 struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
 {
 	struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev);
-	cpumask_var_t req_mask;
+	struct irq_affinity_desc af_desc;
 	struct mlx5_irq *irq;

-	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
-		return ERR_PTR(-ENOMEM);
-	cpumask_copy(req_mask, cpu_online_mask);
+	cpumask_copy(&af_desc.mask, cpu_online_mask);
+	af_desc.is_managed = false;
 	if (!mlx5_irq_pool_is_sf_pool(pool)) {
-		/* In case we are allocating a control IRQ for PF/VF */
+		/* In case we are allocating a control IRQ from a pci device's pool.
+		 * This can happen also for a SF if the SFs pool is empty.
+		 */
 		if (!pool->xa_num_irqs.max) {
-			cpumask_clear(req_mask);
+			cpumask_clear(&af_desc.mask);
 			/* In case we only have a single IRQ for PF/VF */
-			cpumask_set_cpu(cpumask_first(cpu_online_mask), req_mask);
+			cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc.mask);
 		}
-		/* Allocate the IRQ in the last index of the pool */
-		irq = irq_pool_request_vector(pool, pool->xa_num_irqs.max, req_mask);
+		/* Allocate the IRQ in index 0. The vector was already allocated */
+		irq = irq_pool_request_vector(pool, 0, &af_desc, NULL);
 	} else {
-		irq = mlx5_irq_affinity_request(pool, req_mask);
+		irq = mlx5_irq_affinity_request(pool, &af_desc);
 	}

-	free_cpumask_var(req_mask);
 	return irq;
 }

@@ -415,27 +460,81 @@ struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
 * @dev: mlx5 device that requesting the IRQ.
 * @vecidx: vector index of the IRQ. This argument is ignore if affinity is
 * provided.
- * @affinity: cpumask requested for this IRQ.
+ * @af_desc: affinity descriptor for this IRQ.
+ * @rmap: pointer to reverse map pointer for completion interrupts
 *
 * This function returns a pointer to IRQ, or ERR_PTR in case of error.
 */
 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
-				  struct cpumask *affinity)
+				  struct irq_affinity_desc *af_desc,
+				  struct cpu_rmap **rmap)
 {
 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
 	struct mlx5_irq_pool *pool;
 	struct mlx5_irq *irq;

-	pool = irq_table->pf_pool;
-	irq = irq_pool_request_vector(pool, vecidx, affinity);
+	pool = irq_table->pcif_pool;
+	irq = irq_pool_request_vector(pool, vecidx, af_desc, rmap);
 	if (IS_ERR(irq))
 		return irq;
 	mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
-		      irq->irqn, cpumask_pr_args(affinity),
+		      irq->map.virq, cpumask_pr_args(&af_desc->mask),
 		      irq->refcount / MLX5_EQ_REFS_PER_IRQ);
 	return irq;
 }

+/**
+ * mlx5_msix_alloc - allocate msix interrupt
+ * @dev: mlx5 device from which to request
+ * @handler: interrupt handler
+ * @affdesc: affinity descriptor
+ * @name: interrupt name
+ *
+ * Returns: struct msi_map with result encoded.
+ * Note: the caller must make sure to release the irq by calling
+ *       mlx5_msix_free() if shutdown was initiated.
+ */
+struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
+			       irqreturn_t (*handler)(int, void *),
+			       const struct irq_affinity_desc *affdesc,
+			       const char *name)
+{
+	struct msi_map map;
+	int err;
+
+	if (!dev->pdev) {
+		map.virq = 0;
+		map.index = -EINVAL;
+		return map;
+	}
+
+	map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, affdesc);
+	if (!map.virq)
+		return map;
+
+	err = request_irq(map.virq, handler, 0, name, NULL);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		pci_msix_free_irq(dev->pdev, map);
+		map.virq = 0;
+		map.index = -ENOMEM;
+	}
+	return map;
+}
+EXPORT_SYMBOL(mlx5_msix_alloc);
+
+/**
+ * mlx5_msix_free - free a previously allocated msix interrupt
+ * @dev: mlx5 device associated with interrupt
+ * @map: map previously returned by mlx5_msix_alloc()
+ */
+void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map)
+{
+	free_irq(map.virq, NULL);
+	pci_msix_free_irq(dev->pdev, map);
+}
+EXPORT_SYMBOL(mlx5_msix_free);
+
 /**
 * mlx5_irqs_release_vectors - release one or more IRQs back to the system.
 * @irqs: IRQs to be released.
@@ -452,6 +551,7 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs)
 * @cpus: CPUs array for binding the IRQs
 * @nirqs: number of IRQs to request.
 * @irqs: an output array of IRQs pointers.
+ * @rmap: pointer to reverse map pointer for completion interrupts
 *
 * Each IRQ is bound to at most 1 CPU.
 * This function is requests nirqs IRQs, starting from @vecidx.
@@ -460,24 +560,22 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs)
 * @nirqs), if successful, or a negative error code in case of an error.
 */
 int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
-			      struct mlx5_irq **irqs)
+			      struct mlx5_irq **irqs, struct cpu_rmap **rmap)
 {
-	cpumask_var_t req_mask;
+	struct irq_affinity_desc af_desc;
 	struct mlx5_irq *irq;
 	int i;

-	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
-		return -ENOMEM;
+	af_desc.is_managed = 1;
 	for (i = 0; i < nirqs; i++) {
-		cpumask_set_cpu(cpus[i], req_mask);
-		irq = mlx5_irq_request(dev, i, req_mask);
+		cpumask_set_cpu(cpus[i], &af_desc.mask);
+		irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap);
 		if (IS_ERR(irq))
 			break;
-		cpumask_clear(req_mask);
+		cpumask_clear(&af_desc.mask);
 		irqs[i] = irq;
 	}

-	free_cpumask_var(req_mask);
 	return i ? i : PTR_ERR(irq);
 }

@@ -521,7 +619,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool)
 	kvfree(pool);
 }

-static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
+static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec)
 {
 	struct mlx5_irq_table *table = dev->priv.irq_table;
 	int num_sf_ctrl_by_msix;
@@ -529,12 +627,12 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
 	int num_sf_ctrl;
 	int err;

-	/* init pf_pool */
-	table->pf_pool = irq_pool_alloc(dev, 0, pf_vec, NULL,
-					MLX5_EQ_SHARE_IRQ_MIN_COMP,
-					MLX5_EQ_SHARE_IRQ_MAX_COMP);
-	if (IS_ERR(table->pf_pool))
-		return PTR_ERR(table->pf_pool);
+	/* init pcif_pool */
+	table->pcif_pool = irq_pool_alloc(dev, 0, pcif_vec, NULL,
+					  MLX5_EQ_SHARE_IRQ_MIN_COMP,
+					  MLX5_EQ_SHARE_IRQ_MAX_COMP);
+	if (IS_ERR(table->pcif_pool))
+		return PTR_ERR(table->pcif_pool);
 	if (!mlx5_sf_max_functions(dev))
 		return 0;
 	if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
@@ -548,7 +646,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
 					  MLX5_SFS_PER_CTRL_IRQ);
 	num_sf_ctrl = min_t(int, num_sf_ctrl_by_msix, num_sf_ctrl_by_sfs);
 	num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
-	table->sf_ctrl_pool = irq_pool_alloc(dev, pf_vec, num_sf_ctrl,
+	table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl,
 					     "mlx5_sf_ctrl",
 					     MLX5_EQ_SHARE_IRQ_MIN_CTRL,
 					     MLX5_EQ_SHARE_IRQ_MAX_CTRL);
@@ -557,7 +655,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
 		goto err_pf;
 	}
 	/* init sf_comp_pool */
-	table->sf_comp_pool = irq_pool_alloc(dev, pf_vec + num_sf_ctrl,
+	table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl,
 					     sf_vec - num_sf_ctrl, "mlx5_sf_comp",
 					     MLX5_EQ_SHARE_IRQ_MIN_COMP,
 					     MLX5_EQ_SHARE_IRQ_MAX_COMP);
@@ -579,7 +677,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
 err_sf_ctrl:
 	irq_pool_free(table->sf_ctrl_pool);
 err_pf:
-	irq_pool_free(table->pf_pool);
+	irq_pool_free(table->pcif_pool);
 	return err;
 }

@@ -589,7 +687,7 @@ static void irq_pools_destroy(struct mlx5_irq_table *table)
 		irq_pool_free(table->sf_comp_pool);
 		irq_pool_free(table->sf_ctrl_pool);
 	}
-	irq_pool_free(table->pf_pool);
+	irq_pool_free(table->pcif_pool);
 }

 /* irq_table API */
@@ -620,9 +718,9 @@ void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)

 int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
 {
-	if (!table->pf_pool->xa_num_irqs.max)
+	if (!table->pcif_pool->xa_num_irqs.max)
 		return 1;
-	return table->pf_pool->xa_num_irqs.max - table->pf_pool->xa_num_irqs.min;
+	return table->pcif_pool->xa_num_irqs.max - table->pcif_pool->xa_num_irqs.min;
 }

 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
@@ -631,26 +729,30 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 		      MLX5_CAP_GEN(dev, max_num_eqs) :
 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int total_vec;
-	int pf_vec;
+	int pcif_vec;
+	int req_vec;
 	int err;
+	int n;

 	if (mlx5_core_is_sf(dev))
 		return 0;

-	pf_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
-	pf_vec = min_t(int, pf_vec, num_eqs);
+	pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
+	pcif_vec = min_t(int, pcif_vec, num_eqs);

-	total_vec = pf_vec;
+	total_vec = pcif_vec;
 	if (mlx5_sf_max_functions(dev))
 		total_vec += MLX5_IRQ_CTRL_SF_MAX +
 			MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
+	total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev));
+	pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));

-	total_vec = pci_alloc_irq_vectors(dev->pdev, 1, total_vec, PCI_IRQ_MSIX);
-	if (total_vec < 0)
-		return total_vec;
-	pf_vec = min(pf_vec, total_vec);
+	req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec;
+	n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX);
+	if (n < 0)
+		return n;

-	err = irq_pools_init(dev, total_vec - pf_vec, pf_vec);
+	err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec);
 	if (err)
 		pci_free_irq_vectors(dev->pdev);


--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
@@ -12,6 +12,7 @@
 #define MLX5_EQ_REFS_PER_IRQ (2)

 struct mlx5_irq;
+struct cpu_rmap;

 struct mlx5_irq_pool {
 	char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS];
@@ -31,7 +32,8 @@ static inline bool mlx5_irq_pool_is_sf_pool(struct mlx5_irq_pool *pool)
 }

 struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
-				const struct cpumask *affinity);
+				struct irq_affinity_desc *af_desc,
+				struct cpu_rmap **rmap);
 int mlx5_irq_get_locked(struct mlx5_irq *irq);
 int mlx5_irq_read_locked(struct mlx5_irq *irq);
 int mlx5_irq_put(struct mlx5_irq *irq);

--- a/include/linux/cpu_rmap.h
+++ b/include/linux/cpu_rmap.h
@@ -16,14 +16,13 @@
 * struct cpu_rmap - CPU affinity reverse-map
 * @refcount: kref for object
 * @size: Number of objects to be reverse-mapped
- * @used: Number of objects added
 * @obj: Pointer to array of object pointers
 * @near: For each CPU, the index and distance to the nearest object,
 *      based on affinity masks
 */
 struct cpu_rmap {
 	struct kref	refcount;
-	u16		size, used;
+	u16		size;
 	void		**obj;
 	struct {
 		u16	index;
@@ -61,6 +60,7 @@ static inline struct cpu_rmap *alloc_irq_cpu_rmap(unsigned int size)
 }
 extern void free_irq_cpu_rmap(struct cpu_rmap *rmap);

+int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq);
 extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq);

 #endif /* __LINUX_CPU_RMAP_H */
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1311,4 +1311,10 @@ enum {
 	MLX5_OCTWORD = 16,
 };

+struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
+			       irqreturn_t (*handler)(int, void *),
+			       const struct irq_affinity_desc *affdesc,
+			       const char *name);
+void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map);
+
 #endif /* MLX5_DRIVER_H */
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -128,19 +128,31 @@ debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
 }
 #endif

+static int get_free_index(struct cpu_rmap *rmap)
+{
+	int i;
+
+	for (i = 0; i < rmap->size; i++)
+		if (!rmap->obj[i])
+			return i;
+
+	return -ENOSPC;
+}
+
 /**
 * cpu_rmap_add - add object to a rmap
 * @rmap: CPU rmap allocated with alloc_cpu_rmap()
 * @obj: Object to add to rmap
 *
- * Return index of object.
+ * Return index of object or -ENOSPC if no free entry was found
 */
 int cpu_rmap_add(struct cpu_rmap *rmap, void *obj)
 {
-	u16 index;
+	int index = get_free_index(rmap);
+
+	if (index < 0)
+		return index;

-	BUG_ON(rmap->used >= rmap->size);
-	index = rmap->used++;
 	rmap->obj[index] = obj;
 	return index;
 }
@@ -230,9 +242,10 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap)
 	if (!rmap)
 		return;

-	for (index = 0; index < rmap->used; index++) {
+	for (index = 0; index < rmap->size; index++) {
 		glue = rmap->obj[index];
-		irq_set_affinity_notifier(glue->notify.irq, NULL);
+		if (glue)
+			irq_set_affinity_notifier(glue->notify.irq, NULL);
 	}

 	cpu_rmap_put(rmap);
@@ -268,9 +281,21 @@ static void irq_cpu_rmap_release(struct kref *ref)
 		container_of(ref, struct irq_glue, notify.kref);

 	cpu_rmap_put(glue->rmap);
+	glue->rmap->obj[glue->index] = NULL;
 	kfree(glue);
 }

+/**
+ * irq_cpu_rmap_remove - remove an IRQ from a CPU affinity reverse-map
+ * @rmap: The reverse-map
+ * @irq: The IRQ number
+ */
+int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq)
+{
+	return irq_set_affinity_notifier(irq, NULL);
+}
+EXPORT_SYMBOL(irq_cpu_rmap_remove);
+
 /**
 * irq_cpu_rmap_add - add an IRQ to a CPU affinity reverse-map
 * @rmap: The reverse-map
@@ -293,12 +318,22 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
 	glue->notify.release = irq_cpu_rmap_release;
 	glue->rmap = rmap;
 	cpu_rmap_get(rmap);
-	glue->index = cpu_rmap_add(rmap, glue);
+	rc = cpu_rmap_add(rmap, glue);
+	if (rc < 0)
+		goto err_add;
+
+	glue->index = rc;
 	rc = irq_set_affinity_notifier(irq, &glue->notify);
-	if (rc) {
-		cpu_rmap_put(glue->rmap);
-		kfree(glue);
-	}
+	if (rc)
+		goto err_set;
+
+	return rc;
+
+err_set:
+	rmap->obj[glue->index] = NULL;
+err_add:
+	cpu_rmap_put(glue->rmap);
+	kfree(glue);
 	return rc;
 }
 EXPORT_SYMBOL(irq_cpu_rmap_add);