Commit cdf1f1f1 authored by Jacob Keller's avatar Jacob Keller Committed by Tony Nguyen

ice: replace custom AIM algorithm with kernel's DIM library

The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.

The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.

The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.

The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).

There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.

The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: default avatarJesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: default avatarJesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: default avatarJacob Keller <jacob.e.keller@intel.com>
Tested-by: default avatarTony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: default avatarTony Nguyen <anthony.l.nguyen@intel.com>
parent b8b47723
......@@ -294,6 +294,7 @@ config ICE
tristate "Intel(R) Ethernet Connection E800 Series Support"
default n
depends on PCI_MSI
select DIMLIB
select NET_DEVLINK
select PLDMFW
help
......
......@@ -36,6 +36,7 @@
#include <linux/bpf.h>
#include <linux/avf/virtchnl.h>
#include <linux/cpu_rmap.h>
#include <linux/dim.h>
#include <net/devlink.h>
#include <net/ipv6.h>
#include <net/xdp_sock.h>
......@@ -351,7 +352,7 @@ struct ice_q_vector {
u16 reg_idx;
u8 num_ring_rx; /* total number of Rx rings in vector */
u8 num_ring_tx; /* total number of Tx rings in vector */
u8 itr_countdown; /* when 0 should adjust adaptive ITR */
u8 wb_on_itr:1; /* if true, WB on ITR is enabled */
/* in usecs, need to use ice_intrl_to_usecs_reg() before writing this
* value to the device
*/
......@@ -366,6 +367,8 @@ struct ice_q_vector {
struct irq_affinity_notify affinity_notify;
char name[ICE_INT_NAME_STR_LEN];
u16 total_events; /* net_dim(): number of interrupts processed */
} ____cacheline_internodealigned_in_smp;
enum ice_pf_flags {
......
......@@ -3634,6 +3634,12 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec,
ICE_MAX_INTRL);
return -EINVAL;
}
if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl &&
(ec->use_adaptive_rx_coalesce || ec->use_adaptive_tx_coalesce)) {
netdev_info(vsi->netdev, "Invalid value, %s-usecs-high cannot be changed if adaptive-tx or adaptive-rx is enabled\n",
c_type_str);
return -EINVAL;
}
if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl) {
rc->ring->q_vector->intrl = ec->rx_coalesce_usecs_high;
ice_write_intrl(rc->ring->q_vector,
......
......@@ -385,6 +385,8 @@ static irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
if (!q_vector->tx.ring && !q_vector->rx.ring)
return IRQ_HANDLED;
q_vector->total_events++;
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
......@@ -3270,20 +3272,15 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
/**
* ice_update_ring_stats - Update ring statistics
* @ring: ring to update
* @cont: used to increment per-vector counters
* @pkts: number of processed packets
* @bytes: number of processed bytes
*
* This function assumes that caller has acquired a u64_stats_sync lock.
*/
static void
ice_update_ring_stats(struct ice_ring *ring, struct ice_ring_container *cont,
u64 pkts, u64 bytes)
static void ice_update_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes)
{
ring->stats.bytes += bytes;
ring->stats.pkts += pkts;
cont->total_bytes += bytes;
cont->total_pkts += pkts;
}
/**
......@@ -3295,7 +3292,7 @@ ice_update_ring_stats(struct ice_ring *ring, struct ice_ring_container *cont,
void ice_update_tx_ring_stats(struct ice_ring *tx_ring, u64 pkts, u64 bytes)
{
u64_stats_update_begin(&tx_ring->syncp);
ice_update_ring_stats(tx_ring, &tx_ring->q_vector->tx, pkts, bytes);
ice_update_ring_stats(tx_ring, pkts, bytes);
u64_stats_update_end(&tx_ring->syncp);
}
......@@ -3308,7 +3305,7 @@ void ice_update_tx_ring_stats(struct ice_ring *tx_ring, u64 pkts, u64 bytes)
void ice_update_rx_ring_stats(struct ice_ring *rx_ring, u64 pkts, u64 bytes)
{
u64_stats_update_begin(&rx_ring->syncp);
ice_update_ring_stats(rx_ring, &rx_ring->q_vector->rx, pkts, bytes);
ice_update_ring_stats(rx_ring, pkts, bytes);
u64_stats_update_end(&rx_ring->syncp);
}
......
......@@ -5196,6 +5196,105 @@ int ice_vsi_cfg(struct ice_vsi *vsi)
return err;
}
/* THEORY OF MODERATION:
* The below code creates custom DIM profiles for use by this driver, because
* the ice driver hardware works differently than the hardware that DIMLIB was
* originally made for. ice hardware doesn't have packet count limits that
* can trigger an interrupt, but it *does* have interrupt rate limit support,
* and this code adds that capability to be used by the driver when it's using
* DIMLIB. The DIMLIB code was always designed to be a suggestion to the driver
* for how to "respond" to traffic and interrupts, so this driver uses a
* slightly different set of moderation parameters to get best performance.
*/
struct ice_dim {
/* the throttle rate for interrupts, basically worst case delay before
* an initial interrupt fires, value is stored in microseconds.
*/
u16 itr;
/* the rate limit for interrupts, which can cap a delay from a small
* ITR at a certain amount of interrupts per second. f.e. a 2us ITR
* could yield as much as 500,000 interrupts per second, but with a
* 10us rate limit, it limits to 100,000 interrupts per second. Value
* is stored in microseconds.
*/
u16 intrl;
};
/* Make a different profile for Rx that doesn't allow quite so aggressive
* moderation at the high end (it maxes out at 128us or about 8k interrupts a
* second. The INTRL/rate parameters here are only useful to cap small ITR
* values, which is why for larger ITR's - like 128, which can only generate
* 8k interrupts per second, there is no point to rate limit and the values
* are set to zero. The rate limit values do affect latency, and so must
* be reasonably small so to not impact latency sensitive tests.
*/
static const struct ice_dim rx_profile[] = {
{2, 10},
{8, 16},
{32, 0},
{96, 0},
{128, 0}
};
/* The transmit profile, which has the same sorts of values
* as the previous struct
*/
static const struct ice_dim tx_profile[] = {
{2, 10},
{8, 16},
{64, 0},
{128, 0},
{256, 0}
};
static void ice_tx_dim_work(struct work_struct *work)
{
struct ice_ring_container *rc;
struct ice_q_vector *q_vector;
struct dim *dim;
u16 itr, intrl;
dim = container_of(work, struct dim, work);
rc = container_of(dim, struct ice_ring_container, dim);
q_vector = container_of(rc, struct ice_q_vector, tx);
if (dim->profile_ix >= ARRAY_SIZE(tx_profile))
dim->profile_ix = ARRAY_SIZE(tx_profile) - 1;
/* look up the values in our local table */
itr = tx_profile[dim->profile_ix].itr;
intrl = tx_profile[dim->profile_ix].intrl;
ice_write_itr(rc, itr);
ice_write_intrl(q_vector, intrl);
dim->state = DIM_START_MEASURE;
}
static void ice_rx_dim_work(struct work_struct *work)
{
struct ice_ring_container *rc;
struct ice_q_vector *q_vector;
struct dim *dim;
u16 itr, intrl;
dim = container_of(work, struct dim, work);
rc = container_of(dim, struct ice_ring_container, dim);
q_vector = container_of(rc, struct ice_q_vector, rx);
if (dim->profile_ix >= ARRAY_SIZE(rx_profile))
dim->profile_ix = ARRAY_SIZE(rx_profile) - 1;
/* look up the values in our local table */
itr = rx_profile[dim->profile_ix].itr;
intrl = rx_profile[dim->profile_ix].intrl;
ice_write_itr(rc, itr);
ice_write_intrl(q_vector, intrl);
dim->state = DIM_START_MEASURE;
}
/**
* ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
* @vsi: the VSI being configured
......@@ -5210,6 +5309,12 @@ static void ice_napi_enable_all(struct ice_vsi *vsi)
ice_for_each_q_vector(vsi, q_idx) {
struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
INIT_WORK(&q_vector->tx.dim.work, ice_tx_dim_work);
q_vector->tx.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
INIT_WORK(&q_vector->rx.dim.work, ice_rx_dim_work);
q_vector->rx.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
if (q_vector->rx.ring || q_vector->tx.ring)
napi_enable(&q_vector->napi);
}
......@@ -5618,6 +5723,9 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
if (q_vector->rx.ring || q_vector->tx.ring)
napi_disable(&q_vector->napi);
cancel_work_sync(&q_vector->tx.dim.work);
cancel_work_sync(&q_vector->rx.dim.work);
}
}
......
This diff is collapsed.
......@@ -339,12 +339,8 @@ static inline bool ice_ring_is_xdp(struct ice_ring *ring)
struct ice_ring_container {
/* head of linked-list of rings */
struct ice_ring *ring;
unsigned long next_update; /* jiffies value of next queue update */
unsigned int total_bytes; /* total bytes processed this int */
unsigned int total_pkts; /* total packets processed this int */
struct dim dim; /* data for net_dim algorithm */
u16 itr_idx; /* index in the interrupt vector */
u16 target_itr; /* value in usecs divided by the hw->itr_gran */
u16 current_itr; /* value in usecs divided by the hw->itr_gran */
/* high bit set means dynamic ITR, rest is used to store user
* readable ITR value in usecs and must be converted before programming
* to a register.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment