Commit 6025b913 authored by Jakub Kicinski's avatar Jakub Kicinski Committed by David S. Miller

net: dqs: add NIC stall detector based on BQL

softnet_data->time_squeeze is sometimes used as a proxy for
host overload or indication of scheduling problems. In practice
this statistic is very noisy and has hard to grasp units -
e.g. is 10 squeezes a second to be expected, or high?

Delaying network (NAPI) processing leads to drops on NIC queues
but also RTT bloat, impacting pacing and CA decisions.
Stalls are a little hard to detect on the Rx side, because
there may simply have not been any packets received in given
period of time. Packet timestamps help a little bit, but
again we don't know if packets are stale because we're
not keeping up or because someone (*cough* cgroups)
disabled IRQs for a long time.

We can, however, use Tx as a proxy for Rx stalls. Most drivers
use combined Rx+Tx NAPIs so if Tx gets starved so will Rx.
On the Tx side we know exactly when packets get queued,
and completed, so there is no uncertainty.

This patch adds stall checks to BQL. Why BQL? Because
it's a convenient place to add such checks, already
called by most drivers, and it has copious free space
in its structures (this patch adds no extra cache
references or dirtying to the fast path).

The algorithm takes one parameter - max delay AKA stall
threshold and increments a counter whenever NAPI got delayed
for at least that amount of time. It also records the length
of the longest stall.

To be precise every time NAPI has not polled for at least
stall thrs we check if there were any Tx packets queued
between last NAPI run and now - stall_thrs/2.

Unlike the classic Tx watchdog this mechanism does not
ignore stalls caused by Tx being disabled, or loss of link.
I don't think the check is worth the complexity, and
stall is a stall, whether due to host overload, flow
control, link down... doesn't matter much to the application.

We have been running this detector in production at Meta
for 2 years, with the threshold of 8ms. It's the lowest
value where false positives become rare. There's still
a constant stream of reported stalls (especially without
the ksoftirqd deferral patches reverted), those who like
their stall metrics to be 0 may prefer higher value.
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
Signed-off-by: default avatarBreno Leitao <leitao@debian.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 9b78bbef
...@@ -96,3 +96,26 @@ Description: ...@@ -96,3 +96,26 @@ Description:
Indicates the absolute minimum limit of bytes allowed to be Indicates the absolute minimum limit of bytes allowed to be
queued on this network device transmit queue. Default value is queued on this network device transmit queue. Default value is
0. 0.
What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_thrs
Date: Jan 2024
KernelVersion: 6.9
Contact: netdev@vger.kernel.org
Description:
Tx completion stall detection threshold in ms. Kernel will
guarantee to detect all stalls longer than this threshold but
may also detect stalls longer than half of the threshold.
What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_cnt
Date: Jan 2024
KernelVersion: 6.9
Contact: netdev@vger.kernel.org
Description:
Number of detected Tx completion stalls.
What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_max
Date: Jan 2024
KernelVersion: 6.9
Contact: netdev@vger.kernel.org
Description:
Longest detected Tx completion stall. Write 0 to clear.
...@@ -38,14 +38,22 @@ ...@@ -38,14 +38,22 @@
#ifdef __KERNEL__ #ifdef __KERNEL__
#include <linux/bitops.h>
#include <asm/bug.h> #include <asm/bug.h>
#define DQL_HIST_LEN 4
#define DQL_HIST_ENT(dql, idx) ((dql)->history[(idx) % DQL_HIST_LEN])
struct dql { struct dql {
/* Fields accessed in enqueue path (dql_queued) */ /* Fields accessed in enqueue path (dql_queued) */
unsigned int num_queued; /* Total ever queued */ unsigned int num_queued; /* Total ever queued */
unsigned int adj_limit; /* limit + num_completed */ unsigned int adj_limit; /* limit + num_completed */
unsigned int last_obj_cnt; /* Count at last queuing */ unsigned int last_obj_cnt; /* Count at last queuing */
unsigned long history_head; /* top 58 bits of jiffies */
/* stall entries, a bit per entry */
unsigned long history[DQL_HIST_LEN];
/* Fields accessed only by completion path (dql_completed) */ /* Fields accessed only by completion path (dql_completed) */
unsigned int limit ____cacheline_aligned_in_smp; /* Current limit */ unsigned int limit ____cacheline_aligned_in_smp; /* Current limit */
...@@ -62,6 +70,13 @@ struct dql { ...@@ -62,6 +70,13 @@ struct dql {
unsigned int max_limit; /* Max limit */ unsigned int max_limit; /* Max limit */
unsigned int min_limit; /* Minimum limit */ unsigned int min_limit; /* Minimum limit */
unsigned int slack_hold_time; /* Time to measure slack */ unsigned int slack_hold_time; /* Time to measure slack */
/* Stall threshold (in jiffies), defined by user */
unsigned short stall_thrs;
/* Longest stall detected, reported to user */
unsigned short stall_max;
unsigned long last_reap; /* Last reap (in jiffies) */
unsigned long stall_cnt; /* Number of stalls */
}; };
/* Set some static maximums */ /* Set some static maximums */
...@@ -74,6 +89,8 @@ struct dql { ...@@ -74,6 +89,8 @@ struct dql {
*/ */
static inline void dql_queued(struct dql *dql, unsigned int count) static inline void dql_queued(struct dql *dql, unsigned int count)
{ {
unsigned long map, now, now_hi, i;
BUG_ON(count > DQL_MAX_OBJECT); BUG_ON(count > DQL_MAX_OBJECT);
dql->last_obj_cnt = count; dql->last_obj_cnt = count;
...@@ -86,6 +103,34 @@ static inline void dql_queued(struct dql *dql, unsigned int count) ...@@ -86,6 +103,34 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
barrier(); barrier();
dql->num_queued += count; dql->num_queued += count;
now = jiffies;
now_hi = now / BITS_PER_LONG;
/* The following code set a bit in the ring buffer, where each
* bit trackes time the packet was queued. The dql->history buffer
* tracks DQL_HIST_LEN * BITS_PER_LONG time (jiffies) slot
*/
if (unlikely(now_hi != dql->history_head)) {
/* About to reuse slots, clear them */
for (i = 0; i < DQL_HIST_LEN; i++) {
/* Multiplication masks high bits */
if (now_hi * BITS_PER_LONG ==
(dql->history_head + i) * BITS_PER_LONG)
break;
DQL_HIST_ENT(dql, dql->history_head + i + 1) = 0;
}
/* pairs with smp_rmb() in dql_check_stall() */
smp_wmb();
WRITE_ONCE(dql->history_head, now_hi);
}
/* __set_bit() does not guarantee WRITE_ONCE() semantics */
map = DQL_HIST_ENT(dql, now_hi);
/* Populate the history with an entry (bit) per queued */
if (!(map & BIT_MASK(now)))
WRITE_ONCE(DQL_HIST_ENT(dql, now_hi), map | BIT_MASK(now));
} }
/* Returns how many objects can be queued, < 0 indicates over limit. */ /* Returns how many objects can be queued, < 0 indicates over limit. */
......
...@@ -36,6 +36,39 @@ TRACE_EVENT(napi_poll, ...@@ -36,6 +36,39 @@ TRACE_EVENT(napi_poll,
__entry->work, __entry->budget) __entry->work, __entry->budget)
); );
TRACE_EVENT(dql_stall_detected,
TP_PROTO(unsigned short thrs, unsigned int len,
unsigned long last_reap, unsigned long hist_head,
unsigned long now, unsigned long *hist),
TP_ARGS(thrs, len, last_reap, hist_head, now, hist),
TP_STRUCT__entry(
__field( unsigned short, thrs)
__field( unsigned int, len)
__field( unsigned long, last_reap)
__field( unsigned long, hist_head)
__field( unsigned long, now)
__array( unsigned long, hist, 4)
),
TP_fast_assign(
__entry->thrs = thrs;
__entry->len = len;
__entry->last_reap = last_reap;
__entry->hist_head = hist_head * BITS_PER_LONG;
__entry->now = now;
memcpy(__entry->hist, hist, sizeof(entry->hist));
),
TP_printk("thrs %u len %u last_reap %lu hist_head %lu now %lu hist %016lx %016lx %016lx %016lx",
__entry->thrs, __entry->len,
__entry->last_reap, __entry->hist_head, __entry->now,
__entry->hist[0], __entry->hist[1],
__entry->hist[2], __entry->hist[3])
);
#undef NO_DEV #undef NO_DEV
#endif /* _TRACE_NAPI_H */ #endif /* _TRACE_NAPI_H */
......
...@@ -10,10 +10,77 @@ ...@@ -10,10 +10,77 @@
#include <linux/dynamic_queue_limits.h> #include <linux/dynamic_queue_limits.h>
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/export.h> #include <linux/export.h>
#include <trace/events/napi.h>
#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
static void dql_check_stall(struct dql *dql)
{
unsigned short stall_thrs;
unsigned long now;
stall_thrs = READ_ONCE(dql->stall_thrs);
if (!stall_thrs)
return;
now = jiffies;
/* Check for a potential stall */
if (time_after_eq(now, dql->last_reap + stall_thrs)) {
unsigned long hist_head, t, start, end;
/* We are trying to detect a period of at least @stall_thrs
* jiffies without any Tx completions, but during first half
* of which some Tx was posted.
*/
dqs_again:
hist_head = READ_ONCE(dql->history_head);
/* pairs with smp_wmb() in dql_queued() */
smp_rmb();
/* Get the previous entry in the ring buffer, which is the
* oldest sample.
*/
start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG;
/* Advance start to continue from the last reap time */
if (time_before(start, dql->last_reap + 1))
start = dql->last_reap + 1;
/* Newest sample we should have already seen a completion for */
end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1);
/* Shrink the search space to [start, (now - start_thrs/2)] if
* `end` is beyond the stall zone
*/
if (time_before(now, end + stall_thrs / 2))
end = now - stall_thrs / 2;
/* Search for the queued time in [t, end] */
for (t = start; time_before_eq(t, end); t++)
if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG),
dql->history))
break;
/* Variable t contains the time of the queue */
if (!time_before_eq(t, end))
goto no_stall;
/* The ring buffer was modified in the meantime, retry */
if (hist_head != READ_ONCE(dql->history_head))
goto dqs_again;
dql->stall_cnt++;
dql->stall_max = max_t(unsigned short, dql->stall_max, now - t);
trace_dql_stall_detected(dql->stall_thrs, now - t,
dql->last_reap, dql->history_head,
now, dql->history);
}
no_stall:
dql->last_reap = now;
}
/* Records completed count and recalculates the queue limit */ /* Records completed count and recalculates the queue limit */
void dql_completed(struct dql *dql, unsigned int count) void dql_completed(struct dql *dql, unsigned int count)
{ {
...@@ -110,6 +177,8 @@ void dql_completed(struct dql *dql, unsigned int count) ...@@ -110,6 +177,8 @@ void dql_completed(struct dql *dql, unsigned int count)
dql->prev_last_obj_cnt = dql->last_obj_cnt; dql->prev_last_obj_cnt = dql->last_obj_cnt;
dql->num_completed = completed; dql->num_completed = completed;
dql->prev_num_queued = num_queued; dql->prev_num_queued = num_queued;
dql_check_stall(dql);
} }
EXPORT_SYMBOL(dql_completed); EXPORT_SYMBOL(dql_completed);
...@@ -125,6 +194,10 @@ void dql_reset(struct dql *dql) ...@@ -125,6 +194,10 @@ void dql_reset(struct dql *dql)
dql->prev_ovlimit = 0; dql->prev_ovlimit = 0;
dql->lowest_slack = UINT_MAX; dql->lowest_slack = UINT_MAX;
dql->slack_start_time = jiffies; dql->slack_start_time = jiffies;
dql->last_reap = jiffies;
dql->history_head = jiffies / BITS_PER_LONG;
memset(dql->history, 0, sizeof(dql->history));
} }
EXPORT_SYMBOL(dql_reset); EXPORT_SYMBOL(dql_reset);
...@@ -133,6 +206,7 @@ void dql_init(struct dql *dql, unsigned int hold_time) ...@@ -133,6 +206,7 @@ void dql_init(struct dql *dql, unsigned int hold_time)
dql->max_limit = DQL_MAX_LIMIT; dql->max_limit = DQL_MAX_LIMIT;
dql->min_limit = 0; dql->min_limit = 0;
dql->slack_hold_time = hold_time; dql->slack_hold_time = hold_time;
dql->stall_thrs = 0;
dql_reset(dql); dql_reset(dql);
} }
EXPORT_SYMBOL(dql_init); EXPORT_SYMBOL(dql_init);
...@@ -1415,6 +1415,65 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init ...@@ -1415,6 +1415,65 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
= __ATTR(hold_time, 0644, = __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time); bql_show_hold_time, bql_set_hold_time);
static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
}
static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
const char *buf, size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
int err;
err = kstrtouint(buf, 10, &value);
if (err < 0)
return err;
value = msecs_to_jiffies(value);
if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG))
return -ERANGE;
if (!dql->stall_thrs && value)
dql->last_reap = jiffies;
/* Force last_reap to be live */
smp_wmb();
dql->stall_thrs = value;
return len;
}
static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
__ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)
{
return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
}
static ssize_t bql_set_stall_max(struct netdev_queue *queue,
const char *buf, size_t len)
{
WRITE_ONCE(queue->dql.stall_max, 0);
return len;
}
static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
__ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sprintf(buf, "%lu\n", dql->stall_cnt);
}
static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
__ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
static ssize_t bql_show_inflight(struct netdev_queue *queue, static ssize_t bql_show_inflight(struct netdev_queue *queue,
char *buf) char *buf)
{ {
...@@ -1453,6 +1512,9 @@ static struct attribute *dql_attrs[] __ro_after_init = { ...@@ -1453,6 +1512,9 @@ static struct attribute *dql_attrs[] __ro_after_init = {
&bql_limit_min_attribute.attr, &bql_limit_min_attribute.attr,
&bql_hold_time_attribute.attr, &bql_hold_time_attribute.attr,
&bql_inflight_attribute.attr, &bql_inflight_attribute.attr,
&bql_stall_thrs_attribute.attr,
&bql_stall_cnt_attribute.attr,
&bql_stall_max_attribute.attr,
NULL NULL
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment