Merge branch 'sch_fq-improvements'

Eric Dumazet says: ==================== net_sched: sch_fq: round of improvements For FQ tenth anniversary, it was time for making it faster. The FQ part (as in Fair Queue) is rather expensive, because we have to classify packets and store them in a per-flow structure, and add this per-flow structure in a hash table. Then the RR lists also add cache line misses. Most fq qdisc are almost idle. Trying to share NIC bandwidth has no benefits, thus the qdisc could behave like a FIFO. This series brings a 5 % throughput increase in intensive tcp_rr workload, and 13 % increase for (unpaced) UDP packets. v2: removed an extra label (build bot). Fix an accidental increase of stat_internal_packets counter in fast path. Added "constify qdisc_priv()" patch to allow fq_fastpath_check() first parameter to be const. typo on 'eligible' (Willem) ==================== Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'sch_fq-improvements'
Eric Dumazet says: ==================== net_sched: sch_fq: round of improvements For FQ tenth anniversary, it was time for making it faster. The FQ part (as in Fair Queue) is rather expensive, because we have to classify packets and store them in a per-flow structure, and add this per-flow structure in a hash table. Then the RR lists also add cache line misses. Most fq qdisc are almost idle. Trying to share NIC bandwidth has no benefits, thus the qdisc could behave like a FIFO. This series brings a 5 % throughput increase in intensive tcp_rr workload, and 13 % increase for (unpaced) UDP packets. v2: removed an extra label (build bot). Fix an accidental increase of stat_internal_packets counter in fast path. Added "constify qdisc_priv()" patch to allow fq_fastpath_check() first parameter to be const. typo on 'eligible' (Willem) ==================== Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
b49a9485 · David S. Miller · 66ac08a7 · 8f6c4ff9 · b49a9485 · b49a9485
Commit b49a9485 authored Oct 01, 2023 by David S. Miller
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 47 deletions

include/net/pkt_sched.h include/net/pkt_sched.h +4 -4

include/uapi/linux/pkt_sched.h include/uapi/linux/pkt_sched.h +1 -0

net/sched/sch_fq.c net/sched/sch_fq.c +105 -43

No files found.
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -20,10 +20,10 @@ struct qdisc_walker {
 	int	(*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *);
 };

-static inline void *qdisc_priv(struct Qdisc *q)
-{
-	return &q->privdata;
-}
+#define qdisc_priv(q)							\
+	_Generic(q,							\
+		 const struct Qdisc * : (const void *)&q->privdata,	\
+		 struct Qdisc * : (void *)&q->privdata)

 static inline struct Qdisc *qdisc_from_priv(void *priv)
 {

--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -962,6 +962,7 @@ struct tc_fq_qd_stats {
 	__u64	ce_mark;		/* packets above ce_threshold */
 	__u64	horizon_drops;
 	__u64	horizon_caps;
+	__u64	fastpath_packets;
 };

 /* Heavy-Hitter Filter */

--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -2,7 +2,7 @@
 /*
 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
 *
- *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
 *
 *  Meant to be mostly used for locally generated traffic :
 *  Fast classification depends on skb->sk being set before reaching us.
@@ -73,7 +73,13 @@ struct fq_flow {
 		struct sk_buff *tail;	/* last skb in the list */
 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
 	};
-	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+	union {
+		struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+		/* Following field is only used for q->internal,
+		 * because q->internal is not hashed in fq_root[]
+		 */
+		u64		stat_fastpath_packets;
+	};
 	struct sock	*sk;
 	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
@@ -104,6 +110,9 @@ struct fq_sched_data {
 	unsigned long	unthrottle_latency_ns;

 	struct fq_flow	internal;	/* for non classified or high prio packets */
+
+/* Read mostly cache line */
+
 	u32		quantum;
 	u32		initial_quantum;
 	u32		flow_refill_delay;
@@ -117,22 +126,27 @@ struct fq_sched_data {
 	u8		rate_enable;
 	u8		fq_trees_log;
 	u8		horizon_drop;
+	u32		timer_slack; /* hrtimer slack in ns */
+
+/* Read/Write fields. */
+
 	u32		flows;
-	u32		inactive_flows;
+	u32		inactive_flows; /* Flows with no packet to send. */
 	u32		throttled_flows;

-	u64		stat_gc_flows;
-	u64		stat_internal_packets;
 	u64		stat_throttled;
+	struct qdisc_watchdog watchdog;
+	u64		stat_gc_flows;
+
+/* Seldom used fields. */
+
+	u64		stat_internal_packets; /* aka highprio */
 	u64		stat_ce_mark;
 	u64		stat_horizon_drops;
 	u64		stat_horizon_caps;
 	u64		stat_flows_plimit;
 	u64		stat_pkts_too_long;
 	u64		stat_allocation_errors;
-
-	u32		timer_slack; /* hrtimer slack in ns */
-	struct qdisc_watchdog watchdog;
 };

 /*
@@ -258,17 +272,64 @@ static void fq_gc(struct fq_sched_data *q,
 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
 }

-static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+/* Fast path can be used if :
+ * 1) Packet tstamp is in the past.
+ * 2) FQ qlen == 0   OR
+ *   (no flow is currently eligible for transmit,
+ *    AND fast path queue has less than 8 packets)
+ * 3) No SO_MAX_PACING_RATE on the socket (if any).
+ * 4) No @maxrate attribute on this qdisc,
+ *
+ * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
+ */
+static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb)
+{
+	const struct fq_sched_data *q = qdisc_priv(sch);
+	const struct sock *sk;
+
+	if (fq_skb_cb(skb)->time_to_send > q->ktime_cache)
+		return false;
+
+	if (sch->q.qlen != 0) {
+		/* Even if some packets are stored in this qdisc,
+		 * we can still enable fast path if all of them are
+		 * scheduled in the future (ie no flows are eligible)
+		 * or in the fast path queue.
+		 */
+		if (q->flows != q->inactive_flows + q->throttled_flows)
+			return false;
+
+		/* Do not allow fast path queue to explode, we want Fair Queue mode
+		 * under pressure.
+		 */
+		if (q->internal.qlen >= 8)
+			return false;
+	}
+
+	sk = skb->sk;
+	if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
+	    sk->sk_max_pacing_rate != ~0UL)
+		return false;
+
+	if (q->flow_max_rate != ~0UL)
+		return false;
+
+	return true;
+}
+
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct fq_sched_data *q = qdisc_priv(sch);
 	struct rb_node **p, *parent;
 	struct sock *sk = skb->sk;
 	struct rb_root *root;
 	struct fq_flow *f;

 	/* warning: no starvation prevention... */
-	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) {
+		q->stat_internal_packets++; /* highprio packet */
 		return &q->internal;
-
+	}
 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
 	 * or a listener (SYNCOOKIE mode)
 	 * 1) request sockets are not full blown,
@@ -299,11 +360,14 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 		sk = (struct sock *)((hash << 1) | 1UL);
 	}

+	if (fq_fastpath_check(sch, skb)) {
+		q->internal.stat_fastpath_packets++;
+		return &q->internal;
+	}
+
 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];

-	if (q->flows >= (2U << q->fq_trees_log) &&
-	    q->inactive_flows > q->flows/2)
-		fq_gc(q, root, sk);
+	fq_gc(q, root, sk);

 	p = &root->rb_node;
 	parent = NULL;
@@ -396,7 +460,6 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 {
 	fq_erase_head(sch, flow, skb);
 	skb_mark_not_on_list(skb);
-	flow->qlen--;
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 }
@@ -448,49 +511,45 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);

+	q->ktime_cache = ktime_get_ns();
 	if (!skb->tstamp) {
-		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+		fq_skb_cb(skb)->time_to_send = q->ktime_cache;
 	} else {
-		/* Check if packet timestamp is too far in the future.
-		 * Try first if our cached value, to avoid ktime_get_ns()
-		 * cost in most cases.
-		 */
+		/* Check if packet timestamp is too far in the future. */
 		if (fq_packet_beyond_horizon(skb, q)) {
-			/* Refresh our cache and check another time */
-			q->ktime_cache = ktime_get_ns();
-			if (fq_packet_beyond_horizon(skb, q)) {
-				if (q->horizon_drop) {
+			if (q->horizon_drop) {
 					q->stat_horizon_drops++;
 					return qdisc_drop(skb, sch, to_free);
-				}
-				q->stat_horizon_caps++;
-				skb->tstamp = q->ktime_cache + q->horizon;
 			}
+			q->stat_horizon_caps++;
+			skb->tstamp = q->ktime_cache + q->horizon;
 		}
 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
 	}

-	f = fq_classify(skb, q);
-	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
-		q->stat_flows_plimit++;
-		return qdisc_drop(skb, sch, to_free);
-	}
+	f = fq_classify(sch, skb);

-	f->qlen++;
-	qdisc_qstats_backlog_inc(sch, skb);
-	if (fq_flow_is_detached(f)) {
-		fq_flow_add_tail(&q->new_flows, f);
-		if (time_after(jiffies, f->age + q->flow_refill_delay))
-			f->credit = max_t(u32, f->credit, q->quantum);
-		q->inactive_flows--;
+	if (f != &q->internal) {
+		if (unlikely(f->qlen >= q->flow_plimit)) {
+			q->stat_flows_plimit++;
+			return qdisc_drop(skb, sch, to_free);
+		}
+
+		if (fq_flow_is_detached(f)) {
+			fq_flow_add_tail(&q->new_flows, f);
+			if (time_after(jiffies, f->age + q->flow_refill_delay))
+				f->credit = max_t(u32, f->credit, q->quantum);
+		}
+
+		if (f->qlen == 0)
+			q->inactive_flows--;
 	}

+	f->qlen++;
 	/* Note: this overwrites f->age */
 	flow_queue_add(f, skb);

-	if (unlikely(f == &q->internal)) {
-		q->stat_internal_packets++;
-	}
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;

 	return NET_XMIT_SUCCESS;
@@ -538,6 +597,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)

 	skb = fq_peek(&q->internal);
 	if (unlikely(skb)) {
+		q->internal.qlen--;
 		fq_dequeue_skb(sch, &q->internal, skb);
 		goto out;
 	}
@@ -581,6 +641,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
 		}
+		if (--f->qlen == 0)
+			q->inactive_flows++;
 		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
@@ -589,7 +651,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 			fq_flow_add_tail(&q->old_flows, f);
 		} else {
 			fq_flow_set_detached(f);
-			q->inactive_flows++;
 		}
 		goto begin;
 	}
@@ -1014,6 +1075,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)

 	st.gc_flows		  = q->stat_gc_flows;
 	st.highprio_packets	  = q->stat_internal_packets;
+	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
 	st.tcp_retrans		  = 0;
 	st.throttled		  = q->stat_throttled;
 	st.flows_plimit		  = q->stat_flows_plimit;