Commit 29dd5ec0 authored by David S. Miller's avatar David S. Miller

Merge branch 'vrf-perf'

David Ahern says:

====================
net: vrf: performance improvements

Device based features for VRF such as qdisc, netfilter and packet
captures are implemented by switching the dst on skbuffs to its per-VRF
dst. This has the effect of controlling the output function which points
a function in the VRF driver. [1] The skb proceeds down the stack with
dst->dev pointing to the VRF device. Netfilter, qdisc and tc rules and
network taps are evaluated based on this device. Finally, the skb makes
it to the vrf_xmit function which resets the dst based on a FIB lookup.

The feature comes at cost - between 5 and 10% depending on test (TCP vs
UDP, stream vs RR and IPv4 vs IPv6). The main cost is requiring a FIB
lookup in the VRF driver for each packet sent through it. The FIB lookup
is required because the real dst gets dropped so that the skb can
traverse the stack with dst->dev set to the VRF device.

All of that is really driven by the qdisc and not replicating the
processing of __dev_queue_xmit if a qdisc is set up on the device. But,
VRF devices by default do not have a qdisc and really have no need for
multiple Tx queues. This means the performance overhead is inflicted upon
all users for the potential use case of a qdisc being configured.

The overhead can be avoided by checking if the default configuration
applies to a specific VRF device before switching the dst. If a device
does not have a qdisc, the pass through netfilter hooks and packet taps
can be done inline without dropping the dst and thus avoiding the
performance penalty. With this change performance overhead of VRF drops
to neglible (difference with run-over-run variance) to 3% depending on
test type.

netperf performance comparison for 3 cases:
1. L3_MASTER_DEVICE compiled out
2. VRF with this patch set
3. current VRF code

IPv4
----
           no-l3mdev     new-vrf     old-vrf
TCP_RR       28778        28938*       27169
TCP_CRR      10706        10490         9770
UDP_RR       30750        29813        29256

* Although higher in the final run used for submitting this patch set, I
  think what this really represents is a neglible performance overhead for
  VRF with this change (i.e, within the +-1% variance of runs). Most
  notably the FIB lookups in the Tx path are avoided for TCP_RR.

IPv6
----
           no-l3mdev     new-vrf     old-vrf
TCP_RR       29495        29432       27794
TCP_CRR      10520        10338        9870
UDP_RR       26137        27019*      26511

* UDP is consistently better with VRF for two reasons:
  1. Source address selection with L3 domains is considering fewer
     addresses since only addresses on interfaces in the domain are
     considered for the selection. Specifically, perf-top shows
     shows ipv6_get_saddr_eval, ipv6_dev_get_saddr and __ipv6_dev_get_saddr
     running much lower with vrf than without.

  2. The VRF table contains all routes (i.e, there are no separate local
     and main tables per VRF). That means ip6_pol_route_output only has 1
     lookup for VRF where it does 2 without it (1 in the local table and 1
     in the main table).

[1] http://netdevconf.org/1.2/papers/ahern-what-is-l3mdev-paper.pdf
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a2d133b1 a9ec54d1
...@@ -104,6 +104,23 @@ static void vrf_get_stats64(struct net_device *dev, ...@@ -104,6 +104,23 @@ static void vrf_get_stats64(struct net_device *dev,
} }
} }
/* by default VRF devices do not have a qdisc and are expected
* to be created with only a single queue.
*/
static bool qdisc_tx_is_default(const struct net_device *dev)
{
struct netdev_queue *txq;
struct Qdisc *qdisc;
if (dev->num_tx_queues > 1)
return false;
txq = netdev_get_tx_queue(dev, 0);
qdisc = rcu_access_pointer(txq->qdisc);
return !qdisc->enqueue;
}
/* Local traffic destined to local address. Reinsert the packet to rx /* Local traffic destined to local address. Reinsert the packet to rx
* path, similar to loopback handling. * path, similar to loopback handling.
*/ */
...@@ -357,6 +374,29 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -357,6 +374,29 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
return ret; return ret;
} }
static int vrf_finish_direct(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
struct net_device *vrf_dev = skb->dev;
if (!list_empty(&vrf_dev->ptype_all) &&
likely(skb_headroom(skb) >= ETH_HLEN)) {
struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
eth_zero_addr(eth->h_dest);
eth->h_proto = skb->protocol;
rcu_read_lock_bh();
dev_queue_xmit_nit(skb, vrf_dev);
rcu_read_unlock_bh();
skb_pull(skb, ETH_HLEN);
}
return 1;
}
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
/* modelled after ip6_finish_output2 */ /* modelled after ip6_finish_output2 */
static int vrf_finish_output6(struct net *net, struct sock *sk, static int vrf_finish_output6(struct net *net, struct sock *sk,
...@@ -405,18 +445,13 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -405,18 +445,13 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
* packet to go through device based features such as qdisc, netfilter * packet to go through device based features such as qdisc, netfilter
* hooks and packet sockets with skb->dev set to vrf device. * hooks and packet sockets with skb->dev set to vrf device.
*/ */
static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev,
struct sock *sk,
struct sk_buff *skb) struct sk_buff *skb)
{ {
struct net_vrf *vrf = netdev_priv(vrf_dev); struct net_vrf *vrf = netdev_priv(vrf_dev);
struct dst_entry *dst = NULL; struct dst_entry *dst = NULL;
struct rt6_info *rt6; struct rt6_info *rt6;
/* don't divert link scope packets */
if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
return skb;
rcu_read_lock(); rcu_read_lock();
rt6 = rcu_dereference(vrf->rt6); rt6 = rcu_dereference(vrf->rt6);
...@@ -438,6 +473,55 @@ static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, ...@@ -438,6 +473,55 @@ static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
return skb; return skb;
} }
static int vrf_output6_direct(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
skb->protocol = htons(ETH_P_IPV6);
return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
vrf_finish_direct,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
struct sock *sk,
struct sk_buff *skb)
{
struct net *net = dev_net(vrf_dev);
int err;
skb->dev = vrf_dev;
err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
skb, NULL, vrf_dev, vrf_output6_direct);
if (likely(err == 1))
err = vrf_output6_direct(net, sk, skb);
/* reset skb device */
if (likely(err == 1))
nf_reset(skb);
else
skb = NULL;
return skb;
}
static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
struct sock *sk,
struct sk_buff *skb)
{
/* don't divert link scope packets */
if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
return skb;
if (qdisc_tx_is_default(vrf_dev))
return vrf_ip6_out_direct(vrf_dev, sk, skb);
return vrf_ip6_out_redirect(vrf_dev, skb);
}
/* holding rtnl */ /* holding rtnl */
static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
{ {
...@@ -607,18 +691,13 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -607,18 +691,13 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
* packet to go through device based features such as qdisc, netfilter * packet to go through device based features such as qdisc, netfilter
* hooks and packet sockets with skb->dev set to vrf device. * hooks and packet sockets with skb->dev set to vrf device.
*/ */
static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev,
struct sock *sk,
struct sk_buff *skb) struct sk_buff *skb)
{ {
struct net_vrf *vrf = netdev_priv(vrf_dev); struct net_vrf *vrf = netdev_priv(vrf_dev);
struct dst_entry *dst = NULL; struct dst_entry *dst = NULL;
struct rtable *rth; struct rtable *rth;
/* don't divert multicast */
if (ipv4_is_multicast(ip_hdr(skb)->daddr))
return skb;
rcu_read_lock(); rcu_read_lock();
rth = rcu_dereference(vrf->rth); rth = rcu_dereference(vrf->rth);
...@@ -640,6 +719,55 @@ static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, ...@@ -640,6 +719,55 @@ static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
return skb; return skb;
} }
static int vrf_output_direct(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
vrf_finish_direct,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
struct sock *sk,
struct sk_buff *skb)
{
struct net *net = dev_net(vrf_dev);
int err;
skb->dev = vrf_dev;
err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
skb, NULL, vrf_dev, vrf_output_direct);
if (likely(err == 1))
err = vrf_output_direct(net, sk, skb);
/* reset skb device */
if (likely(err == 1))
nf_reset(skb);
else
skb = NULL;
return skb;
}
static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
struct sock *sk,
struct sk_buff *skb)
{
/* don't divert multicast */
if (ipv4_is_multicast(ip_hdr(skb)->daddr))
return skb;
if (qdisc_tx_is_default(vrf_dev))
return vrf_ip_out_direct(vrf_dev, sk, skb);
return vrf_ip_out_redirect(vrf_dev, skb);
}
/* called with rcu lock held */ /* called with rcu lock held */
static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
struct sock *sk, struct sock *sk,
...@@ -980,9 +1108,11 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, ...@@ -980,9 +1108,11 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
skb->dev = vrf_dev; skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex; skb->skb_iif = vrf_dev->ifindex;
if (!list_empty(&vrf_dev->ptype_all)) {
skb_push(skb, skb->mac_len); skb_push(skb, skb->mac_len);
dev_queue_xmit_nit(skb, vrf_dev); dev_queue_xmit_nit(skb, vrf_dev);
skb_pull(skb, skb->mac_len); skb_pull(skb, skb->mac_len);
}
IP6CB(skb)->flags |= IP6SKB_L3SLAVE; IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
} }
...@@ -1023,9 +1153,11 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, ...@@ -1023,9 +1153,11 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
vrf_rx_stats(vrf_dev, skb->len); vrf_rx_stats(vrf_dev, skb->len);
if (!list_empty(&vrf_dev->ptype_all)) {
skb_push(skb, skb->mac_len); skb_push(skb, skb->mac_len);
dev_queue_xmit_nit(skb, vrf_dev); dev_queue_xmit_nit(skb, vrf_dev);
skb_pull(skb, skb->mac_len); skb_pull(skb, skb->mac_len);
}
skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
out: out:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment