Commit 206f3985 authored by David S. Miller's avatar David S. Miller

Merge branch 'xen_netback'

xen-netback: IPv6 offload support

====================
This patch series adds support for checksum and large packet offloads
into xen-netback.  Testing has mainly been done using the Microsoft
network hardware certification suite running in Server 2008R2 VMs with
Citrix PV frontends.

v2:
- Fixed Wei's email address in Cc lines

v3:
- Responded to Wei's comments:
 - netif.h now updated with comments and a definition of
   XEN_NETIF_GSO_TYPE_NONE.
 - limited number of pullups
- Responded to Annie's comments:
 - New GSO_BIT macro

v4:
- Responded to more of Wei's comments
- Remove parsing of IPv6 fragment header and added warning

v5:
- Added comment concerning the value chosen for PKT_PROT_LEN
- Dropped deprecation of feature-no-csum-offload
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c0f4ace7 82cada22
......@@ -87,9 +87,13 @@ struct pending_tx_info {
struct xenvif_rx_meta {
int id;
int size;
int gso_type;
int gso_size;
};
#define GSO_BIT(type) \
(1 << XEN_NETIF_GSO_TYPE_ ## type)
/* Discriminate from any valid pending_idx value. */
#define INVALID_PENDING_IDX 0xFFFF
......@@ -150,10 +154,12 @@ struct xenvif {
u8 fe_dev_addr[6];
/* Frontend feature information. */
int gso_mask;
int gso_prefix_mask;
u8 can_sg:1;
u8 gso:1;
u8 gso_prefix:1;
u8 csum:1;
u8 ip_csum:1;
u8 ipv6_csum:1;
/* Internal feature information. */
u8 can_queue:1; /* can queue packets for receiver? */
......
......@@ -214,10 +214,14 @@ static netdev_features_t xenvif_fix_features(struct net_device *dev,
if (!vif->can_sg)
features &= ~NETIF_F_SG;
if (!vif->gso && !vif->gso_prefix)
if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV4))
features &= ~NETIF_F_TSO;
if (!vif->csum)
if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV6))
features &= ~NETIF_F_TSO6;
if (!vif->ip_csum)
features &= ~NETIF_F_IP_CSUM;
if (!vif->ipv6_csum)
features &= ~NETIF_F_IPV6_CSUM;
return features;
}
......@@ -306,7 +310,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
vif->domid = domid;
vif->handle = handle;
vif->can_sg = 1;
vif->csum = 1;
vif->ip_csum = 1;
vif->dev = dev;
vif->credit_bytes = vif->remaining_credit = ~0UL;
......@@ -316,8 +320,10 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
vif->credit_timeout.expires = jiffies;
dev->netdev_ops = &xenvif_netdev_ops;
dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
dev->features = dev->hw_features;
dev->hw_features = NETIF_F_SG |
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
NETIF_F_TSO | NETIF_F_TSO6;
dev->features = dev->hw_features | NETIF_F_RXCSUM;
SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops);
dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
......
......@@ -109,15 +109,12 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx));
}
/*
* This is the amount of packet we copy rather than map, so that the
* guest can't fiddle with the contents of the headers while we do
* packet processing on them (netfilter, routing, etc).
/* This is a miniumum size for the linear area to avoid lots of
* calls to __pskb_pull_tail() as we set up checksum offsets. The
* value 128 was chosen as it covers all IPv4 and most likely
* IPv6 headers.
*/
#define PKT_PROT_LEN (ETH_HLEN + \
VLAN_HLEN + \
sizeof(struct iphdr) + MAX_IPOPTLEN + \
sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
#define PKT_PROT_LEN 128
static u16 frag_get_pending_idx(skb_frag_t *frag)
{
......@@ -145,7 +142,7 @@ static int max_required_rx_slots(struct xenvif *vif)
int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
/* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */
if (vif->can_sg || vif->gso || vif->gso_prefix)
if (vif->can_sg || vif->gso_mask || vif->gso_prefix_mask)
max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
return max;
......@@ -317,6 +314,7 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif,
req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
meta = npo->meta + npo->meta_prod++;
meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
meta->gso_size = 0;
meta->size = 0;
meta->id = req->id;
......@@ -339,6 +337,7 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
struct gnttab_copy *copy_gop;
struct xenvif_rx_meta *meta;
unsigned long bytes;
int gso_type;
/* Data must not cross a page boundary. */
BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
......@@ -397,7 +396,14 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
}
/* Leave a gap for the GSO descriptor. */
if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix)
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
else
gso_type = XEN_NETIF_GSO_TYPE_NONE;
if (*head && ((1 << gso_type) & vif->gso_mask))
vif->rx.req_cons++;
*head = 0; /* There must be something in this buffer now. */
......@@ -428,14 +434,28 @@ static int xenvif_gop_skb(struct sk_buff *skb,
unsigned char *data;
int head = 1;
int old_meta_prod;
int gso_type;
int gso_size;
old_meta_prod = npo->meta_prod;
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
gso_size = skb_shinfo(skb)->gso_size;
} else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
gso_size = skb_shinfo(skb)->gso_size;
} else {
gso_type = XEN_NETIF_GSO_TYPE_NONE;
gso_size = 0;
}
/* Set up a GSO prefix descriptor, if necessary */
if (skb_shinfo(skb)->gso_size && vif->gso_prefix) {
if ((1 << skb_shinfo(skb)->gso_type) & vif->gso_prefix_mask) {
req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
meta = npo->meta + npo->meta_prod++;
meta->gso_size = skb_shinfo(skb)->gso_size;
meta->gso_type = gso_type;
meta->gso_size = gso_size;
meta->size = 0;
meta->id = req->id;
}
......@@ -443,10 +463,13 @@ static int xenvif_gop_skb(struct sk_buff *skb,
req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
meta = npo->meta + npo->meta_prod++;
if (!vif->gso_prefix)
meta->gso_size = skb_shinfo(skb)->gso_size;
else
if ((1 << gso_type) & vif->gso_mask) {
meta->gso_type = gso_type;
meta->gso_size = gso_size;
} else {
meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
meta->gso_size = 0;
}
meta->size = 0;
meta->id = req->id;
......@@ -592,7 +615,8 @@ void xenvif_rx_action(struct xenvif *vif)
vif = netdev_priv(skb->dev);
if (vif->meta[npo.meta_cons].gso_size && vif->gso_prefix) {
if ((1 << vif->meta[npo.meta_cons].gso_type) &
vif->gso_prefix_mask) {
resp = RING_GET_RESPONSE(&vif->rx,
vif->rx.rsp_prod_pvt++);
......@@ -629,7 +653,8 @@ void xenvif_rx_action(struct xenvif *vif)
vif->meta[npo.meta_cons].size,
flags);
if (vif->meta[npo.meta_cons].gso_size && !vif->gso_prefix) {
if ((1 << vif->meta[npo.meta_cons].gso_type) &
vif->gso_mask) {
struct xen_netif_extra_info *gso =
(struct xen_netif_extra_info *)
RING_GET_RESPONSE(&vif->rx,
......@@ -637,8 +662,8 @@ void xenvif_rx_action(struct xenvif *vif)
resp->flags |= XEN_NETRXF_extra_info;
gso->u.gso.type = vif->meta[npo.meta_cons].gso_type;
gso->u.gso.size = vif->meta[npo.meta_cons].gso_size;
gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
gso->u.gso.pad = 0;
gso->u.gso.features = 0;
......@@ -1101,15 +1126,20 @@ static int xenvif_set_skb_gso(struct xenvif *vif,
return -EINVAL;
}
/* Currently only TCPv4 S.O. is supported. */
if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
switch (gso->u.gso.type) {
case XEN_NETIF_GSO_TYPE_TCPV4:
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
break;
case XEN_NETIF_GSO_TYPE_TCPV6:
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
break;
default:
netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
xenvif_fatal_tx_err(vif);
return -EINVAL;
}
skb_shinfo(skb)->gso_size = gso->u.gso.size;
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
/* Header must be checked, and gso_segs computed. */
skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
......@@ -1118,61 +1148,74 @@ static int xenvif_set_skb_gso(struct xenvif *vif,
return 0;
}
static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
static inline void maybe_pull_tail(struct sk_buff *skb, unsigned int len)
{
struct iphdr *iph;
int err = -EPROTO;
int recalculate_partial_csum = 0;
/*
* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
* peers can fail to set NETRXF_csum_blank when sending a GSO
* frame. In this case force the SKB to CHECKSUM_PARTIAL and
* recalculate the partial checksum.
if (skb_is_nonlinear(skb) && skb_headlen(skb) < len) {
/* If we need to pullup then pullup to the max, so we
* won't need to do it again.
*/
if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
vif->rx_gso_checksum_fixup++;
skb->ip_summed = CHECKSUM_PARTIAL;
recalculate_partial_csum = 1;
int target = min_t(int, skb->len, MAX_TCP_HEADER);
__pskb_pull_tail(skb, target - skb_headlen(skb));
}
}
/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
if (skb->ip_summed != CHECKSUM_PARTIAL)
return 0;
static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb,
int recalculate_partial_csum)
{
struct iphdr *iph = (void *)skb->data;
unsigned int header_size;
unsigned int off;
int err = -EPROTO;
if (skb->protocol != htons(ETH_P_IP))
goto out;
off = sizeof(struct iphdr);
header_size = skb->network_header + off + MAX_IPOPTLEN;
maybe_pull_tail(skb, header_size);
off = iph->ihl * 4;
iph = (void *)skb->data;
switch (iph->protocol) {
case IPPROTO_TCP:
if (!skb_partial_csum_set(skb, 4 * iph->ihl,
if (!skb_partial_csum_set(skb, off,
offsetof(struct tcphdr, check)))
goto out;
if (recalculate_partial_csum) {
struct tcphdr *tcph = tcp_hdr(skb);
header_size = skb->network_header +
off +
sizeof(struct tcphdr);
maybe_pull_tail(skb, header_size);
tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
skb->len - iph->ihl*4,
skb->len - off,
IPPROTO_TCP, 0);
}
break;
case IPPROTO_UDP:
if (!skb_partial_csum_set(skb, 4 * iph->ihl,
if (!skb_partial_csum_set(skb, off,
offsetof(struct udphdr, check)))
goto out;
if (recalculate_partial_csum) {
struct udphdr *udph = udp_hdr(skb);
header_size = skb->network_header +
off +
sizeof(struct udphdr);
maybe_pull_tail(skb, header_size);
udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
skb->len - iph->ihl*4,
skb->len - off,
IPPROTO_UDP, 0);
}
break;
default:
if (net_ratelimit())
netdev_err(vif->dev,
"Attempting to checksum a non-TCP/UDP packet, dropping a protocol %d packet\n",
"Attempting to checksum a non-TCP/UDP packet, "
"dropping a protocol %d packet\n",
iph->protocol);
goto out;
}
......@@ -1183,6 +1226,158 @@ static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
return err;
}
static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb,
int recalculate_partial_csum)
{
int err = -EPROTO;
struct ipv6hdr *ipv6h = (void *)skb->data;
u8 nexthdr;
unsigned int header_size;
unsigned int off;
bool fragment;
bool done;
done = false;
off = sizeof(struct ipv6hdr);
header_size = skb->network_header + off;
maybe_pull_tail(skb, header_size);
nexthdr = ipv6h->nexthdr;
while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len)) &&
!done) {
switch (nexthdr) {
case IPPROTO_DSTOPTS:
case IPPROTO_HOPOPTS:
case IPPROTO_ROUTING: {
struct ipv6_opt_hdr *hp = (void *)(skb->data + off);
header_size = skb->network_header +
off +
sizeof(struct ipv6_opt_hdr);
maybe_pull_tail(skb, header_size);
nexthdr = hp->nexthdr;
off += ipv6_optlen(hp);
break;
}
case IPPROTO_AH: {
struct ip_auth_hdr *hp = (void *)(skb->data + off);
header_size = skb->network_header +
off +
sizeof(struct ip_auth_hdr);
maybe_pull_tail(skb, header_size);
nexthdr = hp->nexthdr;
off += (hp->hdrlen+2)<<2;
break;
}
case IPPROTO_FRAGMENT:
fragment = true;
/* fall through */
default:
done = true;
break;
}
}
if (!done) {
if (net_ratelimit())
netdev_err(vif->dev, "Failed to parse packet header\n");
goto out;
}
if (fragment) {
if (net_ratelimit())
netdev_err(vif->dev, "Packet is a fragment!\n");
goto out;
}
switch (nexthdr) {
case IPPROTO_TCP:
if (!skb_partial_csum_set(skb, off,
offsetof(struct tcphdr, check)))
goto out;
if (recalculate_partial_csum) {
struct tcphdr *tcph = tcp_hdr(skb);
header_size = skb->network_header +
off +
sizeof(struct tcphdr);
maybe_pull_tail(skb, header_size);
tcph->check = ~csum_ipv6_magic(&ipv6h->saddr,
&ipv6h->daddr,
skb->len - off,
IPPROTO_TCP, 0);
}
break;
case IPPROTO_UDP:
if (!skb_partial_csum_set(skb, off,
offsetof(struct udphdr, check)))
goto out;
if (recalculate_partial_csum) {
struct udphdr *udph = udp_hdr(skb);
header_size = skb->network_header +
off +
sizeof(struct udphdr);
maybe_pull_tail(skb, header_size);
udph->check = ~csum_ipv6_magic(&ipv6h->saddr,
&ipv6h->daddr,
skb->len - off,
IPPROTO_UDP, 0);
}
break;
default:
if (net_ratelimit())
netdev_err(vif->dev,
"Attempting to checksum a non-TCP/UDP packet, "
"dropping a protocol %d packet\n",
nexthdr);
goto out;
}
err = 0;
out:
return err;
}
static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
{
int err = -EPROTO;
int recalculate_partial_csum = 0;
/* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
* peers can fail to set NETRXF_csum_blank when sending a GSO
* frame. In this case force the SKB to CHECKSUM_PARTIAL and
* recalculate the partial checksum.
*/
if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
vif->rx_gso_checksum_fixup++;
skb->ip_summed = CHECKSUM_PARTIAL;
recalculate_partial_csum = 1;
}
/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
if (skb->ip_summed != CHECKSUM_PARTIAL)
return 0;
if (skb->protocol == htons(ETH_P_IP))
err = checksum_setup_ip(vif, skb, recalculate_partial_csum);
else if (skb->protocol == htons(ETH_P_IPV6))
err = checksum_setup_ipv6(vif, skb, recalculate_partial_csum);
return err;
}
static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
{
unsigned long now = jiffies;
......@@ -1428,12 +1623,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
xenvif_fill_frags(vif, skb);
/*
* If the initial fragment was < PKT_PROT_LEN then
* pull through some bytes from the other fragments to
* increase the linear region to PKT_PROT_LEN bytes.
*/
if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);
__pskb_pull_tail(skb, target - skb_headlen(skb));
}
......
......@@ -105,6 +105,22 @@ static int netback_probe(struct xenbus_device *dev,
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv6",
"%d", sg);
if (err) {
message = "writing feature-gso-tcpv6";
goto abort_transaction;
}
/* We support partial checksum setup for IPv6 packets */
err = xenbus_printf(xbt, dev->nodename,
"feature-ipv6-csum-offload",
"%d", 1);
if (err) {
message = "writing feature-ipv6-csum-offload";
goto abort_transaction;
}
/* We support rx-copy path. */
err = xenbus_printf(xbt, dev->nodename,
"feature-rx-copy", "%d", 1);
......@@ -561,20 +577,50 @@ static int connect_rings(struct backend_info *be)
val = 0;
vif->can_sg = !!val;
vif->gso_mask = 0;
vif->gso_prefix_mask = 0;
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
"%d", &val) < 0)
val = 0;
vif->gso = !!val;
if (val)
vif->gso_mask |= GSO_BIT(TCPV4);
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
"%d", &val) < 0)
val = 0;
vif->gso_prefix = !!val;
if (val)
vif->gso_prefix_mask |= GSO_BIT(TCPV4);
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6",
"%d", &val) < 0)
val = 0;
if (val)
vif->gso_mask |= GSO_BIT(TCPV6);
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6-prefix",
"%d", &val) < 0)
val = 0;
if (val)
vif->gso_prefix_mask |= GSO_BIT(TCPV6);
if (vif->gso_mask & vif->gso_prefix_mask) {
xenbus_dev_fatal(dev, err,
"%s: gso and gso prefix flags are not "
"mutually exclusive",
dev->otherend);
return -EOPNOTSUPP;
}
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
"%d", &val) < 0)
val = 0;
vif->csum = !val;
vif->ip_csum = !val;
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-ipv6-csum-offload",
"%d", &val) < 0)
val = 0;
vif->ipv6_csum = !!val;
/* Map the shared frame, irq etc. */
err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref,
......
......@@ -50,6 +50,20 @@
* node as before.
*/
/*
* "feature-no-csum-offload" should be used to turn IPv4 TCP/UDP checksum
* offload off or on. If it is missing then the feature is assumed to be on.
* "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP checksum
* offload on or off. If it is missing then the feature is assumed to be off.
*/
/*
* "feature-gso-tcpv4" and "feature-gso-tcpv6" advertise the capability to
* handle large TCP packets (in IPv4 or IPv6 form respectively). Neither
* frontends nor backends are assumed to be capable unless the flags are
* present.
*/
/*
* This is the 'wire' format for packets:
* Request 1: xen_netif_tx_request -- XEN_NETTXF_* (any flags)
......@@ -95,8 +109,10 @@ struct xen_netif_tx_request {
#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
/* GSO types - only TCPv4 currently supported. */
/* GSO types */
#define XEN_NETIF_GSO_TYPE_NONE (0)
#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
#define XEN_NETIF_GSO_TYPE_TCPV6 (2)
/*
* This structure needs to fit within both netif_tx_request and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment