Commit 58deb77c authored by Matteo Croce's avatar Matteo Croce Committed by David S. Miller

bonding: balance ICMP echoes in layer3+4 mode

The bonding uses the L4 ports to balance flows between slaves. As the ICMP
protocol has no ports, those packets are sent all to the same device:

    # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
    # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64

But some ICMP packets have an Identifier field which is
used to match packets within sessions, let's use this value in the hash
function to balance these packets between bond slaves:

    # ping -qc1 192.168.0.2
    0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
    0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64

Aso, let's use a flow_dissector_key which defines FLOW_DISSECTOR_KEY_ICMP,
so we can balance pings encapsulated in a tunnel when using mode encap3+4:

    # ping -q 192.168.1.2 -c1
    0: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 585, seq 1, length 64
    0: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 585, seq 1, length 64
    # ping -q 192.168.1.2 -c1
    1: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 586, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 586, seq 1, length 64
Signed-off-by: default avatarMatteo Croce <mcroce@redhat.com>
Reviewed-by: default avatarNikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 5dec597e
...@@ -200,6 +200,51 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0); ...@@ -200,6 +200,51 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0);
unsigned int bond_net_id __read_mostly; unsigned int bond_net_id __read_mostly;
static const struct flow_dissector_key flow_keys_bonding_keys[] = {
{
.key_id = FLOW_DISSECTOR_KEY_CONTROL,
.offset = offsetof(struct flow_keys, control),
},
{
.key_id = FLOW_DISSECTOR_KEY_BASIC,
.offset = offsetof(struct flow_keys, basic),
},
{
.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
.offset = offsetof(struct flow_keys, addrs.v4addrs),
},
{
.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
.offset = offsetof(struct flow_keys, addrs.v6addrs),
},
{
.key_id = FLOW_DISSECTOR_KEY_TIPC,
.offset = offsetof(struct flow_keys, addrs.tipckey),
},
{
.key_id = FLOW_DISSECTOR_KEY_PORTS,
.offset = offsetof(struct flow_keys, ports),
},
{
.key_id = FLOW_DISSECTOR_KEY_ICMP,
.offset = offsetof(struct flow_keys, icmp),
},
{
.key_id = FLOW_DISSECTOR_KEY_VLAN,
.offset = offsetof(struct flow_keys, vlan),
},
{
.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
.offset = offsetof(struct flow_keys, tags),
},
{
.key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
.offset = offsetof(struct flow_keys, keyid),
},
};
static struct flow_dissector flow_keys_bonding __read_mostly;
/*-------------------------- Forward declarations ---------------------------*/ /*-------------------------- Forward declarations ---------------------------*/
static int bond_init(struct net_device *bond_dev); static int bond_init(struct net_device *bond_dev);
...@@ -3263,10 +3308,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, ...@@ -3263,10 +3308,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
const struct iphdr *iph; const struct iphdr *iph;
int noff, proto = -1; int noff, proto = -1;
if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
return skb_flow_dissect_flow_keys(skb, fk, 0); memset(fk, 0, sizeof(*fk));
return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
fk, NULL, 0, 0, 0, 0);
}
fk->ports.ports = 0; fk->ports.ports = 0;
memset(&fk->icmp, 0, sizeof(fk->icmp));
noff = skb_network_offset(skb); noff = skb_network_offset(skb);
if (skb->protocol == htons(ETH_P_IP)) { if (skb->protocol == htons(ETH_P_IP)) {
if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
...@@ -3286,8 +3335,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, ...@@ -3286,8 +3335,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
} else { } else {
return false; return false;
} }
if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) {
if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data,
skb_transport_offset(skb),
skb_headlen(skb));
else
fk->ports.ports = skb_flow_get_ports(skb, noff, proto); fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
}
return true; return true;
} }
...@@ -3314,10 +3369,14 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) ...@@ -3314,10 +3369,14 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
return bond_eth_hash(skb); return bond_eth_hash(skb);
if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
hash = bond_eth_hash(skb); hash = bond_eth_hash(skb);
} else {
if (flow.icmp.id)
memcpy(&hash, &flow.icmp, sizeof(hash));
else else
hash = (__force u32)flow.ports.ports; memcpy(&hash, &flow.ports.ports, sizeof(hash));
}
hash ^= (__force u32)flow_get_u32_dst(&flow) ^ hash ^= (__force u32)flow_get_u32_dst(&flow) ^
(__force u32)flow_get_u32_src(&flow); (__force u32)flow_get_u32_src(&flow);
hash ^= (hash >> 16); hash ^= (hash >> 16);
...@@ -4901,6 +4960,10 @@ static int __init bonding_init(void) ...@@ -4901,6 +4960,10 @@ static int __init bonding_init(void)
goto err; goto err;
} }
skb_flow_dissector_init(&flow_keys_bonding,
flow_keys_bonding_keys,
ARRAY_SIZE(flow_keys_bonding_keys));
register_netdevice_notifier(&bond_netdev_notifier); register_netdevice_notifier(&bond_netdev_notifier);
out: out:
return res; return res;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment