Commit 293bfa9b authored by David S. Miller's avatar David S. Miller

Merge branch 'xdp-head-adjustment'

Martin KaFai Lau says:

====================
Allow head adjustment in XDP prog

This series adds a helper to allow head adjusting in XDP prog.  mlx4
driver has been modified to support this feature.  An example is written
to encapsulate a packet with an IPv4/v6 header and then XDP_TX it
out.

v4:
1. Remove XDP_QUERY_FEATURES command.  Instead, check
   the prog->xdp_adjust_head bit inside the driver itself
   during XDP_SETUP_PROG in patch 1of4.
   Thanks for everybody's ideas.
2. Nit changes on sample code per Jesper

v3:
1. Check if the driver supports head adjustment before
   setting the xdp_prog fd to the device in patch 1of4.
2. Remove the page alignment assumption on the data_hard_start.
   Instead, add data_hard_start to the struct xdp_buff and the
   driver has to fill it if it supports head adjustment.
3. Keep the wire MTU as before in mlx4
4. Set map0_byte_count to PAGE_SIZE in patch 3of4

v2:
1. Make a variable name change in bpf_xdp_adjust_head() in patch 1
2. Ensure no less than ETH_HLEN data in bpf_xdp_adjust_head() in patch 1
3. Some clarifications in commit log messages of patch 2 and 3
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8a03cf2c 12d8bb64
......@@ -766,7 +766,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
func = (u8 *) __bpf_call_base + imm;
/* Save skb pointer if we need to re-cache skb data */
if (bpf_helper_changes_skb_data(func))
if (bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_func_call(image, ctx, (u64)func);
......@@ -775,7 +775,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
PPC_MR(b2p[BPF_REG_0], 3);
/* refresh skb cache */
if (bpf_helper_changes_skb_data(func)) {
if (bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx);
......
......@@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2);
if (bpf_helper_changes_skb_data((void *)func)) {
if (bpf_helper_changes_pkt_data((void *)func)) {
jit->seen |= SEEN_SKB_CHANGE;
/* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
......
......@@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
if (seen_ld_abs) {
reload_skb_data = bpf_helper_changes_skb_data(func);
reload_skb_data = bpf_helper_changes_pkt_data(func);
if (reload_skb_data) {
EMIT1(0x57); /* push %rdi */
jmp_offset += 22; /* pop, mov, sub, mov */
......
......@@ -51,6 +51,9 @@
#include "mlx4_en.h"
#include "en_port.h"
#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
XDP_PACKET_HEADROOM))
int mlx4_en_setup_tc(struct net_device *dev, u8 up)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
......@@ -2249,6 +2252,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
free_netdev(dev);
}
static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
if (mtu > MLX4_EN_MAX_XDP_MTU) {
en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
mtu, MLX4_EN_MAX_XDP_MTU);
return false;
}
return true;
}
static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
......@@ -2258,11 +2274,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
dev->mtu, new_mtu);
if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
en_err(priv, "MTU size:%d requires frags but XDP running\n",
new_mtu);
return -EOPNOTSUPP;
}
if (priv->tx_ring_num[TX_XDP] &&
!mlx4_en_check_xdp_mtu(dev, new_mtu))
return -ENOTSUPP;
dev->mtu = new_mtu;
if (netif_running(dev)) {
......@@ -2710,10 +2725,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
return 0;
}
if (priv->num_frags > 1) {
en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
return -EOPNOTSUPP;
}
tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
if (!tmp)
......
......@@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
const struct mlx4_en_frag_info *frag_info;
struct page *page;
dma_addr_t dma;
int i;
for (i = 0; i < priv->num_frags; i++) {
......@@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
for (i = 0; i < priv->num_frags; i++) {
frags[i] = ring_alloc[i];
dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
frags[i].page_offset += priv->frag_info[i].rx_headroom;
rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
frags[i].page_offset);
ring_alloc[i] = page_alloc[i];
rx_desc->data[i].addr = cpu_to_be64(dma);
}
return 0;
......@@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
if (ring->page_cache.index > 0) {
frags[0] = ring->page_cache.buf[--ring->page_cache.index];
rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
frags[0].page_offset);
return 0;
}
......@@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
if (xdp_prog) {
struct xdp_buff xdp;
dma_addr_t dma;
void *orig_data;
u32 act;
dma = be64_to_cpu(rx_desc->data[0].addr);
......@@ -896,11 +898,19 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
priv->frag_info[0].frag_size,
DMA_FROM_DEVICE);
xdp.data = page_address(frags[0].page) +
frags[0].page_offset;
xdp.data_hard_start = page_address(frags[0].page);
xdp.data = xdp.data_hard_start + frags[0].page_offset;
xdp.data_end = xdp.data + length;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
if (xdp.data != orig_data) {
length = xdp.data_end - xdp.data;
frags[0].page_offset = xdp.data -
xdp.data_hard_start;
}
switch (act) {
case XDP_PASS:
break;
......@@ -1164,38 +1174,42 @@ static const int frag_sizes[] = {
void mlx4_en_calc_rx_buf(struct net_device *dev)
{
enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
struct mlx4_en_priv *priv = netdev_priv(dev);
int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
int order = MLX4_EN_ALLOC_PREFER_ORDER;
u32 align = SMP_CACHE_BYTES;
int buf_size = 0;
int i = 0;
/* bpf requires buffers to be set up as 1 packet per page.
* This only works when num_frags == 1.
*/
if (priv->tx_ring_num[TX_XDP]) {
dma_dir = PCI_DMA_BIDIRECTIONAL;
/* This will gain efficient xdp frame recycling at the expense
* of more costly truesize accounting
priv->frag_info[0].order = 0;
priv->frag_info[0].frag_size = eff_mtu;
priv->frag_info[0].frag_prefix_size = 0;
/* This will gain efficient xdp frame recycling at the
* expense of more costly truesize accounting
*/
align = PAGE_SIZE;
order = 0;
}
priv->frag_info[0].frag_stride = PAGE_SIZE;
priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
i = 1;
} else {
int buf_size = 0;
while (buf_size < eff_mtu) {
priv->frag_info[i].order = order;
priv->frag_info[i].order = MLX4_EN_ALLOC_PREFER_ORDER;
priv->frag_info[i].frag_size =
(eff_mtu > buf_size + frag_sizes[i]) ?
frag_sizes[i] : eff_mtu - buf_size;
priv->frag_info[i].frag_prefix_size = buf_size;
priv->frag_info[i].frag_stride =
ALIGN(priv->frag_info[i].frag_size, align);
priv->frag_info[i].dma_dir = dma_dir;
ALIGN(priv->frag_info[i].frag_size,
SMP_CACHE_BYTES);
priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
priv->frag_info[i].rx_headroom = 0;
buf_size += priv->frag_info[i].frag_size;
i++;
}
}
priv->num_frags = i;
priv->rx_skb_size = eff_mtu;
......
......@@ -354,7 +354,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc frame = {
.page = tx_info->page,
.dma = tx_info->map0_dma,
.page_offset = 0,
.page_offset = XDP_PACKET_HEADROOM,
.page_size = PAGE_SIZE,
};
......@@ -1132,7 +1132,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
tx_info->page = frame->page;
frame->page = NULL;
tx_info->map0_dma = dma;
tx_info->map0_byte_count = length;
tx_info->map0_byte_count = PAGE_SIZE;
tx_info->nr_txbb = nr_txbb;
tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
tx_info->data_offset = (void *)data - (void *)tx_desc;
......@@ -1141,9 +1141,10 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
tx_info->linear = 1;
tx_info->inl = 0;
dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
length, PCI_DMA_TODEVICE);
data->addr = cpu_to_be64(dma);
data->addr = cpu_to_be64(dma + frame->page_offset);
data->lkey = ring->mr_key;
dma_wmb();
data->byte_count = cpu_to_be32(length);
......
......@@ -475,7 +475,8 @@ struct mlx4_en_frag_info {
u16 frag_prefix_size;
u32 frag_stride;
enum dma_data_direction dma_dir;
int order;
u16 order;
u16 rx_headroom;
};
#ifdef CONFIG_MLX4_EN_DCB
......
......@@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
bool reset, was_opened;
int i;
if (prog && prog->xdp_adjust_head) {
netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n");
return -EOPNOTSUPP;
}
mutex_lock(&priv->state_lock);
if ((netdev->features & NETIF_F_LRO) && prog) {
......
......@@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog)
};
int err;
if (prog && prog->xdp_adjust_head) {
nn_err(nn, "Does not support bpf_xdp_adjust_head()\n");
return -EOPNOTSUPP;
}
if (!prog && !nn->xdp_prog)
return 0;
if (prog && nn->xdp_prog) {
......
......@@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog)
{
struct qede_reload_args args;
if (prog && prog->xdp_adjust_head) {
DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n");
return -EOPNOTSUPP;
}
/* If we're called, there was already a bpf reference increment */
args.func = &qede_xdp_reload_func;
args.u.new_prog = prog;
......
......@@ -406,7 +406,8 @@ struct bpf_prog {
u16 jited:1, /* Is our filter JIT'ed? */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1; /* Do we need dst entry? */
dst_needed:1, /* Do we need dst entry? */
xdp_adjust_head:1; /* Adjusting pkt head? */
kmemcheck_bitfield_end(meta);
enum bpf_prog_type type; /* Type of BPF program */
u32 len; /* Number of filter blocks */
......@@ -440,6 +441,7 @@ struct bpf_skb_data_end {
struct xdp_buff {
void *data;
void *data_end;
void *data_hard_start;
};
/* compute the linear packet data range [data, data_end) which
......@@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
bool bpf_helper_changes_skb_data(void *func);
bool bpf_helper_changes_pkt_data(void *func);
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
......
......@@ -424,6 +424,12 @@ union bpf_attr {
* @len: length of header to be pushed in front
* @flags: Flags (unused for now)
* Return: 0 on success or negative error
*
* int bpf_xdp_adjust_head(xdp_md, delta)
* Adjust the xdp_md.data by delta
* @xdp_md: pointer to xdp_md
* @delta: An positive/negative integer to be added to xdp_md.data
* Return: 0 on success or negative on error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -469,7 +475,8 @@ union bpf_attr {
FN(csum_update), \
FN(set_hash_invalid), \
FN(get_numa_node_id), \
FN(skb_change_head),
FN(skb_change_head), \
FN(xdp_adjust_head),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -576,6 +583,8 @@ struct bpf_sock {
__u32 protocol;
};
#define XDP_PACKET_HEADROOM 256
/* User return codes for XDP prog type.
* A valid XDP program must return one of these defined values. All other
* return codes are reserved for future use. Unknown return codes will result
......
......@@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
return prog;
}
bool __weak bpf_helper_changes_skb_data(void *func)
bool __weak bpf_helper_changes_pkt_data(void *func)
{
return false;
}
......
......@@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
bpf_user_rnd_init_once();
if (insn->imm == BPF_FUNC_xdp_adjust_head)
prog->xdp_adjust_head = 1;
if (insn->imm == BPF_FUNC_tail_call) {
/* mark bpf_tail_call as different opcode
* to avoid conditional branch in
......
......@@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
return -EINVAL;
}
changes_data = bpf_helper_changes_skb_data(fn->func);
changes_data = bpf_helper_changes_pkt_data(fn->func);
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
......
......@@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
.arg3_type = ARG_ANYTHING,
};
bool bpf_helper_changes_skb_data(void *func)
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
void *data = xdp->data + offset;
if (unlikely(data < xdp->data_hard_start ||
data > xdp->data_end - ETH_HLEN))
return -EINVAL;
xdp->data = data;
return 0;
}
static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.func = bpf_xdp_adjust_head,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||
......@@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func)
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace)
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head)
return true;
return false;
......@@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id)
return &bpf_xdp_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_xdp_adjust_head:
return &bpf_xdp_adjust_head_proto;
default:
return sk_filter_func_proto(func_id);
}
......
......@@ -33,6 +33,7 @@ hostprogs-y += trace_event
hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect
hostprogs-y += lwt_len_hist
hostprogs-y += xdp_tx_iptunnel
test_lru_dist-objs := test_lru_dist.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
......@@ -67,6 +68,7 @@ trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o
xdp_tx_iptunnel-objs := bpf_load.o libbpf.o xdp_tx_iptunnel_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
......@@ -99,6 +101,7 @@ always += test_current_task_under_cgroup_kern.o
always += trace_event_kern.o
always += sampleip_kern.o
always += lwt_len_hist_kern.o
always += xdp_tx_iptunnel_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
......@@ -129,6 +132,7 @@ HOSTLOADLIBES_trace_event += -lelf
HOSTLOADLIBES_sampleip += -lelf
HOSTLOADLIBES_tc_l2_redirect += -l elf
HOSTLOADLIBES_lwt_len_hist += -l elf
HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
......
......@@ -57,6 +57,8 @@ static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) =
(void *) BPF_FUNC_skb_set_tunnel_opt;
static unsigned long long (*bpf_get_prandom_u32)(void) =
(void *) BPF_FUNC_get_prandom_u32;
static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
(void *) BPF_FUNC_xdp_adjust_head;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
......
......@@ -12,6 +12,10 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
......@@ -450,3 +454,93 @@ struct ksym *ksym_search(long key)
/* out of range. return _stext */
return &syms[0];
}
int set_link_xdp_fd(int ifindex, int fd)
{
struct sockaddr_nl sa;
int sock, seq = 0, len, ret = -1;
char buf[4096];
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
struct nlmsghdr *nh;
struct nlmsgerr *err;
memset(&sa, 0, sizeof(sa));
sa.nl_family = AF_NETLINK;
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock < 0) {
printf("open netlink socket: %s\n", strerror(errno));
return -1;
}
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
printf("bind to netlink: %s\n", strerror(errno));
goto cleanup;
}
memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
printf("send to netlink: %s\n", strerror(errno));
goto cleanup;
}
len = recv(sock, buf, sizeof(buf), 0);
if (len < 0) {
printf("recv from netlink: %s\n", strerror(errno));
goto cleanup;
}
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
nh = NLMSG_NEXT(nh, len)) {
if (nh->nlmsg_pid != getpid()) {
printf("Wrong pid %d, expected %d\n",
nh->nlmsg_pid, getpid());
goto cleanup;
}
if (nh->nlmsg_seq != seq) {
printf("Wrong seq %d, expected %d\n",
nh->nlmsg_seq, seq);
goto cleanup;
}
switch (nh->nlmsg_type) {
case NLMSG_ERROR:
err = (struct nlmsgerr *)NLMSG_DATA(nh);
if (!err->error)
continue;
printf("nlmsg error %s\n", strerror(-err->error));
goto cleanup;
case NLMSG_DONE:
break;
}
}
ret = 0;
cleanup:
close(sock);
return ret;
}
......@@ -31,4 +31,5 @@ struct ksym {
int load_kallsyms(void);
struct ksym *ksym_search(long key);
int set_link_xdp_fd(int ifindex, int fd);
#endif
......@@ -5,111 +5,18 @@
* License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <unistd.h>
#include "bpf_load.h"
#include "bpf_util.h"
#include "libbpf.h"
static int set_link_xdp_fd(int ifindex, int fd)
{
struct sockaddr_nl sa;
int sock, seq = 0, len, ret = -1;
char buf[4096];
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
struct nlmsghdr *nh;
struct nlmsgerr *err;
memset(&sa, 0, sizeof(sa));
sa.nl_family = AF_NETLINK;
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock < 0) {
printf("open netlink socket: %s\n", strerror(errno));
return -1;
}
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
printf("bind to netlink: %s\n", strerror(errno));
goto cleanup;
}
memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
printf("send to netlink: %s\n", strerror(errno));
goto cleanup;
}
len = recv(sock, buf, sizeof(buf), 0);
if (len < 0) {
printf("recv from netlink: %s\n", strerror(errno));
goto cleanup;
}
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
nh = NLMSG_NEXT(nh, len)) {
if (nh->nlmsg_pid != getpid()) {
printf("Wrong pid %d, expected %d\n",
nh->nlmsg_pid, getpid());
goto cleanup;
}
if (nh->nlmsg_seq != seq) {
printf("Wrong seq %d, expected %d\n",
nh->nlmsg_seq, seq);
goto cleanup;
}
switch (nh->nlmsg_type) {
case NLMSG_ERROR:
err = (struct nlmsgerr *)NLMSG_DATA(nh);
if (!err->error)
continue;
printf("nlmsg error %s\n", strerror(-err->error));
goto cleanup;
case NLMSG_DONE:
break;
}
}
ret = 0;
cleanup:
close(sock);
return ret;
}
static int ifindex;
static void int_exit(int sig)
......
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
#include <linux/types.h>
#define MAX_IPTNL_ENTRIES 256U
struct vip {
union {
__u32 v6[4];
__u32 v4;
} daddr;
__u16 dport;
__u16 family;
__u8 protocol;
};
struct iptnl_info {
union {
__u32 v6[4];
__u32 v4;
} saddr;
union {
__u32 v6[4];
__u32 v4;
} daddr;
__u16 family;
__u8 dmac[6];
};
#endif
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program shows how to use bpf_xdp_adjust_head() by
* encapsulating the incoming packet in an IPv4/v6 header
* and then XDP_TX it out.
*/
#include <uapi/linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include "bpf_helpers.h"
#include "xdp_tx_iptunnel_common.h"
struct bpf_map_def SEC("maps") rxcnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u64),
.max_entries = 256,
};
struct bpf_map_def SEC("maps") vip2tnl = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct vip),
.value_size = sizeof(struct iptnl_info),
.max_entries = MAX_IPTNL_ENTRIES,
};
static __always_inline void count_tx(u32 protocol)
{
u64 *rxcnt_count;
rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
if (rxcnt_count)
*rxcnt_count += 1;
}
static __always_inline int get_dport(void *trans_data, void *data_end,
u8 protocol)
{
struct tcphdr *th;
struct udphdr *uh;
switch (protocol) {
case IPPROTO_TCP:
th = (struct tcphdr *)trans_data;
if (th + 1 > data_end)
return -1;
return th->dest;
case IPPROTO_UDP:
uh = (struct udphdr *)trans_data;
if (uh + 1 > data_end)
return -1;
return uh->dest;
default:
return 0;
}
}
static __always_inline void set_ethhdr(struct ethhdr *new_eth,
const struct ethhdr *old_eth,
const struct iptnl_info *tnl,
__be16 h_proto)
{
memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
new_eth->h_proto = h_proto;
}
static __always_inline int handle_ipv4(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
struct iptnl_info *tnl;
struct ethhdr *new_eth;
struct ethhdr *old_eth;
struct iphdr *iph = data + sizeof(struct ethhdr);
u16 *next_iph_u16;
u16 payload_len;
struct vip vip = {};
int dport;
u32 csum = 0;
int i;
if (iph + 1 > data_end)
return XDP_DROP;
dport = get_dport(iph + 1, data_end, iph->protocol);
if (dport == -1)
return XDP_DROP;
vip.protocol = iph->protocol;
vip.family = AF_INET;
vip.daddr.v4 = iph->daddr;
vip.dport = dport;
payload_len = ntohs(iph->tot_len);
tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
/* It only does v4-in-v4 */
if (!tnl || tnl->family != AF_INET)
return XDP_PASS;
/* The vip key is found. Add an IP header and send it out */
if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
return XDP_DROP;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
new_eth = data;
iph = data + sizeof(*new_eth);
old_eth = data + sizeof(*iph);
if (new_eth + 1 > data_end ||
old_eth + 1 > data_end ||
iph + 1 > data_end)
return XDP_DROP;
set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
iph->version = 4;
iph->ihl = sizeof(*iph) >> 2;
iph->frag_off = 0;
iph->protocol = IPPROTO_IPIP;
iph->check = 0;
iph->tos = 0;
iph->tot_len = htons(payload_len + sizeof(*iph));
iph->daddr = tnl->daddr.v4;
iph->saddr = tnl->saddr.v4;
iph->ttl = 8;
next_iph_u16 = (u16 *)iph;
#pragma clang loop unroll(full)
for (i = 0; i < sizeof(*iph) >> 1; i++)
csum += *next_iph_u16++;
iph->check = ~((csum & 0xffff) + (csum >> 16));
count_tx(vip.protocol);
return XDP_TX;
}
static __always_inline int handle_ipv6(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
struct iptnl_info *tnl;
struct ethhdr *new_eth;
struct ethhdr *old_eth;
struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
__u16 payload_len;
struct vip vip = {};
int dport;
if (ip6h + 1 > data_end)
return XDP_DROP;
dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
if (dport == -1)
return XDP_DROP;
vip.protocol = ip6h->nexthdr;
vip.family = AF_INET6;
memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
vip.dport = dport;
payload_len = ip6h->payload_len;
tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
/* It only does v6-in-v6 */
if (!tnl || tnl->family != AF_INET6)
return XDP_PASS;
/* The vip key is found. Add an IP header and send it out */
if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
return XDP_DROP;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
new_eth = data;
ip6h = data + sizeof(*new_eth);
old_eth = data + sizeof(*ip6h);
if (new_eth + 1 > data_end ||
old_eth + 1 > data_end ||
ip6h + 1 > data_end)
return XDP_DROP;
set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
ip6h->version = 6;
ip6h->priority = 0;
memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
ip6h->nexthdr = IPPROTO_IPV6;
ip6h->hop_limit = 8;
memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
count_tx(vip.protocol);
return XDP_TX;
}
SEC("xdp_tx_iptunnel")
int _xdp_tx_iptunnel(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
struct ethhdr *eth = data;
__u16 h_proto;
if (eth + 1 > data_end)
return XDP_DROP;
h_proto = eth->h_proto;
if (h_proto == htons(ETH_P_IP))
return handle_ipv4(xdp);
else if (h_proto == htons(ETH_P_IPV6))
return handle_ipv6(xdp);
else
return XDP_PASS;
}
char _license[] SEC("license") = "GPL";
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include <arpa/inet.h>
#include <netinet/ether.h>
#include <unistd.h>
#include <time.h>
#include "bpf_load.h"
#include "libbpf.h"
#include "bpf_util.h"
#include "xdp_tx_iptunnel_common.h"
#define STATS_INTERVAL_S 2U
static int ifindex = -1;
static void int_exit(int sig)
{
if (ifindex > -1)
set_link_xdp_fd(ifindex, -1);
exit(0);
}
/* simple per-protocol drop counter
*/
static void poll_stats(unsigned int kill_after_s)
{
const unsigned int nr_protos = 256;
unsigned int nr_cpus = bpf_num_possible_cpus();
time_t started_at = time(NULL);
__u64 values[nr_cpus], prev[nr_protos][nr_cpus];
__u32 proto;
int i;
memset(prev, 0, sizeof(prev));
while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
sleep(STATS_INTERVAL_S);
for (proto = 0; proto < nr_protos; proto++) {
__u64 sum = 0;
assert(bpf_lookup_elem(map_fd[0], &proto, values) == 0);
for (i = 0; i < nr_cpus; i++)
sum += (values[i] - prev[proto][i]);
if (sum)
printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n",
proto, sum, sum / STATS_INTERVAL_S);
memcpy(prev[proto], values, sizeof(values));
}
}
}
static void usage(const char *cmd)
{
printf("Start a XDP prog which encapsulates incoming packets\n"
"in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n"
"is used to select packets to encapsulate\n\n");
printf("Usage: %s [...]\n", cmd);
printf(" -i <ifindex> Interface Index\n");
printf(" -a <vip-service-address> IPv4 or IPv6\n");
printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
printf(" -s <source-ip> Used in the IPTunnel header\n");
printf(" -d <dest-ip> Used in the IPTunnel header\n");
printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n");
printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
printf(" -P <IP-Protocol> Default is TCP\n");
printf(" -h Display this help\n");
}
static int parse_ipstr(const char *ipstr, unsigned int *addr)
{
if (inet_pton(AF_INET6, ipstr, addr) == 1) {
return AF_INET6;
} else if (inet_pton(AF_INET, ipstr, addr) == 1) {
addr[1] = addr[2] = addr[3] = 0;
return AF_INET;
}
fprintf(stderr, "%s is an invalid IP\n", ipstr);
return AF_UNSPEC;
}
static int parse_ports(const char *port_str, int *min_port, int *max_port)
{
char *end;
long tmp_min_port;
long tmp_max_port;
tmp_min_port = strtol(optarg, &end, 10);
if (tmp_min_port < 1 || tmp_min_port > 65535) {
fprintf(stderr, "Invalid port(s):%s\n", optarg);
return 1;
}
if (*end == '-') {
end++;
tmp_max_port = strtol(end, NULL, 10);
if (tmp_max_port < 1 || tmp_max_port > 65535) {
fprintf(stderr, "Invalid port(s):%s\n", optarg);
return 1;
}
} else {
tmp_max_port = tmp_min_port;
}
if (tmp_min_port > tmp_max_port) {
fprintf(stderr, "Invalid port(s):%s\n", optarg);
return 1;
}
if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) {
fprintf(stderr, "Port range (%s) is larger than %u\n",
port_str, MAX_IPTNL_ENTRIES);
return 1;
}
*min_port = tmp_min_port;
*max_port = tmp_max_port;
return 0;
}
int main(int argc, char **argv)
{
unsigned char opt_flags[256] = {};
unsigned int kill_after_s = 0;
const char *optstr = "i:a:p:s:d:m:T:P:h";
int min_port = 0, max_port = 0;
struct iptnl_info tnl = {};
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct vip vip = {};
char filename[256];
int opt;
int i;
tnl.family = AF_UNSPEC;
vip.protocol = IPPROTO_TCP;
for (i = 0; i < strlen(optstr); i++)
if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
opt_flags[(unsigned char)optstr[i]] = 1;
while ((opt = getopt(argc, argv, optstr)) != -1) {
unsigned short family;
unsigned int *v6;
switch (opt) {
case 'i':
ifindex = atoi(optarg);
break;
case 'a':
vip.family = parse_ipstr(optarg, vip.daddr.v6);
if (vip.family == AF_UNSPEC)
return 1;
break;
case 'p':
if (parse_ports(optarg, &min_port, &max_port))
return 1;
break;
case 'P':
vip.protocol = atoi(optarg);
break;
case 's':
case 'd':
if (opt == 's')
v6 = tnl.saddr.v6;
else
v6 = tnl.daddr.v6;
family = parse_ipstr(optarg, v6);
if (family == AF_UNSPEC)
return 1;
if (tnl.family == AF_UNSPEC) {
tnl.family = family;
} else if (tnl.family != family) {
fprintf(stderr,
"The IP version of the src and dst addresses used in the IP encapsulation does not match\n");
return 1;
}
break;
case 'm':
if (!ether_aton_r(optarg,
(struct ether_addr *)tnl.dmac)) {
fprintf(stderr, "Invalid mac address:%s\n",
optarg);
return 1;
}
break;
case 'T':
kill_after_s = atoi(optarg);
break;
default:
usage(argv[0]);
return 1;
}
opt_flags[opt] = 0;
}
for (i = 0; i < strlen(optstr); i++) {
if (opt_flags[(unsigned int)optstr[i]]) {
fprintf(stderr, "Missing argument -%c\n", optstr[i]);
usage(argv[0]);
return 1;
}
}
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
return 1;
}
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
if (!prog_fd[0]) {
printf("load_bpf_file: %s\n", strerror(errno));
return 1;
}
signal(SIGINT, int_exit);
while (min_port <= max_port) {
vip.dport = htons(min_port++);
if (bpf_update_elem(map_fd[1], &vip, &tnl, BPF_NOEXIST)) {
perror("bpf_update_elem(&vip2tnl)");
return 1;
}
}
if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) {
printf("link set xdp fd failed\n");
return 1;
}
poll_stats(kill_after_s);
set_link_xdp_fd(ifindex, -1);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment