Commit b2d66643 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2023-11-21

We've added 19 non-merge commits during the last 4 day(s) which contain
a total of 18 files changed, 1043 insertions(+), 416 deletions(-).

The main changes are:

1) Fix BPF verifier to validate callbacks as if they are called an unknown
   number of times in order to fix not detecting some unsafe programs,
   from Eduard Zingerman.

2) Fix bpf_redirect_peer() handling which missed proper stats accounting
   for veth and netkit and also generally fix missing stats for the latter,
   from Peilin Ye, Daniel Borkmann et al.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  selftests/bpf: check if max number of bpf_loop iterations is tracked
  bpf: keep track of max number of bpf_loop callback iterations
  selftests/bpf: test widening for iterating callbacks
  bpf: widening for callback iterators
  selftests/bpf: tests for iterating callbacks
  bpf: verify callbacks as if they are called unknown number of times
  bpf: extract setup_func_entry() utility function
  bpf: extract __check_reg_arg() utility function
  selftests/bpf: fix bpf_loop_bench for new callback verification scheme
  selftests/bpf: track string payload offset as scalar in strobemeta
  selftests/bpf: track tcp payload offset as scalar in xdp_synproxy
  selftests/bpf: Add netkit to tc_redirect selftest
  selftests/bpf: De-veth-ize the tc_redirect test case
  bpf, netkit: Add indirect call wrapper for fetching peer dev
  bpf: Fix dev's rx stats for bpf_redirect_peer traffic
  veth: Use tstats per-CPU traffic counters
  netkit: Add tstats per-CPU traffic counters
  net: Move {l,t,d}stats allocation to core and convert veth & vrf
  net, vrf: Move dstats structure to core
====================

Link: https://lore.kernel.org/r/20231121193113.11796-1-daniel@iogearbox.netSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 495ec91b acb12c85
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/netfilter_netdev.h> #include <linux/netfilter_netdev.h>
#include <linux/bpf_mprog.h> #include <linux/bpf_mprog.h>
#include <linux/indirect_call_wrapper.h>
#include <net/netkit.h> #include <net/netkit.h>
#include <net/dst.h> #include <net/dst.h>
...@@ -68,6 +69,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -68,6 +69,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
netdev_tx_t ret_dev = NET_XMIT_SUCCESS; netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
const struct bpf_mprog_entry *entry; const struct bpf_mprog_entry *entry;
struct net_device *peer; struct net_device *peer;
int len = skb->len;
rcu_read_lock(); rcu_read_lock();
peer = rcu_dereference(nk->peer); peer = rcu_dereference(nk->peer);
...@@ -85,15 +87,22 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -85,15 +87,22 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
case NETKIT_PASS: case NETKIT_PASS:
skb->protocol = eth_type_trans(skb, skb->dev); skb->protocol = eth_type_trans(skb, skb->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
__netif_rx(skb); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
dev_sw_netstats_tx_add(dev, 1, len);
dev_sw_netstats_rx_add(peer, len);
} else {
goto drop_stats;
}
break; break;
case NETKIT_REDIRECT: case NETKIT_REDIRECT:
dev_sw_netstats_tx_add(dev, 1, len);
skb_do_redirect(skb); skb_do_redirect(skb);
break; break;
case NETKIT_DROP: case NETKIT_DROP:
default: default:
drop: drop:
kfree_skb(skb); kfree_skb(skb);
drop_stats:
dev_core_stats_tx_dropped_inc(dev); dev_core_stats_tx_dropped_inc(dev);
ret_dev = NET_XMIT_DROP; ret_dev = NET_XMIT_DROP;
break; break;
...@@ -169,11 +178,18 @@ static void netkit_set_headroom(struct net_device *dev, int headroom) ...@@ -169,11 +178,18 @@ static void netkit_set_headroom(struct net_device *dev, int headroom)
rcu_read_unlock(); rcu_read_unlock();
} }
static struct net_device *netkit_peer_dev(struct net_device *dev) INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev)
{ {
return rcu_dereference(netkit_priv(dev)->peer); return rcu_dereference(netkit_priv(dev)->peer);
} }
static void netkit_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
dev_fetch_sw_netstats(stats, dev->tstats);
stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
}
static void netkit_uninit(struct net_device *dev); static void netkit_uninit(struct net_device *dev);
static const struct net_device_ops netkit_netdev_ops = { static const struct net_device_ops netkit_netdev_ops = {
...@@ -184,6 +200,7 @@ static const struct net_device_ops netkit_netdev_ops = { ...@@ -184,6 +200,7 @@ static const struct net_device_ops netkit_netdev_ops = {
.ndo_set_rx_headroom = netkit_set_headroom, .ndo_set_rx_headroom = netkit_set_headroom,
.ndo_get_iflink = netkit_get_iflink, .ndo_get_iflink = netkit_get_iflink,
.ndo_get_peer_dev = netkit_peer_dev, .ndo_get_peer_dev = netkit_peer_dev,
.ndo_get_stats64 = netkit_get_stats,
.ndo_uninit = netkit_uninit, .ndo_uninit = netkit_uninit,
.ndo_features_check = passthru_features_check, .ndo_features_check = passthru_features_check,
}; };
...@@ -218,6 +235,7 @@ static void netkit_setup(struct net_device *dev) ...@@ -218,6 +235,7 @@ static void netkit_setup(struct net_device *dev)
ether_setup(dev); ether_setup(dev);
dev->max_mtu = ETH_MAX_MTU; dev->max_mtu = ETH_MAX_MTU;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->flags |= IFF_NOARP; dev->flags |= IFF_NOARP;
dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags &= ~IFF_TX_SKB_SHARING;
......
...@@ -373,7 +373,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -373,7 +373,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
skb_tx_timestamp(skb); skb_tx_timestamp(skb);
if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
if (!use_napi) if (!use_napi)
dev_lstats_add(dev, length); dev_sw_netstats_tx_add(dev, 1, length);
else else
__veth_xdp_flush(rq); __veth_xdp_flush(rq);
} else { } else {
...@@ -387,14 +387,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -387,14 +387,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
return ret; return ret;
} }
static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
{
struct veth_priv *priv = netdev_priv(dev);
dev_lstats_read(dev, packets, bytes);
return atomic64_read(&priv->dropped);
}
static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
{ {
struct veth_priv *priv = netdev_priv(dev); struct veth_priv *priv = netdev_priv(dev);
...@@ -432,24 +424,24 @@ static void veth_get_stats64(struct net_device *dev, ...@@ -432,24 +424,24 @@ static void veth_get_stats64(struct net_device *dev,
struct veth_priv *priv = netdev_priv(dev); struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer; struct net_device *peer;
struct veth_stats rx; struct veth_stats rx;
u64 packets, bytes;
tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); tot->tx_dropped = atomic64_read(&priv->dropped);
tot->tx_bytes = bytes; dev_fetch_sw_netstats(tot, dev->tstats);
tot->tx_packets = packets;
veth_stats_rx(&rx, dev); veth_stats_rx(&rx, dev);
tot->tx_dropped += rx.xdp_tx_err; tot->tx_dropped += rx.xdp_tx_err;
tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
tot->rx_bytes = rx.xdp_bytes; tot->rx_bytes += rx.xdp_bytes;
tot->rx_packets = rx.xdp_packets; tot->rx_packets += rx.xdp_packets;
rcu_read_lock(); rcu_read_lock();
peer = rcu_dereference(priv->peer); peer = rcu_dereference(priv->peer);
if (peer) { if (peer) {
veth_stats_tx(peer, &packets, &bytes); struct rtnl_link_stats64 tot_peer = {};
tot->rx_bytes += bytes;
tot->rx_packets += packets; dev_fetch_sw_netstats(&tot_peer, peer->tstats);
tot->rx_bytes += tot_peer.tx_bytes;
tot->rx_packets += tot_peer.tx_packets;
veth_stats_rx(&rx, peer); veth_stats_rx(&rx, peer);
tot->tx_dropped += rx.peer_tq_xdp_xmit_err; tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
...@@ -1506,25 +1498,12 @@ static void veth_free_queues(struct net_device *dev) ...@@ -1506,25 +1498,12 @@ static void veth_free_queues(struct net_device *dev)
static int veth_dev_init(struct net_device *dev) static int veth_dev_init(struct net_device *dev)
{ {
int err; return veth_alloc_queues(dev);
dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
if (!dev->lstats)
return -ENOMEM;
err = veth_alloc_queues(dev);
if (err) {
free_percpu(dev->lstats);
return err;
}
return 0;
} }
static void veth_dev_free(struct net_device *dev) static void veth_dev_free(struct net_device *dev)
{ {
veth_free_queues(dev); veth_free_queues(dev);
free_percpu(dev->lstats);
} }
#ifdef CONFIG_NET_POLL_CONTROLLER #ifdef CONFIG_NET_POLL_CONTROLLER
...@@ -1796,6 +1775,7 @@ static void veth_setup(struct net_device *dev) ...@@ -1796,6 +1775,7 @@ static void veth_setup(struct net_device *dev)
NETIF_F_HW_VLAN_STAG_RX); NETIF_F_HW_VLAN_STAG_RX);
dev->needs_free_netdev = true; dev->needs_free_netdev = true;
dev->priv_destructor = veth_dev_free; dev->priv_destructor = veth_dev_free;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->max_mtu = ETH_MAX_MTU; dev->max_mtu = ETH_MAX_MTU;
dev->hw_features = VETH_FEATURES; dev->hw_features = VETH_FEATURES;
......
...@@ -121,22 +121,12 @@ struct net_vrf { ...@@ -121,22 +121,12 @@ struct net_vrf {
int ifindex; int ifindex;
}; };
struct pcpu_dstats {
u64 tx_pkts;
u64 tx_bytes;
u64 tx_drps;
u64 rx_pkts;
u64 rx_bytes;
u64 rx_drps;
struct u64_stats_sync syncp;
};
static void vrf_rx_stats(struct net_device *dev, int len) static void vrf_rx_stats(struct net_device *dev, int len)
{ {
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
u64_stats_update_begin(&dstats->syncp); u64_stats_update_begin(&dstats->syncp);
dstats->rx_pkts++; dstats->rx_packets++;
dstats->rx_bytes += len; dstats->rx_bytes += len;
u64_stats_update_end(&dstats->syncp); u64_stats_update_end(&dstats->syncp);
} }
...@@ -161,10 +151,10 @@ static void vrf_get_stats64(struct net_device *dev, ...@@ -161,10 +151,10 @@ static void vrf_get_stats64(struct net_device *dev,
do { do {
start = u64_stats_fetch_begin(&dstats->syncp); start = u64_stats_fetch_begin(&dstats->syncp);
tbytes = dstats->tx_bytes; tbytes = dstats->tx_bytes;
tpkts = dstats->tx_pkts; tpkts = dstats->tx_packets;
tdrops = dstats->tx_drps; tdrops = dstats->tx_drops;
rbytes = dstats->rx_bytes; rbytes = dstats->rx_bytes;
rpkts = dstats->rx_pkts; rpkts = dstats->rx_packets;
} while (u64_stats_fetch_retry(&dstats->syncp, start)); } while (u64_stats_fetch_retry(&dstats->syncp, start));
stats->tx_bytes += tbytes; stats->tx_bytes += tbytes;
stats->tx_packets += tpkts; stats->tx_packets += tpkts;
...@@ -421,7 +411,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, ...@@ -421,7 +411,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
vrf_rx_stats(dev, len); vrf_rx_stats(dev, len);
else else
this_cpu_inc(dev->dstats->rx_drps); this_cpu_inc(dev->dstats->rx_drops);
return NETDEV_TX_OK; return NETDEV_TX_OK;
} }
...@@ -616,11 +606,11 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -616,11 +606,11 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
u64_stats_update_begin(&dstats->syncp); u64_stats_update_begin(&dstats->syncp);
dstats->tx_pkts++; dstats->tx_packets++;
dstats->tx_bytes += len; dstats->tx_bytes += len;
u64_stats_update_end(&dstats->syncp); u64_stats_update_end(&dstats->syncp);
} else { } else {
this_cpu_inc(dev->dstats->tx_drps); this_cpu_inc(dev->dstats->tx_drops);
} }
return ret; return ret;
...@@ -1174,22 +1164,15 @@ static void vrf_dev_uninit(struct net_device *dev) ...@@ -1174,22 +1164,15 @@ static void vrf_dev_uninit(struct net_device *dev)
vrf_rtable_release(dev, vrf); vrf_rtable_release(dev, vrf);
vrf_rt6_release(dev, vrf); vrf_rt6_release(dev, vrf);
free_percpu(dev->dstats);
dev->dstats = NULL;
} }
static int vrf_dev_init(struct net_device *dev) static int vrf_dev_init(struct net_device *dev)
{ {
struct net_vrf *vrf = netdev_priv(dev); struct net_vrf *vrf = netdev_priv(dev);
dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
if (!dev->dstats)
goto out_nomem;
/* create the default dst which points back to us */ /* create the default dst which points back to us */
if (vrf_rtable_create(dev) != 0) if (vrf_rtable_create(dev) != 0)
goto out_stats; goto out_nomem;
if (vrf_rt6_create(dev) != 0) if (vrf_rt6_create(dev) != 0)
goto out_rth; goto out_rth;
...@@ -1203,9 +1186,6 @@ static int vrf_dev_init(struct net_device *dev) ...@@ -1203,9 +1186,6 @@ static int vrf_dev_init(struct net_device *dev)
out_rth: out_rth:
vrf_rtable_release(dev, vrf); vrf_rtable_release(dev, vrf);
out_stats:
free_percpu(dev->dstats);
dev->dstats = NULL;
out_nomem: out_nomem:
return -ENOMEM; return -ENOMEM;
} }
...@@ -1704,6 +1684,8 @@ static void vrf_setup(struct net_device *dev) ...@@ -1704,6 +1684,8 @@ static void vrf_setup(struct net_device *dev)
dev->min_mtu = IPV6_MIN_MTU; dev->min_mtu = IPV6_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU; dev->max_mtu = IP6_MAX_MTU;
dev->mtu = dev->max_mtu; dev->mtu = dev->max_mtu;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
} }
static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
......
...@@ -301,6 +301,17 @@ struct bpf_func_state { ...@@ -301,6 +301,17 @@ struct bpf_func_state {
struct tnum callback_ret_range; struct tnum callback_ret_range;
bool in_async_callback_fn; bool in_async_callback_fn;
bool in_exception_callback_fn; bool in_exception_callback_fn;
/* For callback calling functions that limit number of possible
* callback executions (e.g. bpf_loop) keeps track of current
* simulated iteration number.
* Value in frame N refers to number of times callback with frame
* N+1 was simulated, e.g. for the following call:
*
* bpf_loop(..., fn, ...); | suppose current frame is N
* | fn would be simulated in frame N+1
* | number of simulations is tracked in frame N
*/
u32 callback_depth;
/* The following fields should be last. See copy_func_state() */ /* The following fields should be last. See copy_func_state() */
int acquired_refs; int acquired_refs;
...@@ -400,6 +411,7 @@ struct bpf_verifier_state { ...@@ -400,6 +411,7 @@ struct bpf_verifier_state {
struct bpf_idx_pair *jmp_history; struct bpf_idx_pair *jmp_history;
u32 jmp_history_cnt; u32 jmp_history_cnt;
u32 dfs_depth; u32 dfs_depth;
u32 callback_unroll_depth;
}; };
#define bpf_get_spilled_reg(slot, frame, mask) \ #define bpf_get_spilled_reg(slot, frame, mask) \
...@@ -511,6 +523,10 @@ struct bpf_insn_aux_data { ...@@ -511,6 +523,10 @@ struct bpf_insn_aux_data {
* this instruction, regardless of any heuristics * this instruction, regardless of any heuristics
*/ */
bool force_checkpoint; bool force_checkpoint;
/* true if instruction is a call to a helper function that
* accepts callback function as a parameter.
*/
bool calls_callback;
}; };
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
......
...@@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type { ...@@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type {
ML_PRIV_CAN, ML_PRIV_CAN,
}; };
enum netdev_stat_type {
NETDEV_PCPU_STAT_NONE,
NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
};
/** /**
* struct net_device - The DEVICE structure. * struct net_device - The DEVICE structure.
* *
...@@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type { ...@@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type {
* *
* @ml_priv: Mid-layer private * @ml_priv: Mid-layer private
* @ml_priv_type: Mid-layer private type * @ml_priv_type: Mid-layer private type
* @lstats: Loopback statistics *
* @tstats: Tunnel statistics * @pcpu_stat_type: Type of device statistics which the core should
* @dstats: Dummy statistics * allocate/free: none, lstats, tstats, dstats. none
* @vstats: Virtual ethernet statistics * means the driver is handling statistics allocation/
* freeing internally.
* @lstats: Loopback statistics: packets, bytes
* @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes
* @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes
* *
* @garp_port: GARP * @garp_port: GARP
* @mrp_port: MRP * @mrp_port: MRP
...@@ -2354,6 +2365,7 @@ struct net_device { ...@@ -2354,6 +2365,7 @@ struct net_device {
void *ml_priv; void *ml_priv;
enum netdev_ml_priv_type ml_priv_type; enum netdev_ml_priv_type ml_priv_type;
enum netdev_stat_type pcpu_stat_type:8;
union { union {
struct pcpu_lstats __percpu *lstats; struct pcpu_lstats __percpu *lstats;
struct pcpu_sw_netstats __percpu *tstats; struct pcpu_sw_netstats __percpu *tstats;
...@@ -2755,6 +2767,16 @@ struct pcpu_sw_netstats { ...@@ -2755,6 +2767,16 @@ struct pcpu_sw_netstats {
struct u64_stats_sync syncp; struct u64_stats_sync syncp;
} __aligned(4 * sizeof(u64)); } __aligned(4 * sizeof(u64));
struct pcpu_dstats {
u64 rx_packets;
u64 rx_bytes;
u64 rx_drops;
u64 tx_packets;
u64 tx_bytes;
u64 tx_drops;
struct u64_stats_sync syncp;
} __aligned(8 * sizeof(u64));
struct pcpu_lstats { struct pcpu_lstats {
u64_stats_t packets; u64_stats_t packets;
u64_stats_t bytes; u64_stats_t bytes;
......
...@@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); ...@@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev));
#else #else
static inline int netkit_prog_attach(const union bpf_attr *attr, static inline int netkit_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog) struct bpf_prog *prog)
...@@ -34,5 +35,10 @@ static inline int netkit_prog_query(const union bpf_attr *attr, ...@@ -34,5 +35,10 @@ static inline int netkit_prog_query(const union bpf_attr *attr,
{ {
return -EINVAL; return -EINVAL;
} }
static inline struct net_device *netkit_peer_dev(struct net_device *dev)
{
return NULL;
}
#endif /* CONFIG_NETKIT */ #endif /* CONFIG_NETKIT */
#endif /* __NET_NETKIT_H */ #endif /* __NET_NETKIT_H */
This diff is collapsed.
...@@ -10051,6 +10051,54 @@ void netif_tx_stop_all_queues(struct net_device *dev) ...@@ -10051,6 +10051,54 @@ void netif_tx_stop_all_queues(struct net_device *dev)
} }
EXPORT_SYMBOL(netif_tx_stop_all_queues); EXPORT_SYMBOL(netif_tx_stop_all_queues);
static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
{
void __percpu *v;
/* Drivers implementing ndo_get_peer_dev must support tstat
* accounting, so that skb_do_redirect() can bump the dev's
* RX stats upon network namespace switch.
*/
if (dev->netdev_ops->ndo_get_peer_dev &&
dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
return -EOPNOTSUPP;
switch (dev->pcpu_stat_type) {
case NETDEV_PCPU_STAT_NONE:
return 0;
case NETDEV_PCPU_STAT_LSTATS:
v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
break;
case NETDEV_PCPU_STAT_TSTATS:
v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
break;
case NETDEV_PCPU_STAT_DSTATS:
v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
break;
default:
return -EINVAL;
}
return v ? 0 : -ENOMEM;
}
static void netdev_do_free_pcpu_stats(struct net_device *dev)
{
switch (dev->pcpu_stat_type) {
case NETDEV_PCPU_STAT_NONE:
return;
case NETDEV_PCPU_STAT_LSTATS:
free_percpu(dev->lstats);
break;
case NETDEV_PCPU_STAT_TSTATS:
free_percpu(dev->tstats);
break;
case NETDEV_PCPU_STAT_DSTATS:
free_percpu(dev->dstats);
break;
}
}
/** /**
* register_netdevice() - register a network device * register_netdevice() - register a network device
* @dev: device to register * @dev: device to register
...@@ -10111,9 +10159,13 @@ int register_netdevice(struct net_device *dev) ...@@ -10111,9 +10159,13 @@ int register_netdevice(struct net_device *dev)
goto err_uninit; goto err_uninit;
} }
ret = netdev_do_alloc_pcpu_stats(dev);
if (ret)
goto err_uninit;
ret = dev_index_reserve(net, dev->ifindex); ret = dev_index_reserve(net, dev->ifindex);
if (ret < 0) if (ret < 0)
goto err_uninit; goto err_free_pcpu;
dev->ifindex = ret; dev->ifindex = ret;
/* Transfer changeable features to wanted_features and enable /* Transfer changeable features to wanted_features and enable
...@@ -10219,6 +10271,8 @@ int register_netdevice(struct net_device *dev) ...@@ -10219,6 +10271,8 @@ int register_netdevice(struct net_device *dev)
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release: err_ifindex_release:
dev_index_release(net, dev->ifindex); dev_index_release(net, dev->ifindex);
err_free_pcpu:
netdev_do_free_pcpu_stats(dev);
err_uninit: err_uninit:
if (dev->netdev_ops->ndo_uninit) if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev); dev->netdev_ops->ndo_uninit(dev);
...@@ -10471,6 +10525,7 @@ void netdev_run_todo(void) ...@@ -10471,6 +10525,7 @@ void netdev_run_todo(void)
WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr));
netdev_do_free_pcpu_stats(dev);
if (dev->priv_destructor) if (dev->priv_destructor)
dev->priv_destructor(dev); dev->priv_destructor(dev);
if (dev->needs_free_netdev) if (dev->needs_free_netdev)
......
...@@ -81,6 +81,7 @@ ...@@ -81,6 +81,7 @@
#include <net/xdp.h> #include <net/xdp.h>
#include <net/mptcp.h> #include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h> #include <linux/un.h>
#include "dev.h" #include "dev.h"
...@@ -2468,6 +2469,16 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { ...@@ -2468,6 +2469,16 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (likely(ops->ndo_get_peer_dev))
return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
netkit_peer_dev, dev);
return NULL;
}
int skb_do_redirect(struct sk_buff *skb) int skb_do_redirect(struct sk_buff *skb)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
...@@ -2481,17 +2492,15 @@ int skb_do_redirect(struct sk_buff *skb) ...@@ -2481,17 +2492,15 @@ int skb_do_redirect(struct sk_buff *skb)
if (unlikely(!dev)) if (unlikely(!dev))
goto out_drop; goto out_drop;
if (flags & BPF_F_PEER) { if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops; if (unlikely(!skb_at_tc_ingress(skb)))
if (unlikely(!ops->ndo_get_peer_dev ||
!skb_at_tc_ingress(skb)))
goto out_drop; goto out_drop;
dev = ops->ndo_get_peer_dev(dev); dev = skb_get_peer_dev(dev);
if (unlikely(!dev || if (unlikely(!dev ||
!(dev->flags & IFF_UP) || !(dev->flags & IFF_UP) ||
net_eq(net, dev_net(dev)))) net_eq(net, dev_net(dev))))
goto out_drop; goto out_drop;
skb->dev = dev; skb->dev = dev;
dev_sw_netstats_rx_add(dev, skb->len);
return -EAGAIN; return -EAGAIN;
} }
return flags & BPF_F_NEIGH ? return flags & BPF_F_NEIGH ?
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include "verifier_helper_restricted.skel.h" #include "verifier_helper_restricted.skel.h"
#include "verifier_helper_value_access.skel.h" #include "verifier_helper_value_access.skel.h"
#include "verifier_int_ptr.skel.h" #include "verifier_int_ptr.skel.h"
#include "verifier_iterating_callbacks.skel.h"
#include "verifier_jeq_infer_not_null.skel.h" #include "verifier_jeq_infer_not_null.skel.h"
#include "verifier_ld_ind.skel.h" #include "verifier_ld_ind.skel.h"
#include "verifier_ldsx.skel.h" #include "verifier_ldsx.skel.h"
...@@ -139,6 +140,7 @@ void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_acces ...@@ -139,6 +140,7 @@ void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_acces
void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); } void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); }
void test_verifier_helper_value_access(void) { RUN(verifier_helper_value_access); } void test_verifier_helper_value_access(void) { RUN(verifier_helper_value_access); }
void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); }
void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); }
void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); }
void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); }
void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); }
......
...@@ -15,13 +15,16 @@ static int empty_callback(__u32 index, void *data) ...@@ -15,13 +15,16 @@ static int empty_callback(__u32 index, void *data)
return 0; return 0;
} }
static int outer_loop(__u32 index, void *data)
{
bpf_loop(nr_loops, empty_callback, NULL, 0);
__sync_add_and_fetch(&hits, nr_loops);
return 0;
}
SEC("fentry/" SYS_PREFIX "sys_getpgid") SEC("fentry/" SYS_PREFIX "sys_getpgid")
int benchmark(void *ctx) int benchmark(void *ctx)
{ {
for (int i = 0; i < 1000; i++) { bpf_loop(1000, outer_loop, NULL, 0);
bpf_loop(nr_loops, empty_callback, NULL, 0);
__sync_add_and_fetch(&hits, nr_loops);
}
return 0; return 0;
} }
...@@ -33,6 +33,7 @@ int underflow_prog(void *ctx) ...@@ -33,6 +33,7 @@ int underflow_prog(void *ctx)
if (!p) if (!p)
return 0; return 0;
bpf_for_each_map_elem(&array_map, cb1, &p, 0); bpf_for_each_map_elem(&array_map, cb1, &p, 0);
bpf_kfunc_call_test_release(p);
return 0; return 0;
} }
......
...@@ -171,6 +171,7 @@ int reject_with_rbtree_add_throw(void *ctx) ...@@ -171,6 +171,7 @@ int reject_with_rbtree_add_throw(void *ctx)
return 0; return 0;
bpf_spin_lock(&lock); bpf_spin_lock(&lock);
bpf_rbtree_add(&rbtree, &f->node, rbless); bpf_rbtree_add(&rbtree, &f->node, rbless);
bpf_spin_unlock(&lock);
return 0; return 0;
} }
...@@ -214,6 +215,7 @@ int reject_with_cb_reference(void *ctx) ...@@ -214,6 +215,7 @@ int reject_with_cb_reference(void *ctx)
if (!f) if (!f)
return 0; return 0;
bpf_loop(5, subprog_cb_ref, NULL, 0); bpf_loop(5, subprog_cb_ref, NULL, 0);
bpf_obj_drop(f);
return 0; return 0;
} }
......
...@@ -24,9 +24,11 @@ struct task_struct {}; ...@@ -24,9 +24,11 @@ struct task_struct {};
#define STACK_TABLE_EPOCH_SHIFT 20 #define STACK_TABLE_EPOCH_SHIFT 20
#define STROBE_MAX_STR_LEN 1 #define STROBE_MAX_STR_LEN 1
#define STROBE_MAX_CFGS 32 #define STROBE_MAX_CFGS 32
#define READ_MAP_VAR_PAYLOAD_CAP \
((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
#define STROBE_MAX_PAYLOAD \ #define STROBE_MAX_PAYLOAD \
(STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP)
struct strobe_value_header { struct strobe_value_header {
/* /*
...@@ -355,7 +357,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, ...@@ -355,7 +357,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
size_t idx, void *tls_base, size_t idx, void *tls_base,
struct strobe_value_generic *value, struct strobe_value_generic *value,
struct strobemeta_payload *data, struct strobemeta_payload *data,
void *payload) size_t off)
{ {
void *location; void *location;
uint64_t len; uint64_t len;
...@@ -366,7 +368,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, ...@@ -366,7 +368,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
return 0; return 0;
bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr); len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr);
/* /*
* if bpf_probe_read_user_str returns error (<0), due to casting to * if bpf_probe_read_user_str returns error (<0), due to casting to
* unsinged int, it will become big number, so next check is * unsinged int, it will become big number, so next check is
...@@ -378,14 +380,14 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, ...@@ -378,14 +380,14 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
return 0; return 0;
data->str_lens[idx] = len; data->str_lens[idx] = len;
return len; return off + len;
} }
static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
size_t idx, void *tls_base, size_t idx, void *tls_base,
struct strobe_value_generic *value, struct strobe_value_generic *value,
struct strobemeta_payload *data, struct strobemeta_payload *data,
void *payload) size_t off)
{ {
struct strobe_map_descr* descr = &data->map_descrs[idx]; struct strobe_map_descr* descr = &data->map_descrs[idx];
struct strobe_map_raw map; struct strobe_map_raw map;
...@@ -397,11 +399,11 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, ...@@ -397,11 +399,11 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
location = calc_location(&cfg->map_locs[idx], tls_base); location = calc_location(&cfg->map_locs[idx], tls_base);
if (!location) if (!location)
return payload; return off;
bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
return payload; return off;
descr->id = map.id; descr->id = map.id;
descr->cnt = map.cnt; descr->cnt = map.cnt;
...@@ -410,10 +412,10 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, ...@@ -410,10 +412,10 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
data->req_meta_valid = 1; data->req_meta_valid = 1;
} }
len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag); len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag);
if (len <= STROBE_MAX_STR_LEN) { if (len <= STROBE_MAX_STR_LEN) {
descr->tag_len = len; descr->tag_len = len;
payload += len; off += len;
} }
#ifdef NO_UNROLL #ifdef NO_UNROLL
...@@ -426,22 +428,22 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, ...@@ -426,22 +428,22 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
break; break;
descr->key_lens[i] = 0; descr->key_lens[i] = 0;
len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
map.entries[i].key); map.entries[i].key);
if (len <= STROBE_MAX_STR_LEN) { if (len <= STROBE_MAX_STR_LEN) {
descr->key_lens[i] = len; descr->key_lens[i] = len;
payload += len; off += len;
} }
descr->val_lens[i] = 0; descr->val_lens[i] = 0;
len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN,
map.entries[i].val); map.entries[i].val);
if (len <= STROBE_MAX_STR_LEN) { if (len <= STROBE_MAX_STR_LEN) {
descr->val_lens[i] = len; descr->val_lens[i] = len;
payload += len; off += len;
} }
} }
return payload; return off;
} }
#ifdef USE_BPF_LOOP #ifdef USE_BPF_LOOP
...@@ -455,14 +457,20 @@ struct read_var_ctx { ...@@ -455,14 +457,20 @@ struct read_var_ctx {
struct strobemeta_payload *data; struct strobemeta_payload *data;
void *tls_base; void *tls_base;
struct strobemeta_cfg *cfg; struct strobemeta_cfg *cfg;
void *payload; size_t payload_off;
/* value gets mutated */ /* value gets mutated */
struct strobe_value_generic *value; struct strobe_value_generic *value;
enum read_type type; enum read_type type;
}; };
static int read_var_callback(__u32 index, struct read_var_ctx *ctx) static int read_var_callback(__u64 index, struct read_var_ctx *ctx)
{ {
/* lose precision info for ctx->payload_off, verifier won't track
* double xor, barrier_var() is needed to force clang keep both xors.
*/
ctx->payload_off ^= index;
barrier_var(ctx->payload_off);
ctx->payload_off ^= index;
switch (ctx->type) { switch (ctx->type) {
case READ_INT_VAR: case READ_INT_VAR:
if (index >= STROBE_MAX_INTS) if (index >= STROBE_MAX_INTS)
...@@ -472,14 +480,18 @@ static int read_var_callback(__u32 index, struct read_var_ctx *ctx) ...@@ -472,14 +480,18 @@ static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
case READ_MAP_VAR: case READ_MAP_VAR:
if (index >= STROBE_MAX_MAPS) if (index >= STROBE_MAX_MAPS)
return 1; return 1;
ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP)
ctx->value, ctx->data, ctx->payload); return 1;
ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base,
ctx->value, ctx->data, ctx->payload_off);
break; break;
case READ_STR_VAR: case READ_STR_VAR:
if (index >= STROBE_MAX_STRS) if (index >= STROBE_MAX_STRS)
return 1; return 1;
ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN)
ctx->value, ctx->data, ctx->payload); return 1;
ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base,
ctx->value, ctx->data, ctx->payload_off);
break; break;
} }
return 0; return 0;
...@@ -501,7 +513,8 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -501,7 +513,8 @@ static void *read_strobe_meta(struct task_struct *task,
pid_t pid = bpf_get_current_pid_tgid() >> 32; pid_t pid = bpf_get_current_pid_tgid() >> 32;
struct strobe_value_generic value = {0}; struct strobe_value_generic value = {0};
struct strobemeta_cfg *cfg; struct strobemeta_cfg *cfg;
void *tls_base, *payload; size_t payload_off;
void *tls_base;
cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
if (!cfg) if (!cfg)
...@@ -509,7 +522,7 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -509,7 +522,7 @@ static void *read_strobe_meta(struct task_struct *task,
data->int_vals_set_mask = 0; data->int_vals_set_mask = 0;
data->req_meta_valid = 0; data->req_meta_valid = 0;
payload = data->payload; payload_off = 0;
/* /*
* we don't have struct task_struct definition, it should be: * we don't have struct task_struct definition, it should be:
* tls_base = (void *)task->thread.fsbase; * tls_base = (void *)task->thread.fsbase;
...@@ -522,7 +535,7 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -522,7 +535,7 @@ static void *read_strobe_meta(struct task_struct *task,
.tls_base = tls_base, .tls_base = tls_base,
.value = &value, .value = &value,
.data = data, .data = data,
.payload = payload, .payload_off = 0,
}; };
int err; int err;
...@@ -540,6 +553,11 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -540,6 +553,11 @@ static void *read_strobe_meta(struct task_struct *task,
err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
if (err != STROBE_MAX_MAPS) if (err != STROBE_MAX_MAPS)
return NULL; return NULL;
payload_off = ctx.payload_off;
/* this should not really happen, here only to satisfy verifer */
if (payload_off > sizeof(data->payload))
payload_off = sizeof(data->payload);
#else #else
#ifdef NO_UNROLL #ifdef NO_UNROLL
#pragma clang loop unroll(disable) #pragma clang loop unroll(disable)
...@@ -555,7 +573,7 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -555,7 +573,7 @@ static void *read_strobe_meta(struct task_struct *task,
#pragma unroll #pragma unroll
#endif /* NO_UNROLL */ #endif /* NO_UNROLL */
for (int i = 0; i < STROBE_MAX_STRS; ++i) { for (int i = 0; i < STROBE_MAX_STRS; ++i) {
payload += read_str_var(cfg, i, tls_base, &value, data, payload); payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
} }
#ifdef NO_UNROLL #ifdef NO_UNROLL
#pragma clang loop unroll(disable) #pragma clang loop unroll(disable)
...@@ -563,7 +581,7 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -563,7 +581,7 @@ static void *read_strobe_meta(struct task_struct *task,
#pragma unroll #pragma unroll
#endif /* NO_UNROLL */ #endif /* NO_UNROLL */
for (int i = 0; i < STROBE_MAX_MAPS; ++i) { for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
payload = read_map_var(cfg, i, tls_base, &value, data, payload); payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
} }
#endif /* USE_BPF_LOOP */ #endif /* USE_BPF_LOOP */
...@@ -571,7 +589,7 @@ static void *read_strobe_meta(struct task_struct *task, ...@@ -571,7 +589,7 @@ static void *read_strobe_meta(struct task_struct *task,
* return pointer right after end of payload, so it's possible to * return pointer right after end of payload, so it's possible to
* calculate exact amount of useful data that needs to be sent * calculate exact amount of useful data that needs to be sent
*/ */
return payload; return &data->payload[payload_off];
} }
SEC("raw_tracepoint/kfree_skb") SEC("raw_tracepoint/kfree_skb")
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 8);
__type(key, __u32);
__type(value, __u64);
} map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_USER_RINGBUF);
__uint(max_entries, 8);
} ringbuf SEC(".maps");
struct vm_area_struct;
struct bpf_map;
struct buf_context {
char *buf;
};
struct num_context {
__u64 i;
__u64 j;
};
__u8 choice_arr[2] = { 0, 1 };
static int unsafe_on_2nd_iter_cb(__u32 idx, struct buf_context *ctx)
{
if (idx == 0) {
ctx->buf = (char *)(0xDEAD);
return 0;
}
if (bpf_probe_read_user(ctx->buf, 8, (void *)(0xBADC0FFEE)))
return 1;
return 0;
}
SEC("?raw_tp")
__failure __msg("R1 type=scalar expected=fp")
int unsafe_on_2nd_iter(void *unused)
{
char buf[4];
struct buf_context loop_ctx = { .buf = buf };
bpf_loop(100, unsafe_on_2nd_iter_cb, &loop_ctx, 0);
return 0;
}
static int unsafe_on_zero_iter_cb(__u32 idx, struct num_context *ctx)
{
ctx->i = 0;
return 0;
}
SEC("?raw_tp")
__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
int unsafe_on_zero_iter(void *unused)
{
struct num_context loop_ctx = { .i = 32 };
bpf_loop(100, unsafe_on_zero_iter_cb, &loop_ctx, 0);
return choice_arr[loop_ctx.i];
}
static int widening_cb(__u32 idx, struct num_context *ctx)
{
++ctx->i;
return 0;
}
SEC("?raw_tp")
__success
int widening(void *unused)
{
struct num_context loop_ctx = { .i = 0, .j = 1 };
bpf_loop(100, widening_cb, &loop_ctx, 0);
/* loop_ctx.j is not changed during callback iteration,
* verifier should not apply widening to it.
*/
return choice_arr[loop_ctx.j];
}
static int loop_detection_cb(__u32 idx, struct num_context *ctx)
{
for (;;) {}
return 0;
}
SEC("?raw_tp")
__failure __msg("infinite loop detected")
int loop_detection(void *unused)
{
struct num_context loop_ctx = { .i = 0 };
bpf_loop(100, loop_detection_cb, &loop_ctx, 0);
return 0;
}
static __always_inline __u64 oob_state_machine(struct num_context *ctx)
{
switch (ctx->i) {
case 0:
ctx->i = 1;
break;
case 1:
ctx->i = 32;
break;
}
return 0;
}
static __u64 for_each_map_elem_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data)
{
return oob_state_machine(data);
}
SEC("?raw_tp")
__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
int unsafe_for_each_map_elem(void *unused)
{
struct num_context loop_ctx = { .i = 0 };
bpf_for_each_map_elem(&map, for_each_map_elem_cb, &loop_ctx, 0);
return choice_arr[loop_ctx.i];
}
static __u64 ringbuf_drain_cb(struct bpf_dynptr *dynptr, void *data)
{
return oob_state_machine(data);
}
SEC("?raw_tp")
__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
int unsafe_ringbuf_drain(void *unused)
{
struct num_context loop_ctx = { .i = 0 };
bpf_user_ringbuf_drain(&ringbuf, ringbuf_drain_cb, &loop_ctx, 0);
return choice_arr[loop_ctx.i];
}
static __u64 find_vma_cb(struct task_struct *task, struct vm_area_struct *vma, void *data)
{
return oob_state_machine(data);
}
SEC("?raw_tp")
__failure __msg("invalid access to map value, value_size=2 off=32 size=1")
int unsafe_find_vma(void *unused)
{
struct task_struct *task = bpf_get_current_task_btf();
struct num_context loop_ctx = { .i = 0 };
bpf_find_vma(task, 0, find_vma_cb, &loop_ctx, 0);
return choice_arr[loop_ctx.i];
}
static int iter_limit_cb(__u32 idx, struct num_context *ctx)
{
ctx->i++;
return 0;
}
SEC("?raw_tp")
__success
int bpf_loop_iter_limit_ok(void *unused)
{
struct num_context ctx = { .i = 0 };
bpf_loop(1, iter_limit_cb, &ctx, 0);
return choice_arr[ctx.i];
}
SEC("?raw_tp")
__failure __msg("invalid access to map value, value_size=2 off=2 size=1")
int bpf_loop_iter_limit_overflow(void *unused)
{
struct num_context ctx = { .i = 0 };
bpf_loop(2, iter_limit_cb, &ctx, 0);
return choice_arr[ctx.i];
}
static int iter_limit_level2a_cb(__u32 idx, struct num_context *ctx)
{
ctx->i += 100;
return 0;
}
static int iter_limit_level2b_cb(__u32 idx, struct num_context *ctx)
{
ctx->i += 10;
return 0;
}
static int iter_limit_level1_cb(__u32 idx, struct num_context *ctx)
{
ctx->i += 1;
bpf_loop(1, iter_limit_level2a_cb, ctx, 0);
bpf_loop(1, iter_limit_level2b_cb, ctx, 0);
return 0;
}
/* Check that path visiting every callback function once had been
* reached by verifier. Variables 'ctx{1,2}i' below serve as flags,
* with each decimal digit corresponding to a callback visit marker.
*/
SEC("socket")
__success __retval(111111)
int bpf_loop_iter_limit_nested(void *unused)
{
struct num_context ctx1 = { .i = 0 };
struct num_context ctx2 = { .i = 0 };
__u64 a, b, c;
bpf_loop(1, iter_limit_level1_cb, &ctx1, 0);
bpf_loop(1, iter_limit_level1_cb, &ctx2, 0);
a = ctx1.i;
b = ctx2.i;
/* Force 'ctx1.i' and 'ctx2.i' precise. */
c = choice_arr[(a + b) % 2];
/* This makes 'c' zero, but neither clang nor verifier know it. */
c /= 10;
/* Make sure that verifier does not visit 'impossible' states:
* enumerate all possible callback visit masks.
*/
if (a != 0 && a != 1 && a != 11 && a != 101 && a != 111 &&
b != 0 && b != 1 && b != 11 && b != 101 && b != 111)
asm volatile ("r0 /= 0;" ::: "r0");
return 1000 * a + b + c;
}
char _license[] SEC("license") = "GPL";
...@@ -119,15 +119,41 @@ __naked int global_subprog_result_precise(void) ...@@ -119,15 +119,41 @@ __naked int global_subprog_result_precise(void)
SEC("?raw_tp") SEC("?raw_tp")
__success __log_level(2) __success __log_level(2)
/* First simulated path does not include callback body,
* r1 and r4 are always precise for bpf_loop() calls.
*/
__msg("9: (85) call bpf_loop#181")
__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
__msg("mark_precise: frame0: parent state regs=r4 stack=:")
__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
__msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0")
__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
__msg("mark_precise: frame0: parent state regs=r1 stack=:")
__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
__msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0")
__msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0")
__msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8")
__msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6")
__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
/* r6 precision propagation */
__msg("14: (0f) r1 += r6") __msg("14: (0f) r1 += r6")
__msg("mark_precise: frame0: last_idx 14 first_idx 10") __msg("mark_precise: frame0: last_idx 14 first_idx 9")
__msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
__msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
__msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4")
__msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0") __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0")
__msg("mark_precise: frame0: parent state regs=r0 stack=:") __msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop")
__msg("mark_precise: frame0: last_idx 18 first_idx 0") /* State entering callback body popped from states stack */
__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit") __msg("from 9 to 17: frame1:")
__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
__msg("17: (b7) r0 = 0")
__msg("18: (95) exit")
__msg("returning from callee:")
__msg("to caller at 9:")
__msg("frame 0: propagating r1,r4")
__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
__msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit")
__msg("from 18 to 9: safe")
__naked int callback_result_precise(void) __naked int callback_result_precise(void)
{ {
asm volatile ( asm volatile (
...@@ -233,20 +259,36 @@ __naked int parent_callee_saved_reg_precise_global(void) ...@@ -233,20 +259,36 @@ __naked int parent_callee_saved_reg_precise_global(void)
SEC("?raw_tp") SEC("?raw_tp")
__success __log_level(2) __success __log_level(2)
/* First simulated path does not include callback body */
__msg("12: (0f) r1 += r6") __msg("12: (0f) r1 += r6")
__msg("mark_precise: frame0: last_idx 12 first_idx 10") __msg("mark_precise: frame0: last_idx 12 first_idx 9")
__msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7")
__msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4")
__msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop")
__msg("mark_precise: frame0: parent state regs=r6 stack=:") __msg("mark_precise: frame0: parent state regs=r6 stack=:")
__msg("mark_precise: frame0: last_idx 16 first_idx 0") __msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit")
__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181")
__msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0") __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0")
__msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0") __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0")
__msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8") __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8")
__msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1") __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1")
__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
/* State entering callback body popped from states stack */
__msg("from 9 to 15: frame1:")
__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb")
__msg("15: (b7) r0 = 0")
__msg("16: (95) exit")
__msg("returning from callee:")
__msg("to caller at 9:")
/* r1, r4 are always precise for bpf_loop(),
* r6 was marked before backtracking to callback body.
*/
__msg("frame 0: propagating r1,r4,r6")
__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
__msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit")
__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
__msg("mark_precise: frame0: parent state regs= stack=:")
__msg("from 16 to 9: safe")
__naked int parent_callee_saved_reg_precise_with_callback(void) __naked int parent_callee_saved_reg_precise_with_callback(void)
{ {
asm volatile ( asm volatile (
...@@ -373,22 +415,38 @@ __naked int parent_stack_slot_precise_global(void) ...@@ -373,22 +415,38 @@ __naked int parent_stack_slot_precise_global(void)
SEC("?raw_tp") SEC("?raw_tp")
__success __log_level(2) __success __log_level(2)
/* First simulated path does not include callback body */
__msg("14: (0f) r1 += r6") __msg("14: (0f) r1 += r6")
__msg("mark_precise: frame0: last_idx 14 first_idx 11") __msg("mark_precise: frame0: last_idx 14 first_idx 10")
__msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
__msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
__msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)")
__msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop")
__msg("mark_precise: frame0: parent state regs= stack=-8:") __msg("mark_precise: frame0: parent state regs= stack=-8:")
__msg("mark_precise: frame0: last_idx 18 first_idx 0") __msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10")
__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
__msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0")
__msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0")
__msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8")
__msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6") __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6")
__msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6")
__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
/* State entering callback body popped from states stack */
__msg("from 10 to 17: frame1:")
__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
__msg("17: (b7) r0 = 0")
__msg("18: (95) exit")
__msg("returning from callee:")
__msg("to caller at 10:")
/* r1, r4 are always precise for bpf_loop(),
* fp-8 was marked before backtracking to callback body.
*/
__msg("frame 0: propagating r1,r4,fp-8")
__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
__msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit")
__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
__msg("mark_precise: frame0: parent state regs= stack=:")
__msg("from 18 to 10: safe")
__naked int parent_stack_slot_precise_with_callback(void) __naked int parent_stack_slot_precise_with_callback(void)
{ {
asm volatile ( asm volatile (
......
...@@ -53,6 +53,8 @@ ...@@ -53,6 +53,8 @@
#define DEFAULT_TTL 64 #define DEFAULT_TTL 64
#define MAX_ALLOWED_PORTS 8 #define MAX_ALLOWED_PORTS 8
#define MAX_PACKET_OFF 0xffff
#define swap(a, b) \ #define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
...@@ -183,63 +185,76 @@ static __always_inline __u32 tcp_clock_ms(void) ...@@ -183,63 +185,76 @@ static __always_inline __u32 tcp_clock_ms(void)
} }
struct tcpopt_context { struct tcpopt_context {
__u8 *ptr; void *data;
__u8 *end;
void *data_end; void *data_end;
__be32 *tsecr; __be32 *tsecr;
__u8 wscale; __u8 wscale;
bool option_timestamp; bool option_timestamp;
bool option_sack; bool option_sack;
__u32 off;
}; };
static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
{ {
__u8 opcode, opsize; __u64 off = ctx->off;
__u8 *data;
if (ctx->ptr >= ctx->end) /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
return 1; if (off > MAX_PACKET_OFF - sz)
if (ctx->ptr >= ctx->data_end) return NULL;
return 1;
opcode = ctx->ptr[0]; data = ctx->data + off;
barrier_var(data);
if (data + sz >= ctx->data_end)
return NULL;
if (opcode == TCPOPT_EOL) ctx->off += sz;
return 1; return data;
if (opcode == TCPOPT_NOP) { }
++ctx->ptr;
return 0;
}
if (ctx->ptr + 1 >= ctx->end) static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
return 1; {
if (ctx->ptr + 1 >= ctx->data_end) __u8 *opcode, *opsize, *wscale, *tsecr;
__u32 off = ctx->off;
opcode = next(ctx, 1);
if (!opcode)
return 1; return 1;
opsize = ctx->ptr[1];
if (opsize < 2) if (*opcode == TCPOPT_EOL)
return 1; return 1;
if (*opcode == TCPOPT_NOP)
return 0;
if (ctx->ptr + opsize > ctx->end) opsize = next(ctx, 1);
if (!opsize || *opsize < 2)
return 1; return 1;
switch (opcode) { switch (*opcode) {
case TCPOPT_WINDOW: case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) wscale = next(ctx, 1);
ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; if (!wscale)
return 1;
if (*opsize == TCPOLEN_WINDOW)
ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
break; break;
case TCPOPT_TIMESTAMP: case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { tsecr = next(ctx, 4);
if (!tsecr)
return 1;
if (*opsize == TCPOLEN_TIMESTAMP) {
ctx->option_timestamp = true; ctx->option_timestamp = true;
/* Client's tsval becomes our tsecr. */ /* Client's tsval becomes our tsecr. */
*ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); *ctx->tsecr = get_unaligned((__be32 *)tsecr);
} }
break; break;
case TCPOPT_SACK_PERM: case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM) if (*opsize == TCPOLEN_SACK_PERM)
ctx->option_sack = true; ctx->option_sack = true;
break; break;
} }
ctx->ptr += opsize; ctx->off = off + *opsize;
return 0; return 0;
} }
...@@ -256,16 +271,21 @@ static int tscookie_tcpopt_parse_batch(__u32 index, void *context) ...@@ -256,16 +271,21 @@ static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
static __always_inline bool tscookie_init(struct tcphdr *tcp_header, static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
__u16 tcp_len, __be32 *tsval, __u16 tcp_len, __be32 *tsval,
__be32 *tsecr, void *data_end) __be32 *tsecr, void *data, void *data_end)
{ {
struct tcpopt_context loop_ctx = { struct tcpopt_context loop_ctx = {
.ptr = (__u8 *)(tcp_header + 1), .data = data,
.end = (__u8 *)tcp_header + tcp_len,
.data_end = data_end, .data_end = data_end,
.tsecr = tsecr, .tsecr = tsecr,
.wscale = TS_OPT_WSCALE_MASK, .wscale = TS_OPT_WSCALE_MASK,
.option_timestamp = false, .option_timestamp = false,
.option_sack = false, .option_sack = false,
/* Note: currently verifier would track .off as unbound scalar.
* In case if verifier would at some point get smarter and
* compute bounded value for this var, beware that it might
* hinder bpf_loop() convergence validation.
*/
.off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
}; };
u32 cookie; u32 cookie;
...@@ -635,7 +655,7 @@ static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, ...@@ -635,7 +655,7 @@ static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
cookie = (__u32)value; cookie = (__u32)value;
if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
&tsopt_buf[0], &tsopt_buf[1], data_end)) &tsopt_buf[0], &tsopt_buf[1], data, data_end))
tsopt = tsopt_buf; tsopt = tsopt_buf;
/* Check that there is enough space for a SYNACK. It also covers /* Check that there is enough space for a SYNACK. It also covers
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment