Commit 576240cf authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'propagate-cn-to-tcp'

Lawrence Brakmo says:

====================
This patchset adds support for propagating congestion notifications (cn)
to TCP from cgroup inet skb egress BPF programs.

Current cgroup skb BPF programs cannot trigger TCP congestion window
reductions, even when they drop a packet. This patch-set adds support
for cgroup skb BPF programs to send congestion notifications in the
return value when the packets are TCP packets. Rather than the
current 1 for keeping the packet and 0 for dropping it, they can
now return:
    NET_XMIT_SUCCESS    (0)    - continue with packet output
    NET_XMIT_DROP       (1)    - drop packet and do cn
    NET_XMIT_CN         (2)    - continue with packet output and do cn
    -EPERM                     - drop packet

Finally, HBM programs are modified to collect and return more
statistics.

There has been some discussion regarding the best place to manage
bandwidths. Some believe this should be done in the qdisc where it can
also be managed with a BPF program. We believe there are advantages
for doing it with a BPF program in the cgroup/skb callback. For example,
it reduces overheads in the cases where there is on primary workload and
one or more secondary workloads, where each workload is running on its
own cgroupv2. In this scenario, we only need to throttle the secondary
workloads and there is no overhead for the primary workload since there
will be no BPF program attached to its cgroup.

Regardless, we agree that this mechanism should not penalize those that
are not using it. We tested this by doing 1 byte req/reply RPCs over
loopback. Each test consists of 30 sec of back-to-back 1 byte RPCs.
Each test was repeated 50 times with a 1 minute delay between each set
of 10. We then calculated the average RPCs/sec over the 50 tests. We
compare upstream with upstream + patchset and no BPF program as well
as upstream + patchset and a BPF program that just returns ALLOW_PKT.
Here are the results:

upstream                           80937 RPCs/sec
upstream + patches, no BPF program 80894 RPCs/sec
upstream + patches, BPF program    80634 RPCs/sec

These numbers indicate that there is no penalty for these patches

The use of congestion notifications improves the performance of HBM when
using Cubic. Without congestion notifications, Cubic will not decrease its
cwnd and HBM will need to drop a large percentage of the packets.

The following results are obtained for rate limits of 1Gbps,
between two servers using netperf, and only one flow. We also show how
reducing the max delayed ACK timer can improve the performance when
using Cubic.

Command used was:
  ./do_hbm_test.sh -l -D --stats -N -r=<rate> [--no_cn] [dctcp] \
                   -s=<server running netserver>
  where:
     <rate>   is 1000
     --no_cn  specifies no cwr notifications
     dctcp    uses dctcp

                       Cubic                    DCTCP
Lim, DA      Mbps cwnd cred drops  Mbps cwnd cred drops
--------     ---- ---- ---- -----  ---- ---- ---- -----
  1G, 40       35  462 -320 67%     995    1 -212  0.05%
  1G, 40,cn   736    9  -78  0.07   995    1 -212  0.05
  1G,  5,cn   941    2 -189  0.13   995    1 -212  0.05

Notes:
  --no_cn has no effect with DCTCP
  Lim = rate limit
  DA = maximum delay ack timer
  cred = credit in packets
  drops = % packets dropped

v1->v2: Insures that only BPF_CGROUP_INET_EGRESS can return values 2 and 3
        New egress values apply to all protocols, not just TCP
        Cleaned up patch 4, Update BPF_CGROUP_RUN_PROG_INET_EGRESS callers
        Removed changes to __tcp_transmit_skb (patch 5), no longer needed
        Removed sample use of EDT
v2->v3: Removed the probe timer related changes
v3->v4: Replaced preempt_enable_no_resched() by preempt_enable()
        in BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() macro
====================
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 399dc65e d58c6f72
...@@ -552,6 +552,56 @@ _out: \ ...@@ -552,6 +552,56 @@ _out: \
_ret; \ _ret; \
}) })
/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
* so BPF programs can request cwr for TCP packets.
*
* Current cgroup skb programs can only return 0 or 1 (0 to drop the
* packet. This macro changes the behavior so the low order bit
* indicates whether the packet should be dropped (0) or not (1)
* and the next bit is a congestion notification bit. This could be
* used by TCP to call tcp_enter_cwr()
*
* Hence, new allowed return values of CGROUP EGRESS BPF programs are:
* 0: drop packet
* 1: keep packet
* 2: drop packet and cn
* 3: keep packet and cn
*
* This macro then converts it to one of the NET_XMIT or an error
* code that is then interpreted as drop packet (and no cn):
* 0: NET_XMIT_SUCCESS skb should be transmitted
* 1: NET_XMIT_DROP skb should be dropped and cn
* 2: NET_XMIT_CN skb should be transmitted and cn
* 3: -EPERM skb should be dropped
*/
#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \
({ \
struct bpf_prog_array_item *_item; \
struct bpf_prog *_prog; \
struct bpf_prog_array *_array; \
u32 ret; \
u32 _ret = 1; \
u32 _cn = 0; \
preempt_disable(); \
rcu_read_lock(); \
_array = rcu_dereference(array); \
_item = &_array->items[0]; \
while ((_prog = READ_ONCE(_item->prog))) { \
bpf_cgroup_storage_set(_item->cgroup_storage); \
ret = func(_prog, ctx); \
_ret &= (ret & 1); \
_cn |= (ret & 2); \
_item++; \
} \
rcu_read_unlock(); \
preempt_enable(); \
if (_ret) \
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
else \
_ret = (_cn ? NET_XMIT_DROP : -EPERM); \
_ret; \
})
#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ #define BPF_PROG_RUN_ARRAY(array, ctx, func) \
__BPF_PROG_RUN_ARRAY(array, ctx, func, false) __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
......
...@@ -526,7 +526,8 @@ struct bpf_prog { ...@@ -526,7 +526,8 @@ struct bpf_prog {
blinded:1, /* Was blinded */ blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */ is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */ kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1; /* callchain buffer allocated? */ has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
enum bpf_prog_type type; /* Type of BPF program */ enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */ enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */ u32 len; /* Number of filter blocks */
......
...@@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, ...@@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* The program type passed in via @type must be suitable for network * The program type passed in via @type must be suitable for network
* filtering. No further check is performed to assert that. * filtering. No further check is performed to assert that.
* *
* This function will return %-EPERM if any if an attached program was found * For egress packets, this function can return:
* and if it returned != 1 during execution. In all other cases, 0 is returned. * NET_XMIT_SUCCESS (0) - continue with packet output
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
* -EPERM - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
* Otherwise 0 is returned.
*/ */
int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb, struct sk_buff *skb,
...@@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, ...@@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */ /* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end); bpf_compute_and_save_data_end(skb, &saved_data_end);
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, if (type == BPF_CGROUP_INET_EGRESS) {
__bpf_prog_run_save_cb); ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
} else {
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
__bpf_prog_run_save_cb);
ret = (ret == 1 ? 0 : -EPERM);
}
bpf_restore_data_end(skb, saved_data_end); bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset); __skb_pull(skb, offset);
skb->sk = save_sk; skb->sk = save_sk;
return ret == 1 ? 0 : -EPERM;
return ret;
} }
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
......
...@@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, ...@@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
default: default:
return -EINVAL; return -EINVAL;
} }
case BPF_PROG_TYPE_CGROUP_SKB:
switch (expected_attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
return 0;
default:
return -EINVAL;
}
default: default:
return 0; return 0;
} }
...@@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ...@@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL; return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
default: default:
return 0; return 0;
} }
......
...@@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) ...@@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env) static int check_return_code(struct bpf_verifier_env *env)
{ {
struct tnum enforce_attach_type_range = tnum_unknown;
struct bpf_reg_state *reg; struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1); struct tnum range = tnum_range(0, 1);
switch (env->prog->type) { switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
range = tnum_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_SOCK_OPS:
...@@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env) ...@@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env)
} }
if (!tnum_in(range, reg->var_off)) { if (!tnum_in(range, reg->var_off)) {
char tn_buf[48];
verbose(env, "At program exit the register R0 "); verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) { if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "has value %s", tn_buf); verbose(env, "has value %s", tn_buf);
} else { } else {
verbose(env, "has unknown scalar value"); verbose(env, "has unknown scalar value");
} }
verbose(env, " should have been 0 or 1\n"); tnum_strn(tn_buf, sizeof(tn_buf), range);
verbose(env, " should have been %s\n", tn_buf);
return -EINVAL; return -EINVAL;
} }
if (!tnum_is_unknown(enforce_attach_type_range) &&
tnum_in(enforce_attach_type_range, reg->var_off))
env->prog->enforce_expected_attach_type = 1;
return 0; return 0;
} }
......
...@@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, ...@@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
return ret; return ret;
} }
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
unsigned int mtu; unsigned int mtu;
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
if (ret) {
kfree_skb(skb);
return ret;
}
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */ /* Policy lookup after SNAT yielded a new policy */
...@@ -315,18 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk ...@@ -315,18 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb); return ip_finish_output2(net, sk, skb);
} }
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
switch (ret) {
case NET_XMIT_SUCCESS:
return __ip_finish_output(net, sk, skb);
case NET_XMIT_CN:
return __ip_finish_output(net, sk, skb) ? : ret;
default:
kfree_skb(skb);
return ret;
}
}
static int ip_mc_finish_output(struct net *net, struct sock *sk, static int ip_mc_finish_output(struct net *net, struct sock *sk,
struct sk_buff *skb) struct sk_buff *skb)
{ {
int ret; int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
if (ret) { switch (ret) {
case NET_XMIT_SUCCESS:
return dev_loopback_xmit(net, sk, skb);
case NET_XMIT_CN:
return dev_loopback_xmit(net, sk, skb) ? : ret;
default:
kfree_skb(skb); kfree_skb(skb);
return ret; return ret;
} }
return dev_loopback_xmit(net, sk, skb);
} }
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
......
...@@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * ...@@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
return -EINVAL; return -EINVAL;
} }
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
if (ret) {
kfree_skb(skb);
return ret;
}
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */ /* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) { if (skb_dst(skb)->xfrm) {
...@@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s ...@@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
return ip6_finish_output2(net, sk, skb); return ip6_finish_output2(net, sk, skb);
} }
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
switch (ret) {
case NET_XMIT_SUCCESS:
return __ip6_finish_output(net, sk, skb);
case NET_XMIT_CN:
return __ip6_finish_output(net, sk, skb) ? : ret;
default:
kfree_skb(skb);
return ret;
}
}
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
struct net_device *dev = skb_dst(skb)->dev; struct net_device *dev = skb_dst(skb)->dev;
......
...@@ -13,10 +13,10 @@ Usage() { ...@@ -13,10 +13,10 @@ Usage() {
echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "loads. The output is the goodput in Mbps (unless -D was used)."
echo "" echo ""
echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]"
echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
echo " Where:" echo " Where:"
...@@ -33,6 +33,7 @@ Usage() { ...@@ -33,6 +33,7 @@ Usage() {
echo " -f or --flows number of concurrent flows (default=1)" echo " -f or --flows number of concurrent flows (default=1)"
echo " -i or --id cgroup id (an integer, default is 1)" echo " -i or --id cgroup id (an integer, default is 1)"
echo " -N use netperf instead of iperf3" echo " -N use netperf instead of iperf3"
echo " --no_cn Do not return CN notifications"
echo " -l do not limit flows using loopback" echo " -l do not limit flows using loopback"
echo " -h Help" echo " -h Help"
echo " -p or --port iperf3 port (default is 5201)" echo " -p or --port iperf3 port (default is 5201)"
...@@ -115,6 +116,9 @@ processArgs () { ...@@ -115,6 +116,9 @@ processArgs () {
-c=*|--cc=*) -c=*|--cc=*)
cc="${i#*=}" cc="${i#*=}"
;; ;;
--no_cn)
flags="$flags --no_cn"
;;
--debug) --debug)
flags="$flags -d" flags="$flags -d"
debug_flag=1 debug_flag=1
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
* -l Also limit flows doing loopback * -l Also limit flows doing loopback
* -n <#> To create cgroup \"/hbm#\" and attach prog * -n <#> To create cgroup \"/hbm#\" and attach prog
* Default is /hbm1 * Default is /hbm1
* --no_cn Do not return cn notifications
* -r <rate> Rate limit in Mbps * -r <rate> Rate limit in Mbps
* -s Get HBM stats (marked, dropped, etc.) * -s Get HBM stats (marked, dropped, etc.)
* -t <time> Exit after specified seconds (default is 0) * -t <time> Exit after specified seconds (default is 0)
...@@ -42,6 +43,7 @@ ...@@ -42,6 +43,7 @@
#include <linux/bpf.h> #include <linux/bpf.h>
#include <bpf/bpf.h> #include <bpf/bpf.h>
#include <getopt.h>
#include "bpf_load.h" #include "bpf_load.h"
#include "bpf_rlimit.h" #include "bpf_rlimit.h"
...@@ -59,6 +61,7 @@ bool stats_flag; ...@@ -59,6 +61,7 @@ bool stats_flag;
bool loopback_flag; bool loopback_flag;
bool debugFlag; bool debugFlag;
bool work_conserving_flag; bool work_conserving_flag;
bool no_cn_flag;
static void Usage(void); static void Usage(void);
static void read_trace_pipe2(void); static void read_trace_pipe2(void);
...@@ -185,6 +188,7 @@ static int run_bpf_prog(char *prog, int cg_id) ...@@ -185,6 +188,7 @@ static int run_bpf_prog(char *prog, int cg_id)
qstats.rate = rate; qstats.rate = rate;
qstats.stats = stats_flag ? 1 : 0; qstats.stats = stats_flag ? 1 : 0;
qstats.loopback = loopback_flag ? 1 : 0; qstats.loopback = loopback_flag ? 1 : 0;
qstats.no_cn = no_cn_flag ? 1 : 0;
if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
printf("ERROR: Could not update map element\n"); printf("ERROR: Could not update map element\n");
goto err; goto err;
...@@ -312,6 +316,14 @@ static int run_bpf_prog(char *prog, int cg_id) ...@@ -312,6 +316,14 @@ static int run_bpf_prog(char *prog, int cg_id)
double percent_pkts, percent_bytes; double percent_pkts, percent_bytes;
char fname[100]; char fname[100];
FILE *fout; FILE *fout;
int k;
static const char *returnValNames[] = {
"DROP_PKT",
"ALLOW_PKT",
"DROP_PKT_CWR",
"ALLOW_PKT_CWR"
};
#define RET_VAL_COUNT 4
// Future support of ingress // Future support of ingress
// if (!outFlag) // if (!outFlag)
...@@ -346,6 +358,31 @@ static int run_bpf_prog(char *prog, int cg_id) ...@@ -346,6 +358,31 @@ static int run_bpf_prog(char *prog, int cg_id)
(qstats.bytes_total + 1); (qstats.bytes_total + 1);
fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
// ECN CE markings
percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
(qstats.pkts_total + 1);
fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
(int)qstats.pkts_ecn_ce);
// Average cwnd
fprintf(fout, "avg cwnd:%d\n",
(int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
// Average rtt
fprintf(fout, "avg rtt:%d\n",
(int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
// Average credit
fprintf(fout, "avg credit:%d\n",
(int)(qstats.sum_credit /
(1500 * ((int)qstats.pkts_total) + 1)));
// Return values stats
for (k = 0; k < RET_VAL_COUNT; k++) {
percent_pkts = (qstats.returnValCount[k] * 100.0) /
(qstats.pkts_total + 1);
fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
percent_pkts, (int)qstats.returnValCount[k]);
}
fclose(fout); fclose(fout);
} }
...@@ -366,14 +403,15 @@ static void Usage(void) ...@@ -366,14 +403,15 @@ static void Usage(void)
{ {
printf("This program loads a cgroup skb BPF program to enforce\n" printf("This program loads a cgroup skb BPF program to enforce\n"
"cgroup output (egress) bandwidth limits.\n\n" "cgroup output (egress) bandwidth limits.\n\n"
"USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
" [-t <secs>] [-w] [-h] [prog]\n" " [-s] [-t <secs>] [-w] [-h] [prog]\n"
" Where:\n" " Where:\n"
" -o indicates egress direction (default)\n" " -o indicates egress direction (default)\n"
" -d print BPF trace debug buffer\n" " -d print BPF trace debug buffer\n"
" -l also limit flows using loopback\n" " -l also limit flows using loopback\n"
" -n <#> to create cgroup \"/hbm#\" and attach prog\n" " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
" Default is /hbm1\n" " Default is /hbm1\n"
" --no_cn disable CN notifcations\n"
" -r <rate> Rate in Mbps\n" " -r <rate> Rate in Mbps\n"
" -s Update HBM stats\n" " -s Update HBM stats\n"
" -t <time> Exit after specified seconds (default is 0)\n" " -t <time> Exit after specified seconds (default is 0)\n"
...@@ -393,9 +431,16 @@ int main(int argc, char **argv) ...@@ -393,9 +431,16 @@ int main(int argc, char **argv)
int k; int k;
int cg_id = 1; int cg_id = 1;
char *optstring = "iodln:r:st:wh"; char *optstring = "iodln:r:st:wh";
struct option loptions[] = {
{"no_cn", 0, NULL, 1},
{NULL, 0, NULL, 0}
};
while ((k = getopt(argc, argv, optstring)) != -1) { while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
switch (k) { switch (k) {
case 1:
no_cn_flag = true;
break;
case'o': case'o':
break; break;
case 'd': case 'd':
......
...@@ -19,7 +19,8 @@ struct hbm_vqueue { ...@@ -19,7 +19,8 @@ struct hbm_vqueue {
struct hbm_queue_stats { struct hbm_queue_stats {
unsigned long rate; /* in Mbps*/ unsigned long rate; /* in Mbps*/
unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
loopback:1; /* also limit flows using loopback */ loopback:1, /* also limit flows using loopback */
no_cn:1; /* do not use cn flags */
unsigned long long pkts_marked; unsigned long long pkts_marked;
unsigned long long bytes_marked; unsigned long long bytes_marked;
unsigned long long pkts_dropped; unsigned long long pkts_dropped;
...@@ -28,4 +29,10 @@ struct hbm_queue_stats { ...@@ -28,4 +29,10 @@ struct hbm_queue_stats {
unsigned long long bytes_total; unsigned long long bytes_total;
unsigned long long firstPacketTime; unsigned long long firstPacketTime;
unsigned long long lastPacketTime; unsigned long long lastPacketTime;
unsigned long long pkts_ecn_ce;
unsigned long long returnValCount[4];
unsigned long long sum_cwnd;
unsigned long long sum_rtt;
unsigned long long sum_cwnd_cnt;
long long sum_credit;
}; };
...@@ -65,17 +65,43 @@ struct bpf_map_def SEC("maps") queue_stats = { ...@@ -65,17 +65,43 @@ struct bpf_map_def SEC("maps") queue_stats = {
BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
struct hbm_pkt_info { struct hbm_pkt_info {
int cwnd;
int rtt;
bool is_ip; bool is_ip;
bool is_tcp; bool is_tcp;
short ecn; short ecn;
}; };
static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
{
struct bpf_sock *sk;
struct bpf_tcp_sock *tp;
sk = skb->sk;
if (sk) {
sk = bpf_sk_fullsock(sk);
if (sk) {
if (sk->protocol == IPPROTO_TCP) {
tp = bpf_tcp_sock(sk);
if (tp) {
pkti->cwnd = tp->snd_cwnd;
pkti->rtt = tp->srtt_us >> 3;
return 0;
}
}
}
}
return 1;
}
static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
struct hbm_pkt_info *pkti) struct hbm_pkt_info *pkti)
{ {
struct iphdr iph; struct iphdr iph;
struct ipv6hdr *ip6h; struct ipv6hdr *ip6h;
pkti->cwnd = 0;
pkti->rtt = 0;
bpf_skb_load_bytes(skb, 0, &iph, 12); bpf_skb_load_bytes(skb, 0, &iph, 12);
if (iph.version == 6) { if (iph.version == 6) {
ip6h = (struct ipv6hdr *)&iph; ip6h = (struct ipv6hdr *)&iph;
...@@ -91,6 +117,8 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, ...@@ -91,6 +117,8 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
pkti->is_tcp = false; pkti->is_tcp = false;
pkti->ecn = 0; pkti->ecn = 0;
} }
if (pkti->is_tcp)
get_tcp_info(skb, pkti);
} }
static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
...@@ -105,8 +133,14 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, ...@@ -105,8 +133,14 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
int len, int len,
unsigned long long curtime, unsigned long long curtime,
bool congestion_flag, bool congestion_flag,
bool drop_flag) bool drop_flag,
bool cwr_flag,
bool ecn_ce_flag,
struct hbm_pkt_info *pkti,
int credit)
{ {
int rv = ALLOW_PKT;
if (qsp != NULL) { if (qsp != NULL) {
// Following is needed for work conserving // Following is needed for work conserving
__sync_add_and_fetch(&(qsp->bytes_total), len); __sync_add_and_fetch(&(qsp->bytes_total), len);
...@@ -116,7 +150,7 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, ...@@ -116,7 +150,7 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
qsp->firstPacketTime = curtime; qsp->firstPacketTime = curtime;
qsp->lastPacketTime = curtime; qsp->lastPacketTime = curtime;
__sync_add_and_fetch(&(qsp->pkts_total), 1); __sync_add_and_fetch(&(qsp->pkts_total), 1);
if (congestion_flag || drop_flag) { if (congestion_flag) {
__sync_add_and_fetch(&(qsp->pkts_marked), 1); __sync_add_and_fetch(&(qsp->pkts_marked), 1);
__sync_add_and_fetch(&(qsp->bytes_marked), len); __sync_add_and_fetch(&(qsp->bytes_marked), len);
} }
...@@ -125,6 +159,34 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, ...@@ -125,6 +159,34 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
__sync_add_and_fetch(&(qsp->bytes_dropped), __sync_add_and_fetch(&(qsp->bytes_dropped),
len); len);
} }
if (ecn_ce_flag)
__sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1);
if (pkti->cwnd) {
__sync_add_and_fetch(&(qsp->sum_cwnd),
pkti->cwnd);
__sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1);
}
if (pkti->rtt)
__sync_add_and_fetch(&(qsp->sum_rtt),
pkti->rtt);
__sync_add_and_fetch(&(qsp->sum_credit), credit);
if (drop_flag)
rv = DROP_PKT;
if (cwr_flag)
rv |= 2;
if (rv == DROP_PKT)
__sync_add_and_fetch(&(qsp->returnValCount[0]),
1);
else if (rv == ALLOW_PKT)
__sync_add_and_fetch(&(qsp->returnValCount[1]),
1);
else if (rv == 2)
__sync_add_and_fetch(&(qsp->returnValCount[2]),
1);
else if (rv == 3)
__sync_add_and_fetch(&(qsp->returnValCount[3]),
1);
} }
} }
} }
...@@ -62,11 +62,12 @@ int _hbm_out_cg(struct __sk_buff *skb) ...@@ -62,11 +62,12 @@ int _hbm_out_cg(struct __sk_buff *skb)
unsigned int queue_index = 0; unsigned int queue_index = 0;
unsigned long long curtime; unsigned long long curtime;
int credit; int credit;
signed long long delta = 0, zero = 0; signed long long delta = 0, new_credit;
int max_credit = MAX_CREDIT; int max_credit = MAX_CREDIT;
bool congestion_flag = false; bool congestion_flag = false;
bool drop_flag = false; bool drop_flag = false;
bool cwr_flag = false; bool cwr_flag = false;
bool ecn_ce_flag = false;
struct hbm_vqueue *qdp; struct hbm_vqueue *qdp;
struct hbm_queue_stats *qsp = NULL; struct hbm_queue_stats *qsp = NULL;
int rv = ALLOW_PKT; int rv = ALLOW_PKT;
...@@ -99,9 +100,11 @@ int _hbm_out_cg(struct __sk_buff *skb) ...@@ -99,9 +100,11 @@ int _hbm_out_cg(struct __sk_buff *skb)
*/ */
if (delta > 0) { if (delta > 0) {
qdp->lasttime = curtime; qdp->lasttime = curtime;
credit += CREDIT_PER_NS(delta, qdp->rate); new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
if (credit > MAX_CREDIT) if (new_credit > MAX_CREDIT)
credit = MAX_CREDIT; credit = MAX_CREDIT;
else
credit = new_credit;
} }
credit -= len; credit -= len;
qdp->credit = credit; qdp->credit = credit;
...@@ -119,13 +122,16 @@ int _hbm_out_cg(struct __sk_buff *skb) ...@@ -119,13 +122,16 @@ int _hbm_out_cg(struct __sk_buff *skb)
// Set flags (drop, congestion, cwr) // Set flags (drop, congestion, cwr)
// Dropping => we are congested, so ignore congestion flag // Dropping => we are congested, so ignore congestion flag
if (credit < -DROP_THRESH || if (credit < -DROP_THRESH ||
(len > LARGE_PKT_THRESH && (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
credit < -LARGE_PKT_DROP_THRESH)) { // Very congested, set drop packet
// Very congested, set drop flag
drop_flag = true; drop_flag = true;
if (pkti.ecn)
congestion_flag = true;
else if (pkti.is_tcp)
cwr_flag = true;
} else if (credit < 0) { } else if (credit < 0) {
// Congested, set congestion flag // Congested, set congestion flag
if (pkti.ecn) { if (pkti.ecn || pkti.is_tcp) {
if (credit < -MARK_THRESH) if (credit < -MARK_THRESH)
congestion_flag = true; congestion_flag = true;
else else
...@@ -136,22 +142,38 @@ int _hbm_out_cg(struct __sk_buff *skb) ...@@ -136,22 +142,38 @@ int _hbm_out_cg(struct __sk_buff *skb)
} }
if (congestion_flag) { if (congestion_flag) {
if (!bpf_skb_ecn_set_ce(skb)) { if (bpf_skb_ecn_set_ce(skb)) {
if (len > LARGE_PKT_THRESH) { ecn_ce_flag = true;
} else {
if (pkti.is_tcp) {
unsigned int rand = bpf_get_prandom_u32();
if (-credit >= MARK_THRESH +
(rand % MARK_REGION_SIZE)) {
// Do congestion control
cwr_flag = true;
}
} else if (len > LARGE_PKT_THRESH) {
// Problem if too many small packets? // Problem if too many small packets?
drop_flag = true; drop_flag = true;
} }
} }
} }
if (drop_flag) if (qsp != NULL)
rv = DROP_PKT; if (qsp->no_cn)
cwr_flag = false;
hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
cwr_flag, ecn_ce_flag, &pkti, credit);
if (rv == DROP_PKT) if (drop_flag) {
__sync_add_and_fetch(&(qdp->credit), len); __sync_add_and_fetch(&(qdp->credit), len);
rv = DROP_PKT;
}
if (cwr_flag)
rv |= 2;
return rv; return rv;
} }
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment