Commit bec13ba9 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf

Florian Westphal says:

====================
netfilter: conntrack and nf_tables bug fixes

The following patchset contains netfilter fixes for net.

Broken since 5.19:
  A few ancient connection tracking helpers assume TCP packets cannot
  exceed 64kb in size, but this isn't the case anymore with 5.19 when
  BIG TCP got merged, from myself.

Regressions since 5.19:
  1. 'conntrack -E expect' won't display anything because nfnetlink failed
     to enable events for expectations, only for normal conntrack events.

  2. partially revert change that added resched calls to a function that can
     be in atomic context.  Both broken and fixed up by myself.

Broken for several releases (up to original merge of nf_tables):
  Several fixes for nf_tables control plane, from Pablo.
  This fixes up resource leaks in error paths and adds more sanity
  checks for mutually exclusive attributes/flags.

Kconfig:
  NF_CONNTRACK_PROCFS is very old and doesn't provide all info provided
  via ctnetlink, so it should not default to y. From Geert Uytterhoeven.

Selftests:
  rework nft_flowtable.sh: it frequently indicated failure; the way it
  tried to detect an offload failure did not work reliably.

* git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf:
  testing: selftests: nft_flowtable.sh: rework test to detect offload failure
  testing: selftests: nft_flowtable.sh: use random netns names
  netfilter: conntrack: NF_CONNTRACK_PROCFS should no longer default to y
  netfilter: nf_tables: check NFT_SET_CONCAT flag if field_count is specified
  netfilter: nf_tables: disallow NFT_SET_ELEM_CATCHALL and NFT_SET_ELEM_INTERVAL_END
  netfilter: nf_tables: NFTA_SET_ELEM_KEY_END requires concat and interval flags
  netfilter: nf_tables: validate NFTA_SET_ELEM_OBJREF based on NFT_SET_OBJECT flag
  netfilter: nf_tables: really skip inactive sets when allocating name
  netfilter: nfnetlink: re-enable conntrack expectation events
  netfilter: nf_tables: fix scheduling-while-atomic splat
  netfilter: nf_ct_irc: cap packet search space to 4k
  netfilter: nf_ct_ftp: prefer skb_linearize
  netfilter: nf_ct_h323: cap packet size at 64k
  netfilter: nf_ct_sane: remove pseudo skb linearization
  netfilter: nf_tables: possible module reference underflow in error path
  netfilter: nf_tables: disallow NFTA_SET_ELEM_KEY_END with NFT_SET_ELEM_INTERVAL_END flag
  netfilter: nf_tables: use READ_ONCE and WRITE_ONCE for shared generation id access
====================

Link: https://lore.kernel.org/r/20220817140015.25843-1-fw@strlen.deSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents fc4aaf9f c8550b90
...@@ -95,7 +95,7 @@ struct nf_ip_net { ...@@ -95,7 +95,7 @@ struct nf_ip_net {
struct netns_ct { struct netns_ct {
#ifdef CONFIG_NF_CONNTRACK_EVENTS #ifdef CONFIG_NF_CONNTRACK_EVENTS
bool ctnetlink_has_listener; u8 ctnetlink_has_listener;
bool ecache_dwork_pending; bool ecache_dwork_pending;
#endif #endif
u8 sysctl_log_invalid; /* Log invalid packets */ u8 sysctl_log_invalid; /* Log invalid packets */
......
...@@ -144,7 +144,6 @@ config NF_CONNTRACK_ZONES ...@@ -144,7 +144,6 @@ config NF_CONNTRACK_ZONES
config NF_CONNTRACK_PROCFS config NF_CONNTRACK_PROCFS
bool "Supply CT list in procfs (OBSOLETE)" bool "Supply CT list in procfs (OBSOLETE)"
default y
depends on PROC_FS depends on PROC_FS
help help
This option enables for the list of known conntrack entries This option enables for the list of known conntrack entries
......
...@@ -34,11 +34,6 @@ MODULE_DESCRIPTION("ftp connection tracking helper"); ...@@ -34,11 +34,6 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
MODULE_ALIAS("ip_conntrack_ftp"); MODULE_ALIAS("ip_conntrack_ftp");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
/* This is slow, but it's simple. --RR */
static char *ftp_buffer;
static DEFINE_SPINLOCK(nf_ftp_lock);
#define MAX_PORTS 8 #define MAX_PORTS 8
static u_int16_t ports[MAX_PORTS]; static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c; static unsigned int ports_c;
...@@ -398,6 +393,9 @@ static int help(struct sk_buff *skb, ...@@ -398,6 +393,9 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT; return NF_ACCEPT;
} }
if (unlikely(skb_linearize(skb)))
return NF_DROP;
th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL) if (th == NULL)
return NF_ACCEPT; return NF_ACCEPT;
...@@ -411,12 +409,8 @@ static int help(struct sk_buff *skb, ...@@ -411,12 +409,8 @@ static int help(struct sk_buff *skb,
} }
datalen = skb->len - dataoff; datalen = skb->len - dataoff;
spin_lock_bh(&nf_ftp_lock); spin_lock_bh(&ct->lock);
fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); fb_ptr = skb->data + dataoff;
if (!fb_ptr) {
spin_unlock_bh(&nf_ftp_lock);
return NF_ACCEPT;
}
ends_in_nl = (fb_ptr[datalen - 1] == '\n'); ends_in_nl = (fb_ptr[datalen - 1] == '\n');
seq = ntohl(th->seq) + datalen; seq = ntohl(th->seq) + datalen;
...@@ -544,7 +538,7 @@ static int help(struct sk_buff *skb, ...@@ -544,7 +538,7 @@ static int help(struct sk_buff *skb,
if (ends_in_nl) if (ends_in_nl)
update_nl_seq(ct, seq, ct_ftp_info, dir, skb); update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
out: out:
spin_unlock_bh(&nf_ftp_lock); spin_unlock_bh(&ct->lock);
return ret; return ret;
} }
...@@ -571,7 +565,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { ...@@ -571,7 +565,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
static void __exit nf_conntrack_ftp_fini(void) static void __exit nf_conntrack_ftp_fini(void)
{ {
nf_conntrack_helpers_unregister(ftp, ports_c * 2); nf_conntrack_helpers_unregister(ftp, ports_c * 2);
kfree(ftp_buffer);
} }
static int __init nf_conntrack_ftp_init(void) static int __init nf_conntrack_ftp_init(void)
...@@ -580,10 +573,6 @@ static int __init nf_conntrack_ftp_init(void) ...@@ -580,10 +573,6 @@ static int __init nf_conntrack_ftp_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master)); NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master));
ftp_buffer = kmalloc(65536, GFP_KERNEL);
if (!ftp_buffer)
return -ENOMEM;
if (ports_c == 0) if (ports_c == 0)
ports[ports_c++] = FTP_PORT; ports[ports_c++] = FTP_PORT;
...@@ -603,7 +592,6 @@ static int __init nf_conntrack_ftp_init(void) ...@@ -603,7 +592,6 @@ static int __init nf_conntrack_ftp_init(void)
ret = nf_conntrack_helpers_register(ftp, ports_c * 2); ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
if (ret < 0) { if (ret < 0) {
pr_err("failed to register helpers\n"); pr_err("failed to register helpers\n");
kfree(ftp_buffer);
return ret; return ret;
} }
......
...@@ -34,6 +34,8 @@ ...@@ -34,6 +34,8 @@
#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_h323.h> #include <linux/netfilter/nf_conntrack_h323.h>
#define H323_MAX_SIZE 65535
/* Parameters */ /* Parameters */
static unsigned int default_rrq_ttl __read_mostly = 300; static unsigned int default_rrq_ttl __read_mostly = 300;
module_param(default_rrq_ttl, uint, 0600); module_param(default_rrq_ttl, uint, 0600);
...@@ -86,6 +88,9 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, ...@@ -86,6 +88,9 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen <= 0) /* No TCP data */ if (tcpdatalen <= 0) /* No TCP data */
goto clear_out; goto clear_out;
if (tcpdatalen > H323_MAX_SIZE)
tcpdatalen = H323_MAX_SIZE;
if (*data == NULL) { /* first TPKT */ if (*data == NULL) { /* first TPKT */
/* Get first TPKT pointer */ /* Get first TPKT pointer */
tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
...@@ -1169,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, ...@@ -1169,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len) if (dataoff >= skb->len)
return NULL; return NULL;
*datalen = skb->len - dataoff; *datalen = skb->len - dataoff;
if (*datalen > H323_MAX_SIZE)
*datalen = H323_MAX_SIZE;
return skb_header_pointer(skb, dataoff, *datalen, h323_buffer); return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
} }
...@@ -1770,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void) ...@@ -1770,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master)); NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master));
h323_buffer = kmalloc(65536, GFP_KERNEL); h323_buffer = kmalloc(H323_MAX_SIZE + 1, GFP_KERNEL);
if (!h323_buffer) if (!h323_buffer)
return -ENOMEM; return -ENOMEM;
ret = h323_helper_init(); ret = h323_helper_init();
......
...@@ -39,6 +39,7 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, ...@@ -39,6 +39,7 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_irc_hook); EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
#define HELPER_NAME "irc" #define HELPER_NAME "irc"
#define MAX_SEARCH_SIZE 4095
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
...@@ -121,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, ...@@ -121,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
int i, ret = NF_ACCEPT; int i, ret = NF_ACCEPT;
char *addr_beg_p, *addr_end_p; char *addr_beg_p, *addr_end_p;
typeof(nf_nat_irc_hook) nf_nat_irc; typeof(nf_nat_irc_hook) nf_nat_irc;
unsigned int datalen;
/* If packet is coming from IRC server */ /* If packet is coming from IRC server */
if (dir == IP_CT_DIR_REPLY) if (dir == IP_CT_DIR_REPLY)
...@@ -140,8 +142,12 @@ static int help(struct sk_buff *skb, unsigned int protoff, ...@@ -140,8 +142,12 @@ static int help(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len) if (dataoff >= skb->len)
return NF_ACCEPT; return NF_ACCEPT;
datalen = skb->len - dataoff;
if (datalen > MAX_SEARCH_SIZE)
datalen = MAX_SEARCH_SIZE;
spin_lock_bh(&irc_buffer_lock); spin_lock_bh(&irc_buffer_lock);
ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, ib_ptr = skb_header_pointer(skb, dataoff, datalen,
irc_buffer); irc_buffer);
if (!ib_ptr) { if (!ib_ptr) {
spin_unlock_bh(&irc_buffer_lock); spin_unlock_bh(&irc_buffer_lock);
...@@ -149,7 +155,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, ...@@ -149,7 +155,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
} }
data = ib_ptr; data = ib_ptr;
data_limit = ib_ptr + skb->len - dataoff; data_limit = ib_ptr + datalen;
/* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
* 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
...@@ -251,7 +257,7 @@ static int __init nf_conntrack_irc_init(void) ...@@ -251,7 +257,7 @@ static int __init nf_conntrack_irc_init(void)
irc_exp_policy.max_expected = max_dcc_channels; irc_exp_policy.max_expected = max_dcc_channels;
irc_exp_policy.timeout = dcc_timeout; irc_exp_policy.timeout = dcc_timeout;
irc_buffer = kmalloc(65536, GFP_KERNEL); irc_buffer = kmalloc(MAX_SEARCH_SIZE + 1, GFP_KERNEL);
if (!irc_buffer) if (!irc_buffer)
return -ENOMEM; return -ENOMEM;
......
...@@ -34,10 +34,6 @@ MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>"); ...@@ -34,10 +34,6 @@ MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
MODULE_DESCRIPTION("SANE connection tracking helper"); MODULE_DESCRIPTION("SANE connection tracking helper");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
static char *sane_buffer;
static DEFINE_SPINLOCK(nf_sane_lock);
#define MAX_PORTS 8 #define MAX_PORTS 8
static u_int16_t ports[MAX_PORTS]; static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c; static unsigned int ports_c;
...@@ -67,14 +63,16 @@ static int help(struct sk_buff *skb, ...@@ -67,14 +63,16 @@ static int help(struct sk_buff *skb,
unsigned int dataoff, datalen; unsigned int dataoff, datalen;
const struct tcphdr *th; const struct tcphdr *th;
struct tcphdr _tcph; struct tcphdr _tcph;
void *sb_ptr;
int ret = NF_ACCEPT; int ret = NF_ACCEPT;
int dir = CTINFO2DIR(ctinfo); int dir = CTINFO2DIR(ctinfo);
struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct); struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp; struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple *tuple; struct nf_conntrack_tuple *tuple;
struct sane_request *req;
struct sane_reply_net_start *reply; struct sane_reply_net_start *reply;
union {
struct sane_request req;
struct sane_reply_net_start repl;
} buf;
/* Until there's been traffic both ways, don't look in packets. */ /* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED && if (ctinfo != IP_CT_ESTABLISHED &&
...@@ -92,59 +90,62 @@ static int help(struct sk_buff *skb, ...@@ -92,59 +90,62 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT; return NF_ACCEPT;
datalen = skb->len - dataoff; datalen = skb->len - dataoff;
spin_lock_bh(&nf_sane_lock);
sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
if (!sb_ptr) {
spin_unlock_bh(&nf_sane_lock);
return NF_ACCEPT;
}
if (dir == IP_CT_DIR_ORIGINAL) { if (dir == IP_CT_DIR_ORIGINAL) {
const struct sane_request *req;
if (datalen != sizeof(struct sane_request)) if (datalen != sizeof(struct sane_request))
goto out; return NF_ACCEPT;
req = skb_header_pointer(skb, dataoff, datalen, &buf.req);
if (!req)
return NF_ACCEPT;
req = sb_ptr;
if (req->RPC_code != htonl(SANE_NET_START)) { if (req->RPC_code != htonl(SANE_NET_START)) {
/* Not an interesting command */ /* Not an interesting command */
ct_sane_info->state = SANE_STATE_NORMAL; WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
goto out; return NF_ACCEPT;
} }
/* We're interested in the next reply */ /* We're interested in the next reply */
ct_sane_info->state = SANE_STATE_START_REQUESTED; WRITE_ONCE(ct_sane_info->state, SANE_STATE_START_REQUESTED);
goto out; return NF_ACCEPT;
} }
/* IP_CT_DIR_REPLY */
/* Is it a reply to an uninteresting command? */ /* Is it a reply to an uninteresting command? */
if (ct_sane_info->state != SANE_STATE_START_REQUESTED) if (READ_ONCE(ct_sane_info->state) != SANE_STATE_START_REQUESTED)
goto out; return NF_ACCEPT;
/* It's a reply to SANE_NET_START. */ /* It's a reply to SANE_NET_START. */
ct_sane_info->state = SANE_STATE_NORMAL; WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
if (datalen < sizeof(struct sane_reply_net_start)) { if (datalen < sizeof(struct sane_reply_net_start)) {
pr_debug("NET_START reply too short\n"); pr_debug("NET_START reply too short\n");
goto out; return NF_ACCEPT;
} }
reply = sb_ptr; datalen = sizeof(struct sane_reply_net_start);
reply = skb_header_pointer(skb, dataoff, datalen, &buf.repl);
if (!reply)
return NF_ACCEPT;
if (reply->status != htonl(SANE_STATUS_SUCCESS)) { if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
/* saned refused the command */ /* saned refused the command */
pr_debug("unsuccessful SANE_STATUS = %u\n", pr_debug("unsuccessful SANE_STATUS = %u\n",
ntohl(reply->status)); ntohl(reply->status));
goto out; return NF_ACCEPT;
} }
/* Invalid saned reply? Ignore it. */ /* Invalid saned reply? Ignore it. */
if (reply->zero != 0) if (reply->zero != 0)
goto out; return NF_ACCEPT;
exp = nf_ct_expect_alloc(ct); exp = nf_ct_expect_alloc(ct);
if (exp == NULL) { if (exp == NULL) {
nf_ct_helper_log(skb, ct, "cannot alloc expectation"); nf_ct_helper_log(skb, ct, "cannot alloc expectation");
ret = NF_DROP; return NF_DROP;
goto out;
} }
tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
...@@ -162,9 +163,6 @@ static int help(struct sk_buff *skb, ...@@ -162,9 +163,6 @@ static int help(struct sk_buff *skb,
} }
nf_ct_expect_put(exp); nf_ct_expect_put(exp);
out:
spin_unlock_bh(&nf_sane_lock);
return ret; return ret;
} }
...@@ -178,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { ...@@ -178,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
static void __exit nf_conntrack_sane_fini(void) static void __exit nf_conntrack_sane_fini(void)
{ {
nf_conntrack_helpers_unregister(sane, ports_c * 2); nf_conntrack_helpers_unregister(sane, ports_c * 2);
kfree(sane_buffer);
} }
static int __init nf_conntrack_sane_init(void) static int __init nf_conntrack_sane_init(void)
...@@ -187,10 +184,6 @@ static int __init nf_conntrack_sane_init(void) ...@@ -187,10 +184,6 @@ static int __init nf_conntrack_sane_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master)); NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master));
sane_buffer = kmalloc(65536, GFP_KERNEL);
if (!sane_buffer)
return -ENOMEM;
if (ports_c == 0) if (ports_c == 0)
ports[ports_c++] = SANE_PORT; ports[ports_c++] = SANE_PORT;
...@@ -210,7 +203,6 @@ static int __init nf_conntrack_sane_init(void) ...@@ -210,7 +203,6 @@ static int __init nf_conntrack_sane_init(void)
ret = nf_conntrack_helpers_register(sane, ports_c * 2); ret = nf_conntrack_helpers_register(sane, ports_c * 2);
if (ret < 0) { if (ret < 0) {
pr_err("failed to register helpers\n"); pr_err("failed to register helpers\n");
kfree(sane_buffer);
return ret; return ret;
} }
......
...@@ -889,7 +889,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, ...@@ -889,7 +889,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = nft_net->base_seq; cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family) if (family != NFPROTO_UNSPEC && family != table->family)
...@@ -1705,7 +1705,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, ...@@ -1705,7 +1705,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = nft_net->base_seq; cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family) if (family != NFPROTO_UNSPEC && family != table->family)
...@@ -3149,7 +3149,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, ...@@ -3149,7 +3149,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = nft_net->base_seq; cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family) if (family != NFPROTO_UNSPEC && family != table->family)
...@@ -3907,7 +3907,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, ...@@ -3907,7 +3907,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
list_for_each_entry(i, &ctx->table->sets, list) { list_for_each_entry(i, &ctx->table->sets, list) {
int tmp; int tmp;
if (!nft_is_active_next(ctx->net, set)) if (!nft_is_active_next(ctx->net, i))
continue; continue;
if (!sscanf(i->name, name, &tmp)) if (!sscanf(i->name, name, &tmp))
continue; continue;
...@@ -4133,7 +4133,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -4133,7 +4133,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = nft_net->base_seq; cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC && if (ctx->family != NFPROTO_UNSPEC &&
...@@ -4451,6 +4451,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -4451,6 +4451,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0) if (err < 0)
return err; return err;
if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT))
return -EINVAL;
} else if (flags & NFT_SET_CONCAT) {
return -EINVAL;
} }
if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
...@@ -5061,6 +5066,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -5061,6 +5066,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC && if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
dump_ctx->ctx.family != table->family) dump_ctx->ctx.family != table->family)
...@@ -5196,6 +5203,9 @@ static int nft_setelem_parse_flags(const struct nft_set *set, ...@@ -5196,6 +5203,9 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
if (!(set->flags & NFT_SET_INTERVAL) && if (!(set->flags & NFT_SET_INTERVAL) &&
*flags & NFT_SET_ELEM_INTERVAL_END) *flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL; return -EINVAL;
if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
return -EINVAL;
return 0; return 0;
} }
...@@ -5599,7 +5609,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, ...@@ -5599,7 +5609,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
err = nft_expr_clone(expr, set->exprs[i]); err = nft_expr_clone(expr, set->exprs[i]);
if (err < 0) { if (err < 0) {
nft_expr_destroy(ctx, expr); kfree(expr);
goto err_expr; goto err_expr;
} }
expr_array[i] = expr; expr_array[i] = expr;
...@@ -5842,6 +5852,24 @@ static void nft_setelem_remove(const struct net *net, ...@@ -5842,6 +5852,24 @@ static void nft_setelem_remove(const struct net *net,
set->ops->remove(net, set, elem); set->ops->remove(net, set, elem);
} }
static bool nft_setelem_valid_key_end(const struct nft_set *set,
struct nlattr **nla, u32 flags)
{
if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
(NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
if (flags & NFT_SET_ELEM_INTERVAL_END)
return false;
if (!nla[NFTA_SET_ELEM_KEY_END] &&
!(flags & NFT_SET_ELEM_CATCHALL))
return false;
} else {
if (nla[NFTA_SET_ELEM_KEY_END])
return false;
}
return true;
}
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags) const struct nlattr *attr, u32 nlmsg_flags)
{ {
...@@ -5892,6 +5920,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ...@@ -5892,6 +5920,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return -EINVAL; return -EINVAL;
} }
if (set->flags & NFT_SET_OBJECT) {
if (!nla[NFTA_SET_ELEM_OBJREF] &&
!(flags & NFT_SET_ELEM_INTERVAL_END))
return -EINVAL;
} else {
if (nla[NFTA_SET_ELEM_OBJREF])
return -EINVAL;
}
if (!nft_setelem_valid_key_end(set, nla, flags))
return -EINVAL;
if ((flags & NFT_SET_ELEM_INTERVAL_END) && if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
(nla[NFTA_SET_ELEM_DATA] || (nla[NFTA_SET_ELEM_DATA] ||
nla[NFTA_SET_ELEM_OBJREF] || nla[NFTA_SET_ELEM_OBJREF] ||
...@@ -5899,6 +5939,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ...@@ -5899,6 +5939,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nla[NFTA_SET_ELEM_EXPIRATION] || nla[NFTA_SET_ELEM_EXPIRATION] ||
nla[NFTA_SET_ELEM_USERDATA] || nla[NFTA_SET_ELEM_USERDATA] ||
nla[NFTA_SET_ELEM_EXPR] || nla[NFTA_SET_ELEM_EXPR] ||
nla[NFTA_SET_ELEM_KEY_END] ||
nla[NFTA_SET_ELEM_EXPRESSIONS])) nla[NFTA_SET_ELEM_EXPRESSIONS]))
return -EINVAL; return -EINVAL;
...@@ -6029,10 +6070,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ...@@ -6029,10 +6070,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
} }
if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
err = -EINVAL;
goto err_parse_key_end;
}
obj = nft_obj_lookup(ctx->net, ctx->table, obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF], nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask); set->objtype, genmask);
...@@ -6325,6 +6362,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, ...@@ -6325,6 +6362,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
return -EINVAL; return -EINVAL;
if (!nft_setelem_valid_key_end(set, nla, flags))
return -EINVAL;
nft_set_ext_prepare(&tmpl); nft_set_ext_prepare(&tmpl);
if (flags != 0) { if (flags != 0) {
...@@ -6941,7 +6981,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -6941,7 +6981,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = nft_net->base_seq; cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family) if (family != NFPROTO_UNSPEC && family != table->family)
...@@ -7873,7 +7913,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, ...@@ -7873,7 +7913,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
rcu_read_lock(); rcu_read_lock();
nft_net = nft_pernet(net); nft_net = nft_pernet(net);
cb->seq = nft_net->base_seq; cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) { list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family) if (family != NFPROTO_UNSPEC && family != table->family)
...@@ -8806,6 +8846,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) ...@@ -8806,6 +8846,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
struct nft_trans_elem *te; struct nft_trans_elem *te;
struct nft_chain *chain; struct nft_chain *chain;
struct nft_table *table; struct nft_table *table;
unsigned int base_seq;
LIST_HEAD(adl); LIST_HEAD(adl);
int err; int err;
...@@ -8855,9 +8896,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) ...@@ -8855,9 +8896,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress. * Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point. * Cannot fail after this point.
*/ */
while (++nft_net->base_seq == 0) base_seq = READ_ONCE(nft_net->base_seq);
while (++base_seq == 0)
; ;
WRITE_ONCE(nft_net->base_seq, base_seq);
/* step 3. Start new generation, rules_gen_X now in use. */ /* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net); net->nft.gencursor = nft_gencursor_next(net);
...@@ -9419,13 +9463,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, ...@@ -9419,13 +9463,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
break; break;
} }
} }
cond_resched();
} }
list_for_each_entry(set, &ctx->table->sets, list) { list_for_each_entry(set, &ctx->table->sets, list) {
cond_resched();
if (!nft_is_active_next(ctx->net, set)) if (!nft_is_active_next(ctx->net, set))
continue; continue;
if (!(set->flags & NFT_SET_MAP) || if (!(set->flags & NFT_SET_MAP) ||
......
...@@ -44,6 +44,10 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket"); ...@@ -44,6 +44,10 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket");
static unsigned int nfnetlink_pernet_id __read_mostly; static unsigned int nfnetlink_pernet_id __read_mostly;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static DEFINE_SPINLOCK(nfnl_grp_active_lock);
#endif
struct nfnl_net { struct nfnl_net {
struct sock *nfnl; struct sock *nfnl;
}; };
...@@ -654,6 +658,44 @@ static void nfnetlink_rcv(struct sk_buff *skb) ...@@ -654,6 +658,44 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg); netlink_rcv_skb(skb, nfnetlink_rcv_msg);
} }
static void nfnetlink_bind_event(struct net *net, unsigned int group)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
int type, group_bit;
u8 v;
/* All NFNLGRP_CONNTRACK_* group bits fit into u8.
* The other groups are not relevant and can be ignored.
*/
if (group >= 8)
return;
type = nfnl_group2type[group];
switch (type) {
case NFNL_SUBSYS_CTNETLINK:
break;
case NFNL_SUBSYS_CTNETLINK_EXP:
break;
default:
return;
}
group_bit = (1 << group);
spin_lock(&nfnl_grp_active_lock);
v = READ_ONCE(net->ct.ctnetlink_has_listener);
if ((v & group_bit) == 0) {
v |= group_bit;
/* read concurrently without nfnl_grp_active_lock held. */
WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
}
spin_unlock(&nfnl_grp_active_lock);
#endif
}
static int nfnetlink_bind(struct net *net, int group) static int nfnetlink_bind(struct net *net, int group)
{ {
const struct nfnetlink_subsystem *ss; const struct nfnetlink_subsystem *ss;
...@@ -670,28 +712,45 @@ static int nfnetlink_bind(struct net *net, int group) ...@@ -670,28 +712,45 @@ static int nfnetlink_bind(struct net *net, int group)
if (!ss) if (!ss)
request_module_nowait("nfnetlink-subsys-%d", type); request_module_nowait("nfnetlink-subsys-%d", type);
#ifdef CONFIG_NF_CONNTRACK_EVENTS nfnetlink_bind_event(net, group);
if (type == NFNL_SUBSYS_CTNETLINK) {
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
WRITE_ONCE(net->ct.ctnetlink_has_listener, true);
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
}
#endif
return 0; return 0;
} }
static void nfnetlink_unbind(struct net *net, int group) static void nfnetlink_unbind(struct net *net, int group)
{ {
#ifdef CONFIG_NF_CONNTRACK_EVENTS #ifdef CONFIG_NF_CONNTRACK_EVENTS
int type, group_bit;
if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
return; return;
if (nfnl_group2type[group] == NFNL_SUBSYS_CTNETLINK) { type = nfnl_group2type[group];
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
if (!nfnetlink_has_listeners(net, group)) switch (type) {
WRITE_ONCE(net->ct.ctnetlink_has_listener, false); case NFNL_SUBSYS_CTNETLINK:
nfnl_unlock(NFNL_SUBSYS_CTNETLINK); break;
case NFNL_SUBSYS_CTNETLINK_EXP:
break;
default:
return;
}
/* ctnetlink_has_listener is u8 */
if (group >= 8)
return;
group_bit = (1 << group);
spin_lock(&nfnl_grp_active_lock);
if (!nfnetlink_has_listeners(net, group)) {
u8 v = READ_ONCE(net->ct.ctnetlink_has_listener);
v &= ~group_bit;
/* read concurrently without nfnl_grp_active_lock held. */
WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
} }
spin_unlock(&nfnl_grp_active_lock);
#endif #endif
} }
......
...@@ -14,13 +14,17 @@ ...@@ -14,13 +14,17 @@
# nft_flowtable.sh -o8000 -l1500 -r2000 # nft_flowtable.sh -o8000 -l1500 -r2000
# #
sfx=$(mktemp -u "XXXXXXXX")
ns1="ns1-$sfx"
ns2="ns2-$sfx"
nsr1="nsr1-$sfx"
nsr2="nsr2-$sfx"
# Kselftest framework requirement - SKIP code is 4. # Kselftest framework requirement - SKIP code is 4.
ksft_skip=4 ksft_skip=4
ret=0 ret=0
ns1in="" nsin=""
ns2in=""
ns1out="" ns1out=""
ns2out="" ns2out=""
...@@ -36,21 +40,19 @@ checktool (){ ...@@ -36,21 +40,19 @@ checktool (){
checktool "nft --version" "run test without nft tool" checktool "nft --version" "run test without nft tool"
checktool "ip -Version" "run test without ip tool" checktool "ip -Version" "run test without ip tool"
checktool "which nc" "run test without nc (netcat)" checktool "which nc" "run test without nc (netcat)"
checktool "ip netns add nsr1" "create net namespace" checktool "ip netns add $nsr1" "create net namespace $nsr1"
ip netns add ns1 ip netns add $ns1
ip netns add ns2 ip netns add $ns2
ip netns add $nsr2
ip netns add nsr2
cleanup() { cleanup() {
for i in 1 2; do ip netns del $ns1
ip netns del ns$i ip netns del $ns2
ip netns del nsr$i ip netns del $nsr1
done ip netns del $nsr2
rm -f "$ns1in" "$ns1out" rm -f "$nsin" "$ns1out" "$ns2out"
rm -f "$ns2in" "$ns2out"
[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
} }
...@@ -59,22 +61,21 @@ trap cleanup EXIT ...@@ -59,22 +61,21 @@ trap cleanup EXIT
sysctl -q net.netfilter.nf_log_all_netns=1 sysctl -q net.netfilter.nf_log_all_netns=1
ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1 ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1
ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2 ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2
ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2 ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2
for dev in lo veth0 veth1; do for dev in lo veth0 veth1; do
for i in 1 2; do ip -net $nsr1 link set $dev up
ip -net nsr$i link set $dev up ip -net $nsr2 link set $dev up
done
done done
ip -net nsr1 addr add 10.0.1.1/24 dev veth0 ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
ip -net nsr1 addr add dead:1::1/64 dev veth0 ip -net $nsr1 addr add dead:1::1/64 dev veth0
ip -net nsr2 addr add 10.0.2.1/24 dev veth1 ip -net $nsr2 addr add 10.0.2.1/24 dev veth1
ip -net nsr2 addr add dead:2::1/64 dev veth1 ip -net $nsr2 addr add dead:2::1/64 dev veth1
# set different MTUs so we need to push packets coming from ns1 (large MTU) # set different MTUs so we need to push packets coming from ns1 (large MTU)
# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), # to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
...@@ -106,85 +107,76 @@ do ...@@ -106,85 +107,76 @@ do
esac esac
done done
if ! ip -net nsr1 link set veth0 mtu $omtu; then if ! ip -net $nsr1 link set veth0 mtu $omtu; then
exit 1 exit 1
fi fi
ip -net ns1 link set eth0 mtu $omtu ip -net $ns1 link set eth0 mtu $omtu
if ! ip -net nsr2 link set veth1 mtu $rmtu; then if ! ip -net $nsr2 link set veth1 mtu $rmtu; then
exit 1 exit 1
fi fi
ip -net ns2 link set eth0 mtu $rmtu ip -net $ns2 link set eth0 mtu $rmtu
# transfer-net between nsr1 and nsr2. # transfer-net between nsr1 and nsr2.
# these addresses are not used for connections. # these addresses are not used for connections.
ip -net nsr1 addr add 192.168.10.1/24 dev veth1 ip -net $nsr1 addr add 192.168.10.1/24 dev veth1
ip -net nsr1 addr add fee1:2::1/64 dev veth1 ip -net $nsr1 addr add fee1:2::1/64 dev veth1
ip -net nsr2 addr add 192.168.10.2/24 dev veth0 ip -net $nsr2 addr add 192.168.10.2/24 dev veth0
ip -net nsr2 addr add fee1:2::2/64 dev veth0 ip -net $nsr2 addr add fee1:2::2/64 dev veth0
for i in 1 2; do for i in 0 1; do
ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
done
ip -net ns$i link set lo up
ip -net ns$i link set eth0 up for ns in $ns1 $ns2;do
ip -net ns$i addr add 10.0.$i.99/24 dev eth0 ip -net $ns link set lo up
ip -net ns$i route add default via 10.0.$i.1 ip -net $ns link set eth0 up
ip -net ns$i addr add dead:$i::99/64 dev eth0
ip -net ns$i route add default via dead:$i::1 if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
if ! ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
echo "ERROR: Check Originator/Responder values (problem during address addition)" echo "ERROR: Check Originator/Responder values (problem during address addition)"
exit 1 exit 1
fi fi
# don't set ip DF bit for first two tests # don't set ip DF bit for first two tests
ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
done done
ip -net nsr1 route add default via 192.168.10.2 ip -net $ns1 addr add 10.0.1.99/24 dev eth0
ip -net nsr2 route add default via 192.168.10.1 ip -net $ns2 addr add 10.0.2.99/24 dev eth0
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns2 route add default via 10.0.2.1
ip -net $ns1 addr add dead:1::99/64 dev eth0
ip -net $ns2 addr add dead:2::99/64 dev eth0
ip -net $ns1 route add default via dead:1::1
ip -net $ns2 route add default via dead:2::1
ip -net $nsr1 route add default via 192.168.10.2
ip -net $nsr2 route add default via 192.168.10.1
ip netns exec nsr1 nft -f - <<EOF ip netns exec $nsr1 nft -f - <<EOF
table inet filter { table inet filter {
flowtable f1 { flowtable f1 {
hook ingress priority 0 hook ingress priority 0
devices = { veth0, veth1 } devices = { veth0, veth1 }
} }
counter routed_orig { }
counter routed_repl { }
chain forward { chain forward {
type filter hook forward priority 0; policy drop; type filter hook forward priority 0; policy drop;
# flow offloaded? Tag ct with mark 1, so we can detect when it fails. # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
meta oif "veth1" tcp dport 12345 flow offload @f1 counter meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept
# use packet size to trigger 'should be offloaded by now'.
# otherwise, if 'flow offload' expression never offloads, the
# test will pass.
tcp dport 12345 meta length gt 200 ct mark set 1 counter
# this turns off flow offloading internally, so expect packets again # count packets supposedly offloaded as per direction.
tcp flags fin,rst ct mark set 0 accept ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept
# this allows large packets from responder, we need this as long
# as PMTUd is off.
# This rule is deleted for the last test, when we expect PMTUd
# to kick in and ensure all packets meet mtu requirements.
meta length gt $lmtu accept comment something-to-grep-for
# next line blocks connection w.o. working offload.
# we only do this for reverse dir, because we expect packets to
# enter slow path due to MTU mismatch of veth0 and veth1.
tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop
ct state established,related accept ct state established,related accept
# for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
meta length lt 200 oif "veth1" tcp dport 12345 counter accept
meta nfproto ipv4 meta l4proto icmp accept meta nfproto ipv4 meta l4proto icmp accept
meta nfproto ipv6 meta l4proto icmpv6 accept meta nfproto ipv6 meta l4proto icmpv6 accept
} }
...@@ -197,30 +189,30 @@ if [ $? -ne 0 ]; then ...@@ -197,30 +189,30 @@ if [ $? -ne 0 ]; then
fi fi
# test basic connectivity # test basic connectivity
if ! ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
echo "ERROR: ns1 cannot reach ns2" 1>&2 echo "ERROR: $ns1 cannot reach ns2" 1>&2
exit 1 exit 1
fi fi
if ! ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
echo "ERROR: ns2 cannot reach ns1" 1>&2 echo "ERROR: $ns2 cannot reach $ns1" 1>&2
exit 1 exit 1
fi fi
if [ $ret -eq 0 ];then if [ $ret -eq 0 ];then
echo "PASS: netns routing/connectivity: ns1 can reach ns2" echo "PASS: netns routing/connectivity: $ns1 can reach $ns2"
fi fi
ns1in=$(mktemp) nsin=$(mktemp)
ns1out=$(mktemp) ns1out=$(mktemp)
ns2in=$(mktemp)
ns2out=$(mktemp) ns2out=$(mktemp)
make_file() make_file()
{ {
name=$1 name=$1
SIZE=$((RANDOM % (1024 * 8))) SIZE=$((RANDOM % (1024 * 128)))
SIZE=$((SIZE + (1024 * 8)))
TSIZE=$((SIZE * 1024)) TSIZE=$((SIZE * 1024))
dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
...@@ -231,6 +223,38 @@ make_file() ...@@ -231,6 +223,38 @@ make_file()
dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
} }
check_counters()
{
local what=$1
local ok=1
local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets)
local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets)
local orig_cnt=${orig#*bytes}
local repl_cnt=${repl#*bytes}
local fs=$(du -sb $nsin)
local max_orig=${fs%%/*}
local max_repl=$((max_orig/4))
if [ $orig_cnt -gt $max_orig ];then
echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2
ret=1
ok=0
fi
if [ $repl_cnt -gt $max_repl ];then
echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2
ret=1
ok=0
fi
if [ $ok -eq 1 ]; then
echo "PASS: $what"
fi
}
check_transfer() check_transfer()
{ {
in=$1 in=$1
...@@ -255,11 +279,11 @@ test_tcp_forwarding_ip() ...@@ -255,11 +279,11 @@ test_tcp_forwarding_ip()
local dstport=$4 local dstport=$4
local lret=0 local lret=0
ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" & ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" &
lpid=$! lpid=$!
sleep 1 sleep 1
ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" & ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" &
cpid=$! cpid=$!
sleep 3 sleep 3
...@@ -274,11 +298,11 @@ test_tcp_forwarding_ip() ...@@ -274,11 +298,11 @@ test_tcp_forwarding_ip()
wait wait
if ! check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"; then if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
lret=1 lret=1
fi fi
if ! check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"; then if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
lret=1 lret=1
fi fi
...@@ -295,41 +319,59 @@ test_tcp_forwarding() ...@@ -295,41 +319,59 @@ test_tcp_forwarding()
test_tcp_forwarding_nat() test_tcp_forwarding_nat()
{ {
local lret local lret
local pmtu
test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
lret=$? lret=$?
pmtu=$3
what=$4
if [ $lret -eq 0 ] ; then if [ $lret -eq 0 ] ; then
if [ $pmtu -eq 1 ] ;then
check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
else
echo "PASS: flow offload for ns1/ns2 with masquerade $what"
fi
test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
lret=$? lret=$?
if [ $pmtu -eq 1 ] ;then
check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
elif [ $lret -eq 0 ] ; then
echo "PASS: flow offload for ns1/ns2 with dnat $what"
fi
fi fi
return $lret return $lret
} }
make_file "$ns1in" make_file "$nsin"
make_file "$ns2in"
# First test: # First test:
# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
if test_tcp_forwarding ns1 ns2; then # Due to MTU mismatch in both directions, all packets (except small packets like pure
# acks) have to be handled by normal forwarding path. Therefore, packet counters
# are not checked.
if test_tcp_forwarding $ns1 $ns2; then
echo "PASS: flow offloaded for ns1/ns2" echo "PASS: flow offloaded for ns1/ns2"
else else
echo "FAIL: flow offload for ns1/ns2:" 1>&2 echo "FAIL: flow offload for ns1/ns2:" 1>&2
ip netns exec nsr1 nft list ruleset ip netns exec $nsr1 nft list ruleset
ret=1 ret=1
fi fi
# delete default route, i.e. ns2 won't be able to reach ns1 and # delete default route, i.e. ns2 won't be able to reach ns1 and
# will depend on ns1 being masqueraded in nsr1. # will depend on ns1 being masqueraded in nsr1.
# expect ns1 has nsr1 address. # expect ns1 has nsr1 address.
ip -net ns2 route del default via 10.0.2.1 ip -net $ns2 route del default via 10.0.2.1
ip -net ns2 route del default via dead:2::1 ip -net $ns2 route del default via dead:2::1
ip -net ns2 route add 192.168.10.1 via 10.0.2.1 ip -net $ns2 route add 192.168.10.1 via 10.0.2.1
# Second test: # Second test:
# Same, but with NAT enabled. # Same, but with NAT enabled. Same as in first test: we expect normal forward path
ip netns exec nsr1 nft -f - <<EOF # to handle most packets.
ip netns exec $nsr1 nft -f - <<EOF
table ip nat { table ip nat {
chain prerouting { chain prerouting {
type nat hook prerouting priority 0; policy accept; type nat hook prerouting priority 0; policy accept;
...@@ -343,47 +385,45 @@ table ip nat { ...@@ -343,47 +385,45 @@ table ip nat {
} }
EOF EOF
if test_tcp_forwarding_nat ns1 ns2; then if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then
echo "PASS: flow offloaded for ns1/ns2 with NAT"
else
echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
ip netns exec nsr1 nft list ruleset ip netns exec $nsr1 nft list ruleset
ret=1 ret=1
fi fi
# Third test: # Third test:
# Same as second test, but with PMTU discovery enabled. # Same as second test, but with PMTU discovery enabled. This
handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) # means that we expect the fastpath to handle packets as soon
# as the endpoints adjust the packet size.
if ! ip netns exec nsr1 nft delete rule inet filter forward $handle; then ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
echo "FAIL: Could not delete large-packet accept rule" ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
exit 1
fi # reset counters.
# With pmtu in-place we'll also check that nft counters
ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # are lower than file size and packets were forwarded via flowtable layer.
ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # For earlier tests (large mtus), packets cannot be handled via flowtable
# (except pure acks and other small packets).
if test_tcp_forwarding_nat ns1 ns2; then ip netns exec $nsr1 nft reset counters table inet filter >/dev/null
echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
else if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then
echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
ip netns exec nsr1 nft list ruleset ip netns exec $nsr1 nft list ruleset
fi fi
# Another test: # Another test:
# Add bridge interface br0 to Router1, with NAT enabled. # Add bridge interface br0 to Router1, with NAT enabled.
ip -net nsr1 link add name br0 type bridge ip -net $nsr1 link add name br0 type bridge
ip -net nsr1 addr flush dev veth0 ip -net $nsr1 addr flush dev veth0
ip -net nsr1 link set up dev veth0 ip -net $nsr1 link set up dev veth0
ip -net nsr1 link set veth0 master br0 ip -net $nsr1 link set veth0 master br0
ip -net nsr1 addr add 10.0.1.1/24 dev br0 ip -net $nsr1 addr add 10.0.1.1/24 dev br0
ip -net nsr1 addr add dead:1::1/64 dev br0 ip -net $nsr1 addr add dead:1::1/64 dev br0
ip -net nsr1 link set up dev br0 ip -net $nsr1 link set up dev br0
ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
# br0 with NAT enabled. # br0 with NAT enabled.
ip netns exec nsr1 nft -f - <<EOF ip netns exec $nsr1 nft -f - <<EOF
flush table ip nat flush table ip nat
table ip nat { table ip nat {
chain prerouting { chain prerouting {
...@@ -398,59 +438,56 @@ table ip nat { ...@@ -398,59 +438,56 @@ table ip nat {
} }
EOF EOF
if test_tcp_forwarding_nat ns1 ns2; then if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then
echo "PASS: flow offloaded for ns1/ns2 with bridge NAT"
else
echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
ip netns exec nsr1 nft list ruleset ip netns exec $nsr1 nft list ruleset
ret=1 ret=1
fi fi
# Another test: # Another test:
# Add bridge interface br0 to Router1, with NAT and VLAN. # Add bridge interface br0 to Router1, with NAT and VLAN.
ip -net nsr1 link set veth0 nomaster ip -net $nsr1 link set veth0 nomaster
ip -net nsr1 link set down dev veth0 ip -net $nsr1 link set down dev veth0
ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10 ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10
ip -net nsr1 link set up dev veth0 ip -net $nsr1 link set up dev veth0
ip -net nsr1 link set up dev veth0.10 ip -net $nsr1 link set up dev veth0.10
ip -net nsr1 link set veth0.10 master br0 ip -net $nsr1 link set veth0.10 master br0
ip -net ns1 addr flush dev eth0 ip -net $ns1 addr flush dev eth0
ip -net ns1 link add link eth0 name eth0.10 type vlan id 10 ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10
ip -net ns1 link set eth0 up ip -net $ns1 link set eth0 up
ip -net ns1 link set eth0.10 up ip -net $ns1 link set eth0.10 up
ip -net ns1 addr add 10.0.1.99/24 dev eth0.10 ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10
ip -net ns1 route add default via 10.0.1.1 ip -net $ns1 route add default via 10.0.1.1
ip -net ns1 addr add dead:1::99/64 dev eth0.10 ip -net $ns1 addr add dead:1::99/64 dev eth0.10
if test_tcp_forwarding_nat ns1 ns2; then if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then
echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN"
else
echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
ip netns exec nsr1 nft list ruleset ip netns exec $nsr1 nft list ruleset
ret=1 ret=1
fi fi
# restore test topology (remove bridge and VLAN) # restore test topology (remove bridge and VLAN)
ip -net nsr1 link set veth0 nomaster ip -net $nsr1 link set veth0 nomaster
ip -net nsr1 link set veth0 down ip -net $nsr1 link set veth0 down
ip -net nsr1 link set veth0.10 down ip -net $nsr1 link set veth0.10 down
ip -net nsr1 link delete veth0.10 type vlan ip -net $nsr1 link delete veth0.10 type vlan
ip -net nsr1 link delete br0 type bridge ip -net $nsr1 link delete br0 type bridge
ip -net ns1 addr flush dev eth0.10 ip -net $ns1 addr flush dev eth0.10
ip -net ns1 link set eth0.10 down ip -net $ns1 link set eth0.10 down
ip -net ns1 link set eth0 down ip -net $ns1 link set eth0 down
ip -net ns1 link delete eth0.10 type vlan ip -net $ns1 link delete eth0.10 type vlan
# restore address in ns1 and nsr1 # restore address in ns1 and nsr1
ip -net ns1 link set eth0 up ip -net $ns1 link set eth0 up
ip -net ns1 addr add 10.0.1.99/24 dev eth0 ip -net $ns1 addr add 10.0.1.99/24 dev eth0
ip -net ns1 route add default via 10.0.1.1 ip -net $ns1 route add default via 10.0.1.1
ip -net ns1 addr add dead:1::99/64 dev eth0 ip -net $ns1 addr add dead:1::99/64 dev eth0
ip -net ns1 route add default via dead:1::1 ip -net $ns1 route add default via dead:1::1
ip -net nsr1 addr add 10.0.1.1/24 dev veth0 ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
ip -net nsr1 addr add dead:1::1/64 dev veth0 ip -net $nsr1 addr add dead:1::1/64 dev veth0
ip -net nsr1 link set up dev veth0 ip -net $nsr1 link set up dev veth0
KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
...@@ -480,23 +517,23 @@ do_esp() { ...@@ -480,23 +517,23 @@ do_esp() {
} }
do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
ip netns exec nsr1 nft delete table ip nat ip netns exec $nsr1 nft delete table ip nat
# restore default routes # restore default routes
ip -net ns2 route del 192.168.10.1 via 10.0.2.1 ip -net $ns2 route del 192.168.10.1 via 10.0.2.1
ip -net ns2 route add default via 10.0.2.1 ip -net $ns2 route add default via 10.0.2.1
ip -net ns2 route add default via dead:2::1 ip -net $ns2 route add default via dead:2::1
if test_tcp_forwarding ns1 ns2; then if test_tcp_forwarding $ns1 $ns2; then
echo "PASS: ipsec tunnel mode for ns1/ns2" check_counters "ipsec tunnel mode for ns1/ns2"
else else
echo "FAIL: ipsec tunnel mode for ns1/ns2" echo "FAIL: ipsec tunnel mode for ns1/ns2"
ip netns exec nsr1 nft list ruleset 1>&2 ip netns exec $nsr1 nft list ruleset 1>&2
ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2 ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2
fi fi
exit $ret exit $ret
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment