netfilter: conntrack: remove central spinlock nf_conntrack_lock

nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

netfilter: conntrack: remove central spinlock nf_conntrack_lock
nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
93bb0ceb · Jesper Dangaard Brouer · Pablo Neira Ayuso · ca7433df · 93bb0ceb · 93bb0ceb
Commit 93bb0ceb authored Mar 03, 2014 by Jesper Dangaard Brouer Committed by Pablo Neira Ayuso Mar 07, 2014
5 changed files
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
            const struct nf_conntrack_l3proto *l3proto,
            const struct nf_conntrack_l4proto *proto);

-extern spinlock_t nf_conntrack_lock ;
+#ifdef CONFIG_LOCKDEP
+# define CONNTRACK_LOCKS 8
+#else
+# define CONNTRACK_LOCKS 1024
+#endif
+extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];

 extern spinlock_t nf_conntrack_expect_lock;


--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -5,6 +5,7 @@
 #include <linux/list_nulls.h>
 #include <linux/atomic.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/seqlock.h>

 struct ctl_table_header;
 struct nf_conntrack_ecache;
@@ -90,6 +91,7 @@ struct netns_ct {
 	int			sysctl_checksum;

 	unsigned int		htable_size;
+	seqcount_t		generation;
 	struct kmem_cache	*nf_conntrack_cachep;
 	struct hlist_nulls_head	*hash;
 	struct hlist_head	*expect_hash;

--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
 			unhelp(h, me);
 		spin_unlock_bh(&pcpu->lock);
 	}
-	spin_lock_bh(&nf_conntrack_lock);
+	local_bh_disable();
 	for (i = 0; i < net->ct.htable_size; i++) {
-		hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
-			unhelp(h, me);
+		spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+		if (i < net->ct.htable_size) {
+			hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+				unhelp(h, me);
+		}
+		spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
 	}
-	spin_unlock_bh(&nf_conntrack_lock);
+	local_bh_enable();
 }

 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)

--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
 	u_int8_t l3proto = nfmsg->nfgen_family;
 	int res;
+	spinlock_t *lockp;
+
 #ifdef CONFIG_NF_CONNTRACK_MARK
 	const struct ctnetlink_dump_filter *filter = cb->data;
 #endif

-	spin_lock_bh(&nf_conntrack_lock);
 	last = (struct nf_conn *)cb->args[1];
+
+	local_bh_disable();
 	for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
 restart:
+		lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
+		spin_lock(lockp);
+		if (cb->args[0] >= net->ct.htable_size) {
+			spin_unlock(lockp);
+			goto out;
+		}
 		hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
 					 hnnode) {
 			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -803,16 +812,18 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 			if (res < 0) {
 				nf_conntrack_get(&ct->ct_general);
 				cb->args[1] = (unsigned long)ct;
+				spin_unlock(lockp);
 				goto out;
 			}
 		}
+		spin_unlock(lockp);
 		if (cb->args[1]) {
 			cb->args[1] = 0;
 			goto restart;
 		}
 	}
 out:
-	spin_unlock_bh(&nf_conntrack_lock);
+	local_bh_enable();
 	if (last)
 		nf_ct_put(last);