Commit 7beceebf authored by David S. Miller's avatar David S. Miller

Merge branch 'rhashtable-next'

Thomas Graf says:

====================
rhashtable: Per bucket locks & deferred table resizing

Prepares for and introduces per bucket spinlocks and deferred table
resizing. This allows for parallel table mutations in different hash
buckets from atomic context. The resizing occurs in the background
in a separate worker thread while lookups, inserts, and removals can
continue.

Also modified the chain linked list to be terminated with a special
nulls marker to allow entries to move between multiple lists.

Last but not least, reintroduces lockless netlink_lookup() with
deferred Netlink socket destruction to avoid the side effect of
increased netlink_release() runtime.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents dd955398 21e4902a
......@@ -21,8 +21,9 @@ struct hlist_nulls_head {
struct hlist_nulls_node {
struct hlist_nulls_node *next, **pprev;
};
#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
/**
......
This diff is collapsed.
......@@ -190,6 +190,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
_raw_spin_lock_nested(lock, subclass)
# define raw_spin_lock_bh_nested(lock, subclass) \
_raw_spin_lock_bh_nested(lock, subclass)
# define raw_spin_lock_nest_lock(lock, nest_lock) \
do { \
......@@ -205,6 +207,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
# define raw_spin_lock_nested(lock, subclass) \
_raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock)
# define raw_spin_lock_bh_nested(lock, subclass) _raw_spin_lock_bh(lock)
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
......@@ -324,6 +327,11 @@ do { \
raw_spin_lock_nested(spinlock_check(lock), subclass); \
} while (0)
#define spin_lock_bh_nested(lock, subclass) \
do { \
raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
} while (0)
#define spin_lock_nest_lock(lock, nest_lock) \
do { \
raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \
......
......@@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr);
void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
__acquires(lock);
void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
__acquires(lock);
void __lockfunc
_raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
__acquires(lock);
......
......@@ -57,6 +57,7 @@
#define _raw_spin_lock(lock) __LOCK(lock)
#define _raw_spin_lock_nested(lock, subclass) __LOCK(lock)
#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
#define _raw_read_lock(lock) __LOCK(lock)
#define _raw_write_lock(lock) __LOCK(lock)
#define _raw_spin_lock_bh(lock) __LOCK_BH(lock)
......
......@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_raw_spin_lock_nested);
void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
{
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}
EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
int subclass)
{
......
This diff is collapsed.
......@@ -33,7 +33,7 @@ static bool nft_hash_lookup(const struct nft_set *set,
const struct nft_data *key,
struct nft_data *data)
{
const struct rhashtable *priv = nft_set_priv(set);
struct rhashtable *priv = nft_set_priv(set);
const struct nft_hash_elem *he;
he = rhashtable_lookup(priv, key);
......@@ -83,46 +83,53 @@ static void nft_hash_remove(const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct rhashtable *priv = nft_set_priv(set);
struct rhash_head *he, __rcu **pprev;
pprev = elem->cookie;
he = rht_dereference((*pprev), priv);
rhashtable_remove(priv, elem->cookie);
synchronize_rcu();
kfree(elem->cookie);
}
rhashtable_remove_pprev(priv, he, pprev);
struct nft_compare_arg {
const struct nft_set *set;
struct nft_set_elem *elem;
};
synchronize_rcu();
kfree(he);
static bool nft_hash_compare(void *ptr, void *arg)
{
struct nft_hash_elem *he = ptr;
struct nft_compare_arg *x = arg;
if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
x->elem->cookie = he;
x->elem->flags = 0;
if (x->set->flags & NFT_SET_MAP)
nft_data_copy(&x->elem->data, he->data);
return true;
}
return false;
}
static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
{
const struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl = rht_dereference_rcu(priv->tbl, priv);
struct rhash_head __rcu * const *pprev;
struct nft_hash_elem *he;
u32 h;
h = rhashtable_hashfn(priv, &elem->key, set->klen);
pprev = &tbl->buckets[h];
rht_for_each_entry_rcu(he, tbl->buckets[h], node) {
if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
pprev = &he->node.next;
continue;
}
struct rhashtable *priv = nft_set_priv(set);
struct nft_compare_arg arg = {
.set = set,
.elem = elem,
};
elem->cookie = (void *)pprev;
elem->flags = 0;
if (set->flags & NFT_SET_MAP)
nft_data_copy(&elem->data, he->data);
if (rhashtable_lookup_compare(priv, &elem->key,
&nft_hash_compare, &arg))
return 0;
}
return -ENOENT;
}
static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
struct nft_set_iter *iter)
{
const struct rhashtable *priv = nft_set_priv(set);
struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl;
const struct nft_hash_elem *he;
struct nft_set_elem elem;
......@@ -130,7 +137,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
tbl = rht_dereference_rcu(priv->tbl, priv);
for (i = 0; i < tbl->size; i++) {
rht_for_each_entry_rcu(he, tbl->buckets[i], node) {
struct rhash_head *pos;
rht_for_each_entry_rcu(he, pos, tbl, i, node) {
if (iter->count < iter->skip)
goto cont;
......@@ -153,13 +162,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
return sizeof(struct rhashtable);
}
#ifdef CONFIG_PROVE_LOCKING
static int lockdep_nfnl_lock_is_held(void *parent)
{
return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
}
#endif
static int nft_hash_init(const struct nft_set *set,
const struct nft_set_desc *desc,
const struct nlattr * const tb[])
......@@ -173,9 +175,6 @@ static int nft_hash_init(const struct nft_set *set,
.hashfn = jhash,
.grow_decision = rht_grow_above_75,
.shrink_decision = rht_shrink_below_30,
#ifdef CONFIG_PROVE_LOCKING
.mutex_is_held = lockdep_nfnl_lock_is_held,
#endif
};
return rhashtable_init(priv, &params);
......@@ -183,18 +182,23 @@ static int nft_hash_init(const struct nft_set *set,
static void nft_hash_destroy(const struct nft_set *set)
{
const struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl = priv->tbl;
struct nft_hash_elem *he, *next;
struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl;
struct nft_hash_elem *he;
struct rhash_head *pos, *next;
unsigned int i;
/* Stop an eventual async resizing */
priv->being_destroyed = true;
mutex_lock(&priv->mutex);
tbl = rht_dereference(priv->tbl, priv);
for (i = 0; i < tbl->size; i++) {
for (he = rht_entry(tbl->buckets[i], struct nft_hash_elem, node);
he != NULL; he = next) {
next = rht_entry(he->node.next, struct nft_hash_elem, node);
rht_for_each_entry_safe(he, pos, next, tbl, i, node)
nft_hash_elem_destroy(set, he);
}
}
mutex_unlock(&priv->mutex);
rhashtable_destroy(priv);
}
......
......@@ -97,12 +97,12 @@ static int netlink_dump(struct sock *sk);
static void netlink_skb_destructor(struct sk_buff *skb);
/* nl_table locking explained:
* Lookup and traversal are protected with nl_sk_hash_lock or nl_table_lock
* combined with an RCU read-side lock. Insertion and removal are protected
* with nl_sk_hash_lock while using RCU list modification primitives and may
* run in parallel to nl_table_lock protected lookups. Destruction of the
* Netlink socket may only occur *after* nl_table_lock has been acquired
* either during or after the socket has been removed from the list.
* Lookup and traversal are protected with an RCU read-side lock. Insertion
* and removal are protected with nl_sk_hash_lock while using RCU list
* modification primitives and may run in parallel to RCU protected lookups.
* Destruction of the Netlink socket may only occur *after* nl_table_lock has
* been acquired * either during or after the socket has been removed from
* the list and after an RCU grace period.
*/
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
......@@ -114,15 +114,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
DEFINE_MUTEX(nl_sk_hash_lock);
EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
#ifdef CONFIG_PROVE_LOCKING
static int lockdep_nl_sk_hash_is_held(void *parent)
{
if (debug_locks)
return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
return 1;
}
#endif
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
static DEFINE_SPINLOCK(netlink_tap_lock);
......@@ -1002,11 +993,8 @@ static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
.net = net,
.portid = portid,
};
u32 hash;
hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid));
return rhashtable_lookup_compare(&table->hash, hash,
return rhashtable_lookup_compare(&table->hash, &portid,
&netlink_compare, &arg);
}
......@@ -1015,13 +1003,11 @@ static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
struct netlink_table *table = &nl_table[protocol];
struct sock *sk;
read_lock(&nl_table_lock);
rcu_read_lock();
sk = __netlink_lookup(table, portid, net);
if (sk)
sock_hold(sk);
rcu_read_unlock();
read_unlock(&nl_table_lock);
return sk;
}
......@@ -1066,7 +1052,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
goto err;
err = -ENOMEM;
if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX))
if (BITS_PER_LONG > 32 &&
unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
goto err;
nlk_sk(sk)->portid = portid;
......@@ -1194,6 +1181,13 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
goto out;
}
static void deferred_put_nlk_sk(struct rcu_head *head)
{
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
sock_put(&nlk->sk);
}
static int netlink_release(struct socket *sock)
{
struct sock *sk = sock->sk;
......@@ -1259,7 +1253,7 @@ static int netlink_release(struct socket *sock)
local_bh_disable();
sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
local_bh_enable();
sock_put(sk);
call_rcu(&nlk->rcu, deferred_put_nlk_sk);
return 0;
}
......@@ -1274,7 +1268,6 @@ static int netlink_autobind(struct socket *sock)
retry:
cond_resched();
netlink_table_grab();
rcu_read_lock();
if (__netlink_lookup(table, portid, net)) {
/* Bind collision, search negative portid values. */
......@@ -1282,11 +1275,9 @@ static int netlink_autobind(struct socket *sock)
if (rover > -4097)
rover = -4097;
rcu_read_unlock();
netlink_table_ungrab();
goto retry;
}
rcu_read_unlock();
netlink_table_ungrab();
err = netlink_insert(sk, net, portid);
if (err == -EADDRINUSE)
......@@ -2901,7 +2892,9 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
for (j = 0; j < tbl->size; j++) {
rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) {
struct rhash_head *node;
rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
s = (struct sock *)nlk;
if (sock_net(s) != seq_file_net(seq))
......@@ -2919,9 +2912,8 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
}
static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(nl_table_lock) __acquires(RCU)
__acquires(RCU)
{
read_lock(&nl_table_lock);
rcu_read_lock();
return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
......@@ -2929,6 +2921,8 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct rhashtable *ht;
const struct bucket_table *tbl;
struct rhash_head *node;
struct netlink_sock *nlk;
struct nl_seq_iter *iter;
struct net *net;
......@@ -2945,17 +2939,17 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
i = iter->link;
ht = &nl_table[i].hash;
rht_for_each_entry(nlk, nlk->node.next, ht, node)
tbl = rht_dereference_rcu(ht->tbl, ht);
rht_for_each_entry_rcu_continue(nlk, node, nlk->node.next, tbl, iter->hash_idx, node)
if (net_eq(sock_net((struct sock *)nlk), net))
return nlk;
j = iter->hash_idx + 1;
do {
const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
for (; j < tbl->size; j++) {
rht_for_each_entry(nlk, tbl->buckets[j], ht, node) {
rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
if (net_eq(sock_net((struct sock *)nlk), net)) {
iter->link = i;
iter->hash_idx = j;
......@@ -2971,10 +2965,9 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
__releases(RCU) __releases(nl_table_lock)
__releases(RCU)
{
rcu_read_unlock();
read_unlock(&nl_table_lock);
}
......@@ -3121,9 +3114,6 @@ static int __init netlink_proto_init(void)
.max_shift = 16, /* 64K */
.grow_decision = rht_grow_above_75,
.shrink_decision = rht_shrink_below_30,
#ifdef CONFIG_PROVE_LOCKING
.mutex_is_held = lockdep_nl_sk_hash_is_held,
#endif
};
if (err != 0)
......
......@@ -50,6 +50,7 @@ struct netlink_sock {
#endif /* CONFIG_NETLINK_MMAP */
struct rhash_head node;
struct rcu_head rcu;
};
static inline struct netlink_sock *nlk_sk(struct sock *sk)
......
......@@ -113,7 +113,9 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
req = nlmsg_data(cb->nlh);
for (i = 0; i < htbl->size; i++) {
rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) {
struct rhash_head *pos;
rht_for_each_entry(nlsk, pos, htbl, i, node) {
sk = (struct sock *)nlsk;
if (!net_eq(sock_net(sk), net))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment