Commit 648700f7 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

inet: frags: use rhashtables for reassembly units

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
of storage for the fragments (exact number depends on frags being evicted
after timeout)

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Alexander Aring <alex.aring@gmail.com>
Cc: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent ae6da1f5
...@@ -134,13 +134,10 @@ min_adv_mss - INTEGER ...@@ -134,13 +134,10 @@ min_adv_mss - INTEGER
IP Fragmentation: IP Fragmentation:
ipfrag_high_thresh - INTEGER ipfrag_high_thresh - INTEGER
Maximum memory used to reassemble IP fragments. When Maximum memory used to reassemble IP fragments.
ipfrag_high_thresh bytes of memory is allocated for this purpose,
the fragment handler will toss packets until ipfrag_low_thresh
is reached. This also serves as a maximum limit to namespaces
different from the initial one.
ipfrag_low_thresh - INTEGER ipfrag_low_thresh - INTEGER
(Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources. begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation. The kernel still accepts new fragments for defragmentation.
......
...@@ -2,7 +2,11 @@ ...@@ -2,7 +2,11 @@
#ifndef __NET_FRAG_H__ #ifndef __NET_FRAG_H__
#define __NET_FRAG_H__ #define __NET_FRAG_H__
#include <linux/rhashtable.h>
struct netns_frags { struct netns_frags {
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
/* Keep atomic mem on separate cachelines in structs that include it */ /* Keep atomic mem on separate cachelines in structs that include it */
atomic_t mem ____cacheline_aligned_in_smp; atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */ /* sysctls */
...@@ -26,12 +30,30 @@ enum { ...@@ -26,12 +30,30 @@ enum {
INET_FRAG_COMPLETE = BIT(2), INET_FRAG_COMPLETE = BIT(2),
}; };
struct frag_v4_compare_key {
__be32 saddr;
__be32 daddr;
u32 user;
u32 vif;
__be16 id;
u16 protocol;
};
struct frag_v6_compare_key {
struct in6_addr saddr;
struct in6_addr daddr;
u32 user;
__be32 id;
u32 iif;
};
/** /**
* struct inet_frag_queue - fragment queue * struct inet_frag_queue - fragment queue
* *
* @lock: spinlock protecting the queue * @node: rhash node
* @key: keys identifying this frag.
* @timer: queue expiration timer * @timer: queue expiration timer
* @list: hash bucket list * @lock: spinlock protecting this frag
* @refcnt: reference count of the queue * @refcnt: reference count of the queue
* @fragments: received fragments head * @fragments: received fragments head
* @fragments_tail: received fragments tail * @fragments_tail: received fragments tail
...@@ -41,12 +63,16 @@ enum { ...@@ -41,12 +63,16 @@ enum {
* @flags: fragment queue flags * @flags: fragment queue flags
* @max_size: maximum received fragment size * @max_size: maximum received fragment size
* @net: namespace that this frag belongs to * @net: namespace that this frag belongs to
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory) * @rcu: rcu head for freeing deferall
*/ */
struct inet_frag_queue { struct inet_frag_queue {
spinlock_t lock; struct rhash_head node;
union {
struct frag_v4_compare_key v4;
struct frag_v6_compare_key v6;
} key;
struct timer_list timer; struct timer_list timer;
struct hlist_node list; spinlock_t lock;
refcount_t refcnt; refcount_t refcnt;
struct sk_buff *fragments; struct sk_buff *fragments;
struct sk_buff *fragments_tail; struct sk_buff *fragments_tail;
...@@ -56,50 +82,19 @@ struct inet_frag_queue { ...@@ -56,50 +82,19 @@ struct inet_frag_queue {
__u8 flags; __u8 flags;
u16 max_size; u16 max_size;
struct netns_frags *net; struct netns_frags *net;
struct hlist_node list_evictor; struct rcu_head rcu;
};
#define INETFRAGS_HASHSZ 1024
/* averaged:
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
* struct frag_queue))
*/
#define INETFRAGS_MAXDEPTH 128
struct inet_frag_bucket {
struct hlist_head chain;
spinlock_t chain_lock;
}; };
struct inet_frags { struct inet_frags {
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
struct work_struct frags_work;
unsigned int next_bucket;
unsigned long last_rebuild_jiffies;
bool rebuild;
/* The first call to hashfn is responsible to initialize
* rnd. This is best done with net_get_random_once.
*
* rnd_seqlock is used to let hash insertion detect
* when it needs to re-lookup the hash chain to use.
*/
u32 rnd;
seqlock_t rnd_seqlock;
unsigned int qsize; unsigned int qsize;
unsigned int (*hashfn)(const struct inet_frag_queue *);
bool (*match)(const struct inet_frag_queue *q,
const void *arg);
void (*constructor)(struct inet_frag_queue *q, void (*constructor)(struct inet_frag_queue *q,
const void *arg); const void *arg);
void (*destructor)(struct inet_frag_queue *); void (*destructor)(struct inet_frag_queue *);
void (*frag_expire)(struct timer_list *t); void (*frag_expire)(struct timer_list *t);
struct kmem_cache *frags_cachep; struct kmem_cache *frags_cachep;
const char *frags_cache_name; const char *frags_cache_name;
struct rhashtable_params rhash_params;
}; };
int inet_frags_init(struct inet_frags *); int inet_frags_init(struct inet_frags *);
...@@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *); ...@@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *);
static inline int inet_frags_init_net(struct netns_frags *nf) static inline int inet_frags_init_net(struct netns_frags *nf)
{ {
atomic_set(&nf->mem, 0); atomic_set(&nf->mem, 0);
return 0; return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
} }
void inet_frags_exit_net(struct netns_frags *nf); void inet_frags_exit_net(struct netns_frags *nf);
void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_destroy(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
struct inet_frags *f, void *key, unsigned int hash);
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
const char *prefix); const char *prefix);
...@@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q) ...@@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
static inline bool inet_frag_evicting(struct inet_frag_queue *q) static inline bool inet_frag_evicting(struct inet_frag_queue *q)
{ {
return !hlist_unhashed(&q->list_evictor); return false;
} }
/* Memory Tracking Functions. */ /* Memory Tracking Functions. */
......
...@@ -579,17 +579,8 @@ enum ip6_defrag_users { ...@@ -579,17 +579,8 @@ enum ip6_defrag_users {
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
}; };
struct ip6_create_arg {
__be32 id;
u32 user;
const struct in6_addr *src;
const struct in6_addr *dst;
int iif;
u8 ecn;
};
void ip6_frag_init(struct inet_frag_queue *q, const void *a); void ip6_frag_init(struct inet_frag_queue *q, const void *a);
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); extern const struct rhashtable_params ip6_rhash_params;
/* /*
* Equivalent of ipv4 struct ip * Equivalent of ipv4 struct ip
...@@ -597,11 +588,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); ...@@ -597,11 +588,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
struct frag_queue { struct frag_queue {
struct inet_frag_queue q; struct inet_frag_queue q;
__be32 id; /* fragment id */
u32 user;
struct in6_addr saddr;
struct in6_addr daddr;
int iif; int iif;
__u16 nhoffset; __u16 nhoffset;
u8 ecn; u8 ecn;
......
...@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; ...@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
#define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAG1 0xc0
#define LOWPAN_DISPATCH_FRAGN 0xe0 #define LOWPAN_DISPATCH_FRAGN 0xe0
struct lowpan_create_arg { struct frag_lowpan_compare_key {
u16 tag; u16 tag;
u16 d_size; u16 d_size;
const struct ieee802154_addr *src; const struct ieee802154_addr src;
const struct ieee802154_addr *dst; const struct ieee802154_addr dst;
}; };
/* Equivalent of ipv4 struct ip /* Equivalent of ipv4 struct ipq
*/ */
struct lowpan_frag_queue { struct lowpan_frag_queue {
struct inet_frag_queue q; struct inet_frag_queue q;
u16 tag;
u16 d_size;
struct ieee802154_addr saddr;
struct ieee802154_addr daddr;
}; };
static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
{
switch (a->mode) {
case IEEE802154_ADDR_LONG:
return (((__force u64)a->extended_addr) >> 32) ^
(((__force u64)a->extended_addr) & 0xffffffff);
case IEEE802154_ADDR_SHORT:
return (__force u32)(a->short_addr + (a->pan_id << 16));
default:
return 0;
}
}
int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
void lowpan_net_frag_exit(void); void lowpan_net_frag_exit(void);
int lowpan_net_frag_init(void); int lowpan_net_frag_init(void);
......
...@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; ...@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
struct sk_buff *prev, struct net_device *ldev); struct sk_buff *prev, struct net_device *ldev);
static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
const struct ieee802154_addr *saddr,
const struct ieee802154_addr *daddr)
{
net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
return jhash_3words(ieee802154_addr_hash(saddr),
ieee802154_addr_hash(daddr),
(__force u32)(tag + (d_size << 16)),
lowpan_frags.rnd);
}
static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
{
const struct lowpan_frag_queue *fq;
fq = container_of(q, struct lowpan_frag_queue, q);
return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
}
static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct lowpan_frag_queue *fq;
const struct lowpan_create_arg *arg = a;
fq = container_of(q, struct lowpan_frag_queue, q);
return fq->tag == arg->tag && fq->d_size == arg->d_size &&
ieee802154_addr_equal(&fq->saddr, arg->src) &&
ieee802154_addr_equal(&fq->daddr, arg->dst);
}
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
{ {
const struct lowpan_create_arg *arg = a; const struct frag_lowpan_compare_key *key = a;
struct lowpan_frag_queue *fq; struct lowpan_frag_queue *fq;
fq = container_of(q, struct lowpan_frag_queue, q); fq = container_of(q, struct lowpan_frag_queue, q);
fq->tag = arg->tag; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
fq->d_size = arg->d_size; memcpy(&q->key, key, sizeof(*key));
fq->saddr = *arg->src;
fq->daddr = *arg->dst;
} }
static void lowpan_frag_expire(struct timer_list *t) static void lowpan_frag_expire(struct timer_list *t)
...@@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, ...@@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
const struct ieee802154_addr *src, const struct ieee802154_addr *src,
const struct ieee802154_addr *dst) const struct ieee802154_addr *dst)
{ {
struct inet_frag_queue *q;
struct lowpan_create_arg arg;
unsigned int hash;
struct netns_ieee802154_lowpan *ieee802154_lowpan = struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
struct frag_lowpan_compare_key key = {
.tag = cb->d_tag,
.d_size = cb->d_size,
.src = *src,
.dst = *dst,
};
struct inet_frag_queue *q;
arg.tag = cb->d_tag; q = inet_frag_find(&ieee802154_lowpan->frags, &key);
arg.d_size = cb->d_size;
arg.src = src;
arg.dst = dst;
hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
q = inet_frag_find(&ieee802154_lowpan->frags,
&lowpan_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) { if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt()); inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
...@@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = { ...@@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = {
.exit = lowpan_frags_exit_net, .exit = lowpan_frags_exit_net,
}; };
static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
{
return jhash2(data,
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
}
static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
{
const struct inet_frag_queue *fq = data;
return jhash2((const u32 *)&fq->key,
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
}
static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_lowpan_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
static const struct rhashtable_params lowpan_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.hashfn = lowpan_key_hashfn,
.obj_hashfn = lowpan_obj_hashfn,
.obj_cmpfn = lowpan_obj_cmpfn,
.automatic_shrinking = true,
};
int __init lowpan_net_frag_init(void) int __init lowpan_net_frag_init(void)
{ {
int ret; int ret;
lowpan_frags.hashfn = lowpan_hashfn;
lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.constructor = lowpan_frag_init;
lowpan_frags.destructor = NULL; lowpan_frags.destructor = NULL;
lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.qsize = sizeof(struct frag_queue);
lowpan_frags.match = lowpan_frag_match;
lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frag_expire = lowpan_frag_expire;
lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
lowpan_frags.rhash_params = lowpan_rhash_params;
ret = inet_frags_init(&lowpan_frags); ret = inet_frags_init(&lowpan_frags);
if (ret) if (ret)
goto out; goto out;
......
...@@ -25,12 +25,6 @@ ...@@ -25,12 +25,6 @@
#include <net/inet_frag.h> #include <net/inet_frag.h>
#include <net/inet_ecn.h> #include <net/inet_ecn.h>
#define INETFRAGS_EVICT_BUCKETS 128
#define INETFRAGS_EVICT_MAX 512
/* don't rebuild inetfrag table with new secret more often than this */
#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped. * Value : 0xff if frame should be dropped.
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
...@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { ...@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
}; };
EXPORT_SYMBOL(ip_frag_ecn_table); EXPORT_SYMBOL(ip_frag_ecn_table);
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}
static bool inet_frag_may_rebuild(struct inet_frags *f)
{
return time_after(jiffies,
f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
}
static void inet_frag_secret_rebuild(struct inet_frags *f)
{
int i;
write_seqlock_bh(&f->rnd_seqlock);
if (!inet_frag_may_rebuild(f))
goto out;
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n;
hb = &f->hash[i];
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = inet_frag_hashfn(f, q);
if (hval != i) {
struct inet_frag_bucket *hb_dest;
hlist_del(&q->list);
/* Relink to new hash chain. */
hb_dest = &f->hash[hval];
/* This is the only place where we take
* another chain_lock while already holding
* one. As this will not run concurrently,
* we cannot deadlock on hb_dest lock below, if its
* already locked it will be released soon since
* other caller cannot be waiting for hb lock
* that we've taken above.
*/
spin_lock_nested(&hb_dest->chain_lock,
SINGLE_DEPTH_NESTING);
hlist_add_head(&q->list, &hb_dest->chain);
spin_unlock(&hb_dest->chain_lock);
}
}
spin_unlock(&hb->chain_lock);
}
f->rebuild = false;
f->last_rebuild_jiffies = jiffies;
out:
write_sequnlock_bh(&f->rnd_seqlock);
}
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
if (!hlist_unhashed(&q->list_evictor))
return false;
return q->net->low_thresh == 0 ||
frag_mem_limit(q->net) >= q->net->low_thresh;
}
static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
struct inet_frag_queue *fq;
struct hlist_node *n;
unsigned int evicted = 0;
HLIST_HEAD(expired);
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))
continue;
if (!del_timer(&fq->timer))
continue;
hlist_add_head(&fq->list_evictor, &expired);
++evicted;
}
spin_unlock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire(&fq->timer);
return evicted;
}
static void inet_frag_worker(struct work_struct *work)
{
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
unsigned int i, evicted = 0;
struct inet_frags *f;
f = container_of(work, struct inet_frags, frags_work);
BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
local_bh_disable();
for (i = READ_ONCE(f->next_bucket); budget; --budget) {
evicted += inet_evict_bucket(f, &f->hash[i]);
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
if (evicted > INETFRAGS_EVICT_MAX)
break;
}
f->next_bucket = i;
local_bh_enable();
if (f->rebuild && inet_frag_may_rebuild(f))
inet_frag_secret_rebuild(f);
}
static void inet_frag_schedule_worker(struct inet_frags *f)
{
if (unlikely(!work_pending(&f->frags_work)))
schedule_work(&f->frags_work);
}
int inet_frags_init(struct inet_frags *f) int inet_frags_init(struct inet_frags *f)
{ {
int i;
INIT_WORK(&f->frags_work, inet_frag_worker);
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb = &f->hash[i];
spin_lock_init(&hb->chain_lock);
INIT_HLIST_HEAD(&hb->chain);
}
seqlock_init(&f->rnd_seqlock);
f->last_rebuild_jiffies = 0;
f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
NULL); NULL);
if (!f->frags_cachep) if (!f->frags_cachep)
...@@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init); ...@@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init);
void inet_frags_fini(struct inet_frags *f) void inet_frags_fini(struct inet_frags *f)
{ {
cancel_work_sync(&f->frags_work); /* We must wait that all inet_frag_destroy_rcu() have completed. */
rcu_barrier();
kmem_cache_destroy(f->frags_cachep); kmem_cache_destroy(f->frags_cachep);
f->frags_cachep = NULL;
} }
EXPORT_SYMBOL(inet_frags_fini); EXPORT_SYMBOL(inet_frags_fini);
void inet_frags_exit_net(struct netns_frags *nf) static void inet_frags_free_cb(void *ptr, void *arg)
{
struct inet_frags *f =nf->f;
unsigned int seq;
int i;
nf->low_thresh = 0;
evict_again:
local_bh_disable();
seq = read_seqbegin(&f->rnd_seqlock);
for (i = 0; i < INETFRAGS_HASHSZ ; i++)
inet_evict_bucket(f, &f->hash[i]);
local_bh_enable();
cond_resched();
if (read_seqretry(&f->rnd_seqlock, seq) ||
sum_frag_mem_limit(nf))
goto evict_again;
}
EXPORT_SYMBOL(inet_frags_exit_net);
static struct inet_frag_bucket *
get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
__acquires(hb->chain_lock)
{ {
struct inet_frag_bucket *hb; struct inet_frag_queue *fq = ptr;
unsigned int seq, hash;
restart: /* If we can not cancel the timer, it means this frag_queue
seq = read_seqbegin(&f->rnd_seqlock); * is already disappearing, we have nothing to do.
* Otherwise, we own a refcount until the end of this function.
hash = inet_frag_hashfn(f, fq); */
hb = &f->hash[hash]; if (!del_timer(&fq->timer))
return;
spin_lock(&hb->chain_lock); spin_lock_bh(&fq->lock);
if (read_seqretry(&f->rnd_seqlock, seq)) { if (!(fq->flags & INET_FRAG_COMPLETE)) {
spin_unlock(&hb->chain_lock); fq->flags |= INET_FRAG_COMPLETE;
goto restart; refcount_dec(&fq->refcnt);
} }
spin_unlock_bh(&fq->lock);
return hb; inet_frag_put(fq);
} }
static inline void fq_unlink(struct inet_frag_queue *fq) void inet_frags_exit_net(struct netns_frags *nf)
{ {
struct inet_frag_bucket *hb; nf->low_thresh = 0; /* prevent creation of new frags */
hb = get_frag_bucket_locked(fq, fq->net->f); rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
hlist_del(&fq->list);
fq->flags |= INET_FRAG_COMPLETE;
spin_unlock(&hb->chain_lock);
} }
EXPORT_SYMBOL(inet_frags_exit_net);
void inet_frag_kill(struct inet_frag_queue *fq) void inet_frag_kill(struct inet_frag_queue *fq)
{ {
...@@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_queue *fq) ...@@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_queue *fq)
refcount_dec(&fq->refcnt); refcount_dec(&fq->refcnt);
if (!(fq->flags & INET_FRAG_COMPLETE)) { if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq); struct netns_frags *nf = fq->net;
fq->flags |= INET_FRAG_COMPLETE;
rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
refcount_dec(&fq->refcnt); refcount_dec(&fq->refcnt);
} }
} }
EXPORT_SYMBOL(inet_frag_kill); EXPORT_SYMBOL(inet_frag_kill);
static void inet_frag_destroy_rcu(struct rcu_head *head)
{
struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
rcu);
struct inet_frags *f = q->net->f;
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
}
void inet_frag_destroy(struct inet_frag_queue *q) void inet_frag_destroy(struct inet_frag_queue *q)
{ {
struct sk_buff *fp; struct sk_buff *fp;
...@@ -310,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q) ...@@ -310,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q)
} }
sum = sum_truesize + f->qsize; sum = sum_truesize + f->qsize;
if (f->destructor) call_rcu(&q->rcu, inet_frag_destroy_rcu);
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
sub_frag_mem_limit(nf, sum); sub_frag_mem_limit(nf, sum);
} }
EXPORT_SYMBOL(inet_frag_destroy); EXPORT_SYMBOL(inet_frag_destroy);
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in,
struct inet_frags *f,
void *arg)
{
struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could have been created on other cpu before
* we acquired hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
refcount_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in);
return qp;
}
}
#endif
qp = qp_in;
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
refcount_inc(&qp->refcnt);
refcount_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
spin_unlock(&hb->chain_lock);
return qp;
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f, struct inet_frags *f,
void *arg) void *arg)
{ {
struct inet_frag_queue *q; struct inet_frag_queue *q;
if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
inet_frag_schedule_worker(f);
return NULL; return NULL;
}
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q) if (!q)
...@@ -374,59 +170,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, ...@@ -374,59 +170,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
timer_setup(&q->timer, f->frag_expire, 0); timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock); spin_lock_init(&q->lock);
refcount_set(&q->refcnt, 1); refcount_set(&q->refcnt, 3);
return q; return q;
} }
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f,
void *arg) void *arg)
{ {
struct inet_frags *f = nf->f;
struct inet_frag_queue *q; struct inet_frag_queue *q;
int err;
q = inet_frag_alloc(nf, f, arg); q = inet_frag_alloc(nf, f, arg);
if (!q) if (!q)
return NULL; return NULL;
return inet_frag_intern(nf, q, f, arg); mod_timer(&q->timer, jiffies + nf->timeout);
err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
f->rhash_params);
if (err < 0) {
q->flags |= INET_FRAG_COMPLETE;
inet_frag_kill(q);
inet_frag_destroy(q);
return NULL;
}
return q;
} }
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frags *f, void *key, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
unsigned int hash)
{ {
struct inet_frag_bucket *hb; struct inet_frag_queue *fq;
struct inet_frag_queue *q;
int depth = 0;
if (frag_mem_limit(nf) > nf->low_thresh)
inet_frag_schedule_worker(f);
hash &= (INETFRAGS_HASHSZ - 1);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
refcount_inc(&q->refcnt);
spin_unlock(&hb->chain_lock);
return q;
}
depth++;
}
spin_unlock(&hb->chain_lock);
if (depth <= INETFRAGS_MAXDEPTH) rcu_read_lock();
return inet_frag_create(nf, f, key);
if (inet_frag_may_rebuild(f)) { fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
if (!f->rebuild) if (fq) {
f->rebuild = true; if (!refcount_inc_not_zero(&fq->refcnt))
inet_frag_schedule_worker(f); fq = NULL;
rcu_read_unlock();
return fq;
} }
rcu_read_unlock();
return ERR_PTR(-ENOBUFS); return inet_frag_create(nf, key);
} }
EXPORT_SYMBOL(inet_frag_find); EXPORT_SYMBOL(inet_frag_find);
...@@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, ...@@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
const char *prefix) const char *prefix)
{ {
static const char msg[] = "inet_frag_find: Fragment hash bucket" static const char msg[] = "inet_frag_find: Fragment hash bucket"
" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) " list length grew over limit. Dropping fragment.\n";
". Dropping fragment.\n";
if (PTR_ERR(q) == -ENOBUFS) if (PTR_ERR(q) == -ENOBUFS)
net_dbg_ratelimited("%s%s", prefix, msg); net_dbg_ratelimited("%s%s", prefix, msg);
......
...@@ -69,15 +69,9 @@ struct ipfrag_skb_cb ...@@ -69,15 +69,9 @@ struct ipfrag_skb_cb
struct ipq { struct ipq {
struct inet_frag_queue q; struct inet_frag_queue q;
u32 user;
__be32 saddr;
__be32 daddr;
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */ u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */ u16 max_df_size; /* largest frag with DF set seen */
int iif; int iif;
int vif; /* L3 master device index */
unsigned int rid; unsigned int rid;
struct inet_peer *peer; struct inet_peer *peer;
}; };
...@@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net) ...@@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net)
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev); struct net_device *dev);
struct ip4_create_arg {
struct iphdr *iph;
u32 user;
int vif;
};
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
ip4_frags.rnd);
}
static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
{
const struct ipq *ipq;
ipq = container_of(q, struct ipq, q);
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct ipq *qp;
const struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
qp->user == arg->user &&
qp->vif == arg->vif;
}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a) static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{ {
...@@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) ...@@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
frags); frags);
struct net *net = container_of(ipv4, struct net, ipv4); struct net *net = container_of(ipv4, struct net, ipv4);
const struct ip4_create_arg *arg = a; const struct frag_v4_compare_key *key = a;
qp->protocol = arg->iph->protocol; q->key.v4 = *key;
qp->id = arg->iph->id; qp->ecn = 0;
qp->ecn = ip4_frag_ecn(arg->iph->tos);
qp->saddr = arg->iph->saddr;
qp->daddr = arg->iph->daddr;
qp->vif = arg->vif;
qp->user = arg->user;
qp->peer = q->net->max_dist ? qp->peer = q->net->max_dist ?
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
NULL; NULL;
} }
...@@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t) ...@@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t)
/* Only an end host needs to send an ICMP /* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792. * "Fragment Reassembly Timeout" message, per RFC792.
*/ */
if (frag_expire_skip_icmp(qp->user) && if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL)) (skb_rtable(head)->rt_type != RTN_LOCAL))
goto out; goto out;
...@@ -262,17 +216,17 @@ static void ip_expire(struct timer_list *t) ...@@ -262,17 +216,17 @@ static void ip_expire(struct timer_list *t)
static struct ipq *ip_find(struct net *net, struct iphdr *iph, static struct ipq *ip_find(struct net *net, struct iphdr *iph,
u32 user, int vif) u32 user, int vif)
{ {
struct frag_v4_compare_key key = {
.saddr = iph->saddr,
.daddr = iph->daddr,
.user = user,
.vif = vif,
.id = iph->id,
.protocol = iph->protocol,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
arg.iph = iph;
arg.user = user;
arg.vif = vif;
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &key);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) { if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt()); inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
...@@ -656,7 +610,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, ...@@ -656,7 +610,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
err = -ENOMEM; err = -ENOMEM;
goto out_fail; goto out_fail;
out_oversize: out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
out_fail: out_fail:
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
return err; return err;
...@@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = { ...@@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = {
.exit = ipv4_frags_exit_net, .exit = ipv4_frags_exit_net,
}; };
static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
return jhash2(data,
sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}
static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
const struct inet_frag_queue *fq = data;
return jhash2((const u32 *)&fq->key.v4,
sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}
static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_v4_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
static const struct rhashtable_params ip4_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.key_offset = offsetof(struct inet_frag_queue, key),
.key_len = sizeof(struct frag_v4_compare_key),
.hashfn = ip4_key_hashfn,
.obj_hashfn = ip4_obj_hashfn,
.obj_cmpfn = ip4_obj_cmpfn,
.automatic_shrinking = true,
};
void __init ipfrag_init(void) void __init ipfrag_init(void)
{ {
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init; ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free; ip4_frags.destructor = ip4_frag_free;
ip4_frags.qsize = sizeof(struct ipq); ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire; ip4_frags.frag_expire = ip_expire;
ip4_frags.frags_cache_name = ip_frag_cache_name; ip4_frags.frags_cache_name = ip_frag_cache_name;
ip4_frags.rhash_params = ip4_rhash_params;
if (inet_frags_init(&ip4_frags)) if (inet_frags_init(&ip4_frags))
panic("IP: failed to allocate ip4_frags cache\n"); panic("IP: failed to allocate ip4_frags cache\n");
ip4_frags_ctl_register(); ip4_frags_ctl_register();
......
...@@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) ...@@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
} }
static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
(__force u32)id, nf_frags.rnd);
}
static unsigned int nf_hashfn(const struct inet_frag_queue *q)
{
const struct frag_queue *nq;
nq = container_of(q, struct frag_queue, q);
return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
}
static void nf_ct_frag6_expire(struct timer_list *t) static void nf_ct_frag6_expire(struct timer_list *t)
{ {
struct inet_frag_queue *frag = from_timer(frag, t, timer); struct inet_frag_queue *frag = from_timer(frag, t, timer);
...@@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t) ...@@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t)
} }
/* Creation primitives. */ /* Creation primitives. */
static inline struct frag_queue *fq_find(struct net *net, __be32 id, static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
u32 user, struct in6_addr *src, const struct ipv6hdr *hdr, int iif)
struct in6_addr *dst, int iif, u8 ecn)
{ {
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
.daddr = hdr->daddr,
.user = user,
.iif = iif,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip6_create_arg arg;
unsigned int hash;
arg.id = id;
arg.user = user;
arg.src = src;
arg.dst = dst;
arg.iif = iif;
arg.ecn = ecn;
local_bh_disable();
hash = nf_hash_frag(id, src, dst);
q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); q = inet_frag_find(&net->nf_frag.frags, &key);
local_bh_enable();
if (IS_ERR_OR_NULL(q)) { if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt()); inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
...@@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) ...@@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
fhdr = (struct frag_hdr *)skb_transport_header(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb);
skb_orphan(skb); skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) { if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n"); pr_debug("Can't find and can't create new queue\n");
return -ENOMEM; return -ENOMEM;
...@@ -660,13 +636,12 @@ int nf_ct_frag6_init(void) ...@@ -660,13 +636,12 @@ int nf_ct_frag6_init(void)
{ {
int ret = 0; int ret = 0;
nf_frags.hashfn = nf_hashfn;
nf_frags.constructor = ip6_frag_init; nf_frags.constructor = ip6_frag_init;
nf_frags.destructor = NULL; nf_frags.destructor = NULL;
nf_frags.qsize = sizeof(struct frag_queue); nf_frags.qsize = sizeof(struct frag_queue);
nf_frags.match = ip6_frag_match;
nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frag_expire = nf_ct_frag6_expire;
nf_frags.frags_cache_name = nf_frags_cache_name; nf_frags.frags_cache_name = nf_frags_cache_name;
nf_frags.rhash_params = ip6_rhash_params;
ret = inet_frags_init(&nf_frags); ret = inet_frags_init(&nf_frags);
if (ret) if (ret)
goto out; goto out;
......
...@@ -79,52 +79,13 @@ static struct inet_frags ip6_frags; ...@@ -79,52 +79,13 @@ static struct inet_frags ip6_frags;
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
struct net_device *dev); struct net_device *dev);
/*
* callers should be careful not to use the hash value outside the ipfrag_lock
* as doing so could race with ipfrag_hash_rnd being recalculated.
*/
static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
(__force u32)id, ip6_frags.rnd);
}
static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
{
const struct frag_queue *fq;
fq = container_of(q, struct frag_queue, q);
return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
}
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct frag_queue *fq;
const struct ip6_create_arg *arg = a;
fq = container_of(q, struct frag_queue, q);
return fq->id == arg->id &&
fq->user == arg->user &&
ipv6_addr_equal(&fq->saddr, arg->src) &&
ipv6_addr_equal(&fq->daddr, arg->dst) &&
(arg->iif == fq->iif ||
!(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
IPV6_ADDR_LINKLOCAL)));
}
EXPORT_SYMBOL(ip6_frag_match);
void ip6_frag_init(struct inet_frag_queue *q, const void *a) void ip6_frag_init(struct inet_frag_queue *q, const void *a)
{ {
struct frag_queue *fq = container_of(q, struct frag_queue, q); struct frag_queue *fq = container_of(q, struct frag_queue, q);
const struct ip6_create_arg *arg = a; const struct frag_v6_compare_key *key = a;
fq->id = arg->id; q->key.v6 = *key;
fq->user = arg->user; fq->ecn = 0;
fq->saddr = *arg->src;
fq->daddr = *arg->dst;
fq->ecn = arg->ecn;
} }
EXPORT_SYMBOL(ip6_frag_init); EXPORT_SYMBOL(ip6_frag_init);
...@@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t) ...@@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t)
} }
static struct frag_queue * static struct frag_queue *
fq_find(struct net *net, __be32 id, const struct in6_addr *src, fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
const struct in6_addr *dst, int iif, u8 ecn)
{ {
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
.daddr = hdr->daddr,
.user = IP6_DEFRAG_LOCAL_DELIVER,
.iif = iif,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip6_create_arg arg;
unsigned int hash;
arg.id = id;
arg.user = IP6_DEFRAG_LOCAL_DELIVER;
arg.src = src;
arg.dst = dst;
arg.iif = iif;
arg.ecn = ecn;
hash = inet6_hash_frag(id, src, dst); if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
IPV6_ADDR_LINKLOCAL)))
key.iif = 0;
q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); q = inet_frag_find(&net->ipv6.frags, &key);
if (IS_ERR_OR_NULL(q)) { if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt()); inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
...@@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ...@@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
struct frag_queue *fq; struct frag_queue *fq;
const struct ipv6hdr *hdr = ipv6_hdr(skb); const struct ipv6hdr *hdr = ipv6_hdr(skb);
struct net *net = dev_net(skb_dst(skb)->dev); struct net *net = dev_net(skb_dst(skb)->dev);
int iif;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
goto fail_hdr; goto fail_hdr;
...@@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ...@@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return 1; return 1;
} }
fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, iif = skb->dev ? skb->dev->ifindex : 0;
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) { if (fq) {
int ret; int ret;
spin_lock(&fq->q.lock); spin_lock(&fq->q.lock);
fq->iif = iif;
ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
...@@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = { ...@@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = {
.exit = ipv6_frags_exit_net, .exit = ipv6_frags_exit_net,
}; };
static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
{
return jhash2(data,
sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}
static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
{
const struct inet_frag_queue *fq = data;
return jhash2((const u32 *)&fq->key.v6,
sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}
static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_v6_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
const struct rhashtable_params ip6_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.hashfn = ip6_key_hashfn,
.obj_hashfn = ip6_obj_hashfn,
.obj_cmpfn = ip6_obj_cmpfn,
.automatic_shrinking = true,
};
EXPORT_SYMBOL(ip6_rhash_params);
int __init ipv6_frag_init(void) int __init ipv6_frag_init(void)
{ {
int ret; int ret;
ip6_frags.hashfn = ip6_hashfn;
ip6_frags.constructor = ip6_frag_init; ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL; ip6_frags.destructor = NULL;
ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.match = ip6_frag_match;
ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frag_expire = ip6_frag_expire;
ip6_frags.frags_cache_name = ip6_frag_cache_name; ip6_frags.frags_cache_name = ip6_frag_cache_name;
ip6_frags.rhash_params = ip6_rhash_params;
ret = inet_frags_init(&ip6_frags); ret = inet_frags_init(&ip6_frags);
if (ret) if (ret)
goto out; goto out;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment