Commit 1a01a075 authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

This is v2 including deadlock fix in conntrack ecache rework
reported by Jakub Kicinski.

The following patchset contains Netfilter updates for net-next,
mostly updates to conntrack from Florian Westphal.

1) Add a dedicated list for conntrack event redelivery.

2) Include event redelivery list in conntrack dumps of dying type.

3) Remove per-cpu dying list for event redelivery, not used anymore.

4) Add netns .pre_exit to cttimeout to zap timeout objects before
   synchronize_rcu() call.

5) Remove nf_ct_unconfirmed_destroy.

6) Add generation id for conntrack extensions for conntrack
   timeout and helpers.

7) Detach timeout policy from conntrack on cttimeout module removal.

8) Remove __nf_ct_unconfirmed_destroy.

9) Remove unconfirmed list.

10) Remove unconditional local_bh_disable in init_conntrack().

11) Consolidate conntrack iterator nf_ct_iterate_cleanup().

12) Detect if ctnetlink listeners exist to short-circuit event
    path early.

13) Un-inline nf_ct_ecache_ext_add().

14) Add nf_conntrack_events autodetect ctnetlink listener mode
    and make it default.

15) Add nf_ct_ecache_exist() to check for event cache extension.

16) Extend flowtable reverse route lookup to include source, iif,
    tos and mark, from Sven Auhagen.

17) Do not verify zero checksum UDP packets in nf_reject,
    from Kevin Mitchell.

====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d9713088 4f9bd530
......@@ -34,10 +34,13 @@ nf_conntrack_count - INTEGER (read-only)
nf_conntrack_events - BOOLEAN
- 0 - disabled
- not 0 - enabled (default)
- 1 - enabled
- 2 - auto (default)
If this option is enabled, the connection tracking code will
provide userspace with connection tracking events via ctnetlink.
The default allocates the extension if a userspace program is
listening to ctnetlink events.
nf_conntrack_expect_max - INTEGER
Maximum size of expectation table. Default value is
......
......@@ -45,7 +45,8 @@ union nf_conntrack_expect_proto {
struct nf_conntrack_net_ecache {
struct delayed_work dwork;
struct netns_ct *ct_net;
spinlock_t dying_lock;
struct hlist_nulls_head dying_list;
};
struct nf_conntrack_net {
......@@ -100,7 +101,6 @@ struct nf_conn {
/* Have we seen traffic both ways yet? (bitset) */
unsigned long status;
u16 cpu;
possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT)
......@@ -236,13 +236,16 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
return nf_ct_delete(ct, 0, 0);
}
/* Set all unconfirmed conntrack as dying */
void nf_ct_unconfirmed_destroy(struct net *);
struct nf_ct_iter_data {
struct net *net;
void *data;
u32 portid;
int report;
};
/* Iterate over all conntracks: if iter returns true, it's deleted. */
void nf_ct_iterate_cleanup_net(struct net *net,
int (*iter)(struct nf_conn *i, void *data),
void *data, u32 portid, int report);
void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
const struct nf_ct_iter_data *iter_data);
/* also set unconfirmed conntracks as dying. Only use in module exit path. */
void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data),
......
......@@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
if (ct) {
if (!nf_ct_is_confirmed(ct))
ret = __nf_conntrack_confirm(skb);
if (likely(ret == NF_ACCEPT))
if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct))
nf_ct_deliver_cached_events(ct);
}
return ret;
......
......@@ -14,7 +14,6 @@
#include <net/netfilter/nf_conntrack_extend.h>
enum nf_ct_ecache_state {
NFCT_ECACHE_UNKNOWN, /* destroy event not sent */
NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */
NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */
};
......@@ -23,7 +22,6 @@ struct nf_conntrack_ecache {
unsigned long cache; /* bitops want long */
u16 ctmask; /* bitmask of ct events to be delivered */
u16 expmask; /* bitmask of expect events to be delivered */
enum nf_ct_ecache_state state:8;/* ecache state */
u32 missed; /* missed events */
u32 portid; /* netlink portid of destroyer */
};
......@@ -38,28 +36,12 @@ nf_ct_ecache_find(const struct nf_conn *ct)
#endif
}
static inline struct nf_conntrack_ecache *
nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
static inline bool nf_ct_ecache_exist(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct net *net = nf_ct_net(ct);
struct nf_conntrack_ecache *e;
if (!ctmask && !expmask && net->ct.sysctl_events) {
ctmask = ~0;
expmask = ~0;
}
if (!ctmask && !expmask)
return NULL;
e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
if (e) {
e->ctmask = ctmask;
e->expmask = expmask;
}
return e;
return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE);
#else
return NULL;
return false;
#endif
}
......@@ -91,6 +73,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct);
int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
u32 portid, int report);
bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp);
#else
static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct)
......@@ -105,6 +88,10 @@ static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
return 0;
}
static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
return false;
}
#endif
static inline void
......@@ -130,30 +117,20 @@ nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
u32 portid, int report)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = nf_ct_net(ct);
if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
return 0;
return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
#else
return 0;
if (nf_ct_ecache_exist(ct))
return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
#endif
return 0;
}
static inline int
nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = nf_ct_net(ct);
if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
return 0;
return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
#else
return 0;
if (nf_ct_ecache_exist(ct))
return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
#endif
return 0;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
......@@ -166,6 +143,8 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state);
void nf_conntrack_ecache_pernet_init(struct net *net);
void nf_conntrack_ecache_pernet_fini(struct net *net);
struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net);
static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net)
{
return net->ct.ecache_dwork_pending;
......
......@@ -34,21 +34,11 @@ enum nf_ct_ext_id {
NF_CT_EXT_NUM,
};
#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
#define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct
#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy
#define NF_CT_EXT_ACT_CT_TYPE struct nf_conn_act_ct_ext
/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {
u8 offset[NF_CT_EXT_NUM];
u8 len;
unsigned int gen_id;
char data[] __aligned(8);
};
......@@ -62,17 +52,28 @@ static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
return (ct->ext && __nf_ct_ext_exist(ct->ext, id));
}
static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id);
static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id)
{
if (!nf_ct_ext_exist(ct, id))
struct nf_ct_ext *ext = ct->ext;
if (!ext || !__nf_ct_ext_exist(ext, id))
return NULL;
if (unlikely(ext->gen_id))
return __nf_ct_ext_find(ext, id);
return (void *)ct->ext + ct->ext->offset[id];
}
#define nf_ct_ext_find(ext, id) \
((id##_TYPE *)__nf_ct_ext_find((ext), (id)))
/* Add this type, returns pointer to data or NULL. */
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
/* ext genid. if ext->id != ext_genid, extensions cannot be used
* anymore unless conntrack has CONFIRMED bit set.
*/
extern atomic_t nf_conntrack_ext_genid;
void nf_ct_ext_bump_genid(void);
#endif /* _NF_CONNTRACK_EXTEND_H */
......@@ -17,10 +17,18 @@ struct nf_conn_labels {
unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
};
/* Can't use nf_ct_ext_find(), flow dissector cannot use symbols
* exported by nf_conntrack module.
*/
static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_LABELS
return nf_ct_ext_find(ct, NF_CT_EXT_LABELS);
struct nf_ct_ext *ext = ct->ext;
if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS))
return NULL;
return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS];
#else
return NULL;
#endif
......
......@@ -17,14 +17,6 @@ struct nf_ct_timeout {
char data[];
};
struct ctnl_timeout {
struct list_head head;
struct rcu_head rcu_head;
refcount_t refcnt;
char name[CTNL_TIMEOUT_NAME_MAX];
struct nf_ct_timeout timeout;
};
struct nf_conn_timeout {
struct nf_ct_timeout __rcu *timeout;
};
......
......@@ -5,12 +5,28 @@
#include <linux/types.h>
#include <uapi/linux/in.h>
static inline bool nf_reject_verify_csum(__u8 proto)
static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff,
__u8 proto)
{
/* Skip protocols that don't use 16-bit one's complement checksum
* of the entire payload.
*/
switch (proto) {
/* Protocols with optional checksums. */
case IPPROTO_UDP: {
const struct udphdr *udp_hdr;
struct udphdr _udp_hdr;
udp_hdr = skb_header_pointer(skb, dataoff,
sizeof(_udp_hdr),
&_udp_hdr);
if (!udp_hdr || udp_hdr->check)
return true;
return false;
}
case IPPROTO_GRE:
/* Protocols with other integrity checks. */
case IPPROTO_AH:
case IPPROTO_ESP:
......@@ -19,9 +35,6 @@ static inline bool nf_reject_verify_csum(__u8 proto)
/* Protocols with partial checksums. */
case IPPROTO_UDPLITE:
case IPPROTO_DCCP:
/* Protocols with optional checksums. */
case IPPROTO_GRE:
return false;
}
return true;
......
......@@ -93,14 +93,9 @@ struct nf_ip_net {
#endif
};
struct ct_pcpu {
spinlock_t lock;
struct hlist_nulls_head unconfirmed;
struct hlist_nulls_head dying;
};
struct netns_ct {
#ifdef CONFIG_NF_CONNTRACK_EVENTS
bool ctnetlink_has_listener;
bool ecache_dwork_pending;
#endif
u8 sysctl_log_invalid; /* Log invalid packets */
......@@ -110,7 +105,6 @@ struct netns_ct {
u8 sysctl_tstamp;
u8 sysctl_checksum;
struct ct_pcpu __percpu *pcpu_lists;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
struct nf_ip_net nf_ct_proto;
......
......@@ -80,6 +80,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
struct iphdr *niph;
struct icmphdr *icmph;
unsigned int len;
int dataoff;
__wsum csum;
u8 proto;
......@@ -99,10 +100,11 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
return NULL;
dataoff = ip_hdrlen(oldskb);
proto = ip_hdr(oldskb)->protocol;
if (!skb_csum_unnecessary(oldskb) &&
nf_reject_verify_csum(proto) &&
nf_reject_verify_csum(oldskb, dataoff, proto) &&
nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
return NULL;
......@@ -311,6 +313,7 @@ EXPORT_SYMBOL_GPL(nf_send_reset);
void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
{
struct iphdr *iph = ip_hdr(skb_in);
int dataoff = ip_hdrlen(skb_in);
u8 proto = iph->protocol;
if (iph->frag_off & htons(IP_OFFSET))
......@@ -320,12 +323,13 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
nf_reject_fill_skb_dst(skb_in) < 0)
return;
if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) {
if (skb_csum_unnecessary(skb_in) ||
!nf_reject_verify_csum(skb_in, dataoff, proto)) {
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
return;
}
if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0)
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
}
EXPORT_SYMBOL_GPL(nf_send_unreach);
......
......@@ -31,7 +31,7 @@ static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
if (!nf_reject_verify_csum(proto))
if (!nf_reject_verify_csum(skb, thoff, proto))
return true;
return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
......@@ -388,7 +388,7 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook)
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
if (!nf_reject_verify_csum(proto))
if (!nf_reject_verify_csum(skb, thoff, proto))
return true;
return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
......
This diff is collapsed.
......@@ -16,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
......@@ -29,8 +28,9 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
#define ECACHE_RETRY_WAIT (HZ/10)
#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
#define DYING_NULLS_VAL ((1 << 30) + 1)
#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)
enum retry_state {
STATE_CONGESTED,
......@@ -38,58 +38,67 @@ enum retry_state {
STATE_DONE,
};
static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
struct nf_conn *refs[ECACHE_STACK_ALLOC];
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
return &cnet->ecache;
}
#if IS_MODULE(CONFIG_NF_CT_NETLINK)
EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
#endif
static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
{
unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
struct hlist_nulls_head evicted_list;
enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int evicted = 0;
unsigned int sent;
spin_lock(&pcpu->lock);
INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);
hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
next:
sent = 0;
spin_lock_bh(&cnet->ecache.dying_lock);
hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
struct nf_conntrack_ecache *e;
if (!nf_ct_is_confirmed(ct))
continue;
/* This ecache access is safe because the ct is on the
* pcpu dying list and we hold the spinlock -- the entry
* cannot be free'd until after the lock is released.
*
* This is true even if ct has a refcount of 0: the
* cpu that is about to free the entry must remove it
* from the dying list and needs the lock to do so.
*/
e = nf_ct_ecache_find(ct);
if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
continue;
/* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
* the worker owns this entry: the ct will remain valid
* until the worker puts its ct reference.
/* The worker owns all entries, ct remains valid until nf_ct_put
* in the loop below.
*/
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}
e->state = NFCT_ECACHE_DESTROY_SENT;
refs[evicted] = ct;
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);
if (++evicted >= ARRAY_SIZE(refs)) {
if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}
if (sent++ > 16) {
spin_unlock_bh(&cnet->ecache.dying_lock);
cond_resched();
goto next;
}
}
spin_unlock(&pcpu->lock);
spin_unlock_bh(&cnet->ecache.dying_lock);
/* can't _put while holding lock */
while (evicted)
nf_ct_put(refs[--evicted]);
hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
nf_ct_put(ct);
cond_resched();
}
return ret;
}
......@@ -97,35 +106,20 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
static void ecache_work(struct work_struct *work)
{
struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
struct netns_ct *ctnet = cnet->ecache.ct_net;
int cpu, delay = -1;
struct ct_pcpu *pcpu;
local_bh_disable();
for_each_possible_cpu(cpu) {
enum retry_state ret;
pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
ret = ecache_work_evict_list(pcpu);
switch (ret) {
case STATE_CONGESTED:
delay = ECACHE_RETRY_WAIT;
goto out;
case STATE_RESTART:
delay = 0;
break;
case STATE_DONE:
break;
}
int ret, delay = -1;
ret = ecache_work_evict_list(cnet);
switch (ret) {
case STATE_CONGESTED:
delay = ECACHE_RETRY_JIFFIES;
break;
case STATE_RESTART:
delay = 0;
break;
case STATE_DONE:
break;
}
out:
local_bh_enable();
ctnet->ecache_dwork_pending = delay > 0;
if (delay >= 0)
schedule_delayed_work(&cnet->ecache.dwork, delay);
}
......@@ -199,7 +193,6 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
*/
if (e->portid == 0 && portid != 0)
e->portid = portid;
e->state = NFCT_ECACHE_DESTROY_FAIL;
}
return ret;
......@@ -297,12 +290,51 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
schedule_delayed_work(&cnet->ecache.dwork, HZ);
net->ct.ecache_dwork_pending = true;
} else if (state == NFCT_ECACHE_DESTROY_SENT) {
net->ct.ecache_dwork_pending = false;
mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
if (!hlist_nulls_empty(&cnet->ecache.dying_list))
mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
else
net->ct.ecache_dwork_pending = false;
}
}
#define NF_CT_EVENTS_DEFAULT 1
bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
struct net *net = nf_ct_net(ct);
struct nf_conntrack_ecache *e;
switch (net->ct.sysctl_events) {
case 0:
/* assignment via template / ruleset? ignore sysctl. */
if (ctmask || expmask)
break;
return true;
case 2: /* autodetect: no event listener, don't allocate extension. */
if (!READ_ONCE(net->ct.ctnetlink_has_listener))
return true;
fallthrough;
case 1:
/* always allocate an extension. */
if (!ctmask && !expmask) {
ctmask = ~0;
expmask = ~0;
}
break;
default:
WARN_ON_ONCE(1);
return true;
}
e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
if (e) {
e->ctmask = ctmask;
e->expmask = expmask;
}
return e != NULL;
}
EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
#define NF_CT_EVENTS_DEFAULT 2
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
void nf_conntrack_ecache_pernet_init(struct net *net)
......@@ -311,8 +343,9 @@ void nf_conntrack_ecache_pernet_init(struct net *net)
net->ct.sysctl_events = nf_ct_events;
cnet->ecache.ct_net = &net->ct;
INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
spin_lock_init(&cnet->ecache.dying_lock);
BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
......
......@@ -27,6 +27,8 @@
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);
static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
[NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
#if IS_ENABLED(CONFIG_NF_NAT)
......@@ -116,8 +118,10 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
if (!new)
return NULL;
if (!ct->ext)
if (!ct->ext) {
memset(new->offset, 0, sizeof(new->offset));
new->gen_id = atomic_read(&nf_conntrack_ext_genid);
}
new->offset[id] = newoff;
new->len = newlen;
......@@ -127,3 +131,29 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
return (void *)new + newoff;
}
EXPORT_SYMBOL(nf_ct_ext_add);
/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
{
unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
unsigned int this_id = READ_ONCE(ext->gen_id);
if (!__nf_ct_ext_exist(ext, id))
return NULL;
if (this_id == 0 || ext->gen_id == gen_id)
return (void *)ext + ext->offset[id];
return NULL;
}
EXPORT_SYMBOL(__nf_ct_ext_find);
void nf_ct_ext_bump_genid(void)
{
unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);
if (value == UINT_MAX)
atomic_set(&nf_conntrack_ext_genid, 1);
msleep(HZ);
}
......@@ -468,11 +468,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
nf_ct_iterate_destroy(unhelp, me);
/* Maybe someone has gotten the helper already when unhelp above.
* So need to wait it.
*/
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
......
......@@ -1559,6 +1559,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
struct nf_ct_iter_data iter = {
.net = net,
.portid = portid,
.report = report,
};
if (ctnetlink_needs_filter(family, cda)) {
if (cda[CTA_FILTER])
......@@ -1567,10 +1572,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
iter.data = filter;
}
nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
portid, report);
nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
kfree(filter);
return 0;
......@@ -1750,61 +1756,59 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
}
static int
ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
return 0;
}
static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
struct nf_conn *ct, *last;
struct nf_conn *last = ctx->last;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = sock_net(skb->sk);
struct nf_conntrack_net_ecache *ecache_net;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct hlist_nulls_head *list;
struct net *net = sock_net(skb->sk);
int res, cpu;
#endif
if (ctx->done)
return 0;
last = ctx->last;
ctx->last = NULL;
for (cpu = ctx->cpu; cpu < nr_cpu_ids; cpu++) {
struct ct_pcpu *pcpu;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache_net = nf_conn_pernet_ecache(net);
spin_lock_bh(&ecache_net->dying_lock);
if (!cpu_possible(cpu))
continue;
hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
struct nf_conn *ct;
int res;
pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
spin_lock_bh(&pcpu->lock);
list = dying ? &pcpu->dying : &pcpu->unconfirmed;
restart:
hlist_nulls_for_each_entry(h, n, list, hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
ct = nf_ct_tuplehash_to_ctrack(h);
if (last && last != ct)
continue;
res = ctnetlink_dump_one_entry(skb, cb, ct, dying);
if (res < 0) {
ctx->cpu = cpu;
spin_unlock_bh(&pcpu->lock);
goto out;
}
}
if (ctx->last) {
ctx->last = NULL;
goto restart;
res = ctnetlink_dump_one_entry(skb, cb, ct, true);
if (res < 0) {
spin_unlock_bh(&ecache_net->dying_lock);
nf_ct_put(last);
return skb->len;
}
spin_unlock_bh(&pcpu->lock);
nf_ct_put(last);
last = NULL;
}
spin_unlock_bh(&ecache_net->dying_lock);
#endif
ctx->done = true;
out:
if (last)
nf_ct_put(last);
nf_ct_put(last);
return skb->len;
}
static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
return ctnetlink_dump_list(skb, cb, true);
}
static int ctnetlink_get_ct_dying(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const cda[])
......@@ -1820,12 +1824,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
return -EOPNOTSUPP;
}
static int
ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
return ctnetlink_dump_list(skb, cb, false);
}
static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const cda[])
......
......@@ -538,9 +538,13 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
if (fixup_needed)
nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
(void *)(unsigned long)nfproto, 0, 0);
if (fixup_needed) {
struct nf_ct_iter_data iter_data = {
.net = net,
.data = (void *)(unsigned long)nfproto,
};
nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
}
return err;
}
......
......@@ -693,7 +693,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
.extra2 = SYSCTL_TWO,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
......
......@@ -38,7 +38,12 @@ static int untimeout(struct nf_conn *ct, void *timeout)
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
{
nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
struct nf_ct_iter_data iter_data = {
.net = net,
.data = timeout,
};
nf_ct_iterate_cleanup_net(untimeout, &iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_untimeout);
......
......@@ -77,11 +77,14 @@ EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
static void iterate_cleanup_work(struct work_struct *work)
{
struct nf_ct_iter_data iter_data = {};
struct masq_dev_work *w;
w = container_of(work, struct masq_dev_work, work);
nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0);
iter_data.net = w->net;
iter_data.data = (void *)w;
nf_ct_iterate_cleanup_net(w->iter, &iter_data);
put_net_track(w->net, &w->ns_tracker);
kfree(w);
......
......@@ -45,6 +45,7 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket");
static unsigned int nfnetlink_pernet_id __read_mostly;
struct nfnl_net {
unsigned int ctnetlink_listeners;
struct sock *nfnl;
};
......@@ -654,7 +655,6 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
#ifdef CONFIG_MODULES
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
......@@ -670,9 +670,44 @@ static int nfnetlink_bind(struct net *net, int group)
rcu_read_unlock();
if (!ss)
request_module_nowait("nfnetlink-subsys-%d", type);
#ifdef CONFIG_NF_CONNTRACK_EVENTS
if (type == NFNL_SUBSYS_CTNETLINK) {
struct nfnl_net *nfnlnet = nfnl_pernet(net);
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
if (WARN_ON_ONCE(nfnlnet->ctnetlink_listeners == UINT_MAX)) {
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
return -EOVERFLOW;
}
nfnlnet->ctnetlink_listeners++;
if (nfnlnet->ctnetlink_listeners == 1)
WRITE_ONCE(net->ct.ctnetlink_has_listener, true);
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
}
#endif
return 0;
}
static void nfnetlink_unbind(struct net *net, int group)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
int type = nfnl_group2type[group];
if (type == NFNL_SUBSYS_CTNETLINK) {
struct nfnl_net *nfnlnet = nfnl_pernet(net);
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
WARN_ON_ONCE(nfnlnet->ctnetlink_listeners == 0);
nfnlnet->ctnetlink_listeners--;
if (nfnlnet->ctnetlink_listeners == 0)
WRITE_ONCE(net->ct.ctnetlink_has_listener, false);
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
}
#endif
}
static int __net_init nfnetlink_net_init(struct net *net)
{
......@@ -680,9 +715,8 @@ static int __net_init nfnetlink_net_init(struct net *net)
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
#endif
.unbind = nfnetlink_unbind,
};
nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
......
......@@ -33,8 +33,19 @@
static unsigned int nfct_timeout_id __read_mostly;
struct ctnl_timeout {
struct list_head head;
struct rcu_head rcu_head;
refcount_t refcnt;
char name[CTNL_TIMEOUT_NAME_MAX];
struct nf_ct_timeout timeout;
struct list_head free_head;
};
struct nfct_timeout_pernet {
struct list_head nfct_timeout_list;
struct list_head nfct_timeout_freelist;
};
MODULE_LICENSE("GPL");
......@@ -574,20 +585,36 @@ static int __net_init cttimeout_net_init(struct net *net)
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
INIT_LIST_HEAD(&pernet->nfct_timeout_list);
INIT_LIST_HEAD(&pernet->nfct_timeout_freelist);
return 0;
}
static void __net_exit cttimeout_net_pre_exit(struct net *net)
{
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
list_del_rcu(&cur->head);
list_add(&cur->free_head, &pernet->nfct_timeout_freelist);
}
/* core calls synchronize_rcu() after this */
}
static void __net_exit cttimeout_net_exit(struct net *net)
{
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
nf_ct_unconfirmed_destroy(net);
if (list_empty(&pernet->nfct_timeout_freelist))
return;
nf_ct_untimeout(net, NULL);
list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
list_del_rcu(&cur->head);
list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) {
list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
......@@ -596,6 +623,7 @@ static void __net_exit cttimeout_net_exit(struct net *net)
static struct pernet_operations cttimeout_ops = {
.init = cttimeout_net_init,
.pre_exit = cttimeout_net_pre_exit,
.exit = cttimeout_net_exit,
.id = &nfct_timeout_id,
.size = sizeof(struct nfct_timeout_pernet),
......@@ -628,13 +656,24 @@ static int __init cttimeout_init(void)
return ret;
}
static int untimeout(struct nf_conn *ct, void *timeout)
{
struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
if (timeout_ext)
RCU_INIT_POINTER(timeout_ext->timeout, NULL);
return 0;
}
static void __exit cttimeout_exit(void)
{
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
synchronize_rcu();
nf_ct_iterate_destroy(untimeout, NULL);
}
module_init(cttimeout_init);
......
......@@ -227,11 +227,19 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
fl.u.ip4.saddr = ct->tuplehash[dir].tuple.dst.u3.ip;
fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos);
fl.u.ip4.flowi4_mark = pkt->skb->mark;
break;
case NFPROTO_IPV6:
fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
fl.u.ip6.saddr = ct->tuplehash[dir].tuple.dst.u3.in6;
fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
fl.u.ip6.flowi6_mark = pkt->skb->mark;
break;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment