Commit 2ffae99d authored by Timo Teräs's avatar Timo Teräs Committed by David S. Miller

ipv4: use next hop exceptions also for input routes

Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
assmued that "locally destined, and routed packets, never trigger
PMTU events or redirects that will be processed by us".

However, it seems that tunnel devices do trigger PMTU events in certain
cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
skb_dst(skb)->ops->update_pmtu to propage mtu information from the
outer flows. These can cause the inner flow mtu to be decreased. If
next hop exceptions are not consulted for pmtu, IP fragmentation will
not be done properly for these routes.

It also seems that we really need to have the PMTU information always
for netfilter TCPMSS clamp-to-pmtu feature to work properly.

So for the time being, cache separate copies of input routes for
each next hop exception.
Signed-off-by: default avatarTimo Teräs <timo.teras@iki.fi>
Reviewed-by: default avatarJulian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b173ee48
...@@ -56,7 +56,8 @@ struct fib_nh_exception { ...@@ -56,7 +56,8 @@ struct fib_nh_exception {
u32 fnhe_pmtu; u32 fnhe_pmtu;
__be32 fnhe_gw; __be32 fnhe_gw;
unsigned long fnhe_expires; unsigned long fnhe_expires;
struct rtable __rcu *fnhe_rth; struct rtable __rcu *fnhe_rth_input;
struct rtable __rcu *fnhe_rth_output;
unsigned long fnhe_stamp; unsigned long fnhe_stamp;
}; };
......
...@@ -169,7 +169,8 @@ static void free_nh_exceptions(struct fib_nh *nh) ...@@ -169,7 +169,8 @@ static void free_nh_exceptions(struct fib_nh *nh)
next = rcu_dereference_protected(fnhe->fnhe_next, 1); next = rcu_dereference_protected(fnhe->fnhe_next, 1);
rt_fibinfo_free(&fnhe->fnhe_rth); rt_fibinfo_free(&fnhe->fnhe_rth_input);
rt_fibinfo_free(&fnhe->fnhe_rth_output);
kfree(fnhe); kfree(fnhe);
......
...@@ -565,10 +565,25 @@ static inline void rt_free(struct rtable *rt) ...@@ -565,10 +565,25 @@ static inline void rt_free(struct rtable *rt)
static DEFINE_SPINLOCK(fnhe_lock); static DEFINE_SPINLOCK(fnhe_lock);
static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
{
struct rtable *rt;
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
rt_free(rt);
}
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
rt_free(rt);
}
}
static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
{ {
struct fib_nh_exception *fnhe, *oldest; struct fib_nh_exception *fnhe, *oldest;
struct rtable *orig;
oldest = rcu_dereference(hash->chain); oldest = rcu_dereference(hash->chain);
for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
...@@ -576,11 +591,7 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) ...@@ -576,11 +591,7 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
oldest = fnhe; oldest = fnhe;
} }
orig = rcu_dereference(oldest->fnhe_rth); fnhe_flush_routes(oldest);
if (orig) {
RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
rt_free(orig);
}
return oldest; return oldest;
} }
...@@ -644,7 +655,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, ...@@ -644,7 +655,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_expires = max(1UL, expires); fnhe->fnhe_expires = max(1UL, expires);
} }
/* Update all cached dsts too */ /* Update all cached dsts too */
rt = rcu_dereference(fnhe->fnhe_rth); rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt)
fill_route_from_fnhe(rt, fnhe);
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) if (rt)
fill_route_from_fnhe(rt, fnhe); fill_route_from_fnhe(rt, fnhe);
} else { } else {
...@@ -668,6 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, ...@@ -668,6 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
* stale, so anyone caching it rechecks if this exception * stale, so anyone caching it rechecks if this exception
* applies to them. * applies to them.
*/ */
rt = rcu_dereference(nh->nh_rth_input);
if (rt)
rt->dst.obsolete = DST_OBSOLETE_KILL;
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
struct rtable __rcu **prt; struct rtable __rcu **prt;
prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
...@@ -1242,25 +1260,36 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, ...@@ -1242,25 +1260,36 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
spin_lock_bh(&fnhe_lock); spin_lock_bh(&fnhe_lock);
if (daddr == fnhe->fnhe_daddr) { if (daddr == fnhe->fnhe_daddr) {
struct rtable __rcu **porig;
struct rtable *orig;
int genid = fnhe_genid(dev_net(rt->dst.dev)); int genid = fnhe_genid(dev_net(rt->dst.dev));
struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
if (rt_is_input_route(rt))
porig = &fnhe->fnhe_rth_input;
else
porig = &fnhe->fnhe_rth_output;
orig = rcu_dereference(*porig);
if (fnhe->fnhe_genid != genid) { if (fnhe->fnhe_genid != genid) {
fnhe->fnhe_genid = genid; fnhe->fnhe_genid = genid;
fnhe->fnhe_gw = 0; fnhe->fnhe_gw = 0;
fnhe->fnhe_pmtu = 0; fnhe->fnhe_pmtu = 0;
fnhe->fnhe_expires = 0; fnhe->fnhe_expires = 0;
fnhe_flush_routes(fnhe);
orig = NULL;
} }
fill_route_from_fnhe(rt, fnhe); fill_route_from_fnhe(rt, fnhe);
if (!rt->rt_gateway) if (!rt->rt_gateway)
rt->rt_gateway = daddr; rt->rt_gateway = daddr;
rcu_assign_pointer(fnhe->fnhe_rth, rt); if (!(rt->dst.flags & DST_NOCACHE)) {
if (orig) rcu_assign_pointer(*porig, rt);
rt_free(orig); if (orig)
rt_free(orig);
ret = true;
}
fnhe->fnhe_stamp = jiffies; fnhe->fnhe_stamp = jiffies;
ret = true;
} }
spin_unlock_bh(&fnhe_lock); spin_unlock_bh(&fnhe_lock);
...@@ -1492,6 +1521,7 @@ static int __mkroute_input(struct sk_buff *skb, ...@@ -1492,6 +1521,7 @@ static int __mkroute_input(struct sk_buff *skb,
struct in_device *in_dev, struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos) __be32 daddr, __be32 saddr, u32 tos)
{ {
struct fib_nh_exception *fnhe;
struct rtable *rth; struct rtable *rth;
int err; int err;
struct in_device *out_dev; struct in_device *out_dev;
...@@ -1538,8 +1568,13 @@ static int __mkroute_input(struct sk_buff *skb, ...@@ -1538,8 +1568,13 @@ static int __mkroute_input(struct sk_buff *skb,
} }
} }
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) { if (do_cache) {
rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (fnhe != NULL)
rth = rcu_dereference(fnhe->fnhe_rth_input);
else
rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) { if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst); skb_dst_set_noref(skb, &rth->dst);
goto out; goto out;
...@@ -1567,7 +1602,7 @@ static int __mkroute_input(struct sk_buff *skb, ...@@ -1567,7 +1602,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->dst.input = ip_forward; rth->dst.input = ip_forward;
rth->dst.output = ip_output; rth->dst.output = ip_output;
rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
skb_dst_set(skb, &rth->dst); skb_dst_set(skb, &rth->dst);
out: out:
err = 0; err = 0;
...@@ -1882,7 +1917,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, ...@@ -1882,7 +1917,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = find_exception(nh, fl4->daddr); fnhe = find_exception(nh, fl4->daddr);
if (fnhe) if (fnhe)
prth = &fnhe->fnhe_rth; prth = &fnhe->fnhe_rth_output;
else { else {
if (unlikely(fl4->flowi4_flags & if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH && FLOWI_FLAG_KNOWN_NH &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment