From 1758fd4688eb92c796e75bdb1d256dc558ef9581 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:24 -0700 Subject: [PATCH 01/21] ipv6: remove unnecessary dst_hold() in ip6_fragment() In ipv6 tx path, rcu_read_lock() is taken so that dst won't get freed during the execution of ip6_fragment(). Hence, no need to hold dst in it. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8b8efb0e55bf..5baa6fab4b97 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -698,8 +698,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); - dst_hold(&rt->dst); - for (;;) { /* Prepare header of the next frame, * before previous one went down. */ @@ -742,7 +740,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); - ip6_rt_put(rt); return 0; } @@ -750,7 +747,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); - ip6_rt_put(rt); return err; slow_path_clean: From d24406c85d123df773bc4df88ad5da2233896919 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:25 -0700 Subject: [PATCH 02/21] udp: call dst_hold_safe() in udp_sk_rx_set_dst() In udp_v4/6_early_demux() code, we try to hold dst->__refcnt for dst with DST_NOCACHE flag. This is because later in udp_sk_rx_dst_set() function, we will try to cache this dst in sk for connected case. However, a better way to achieve this is to not try to hold dst in early_demux(), but in udp_sk_rx_dst_set(), call dst_hold_safe(). This approach is also more consistant with how tcp is handling it. And it will make later changes simpler. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/udp.c | 19 +++++++++---------- net/ipv6/udp.c | 11 +++++------ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2bc638c48b86..f3450f092d71 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1977,9 +1977,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; - dst_hold(dst); - old = xchg(&sk->sk_rx_dst, dst); - dst_release(old); + if (dst_hold_safe(dst)) { + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); + } } /* @@ -2303,13 +2304,11 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { - /* DST_NOCACHE can not be used without taking a reference */ - if (dst->flags & DST_NOCACHE) { - if (likely(atomic_inc_not_zero(&dst->__refcnt))) - skb_dst_set(skb, dst); - } else { - skb_dst_set_noref(skb, dst); - } + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2e9b52bded2d..2b33847bf931 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -920,12 +920,11 @@ static void udp_v6_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); if (dst) { - if (dst->flags & DST_NOCACHE) { - if (likely(atomic_inc_not_zero(&dst->__refcnt))) - skb_dst_set(skb, dst); - } else { - skb_dst_set_noref(skb, dst); - } + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); } } From 1dbe32525e26ec28d2cc17f65a90fc7b53f1f8d0 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:26 -0700 Subject: [PATCH 03/21] net: use loopback dev when generating blackhole route Existing ipv4/6_blackhole_route() code generates a blackhole route with dst->dev pointing to the passed in dst->dev. It is not necessary to hold reference to the passed in dst->dev because the packets going through this route are dropped anyway. A loopback interface is good enough so that we don't need to worry about releasing this dst->dev when this dev is going down. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- net/ipv6/route.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9b38cf18144e..0a843ef2b709 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2504,7 +2504,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->input = dst_discard; new->output = dst_discard_out; - new->dev = ort->dst.dev; + new->dev = net->loopback_dev; if (new->dev) dev_hold(new->dev); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 18fe6e2b88d5..bc1bc91bb969 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1245,9 +1245,12 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; - rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); + + rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, + DST_OBSOLETE_NONE, 0); if (rt) { rt6_info_init(rt); @@ -1257,10 +1260,8 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->output = dst_discard_out; dst_copy_metrics(new, &ort->dst); - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); + rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; rt->rt6i_metric = 0; From 5f56f409b5142bb4e88e1f64dedeb4a76c678177 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:27 -0700 Subject: [PATCH 04/21] net: introduce DST_NOGC in dst_release() to destroy dst based on refcnt The current mechanism of freeing dst is a bit complicated. dst has its ref count and when user grabs the reference to the dst, the ref count is properly taken in most cases except in IPv4/IPv6/decnet/xfrm routing code due to some historic reasons. If the reference to dst is always taken properly, we should be able to simplify the logic in dst_release() to destroy dst when dst->__refcnt drops from 1 to 0. And this should be the only condition to determine if we can call dst_destroy(). And as dst is always ref counted, there is no need for a dst garbage list to hold the dst entries that already get removed by the routing code but are still held by other users. And the task to periodically check the list to free dst if ref count become 0 is also not needed anymore. This patch introduces a temporary flag DST_NOGC(no garbage collector). If it is set in the dst, dst_release() will call dst_destroy() when dst->__refcnt drops to 0. dst_hold_safe() will also check for this flag and do atomic_inc_not_zero() similar as DST_NOCACHE to avoid double free issue. This temporary flag is mainly used so that we can make the transition component by component without breaking other parts. This flag will be removed after all components are properly transitioned. This patch also introduces a new function dst_release_immediate() which destroys dst without waiting on the rcu when refcnt drops to 0. It will be used in later patches. Follow-up patches will correct all the places to properly take ref count on dst and mark DST_NOGC. dst_release() or dst_release_immediate() will be used to release the dst instead of dst_free() and its related functions. And final clean-up patch will remove the DST_NOGC flag. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 5 ++++- net/core/dst.c | 20 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 1969008783d8..2735d5a1e774 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -58,6 +58,7 @@ struct dst_entry { #define DST_XFRM_TUNNEL 0x0080 #define DST_XFRM_QUEUE 0x0100 #define DST_METADATA 0x0200 +#define DST_NOGC 0x0400 short error; @@ -278,6 +279,8 @@ static inline struct dst_entry *dst_clone(struct dst_entry *dst) void dst_release(struct dst_entry *dst); +void dst_release_immediate(struct dst_entry *dst); + static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) @@ -334,7 +337,7 @@ static inline void skb_dst_force(struct sk_buff *skb) */ static inline bool dst_hold_safe(struct dst_entry *dst) { - if (dst->flags & DST_NOCACHE) + if (dst->flags & (DST_NOCACHE | DST_NOGC)) return atomic_inc_not_zero(&dst->__refcnt); dst_hold(dst); return true; diff --git a/net/core/dst.c b/net/core/dst.c index 13ba4a090c41..551834c3363f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -300,18 +300,34 @@ void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; - unsigned short nocache = dst->flags & DST_NOCACHE; + unsigned short destroy_after_rcu = dst->flags & + (DST_NOCACHE | DST_NOGC); newrefcnt = atomic_dec_return(&dst->__refcnt); if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (!newrefcnt && unlikely(nocache)) + if (!newrefcnt && unlikely(destroy_after_rcu)) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); +void dst_release_immediate(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + if (unlikely(newrefcnt < 0)) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt) + dst_destroy(dst); + } +} +EXPORT_SYMBOL(dst_release_immediate); + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); From 4a6ce2b6f2ecabbddcfe47e7cf61dd0f00b10e36 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:28 -0700 Subject: [PATCH 05/21] net: introduce a new function dst_dev_put() This function should be called when removing routes from fib tree after the dst gc is no longer in use. We first mark DST_OBSOLETE_DEAD on this dst to make sure next dst_ops->check() fails and returns NULL. Secondly, as we no longer keep the gc_list, we need to properly release dst->dev right at the moment when the dst is removed from the fib/fib6 tree. It does the following: 1. change dst->input and output pointers to dst_discard/dst_dscard_out to discard all packets 2. replace dst->dev with loopback interface Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 1 + net/core/dst.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/net/dst.h b/include/net/dst.h index 2735d5a1e774..11d779803c0d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -428,6 +428,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags); void __dst_free(struct dst_entry *dst); struct dst_entry *dst_destroy(struct dst_entry *dst); +void dst_dev_put(struct dst_entry *dst); static inline void dst_free(struct dst_entry *dst) { diff --git a/net/core/dst.c b/net/core/dst.c index 551834c3363f..56998f69b84e 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -296,6 +296,30 @@ static void dst_destroy_rcu(struct rcu_head *head) __dst_free(dst); } +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under loopback interface and discard all tx/rx packets + * on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, true); + dst->input = dst_discard; + dst->output = dst_discard_out; + dst->dev = dev_net(dst->dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); +} +EXPORT_SYMBOL(dst_dev_put); + void dst_release(struct dst_entry *dst) { if (dst) { From 0830106c53900181d336350581119af09e123bf3 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:29 -0700 Subject: [PATCH 06/21] ipv4: take dst->__refcnt when caching dst in fib In IPv4 routing code, fib_nh and fib_nh_exception can hold pointers to struct rtable but they never increment dst->__refcnt. This leads to the need of the dst garbage collector because when user is done with this dst and calls dst_release(), it can only decrement dst->__refcnt and can not free the dst even it sees dst->__refcnt drops from 1 to 0 (unless DST_NOCACHE flag is set) because the routing code might still hold reference to it. And when the routing code tries to delete a route, it has to put the dst to the gc_list if dst->__refcnt is not yet 0 and have a gc thread running periodically to check on dst->__refcnt and finally to free dst when refcnt becomes 0. This patch increments dst->__refcnt when fib_nh/fib_nh_exception holds reference to this dst and properly release the dst when fib_nh/fib_nh_exception has been updated with a new dst. This patch is a preparation in order to fully get rid of dst gc later. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 5 ++++- net/ipv4/route.c | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2157dc08c407..53b3e9c2da4c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -152,6 +152,7 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) * free_fib_info_rcu() */ + dst_release(&rt->dst); dst_free(&rt->dst); } @@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); - if (rt) + if (rt) { + dst_release(&rt->dst); dst_free(&rt->dst); + } } free_percpu(rtp); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0a843ef2b709..3dee0043117e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -603,11 +603,13 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); + dst_release(&rt->dst); rt_free(rt); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); + dst_release(&rt->dst); rt_free(rt); } } @@ -1332,9 +1334,12 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, rt->rt_gateway = daddr; if (!(rt->dst.flags & DST_NOCACHE)) { + dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); - if (orig) + if (orig) { + dst_release(&orig->dst); rt_free(orig); + } ret = true; } @@ -1357,12 +1362,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) } orig = *p; + /* hold dst before doing cmpxchg() to avoid race condition + * on this dst + */ + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { - if (orig) + if (orig) { + dst_release(&orig->dst); rt_free(orig); - } else + } + } else { + dst_release(&rt->dst); ret = false; + } return ret; } From 95c47f9cf5e028d1ae77dc6c767c1edc8a18025b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:30 -0700 Subject: [PATCH 07/21] ipv4: call dst_dev_put() properly As the intend of this patch series is to completely remove dst gc, we need to call dst_dev_put() to release the reference to dst->dev when removing routes from fib because we won't keep the gc list anymore and will lose the dst pointer right after removing the routes. Without the gc list, there is no way to find all the dst's that have dst->dev pointing to the going-down dev. Hence, we are doing dst_dev_put() immediately before we lose the last reference of the dst from the routing code. The next dst_check() will trigger a route re-lookup to find another route (if there is any). Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 53b3e9c2da4c..f163fa0a1164 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -152,6 +152,7 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) * free_fib_info_rcu() */ + dst_dev_put(&rt->dst); dst_release(&rt->dst); dst_free(&rt->dst); } @@ -196,6 +197,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); if (rt) { + dst_dev_put(&rt->dst); dst_release(&rt->dst); dst_free(&rt->dst); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3dee0043117e..d986d80258d2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -603,12 +603,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); + dst_dev_put(&rt->dst); dst_release(&rt->dst); rt_free(rt); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); + dst_dev_put(&rt->dst); dst_release(&rt->dst); rt_free(rt); } @@ -1337,6 +1339,7 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { + dst_dev_put(&orig->dst); dst_release(&orig->dst); rt_free(orig); } @@ -1369,6 +1372,7 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { + dst_dev_put(&orig->dst); dst_release(&orig->dst); rt_free(orig); } From 9df16efadd2a8a82731dc76ff656c771e261827f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:31 -0700 Subject: [PATCH 08/21] ipv4: call dst_hold_safe() properly This patch checks all the calls to dst_hold()/skb_dst_force()/dst_clone()/dst_use() to see if dst_hold_safe() is needed to avoid double free issue if dst gc is removed and dst_release() directly destroys dst when dst->__refcnt drops to 0. In tx path, TCP hold sk->sk_rx_dst ref count and also hold sock_lock(). UDP and other similar protocols always hold refcount for skb->_skb_refdst. So both paths seem to be safe. In rx path, as it is lockless and skb_dst_set_noref() is likely to be used, dst_hold_safe() should always be used when trying to hold dst. In the routing code, if dst is held during an rcu protected session, it is necessary to call dst_hold_safe() as the current dst might be in its rcu grace period. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/route.h | 4 +++- net/ipv4/route.c | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 08e689f23365..cb0a76d9dde1 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -190,7 +190,9 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, rcu_read_lock(); err = ip_route_input_noref(skb, dst, src, tos, devin); if (!err) - skb_dst_force(skb); + skb_dst_force_safe(skb); + if (!skb_dst(skb)) + err = -EINVAL; rcu_read_unlock(); return err; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d986d80258d2..903a12c601ac 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2234,10 +2234,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth = rcu_dereference(*prth); rt_cache: - if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); + if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; - } } add: From b838d5e1c5b6e57b10ec8af2268824041e3ea911 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:32 -0700 Subject: [PATCH 09/21] ipv4: mark DST_NOGC and remove the operation of dst_free() With the previous preparation patches, we are ready to get rid of the dst gc operation in ipv4 code and release dst based on refcnt only. So this patch adds DST_NOGC flag for all IPv4 dst and remove the calls to dst_free(). At this point, all dst created in ipv4 code do not use the dst gc anymore and will be destroyed at the point when refcnt drops to 0. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 6 ++---- net/ipv4/route.c | 15 +++------------ 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f163fa0a1164..ff47ea1408fe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -153,8 +153,7 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) */ dst_dev_put(&rt->dst); - dst_release(&rt->dst); - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh *nh) @@ -198,8 +197,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); if (rt) { dst_dev_put(&rt->dst); - dst_release(&rt->dst); - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); } } free_percpu(rtp); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 903a12c601ac..80b30c2bf47d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -589,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, build_sk_flow_key(fl4, sk); } -static inline void rt_free(struct rtable *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); -} - static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) @@ -605,14 +600,12 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); - rt_free(rt); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); - rt_free(rt); } } @@ -1341,7 +1334,6 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); - rt_free(orig); } ret = true; } @@ -1374,7 +1366,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); - rt_free(orig); } } else { dst_release(&rt->dst); @@ -1505,7 +1496,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0)); + (noxfrm ? DST_NOXFRM : 0) | + DST_NOGC); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); @@ -2511,7 +2503,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOGC); if (rt) { struct dst_entry *new = &rt->dst; @@ -2534,7 +2526,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - dst_free(new); } dst_release(dst_orig); From 1cfb71eeb12047bcdbd3e6730ffed66e810a0855 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:33 -0700 Subject: [PATCH 10/21] ipv6: take dst->__refcnt for insertion into fib6 tree In IPv6 routing code, struct rt6_info is created for each static route and RTF_CACHE route and inserted into fib6 tree. In both cases, dst ref count is not taken. As explained in the previous patch, this leads to the need of the dst garbage collector. This patch holds ref count of dst before inserting the route into fib6 tree and properly releases the dst when deleting it from the fib6 tree as a preparation in order to fully get rid of dst gc later. Also, correct fib6_age() logic to check dst->__refcnt to be 1 to indicate no user is referencing the dst. And remove dst_hold() in vrf_rt6_create() as ip6_dst_alloc() already puts dst->__refcnt to 1. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- drivers/net/vrf.c | 4 ---- net/ipv6/ip6_fib.c | 12 +++++++++- net/ipv6/route.c | 55 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index c6c0595d267b..d038927acfca 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -583,8 +583,6 @@ static int vrf_rt6_create(struct net_device *dev) if (!rt6) goto out; - dst_hold(&rt6->dst); - rt6->rt6i_table = rt6i_table; rt6->dst.output = vrf_output6; @@ -597,8 +595,6 @@ static int vrf_rt6_create(struct net_device *dev) goto out; } - dst_hold(&rt6_local->dst); - rt6_local->rt6i_idev = in6_dev_get(dev); rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL; rt6_local->rt6i_table = rt6i_table; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index deea901746c8..3b728bcb1301 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -172,6 +172,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { + dst_release(&pcpu_rt->dst); rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } @@ -185,6 +186,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); + dst_release(&rt->dst); rt6_rcu_free(rt); } } @@ -1101,6 +1103,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, atomic_inc(&pn->leaf->rt6i_ref); } #endif + /* Always release dst as dst->__refcnt is guaranteed + * to be taken before entering this function + */ + dst_release(&rt->dst); if (!(rt->dst.flags & DST_NOCACHE)) dst_free(&rt->dst); } @@ -1113,6 +1119,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); + /* Always release dst as dst->__refcnt is guaranteed + * to be taken before entering this function + */ + dst_release(&rt->dst); if (!(rt->dst.flags & DST_NOCACHE)) dst_free(&rt->dst); return err; @@ -1783,7 +1793,7 @@ static int fib6_age(struct rt6_info *rt, void *arg) } gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { - if (atomic_read(&rt->dst.__refcnt) == 0 && + if (atomic_read(&rt->dst.__refcnt) == 1 && time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bc1bc91bb969..908b71188c57 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -354,7 +354,7 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 0, DST_OBSOLETE_FORCE_CHK, flags); + 1, DST_OBSOLETE_FORCE_CHK, flags); if (rt) rt6_info_init(rt); @@ -381,7 +381,9 @@ struct rt6_info *ip6_dst_alloc(struct net *net, *p = NULL; } } else { - dst_destroy((struct dst_entry *)rt); + dst_release(&rt->dst); + if (!(flags & DST_NOCACHE)) + dst_destroy((struct dst_entry *)rt); return NULL; } } @@ -932,9 +934,9 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. - It takes new route entry, the addition fails by any reason the - route is freed. In any case, if caller does not hold it, it may - be destroyed. + * It takes new route entry, the addition fails by any reason the + * route is released. + * Caller must hold dst before calling it. */ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, @@ -957,6 +959,8 @@ int ip6_ins_rt(struct rt6_info *rt) struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; struct mx6_config mxc = { .mx = NULL, }; + /* Hold dst to account for the reference from the fib6 tree */ + dst_hold(&rt->dst); return __ip6_ins_rt(rt, &info, &mxc, NULL); } @@ -1049,6 +1053,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) prev = cmpxchg(p, NULL, pcpu_rt); if (prev) { /* If someone did it before us, return prev instead */ + dst_release(&pcpu_rt->dst); dst_destroy(&pcpu_rt->dst); pcpu_rt = prev; } @@ -1059,6 +1064,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) * since rt is going away anyway. The next * dst_check() will trigger a re-lookup. */ + dst_release(&pcpu_rt->dst); dst_destroy(&pcpu_rt->dst); pcpu_rt = rt; } @@ -1129,12 +1135,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); - if (uncached_rt) + if (uncached_rt) { + /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() + * No need for another dst_hold() + */ rt6_uncached_list_add(uncached_rt); - else + } else { uncached_rt = net->ipv6.ip6_null_entry; - - dst_hold(&uncached_rt->dst); + dst_hold(&uncached_rt->dst); + } trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1422,6 +1431,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, * invalidate the sk->sk_dst_cache. */ ip6_ins_rt(nrt6); + /* Release the reference taken in + * ip6_rt_cache_alloc() + */ + dst_release(&nrt6->dst); } } } @@ -1673,7 +1686,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->dst.flags |= DST_HOST; rt->dst.output = ip6_output; - atomic_set(&rt->dst.__refcnt, 1); rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; @@ -2130,8 +2142,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, dev_put(dev); if (idev) in6_dev_put(idev); - if (rt) + if (rt) { + dst_release(&rt->dst); dst_free(&rt->dst); + } return ERR_PTR(err); } @@ -2160,8 +2174,10 @@ int ip6_route_add(struct fib6_config *cfg, return err; out: - if (rt) + if (rt) { + dst_release(&rt->dst); dst_free(&rt->dst); + } return err; } @@ -2398,7 +2414,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) - goto out; + goto out_release; netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2411,6 +2427,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu ip6_del_rt(rt); } +out_release: + /* Release the reference taken in + * ip6_rt_cache_alloc() + */ + dst_release(&nrt->dst); + out: neigh_release(neigh); } @@ -2760,8 +2782,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_table = fib6_get_table(net, tb_id); rt->dst.flags |= DST_NOCACHE; - atomic_set(&rt->dst.__refcnt, 1); - return rt; } @@ -3186,6 +3206,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { + dst_release(&rt->dst); dst_free(&rt->dst); goto cleanup; } @@ -3249,8 +3270,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->rt6_info) + if (nh->rt6_info) { + dst_release(&nh->rt6_info->dst); dst_free(&nh->rt6_info->dst); + } kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); From 9514528d92d4cbe086499322370155ed69f5d06c Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:34 -0700 Subject: [PATCH 11/21] ipv6: call dst_dev_put() properly As the intend of this patch series is to completely remove dst gc, we need to call dst_dev_put() to release the reference to dst->dev when removing routes from fib because we won't keep the gc list anymore and will lose the dst pointer right after removing the routes. Without the gc list, there is no way to find all the dst's that have dst->dev pointing to the going-down dev. Hence, we are doing dst_dev_put() immediately before we lose the last reference of the dst from the routing code. The next dst_check() will trigger a route re-lookup to find another route (if there is any). Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3b728bcb1301..265401abb98e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -172,6 +172,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; @@ -186,6 +187,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); + dst_dev_put(&rt->dst); dst_release(&rt->dst); rt6_rcu_free(rt); } From ad65a2f05695aced349e308193c6e2a6b1d87112 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:35 -0700 Subject: [PATCH 12/21] ipv6: call dst_hold_safe() properly Similar as ipv4, ipv6 path also needs to call dst_hold_safe() when necessary to avoid double free issue on the dst. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- net/ipv6/route.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0aa36b093013..2a6397714d70 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5576,8 +5576,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_del_rt(rt); } if (ifp->rt) { - dst_hold(&ifp->rt->dst); - ip6_del_rt(ifp->rt); + if (dst_hold_safe(&ifp->rt->dst)) + ip6_del_rt(ifp->rt); } rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 908b71188c57..c52c51908881 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1366,8 +1366,8 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { if (rt->rt6i_flags & RTF_CACHE) { - dst_hold(&rt->dst); - ip6_del_rt(rt); + if (dst_hold_safe(&rt->dst)) + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } From 587fea74113463b74e0d2994caf9e5f8045c28af Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:36 -0700 Subject: [PATCH 13/21] ipv6: mark DST_NOGC and remove the operation of dst_free() With the previous preparation patches, we are ready to get rid of the dst gc operation in ipv6 code and release dst based on refcnt only. So this patch adds DST_NOGC flag for all IPv6 dst and remove the calls to dst_free() and its related functions. At this point, all dst created in ipv6 code do not use the dst gc anymore and will be destroyed at the point when refcnt drops to 0. Also, as icmp6 dst route is refcounted during creation and will be freed by user during its call of dst_release(), there is no need to add this dst to the icmp6 gc list as well. Instead, we need to add it into uncached list so that when a NETDEV_DOWN/NETDEV_UNREGISRER event comes, we can properly go through these icmp6 dst as well and release the net device properly. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 15 ++------------ net/ipv6/route.c | 49 ++++++++++++++++------------------------------ 2 files changed, 19 insertions(+), 45 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 265401abb98e..e3b35e146eef 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -153,11 +153,6 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } -static void rt6_rcu_free(struct rt6_info *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); -} - static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -174,7 +169,6 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) if (pcpu_rt) { dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); - rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } @@ -189,7 +183,6 @@ static void rt6_release(struct rt6_info *rt) rt6_free_pcpu(rt); dst_dev_put(&rt->dst); dst_release(&rt->dst); - rt6_rcu_free(rt); } } @@ -1108,9 +1101,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ - dst_release(&rt->dst); - if (!(rt->dst.flags & DST_NOCACHE)) - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); } return err; @@ -1124,9 +1115,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ - dst_release(&rt->dst); - if (!(rt->dst.flags & DST_NOCACHE)) - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); return err; #endif } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c52c51908881..5f859ee67172 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -354,7 +354,8 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 1, DST_OBSOLETE_FORCE_CHK, flags); + 1, DST_OBSOLETE_FORCE_CHK, + flags | DST_NOGC); if (rt) rt6_info_init(rt); @@ -381,9 +382,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, *p = NULL; } } else { - dst_release(&rt->dst); - if (!(flags & DST_NOCACHE)) - dst_destroy((struct dst_entry *)rt); + dst_release_immediate(&rt->dst); return NULL; } } @@ -1053,8 +1052,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) prev = cmpxchg(p, NULL, pcpu_rt); if (prev) { /* If someone did it before us, return prev instead */ - dst_release(&pcpu_rt->dst); - dst_destroy(&pcpu_rt->dst); + dst_release_immediate(&pcpu_rt->dst); pcpu_rt = prev; } } else { @@ -1064,8 +1062,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) * since rt is going away anyway. The next * dst_check() will trigger a re-lookup. */ - dst_release(&pcpu_rt->dst); - dst_destroy(&pcpu_rt->dst); + dst_release_immediate(&pcpu_rt->dst); pcpu_rt = rt; } dst_hold(&pcpu_rt->dst); @@ -1257,9 +1254,8 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; - rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_NONE, DST_NOGC); if (rt) { rt6_info_init(rt); @@ -1279,8 +1275,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif - - dst_free(new); } dst_release(dst_orig); @@ -1692,12 +1686,10 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - spin_lock_bh(&icmp6_dst_lock); - rt->dst.next = icmp6_dst_gc_list; - icmp6_dst_gc_list = &rt->dst; - spin_unlock_bh(&icmp6_dst_lock); - - fib6_force_start_gc(net); + /* Add this dst into uncached_list so that rt6_ifdown() can + * do proper release of the net_device + */ + rt6_uncached_list_add(rt); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -2142,10 +2134,8 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, dev_put(dev); if (idev) in6_dev_put(idev); - if (rt) { - dst_release(&rt->dst); - dst_free(&rt->dst); - } + if (rt) + dst_release_immediate(&rt->dst); return ERR_PTR(err); } @@ -2174,10 +2164,8 @@ int ip6_route_add(struct fib6_config *cfg, return err; out: - if (rt) { - dst_release(&rt->dst); - dst_free(&rt->dst); - } + if (rt) + dst_release_immediate(&rt->dst); return err; } @@ -3206,8 +3194,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - dst_release(&rt->dst); - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); goto cleanup; } @@ -3270,10 +3257,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->rt6_info) { - dst_release(&nh->rt6_info->dst); - dst_free(&nh->rt6_info->dst); - } + if (nh->rt6_info) + dst_release_immediate(&nh->rt6_info->dst); kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); From db916649b5dd0fa2bddeb9427dab513b41e1e984 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:37 -0700 Subject: [PATCH 14/21] ipv6: get rid of icmp6 dst garbage collector icmp6 dst route is currently ref counted during creation and will be freed by user during its call of dst_release(). So no need of a garbage collector for it. Remove all icmp6 dst garbage collector related code. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - net/ipv6/ip6_fib.c | 3 +-- net/ipv6/route.c | 46 ----------------------------------------- 3 files changed, 1 insertion(+), 49 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f3da9dd2a8db..0fbf73dd531a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -116,7 +116,6 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int flags); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); -int icmp6_dst_gc(void); void fib6_force_start_gc(struct net *net); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e3b35e146eef..c67ec79bf0da 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1822,8 +1822,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; - - gc_args.more = icmp6_dst_gc(); + gc_args.more = 0; fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5f859ee67172..c88044b8fa7c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1657,9 +1657,6 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } -static struct dst_entry *icmp6_dst_gc_list; -static DEFINE_SPINLOCK(icmp6_dst_lock); - struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { @@ -1697,48 +1694,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, return dst; } -int icmp6_dst_gc(void) -{ - struct dst_entry *dst, **pprev; - int more = 0; - - spin_lock_bh(&icmp6_dst_lock); - pprev = &icmp6_dst_gc_list; - - while ((dst = *pprev) != NULL) { - if (!atomic_read(&dst->__refcnt)) { - *pprev = dst->next; - dst_free(dst); - } else { - pprev = &dst->next; - ++more; - } - } - - spin_unlock_bh(&icmp6_dst_lock); - - return more; -} - -static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), - void *arg) -{ - struct dst_entry *dst, **pprev; - - spin_lock_bh(&icmp6_dst_lock); - pprev = &icmp6_dst_gc_list; - while ((dst = *pprev) != NULL) { - struct rt6_info *rt = (struct rt6_info *) dst; - if (func(rt, arg)) { - *pprev = dst->next; - dst_free(dst); - } else { - pprev = &dst->next; - } - } - spin_unlock_bh(&icmp6_dst_lock); -} - static int ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); @@ -2856,7 +2811,6 @@ void rt6_ifdown(struct net *net, struct net_device *dev) }; fib6_clean_all(net, fib6_ifdown, &adn); - icmp6_clean_all(fib6_ifdown, &adn); if (dev) rt6_uncached_list_flush_dev(net, dev); } From 52df157f17e564ec22afc3e4a89b21828220f576 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:38 -0700 Subject: [PATCH 15/21] xfrm: take refcnt of dst when creating struct xfrm_dst bundle During the creation of xfrm_dst bundle, always take ref count when allocating the dst. This way, xfrm_bundle_create() will form a linked list of dst with dst->child pointing to a ref counted dst child. And the returned dst pointer is also ref counted. This makes the link from the flow cache to this dst now ref counted properly. As the dst is always ref counted properly, we can safely mark DST_NOGC flag so dst_release() will release dst based on refcnt only. And dst gc is no longer needed and all dst_free() and its related function calls should be replaced with dst_release() or dst_release_immediate(). The special handling logic for dst->child in dst_destroy() can be replaced with a simple dst_release_immediate() call on the child to release the whole list linked by dst->child pointer. Previously used DST_NOHASH flag is not needed anymore as well. The reason that DST_NOHASH is used in the existing code is mainly to prevent the dst inserted in the fib tree to be wrongly destroyed during the deletion of the xfrm_dst bundle. So in the existing code, DST_NOHASH flag is marked in all the dst children except the one which is in the fib tree. However, with this patch series to remove dst gc logic and release dst only based on ref count, it is safe to release all the children from a xfrm_dst bundle as long as the dst children are all ref counted properly which is already the case in the existing code. So, this patch removes the use of DST_NOHASH flag. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 1 - net/core/dst.c | 19 ++--------------- net/xfrm/xfrm_policy.c | 48 ++++++++++++++++++++++++++---------------- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 11d779803c0d..88ebb87ad312 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,7 +51,6 @@ struct dst_entry { #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 -#define DST_NOHASH 0x0008 #define DST_NOCACHE 0x0010 #define DST_NOCOUNT 0x0020 #define DST_FAKE_RTABLE 0x0040 diff --git a/net/core/dst.c b/net/core/dst.c index 56998f69b84e..64056ecca5b8 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -250,7 +250,6 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) smp_rmb(); -again: child = dst->child; if (!(dst->flags & DST_NOCOUNT)) @@ -269,20 +268,8 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; - if (dst) { - int nohash = dst->flags & DST_NOHASH; - - if (atomic_dec_and_test(&dst->__refcnt)) { - /* We were real parent of this dst, so kill child. */ - if (nohash) - goto again; - } else { - /* Child is still referenced, return it for freeing. */ - if (nohash) - return dst; - /* Child is still in his hash table */ - } - } + if (dst) + dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); @@ -292,8 +279,6 @@ static void dst_destroy_rcu(struct rcu_head *head) struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); - if (dst) - __dst_free(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ed4e52d95172..85e1e13639cc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1590,7 +1590,9 @@ static void xfrm_bundle_flo_delete(struct flow_cache_object *flo) struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); struct dst_entry *dst = &xdst->u.dst; - dst_free(dst); + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + dst->obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(dst); } static const struct flow_cache_ops xfrm_bundle_fc_ops = { @@ -1620,7 +1622,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0); + xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOGC); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; @@ -1723,10 +1725,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (!dst_prev) dst0 = dst1; - else { - dst_prev->child = dst_clone(dst1); - dst1->flags |= DST_NOHASH; - } + else + /* Ref count is taken during xfrm_alloc_dst() + * No need to do dst_clone() on dst1 + */ + dst_prev->child = dst1; xdst->route = dst; dst_copy_metrics(dst1, dst); @@ -1792,7 +1795,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, xfrm_state_put(xfrm[i]); free_dst: if (dst0) - dst_free(dst0); + dst_release_immediate(dst0); dst0 = ERR_PTR(err); goto out; } @@ -2073,7 +2076,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, pol_dead |= pols[i]->walk.dead; } if (pol_dead) { - dst_free(&xdst->u.dst); + /* Mark DST_OBSOLETE_DEAD to fail the next + * xfrm_dst_check() + */ + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(&xdst->u.dst); xdst = NULL; num_pols = 0; num_xfrms = 0; @@ -2120,11 +2127,12 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (xdst) { /* The policies were stolen for newly generated bundle */ xdst->num_pols = 0; - dst_free(&xdst->u.dst); + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(&xdst->u.dst); } - /* Flow cache does not have reference, it dst_free()'s, - * but we do need to return one reference for original caller */ + /* We do need to return one reference for original caller */ dst_hold(&new_xdst->u.dst); return &new_xdst->flo; @@ -2147,9 +2155,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: - if (xdst != NULL) - dst_free(&xdst->u.dst); - else + if (xdst != NULL) { + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(&xdst->u.dst); + } else xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } @@ -2636,10 +2646,12 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * notice. That's what we are validating here via the * stale_bundle() check. * - * When a policy's bundle is pruned, we dst_free() the XFRM - * dst which causes it's ->obsolete field to be set to - * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like - * this, we want to force a new route lookup. + * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will + * be marked on it. + * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will + * be marked on it. + * Both will force stable_bundle() to fail on any xdst bundle with + * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; From 560fd93bca66c235d04cb7fcb2229b96546e27c8 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:39 -0700 Subject: [PATCH 16/21] decnet: take dst->__refcnt when struct dn_route is created struct dn_route is inserted into dn_rt_hash_table but no dst->__refcnt is taken. This patch makes sure the dn_rt_hash_table's reference to the dst is ref counted. As the dst is always ref counted properly, we can safely mark DST_NOGC flag so dst_release() will release dst based on refcnt only. And dst gc is no longer needed and all dst_free() or its related function calls should be replaced with dst_release() or dst_release_immediate(). And dst_dev_put() is called when removing dst from the hash table to release the reference on dst->dev before we lose pointer to it. Also, correct the logic in dn_dst_check_expire() and dn_dst_gc() to check dst->__refcnt to be > 1 to indicate it is referenced by other users. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 6f95612b4d32..f467c4e3205b 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -183,11 +183,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } -static inline void dnrt_free(struct dn_route *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - static void dn_dst_check_expire(unsigned long dummy) { int i; @@ -202,14 +197,15 @@ static void dn_dst_check_expire(unsigned long dummy) spin_lock(&dn_rt_hash_table[i].lock); while ((rt = rcu_dereference_protected(*rtp, lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { - if (atomic_read(&rt->dst.__refcnt) || - (now - rt->dst.lastuse) < expire) { + if (atomic_read(&rt->dst.__refcnt) > 1 || + (now - rt->dst.lastuse) < expire) { rtp = &rt->dst.dn_next; continue; } *rtp = rt->dst.dn_next; rt->dst.dn_next = NULL; - dnrt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } spin_unlock(&dn_rt_hash_table[i].lock); @@ -235,14 +231,15 @@ static int dn_dst_gc(struct dst_ops *ops) while ((rt = rcu_dereference_protected(*rtp, lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { - if (atomic_read(&rt->dst.__refcnt) || - (now - rt->dst.lastuse) < expire) { + if (atomic_read(&rt->dst.__refcnt) > 1 || + (now - rt->dst.lastuse) < expire) { rtp = &rt->dst.dn_next; continue; } *rtp = rt->dst.dn_next; rt->dst.dn_next = NULL; - dnrt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); break; } spin_unlock_bh(&dn_rt_hash_table[i].lock); @@ -344,7 +341,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou dst_use(&rth->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); *rp = rth; return 0; } @@ -374,7 +371,8 @@ static void dn_run_flush(unsigned long dummy) for(; rt; rt = next) { next = rcu_dereference_raw(rt->dst.dn_next); RCU_INIT_POINTER(rt->dst.dn_next, NULL); - dnrt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } nothing_to_declare: @@ -1181,7 +1179,8 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST); + rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, + DST_HOST | DST_NOGC); if (rt == NULL) goto e_nobufs; @@ -1215,6 +1214,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o goto e_neighbour; hash = dn_hash(rt->fld.saddr, rt->fld.daddr); + /* dn_insert_route() increments dst->__refcnt */ dn_insert_route(rt, hash, (struct dn_route **)pprt); done: @@ -1237,7 +1237,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o err = -ENOBUFS; goto done; e_neighbour: - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); goto e_nobufs; } @@ -1445,7 +1445,8 @@ static int dn_route_input_slow(struct sk_buff *skb) } make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST); + rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, + DST_HOST | DST_NOGC); if (rt == NULL) goto e_nobufs; @@ -1491,6 +1492,7 @@ static int dn_route_input_slow(struct sk_buff *skb) goto e_neighbour; hash = dn_hash(rt->fld.saddr, rt->fld.daddr); + /* dn_insert_route() increments dst->__refcnt */ dn_insert_route(rt, hash, &rt); skb_dst_set(skb, &rt->dst); @@ -1514,7 +1516,7 @@ static int dn_route_input_slow(struct sk_buff *skb) goto done; e_neighbour: - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); goto done; } From 5b7c9a8ff828287af5aebe93e707271bf1a82cc3 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:40 -0700 Subject: [PATCH 17/21] net: remove dst gc related code This patch removes all dst gc related code and all the dst free functions Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 21 ----- net/core/dev.c | 1 - net/core/dst.c | 213 ---------------------------------------------- 3 files changed, 235 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 88ebb87ad312..0c56d1fc4d7f 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -425,28 +425,9 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); -void __dst_free(struct dst_entry *dst); struct dst_entry *dst_destroy(struct dst_entry *dst); void dst_dev_put(struct dst_entry *dst); -static inline void dst_free(struct dst_entry *dst) -{ - if (dst->obsolete > 0) - return; - if (!atomic_read(&dst->__refcnt)) { - dst = dst_destroy(dst); - if (!dst) - return; - } - __dst_free(dst); -} - -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dst_confirm(struct dst_entry *dst) { } @@ -508,8 +489,6 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) return dst; } -void dst_subsys_init(void); - /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, diff --git a/net/core/dev.c b/net/core/dev.c index b8d6dd9e8b5c..5d1830b8d2cf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8681,7 +8681,6 @@ static int __init net_dev_init(void) rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); - dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dst.c b/net/core/dst.c index 64056ecca5b8..30bea01d2262 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -42,108 +42,6 @@ * to dirty as few cache lines as possible in __dst_free(). * As this is not a very strong hint, we dont force an alignment on SMP. */ -static struct { - spinlock_t lock; - struct dst_entry *list; - unsigned long timer_inc; - unsigned long timer_expires; -} dst_garbage = { - .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), - .timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ - int delayed = 0; - int work_performed = 0; - unsigned long expires = ~0L; - struct dst_entry *dst, *next, head; - struct dst_entry *last = &head; - - mutex_lock(&dst_gc_mutex); - next = dst_busy_list; - -loop: - while ((dst = next) != NULL) { - next = dst->next; - prefetch(&next->next); - cond_resched(); - if (likely(atomic_read(&dst->__refcnt))) { - last->next = dst; - last = dst; - delayed++; - continue; - } - work_performed++; - - dst = dst_destroy(dst); - if (dst) { - /* NOHASH and still referenced. Unless it is already - * on gc list, invalidate it and add to gc list. - * - * Note: this is temporary. Actually, NOHASH dst's - * must be obsoleted when parent is obsoleted. - * But we do not have state "obsoleted, but - * referenced by parent", so it is right. - */ - if (dst->obsolete > 0) - continue; - - ___dst_free(dst); - dst->next = next; - next = dst; - } - } - - spin_lock_bh(&dst_garbage.lock); - next = dst_garbage.list; - if (next) { - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - goto loop; - } - last->next = NULL; - dst_busy_list = head.next; - if (!dst_busy_list) - dst_garbage.timer_inc = DST_GC_MAX; - else { - /* - * if we freed less than 1/10 of delayed entries, - * we can sleep longer. - */ - if (work_performed <= delayed/10) { - dst_garbage.timer_expires += dst_garbage.timer_inc; - if (dst_garbage.timer_expires > DST_GC_MAX) - dst_garbage.timer_expires = DST_GC_MAX; - dst_garbage.timer_inc += DST_GC_INC; - } else { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - } - expires = dst_garbage.timer_expires; - /* - * if the next desired timer is more than 4 seconds in the - * future then round the timer to whole seconds - */ - if (expires > 4*HZ) - expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); - } - - spin_unlock_bh(&dst_garbage.lock); - mutex_unlock(&dst_gc_mutex); -} - int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); @@ -216,34 +114,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry *dst) -{ - /* The first case (dev==NULL) is required, when - protocol module is unloaded. - */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } - dst->obsolete = DST_OBSOLETE_DEAD; -} - -void __dst_free(struct dst_entry *dst) -{ - spin_lock_bh(&dst_garbage.lock); - ___dst_free(dst); - dst->next = dst_garbage.list; - dst_garbage.list = dst; - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); -} -EXPORT_SYMBOL(__dst_free); - struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; @@ -448,86 +318,3 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); - -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, unregister); - - if (dev != dst->dev) - return; - - if (!unregister) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } else { - dst->dev = dev_net(dst->dev)->loopback_dev; - dev_hold(dst->dev); - dev_put(dev); - } -} - -static int dst_dev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dst_entry *dst, *last = NULL; - - switch (event) { - case NETDEV_UNREGISTER_FINAL: - case NETDEV_DOWN: - mutex_lock(&dst_gc_mutex); - for (dst = dst_busy_list; dst; dst = dst->next) { - last = dst; - dst_ifdown(dst, dev, event != NETDEV_DOWN); - } - - spin_lock_bh(&dst_garbage.lock); - dst = dst_garbage.list; - dst_garbage.list = NULL; - /* The code in dst_ifdown places a hold on the loopback device. - * If the gc entry processing is set to expire after a lengthy - * interval, this hold can cause netdev_wait_allrefs() to hang - * out and wait for a long time -- until the the loopback - * interface is released. If we're really unlucky, it'll emit - * pr_emerg messages to console too. Reset the interval here, - * so dst cleanups occur in a more timely fashion. - */ - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); - - if (last) - last->next = dst; - else - dst_busy_list = dst; - for (; dst; dst = dst->next) - dst_ifdown(dst, dev, event != NETDEV_DOWN); - mutex_unlock(&dst_gc_mutex); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block dst_dev_notifier = { - .notifier_call = dst_dev_event, - .priority = -10, /* must be called after other network notifiers */ -}; - -void __init dst_subsys_init(void) -{ - register_netdevice_notifier(&dst_dev_notifier); -} From b2a9c0ed75a32e788d034a58a18f2fc46396e412 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:41 -0700 Subject: [PATCH 18/21] net: remove DST_NOGC flag Now that all the components have been changed to release dst based on refcnt only and not depend on dst gc anymore, we can remove the temporary flag DST_NOGC. Note that we also need to remove the DST_NOCACHE check in dst_release() and dst_hold_safe() because now all the dst are released based on refcnt and behaves as DST_NOCACHE. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 6 +----- net/core/dst.c | 4 +--- net/decnet/dn_route.c | 6 ++---- net/ipv4/route.c | 5 ++--- net/ipv6/route.c | 5 ++--- net/xfrm/xfrm_policy.c | 2 +- 6 files changed, 9 insertions(+), 19 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 0c56d1fc4d7f..1be82f672c37 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -57,7 +57,6 @@ struct dst_entry { #define DST_XFRM_TUNNEL 0x0080 #define DST_XFRM_QUEUE 0x0100 #define DST_METADATA 0x0200 -#define DST_NOGC 0x0400 short error; @@ -336,10 +335,7 @@ static inline void skb_dst_force(struct sk_buff *skb) */ static inline bool dst_hold_safe(struct dst_entry *dst) { - if (dst->flags & (DST_NOCACHE | DST_NOGC)) - return atomic_inc_not_zero(&dst->__refcnt); - dst_hold(dst); - return true; + return atomic_inc_not_zero(&dst->__refcnt); } /** diff --git a/net/core/dst.c b/net/core/dst.c index 30bea01d2262..70543dabb797 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -179,14 +179,12 @@ void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; - unsigned short destroy_after_rcu = dst->flags & - (DST_NOCACHE | DST_NOGC); newrefcnt = atomic_dec_return(&dst->__refcnt); if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (!newrefcnt && unlikely(destroy_after_rcu)) + if (!newrefcnt) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index f467c4e3205b..5d17d843ac86 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1179,8 +1179,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, - DST_HOST | DST_NOGC); + rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; @@ -1445,8 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb) } make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, - DST_HOST | DST_NOGC); + rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 80b30c2bf47d..9a0f496f8bf4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1496,8 +1496,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0) | - DST_NOGC); + (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); @@ -2503,7 +2502,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOGC); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (rt) { struct dst_entry *new = &rt->dst; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c88044b8fa7c..6b6528fa3292 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -354,8 +354,7 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 1, DST_OBSOLETE_FORCE_CHK, - flags | DST_NOGC); + 1, DST_OBSOLETE_FORCE_CHK, flags); if (rt) rt6_info_init(rt); @@ -1255,7 +1254,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, DST_NOGC); + DST_OBSOLETE_NONE, 0); if (rt) { rt6_info_init(rt); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 85e1e13639cc..3f7e77f11112 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1622,7 +1622,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOGC); + xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; From a4c2fd7f78915a0d7c5275e7612e7793157a01f2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:42 -0700 Subject: [PATCH 19/21] net: remove DST_NOCACHE flag DST_NOCACHE flag check has been removed from dst_release() and dst_hold_safe() in a previous patch because all the dst are now ref counted properly and can be released based on refcnt only. Looking at the rest of the DST_NOCACHE use, all of them can now be removed or replaced with other checks. So this patch gets rid of all the DST_NOCACHE usage and remove this flag completely. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 +- include/net/dst.h | 1 - include/net/ip6_fib.h | 2 +- net/core/dst.c | 2 +- net/ipv4/route.c | 23 +++++++++++------------ net/ipv6/ip6_fib.c | 4 +--- net/ipv6/route.c | 7 ++----- net/xfrm/xfrm_policy.c | 1 - 8 files changed, 17 insertions(+), 25 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index d038927acfca..997ef25189fd 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -563,7 +563,7 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) static int vrf_rt6_create(struct net_device *dev) { - int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE; + int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct fib6_table *rt6i_table; diff --git a/include/net/dst.h b/include/net/dst.h index 1be82f672c37..642483ed4edf 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,7 +51,6 @@ struct dst_entry { #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 -#define DST_NOCACHE 0x0010 #define DST_NOCOUNT 0x0020 #define DST_FAKE_RTABLE 0x0040 #define DST_XFRM_TUNNEL 0x0080 diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index aa50e2e6fa2a..1a88008cc6f5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -170,7 +170,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) static inline u32 rt6_get_cookie(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_PCPU || - (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) rt = (struct rt6_info *)(rt->dst.from); return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; diff --git a/net/core/dst.c b/net/core/dst.c index 70543dabb797..f851adb9ec9b 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -270,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) dst = &md_dst->dst; dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, - DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + DST_METADATA | DST_NOCOUNT); dst->input = dst_md_discard; dst->output = dst_md_discard_out; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9a0f496f8bf4..c816cd53f7fc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1299,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, - __be32 daddr) + __be32 daddr, const bool do_cache) { bool ret = false; @@ -1328,7 +1328,7 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (!rt->rt_gateway) rt->rt_gateway = daddr; - if (!(rt->dst.flags & DST_NOCACHE)) { + if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { @@ -1441,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt) static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, - struct fib_info *fi, u16 type, u32 itag) + struct fib_info *fi, u16 type, u32 itag, + const bool do_cache) { bool cached = false; @@ -1462,8 +1463,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) - cached = rt_bind_exception(rt, fnhe, daddr); - else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_bind_exception(rt, fnhe, daddr, do_cache); + else if (do_cache) cached = rt_cache_route(nh, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or @@ -1471,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - rt->dst.flags |= DST_NOCACHE; if (!rt->rt_gateway) rt->rt_gateway = daddr; rt_add_uncached_list(rt); @@ -1494,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (will_cache ? 0 : DST_HOST) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1738,7 +1738,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.input = ip_forward; - rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, + do_cache); set_lwt_redirect(rth); skb_dst_set(skb, &rth->dst); out: @@ -2026,10 +2027,8 @@ out: return err; rth->dst.input = lwtunnel_input; } - if (unlikely(!rt_cache_route(nh, rth))) { - rth->dst.flags |= DST_NOCACHE; + if (unlikely(!rt_cache_route(nh, rth))) rt_add_uncached_list(rth); - } } skb_dst_set(skb, &rth->dst); err = 0; @@ -2260,7 +2259,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, #endif } - rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); set_lwt_redirect(rth); return rth; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c67ec79bf0da..4f3e4657f2d6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -975,8 +975,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); - if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && - !atomic_read(&rt->dst.__refcnt))) + if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; if (info->nlh) { @@ -1073,7 +1072,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); - rt->dst.flags &= ~DST_NOCACHE; } out: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6b6528fa3292..2e4490076061 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -128,7 +128,6 @@ static void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - rt->dst.flags |= DST_NOCACHE; rt->rt6i_uncached_list = ul; spin_lock_bh(&ul->lock); @@ -1326,7 +1325,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); if (rt->rt6i_flags & RTF_PCPU || - (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -2130,8 +2129,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry || - rt->dst.flags & DST_NOCACHE) { + if (rt == net->ipv6.ip6_null_entry) { err = -ENOENT; goto out; } @@ -2722,7 +2720,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; rt->rt6i_table = fib6_get_table(net, tb_id); - rt->dst.flags |= DST_NOCACHE; return rt; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 3f7e77f11112..af8e38f47b5b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2231,7 +2231,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, } dst_hold(&xdst->u.dst); - xdst->u.dst.flags |= DST_NOCACHE; route = xdst->route; } } From 1eb04e7c9e63e0480cc52b84187283de4fe5f982 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:43 -0700 Subject: [PATCH 20/21] net: reorder all the dst flags As some dst flags are removed, reorder the dst flags to fill in the blanks. Note: these flags are not exposed into user space. So it is safe to reorder. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 642483ed4edf..d912b44d2dcb 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,11 +51,11 @@ struct dst_entry { #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 -#define DST_NOCOUNT 0x0020 -#define DST_FAKE_RTABLE 0x0040 -#define DST_XFRM_TUNNEL 0x0080 -#define DST_XFRM_QUEUE 0x0100 -#define DST_METADATA 0x0200 +#define DST_NOCOUNT 0x0008 +#define DST_FAKE_RTABLE 0x0010 +#define DST_XFRM_TUNNEL 0x0020 +#define DST_XFRM_QUEUE 0x0040 +#define DST_METADATA 0x0080 short error; From 44ebe79149ff415e810529bf192faa5c38d4a0de Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 17 Jun 2017 10:42:44 -0700 Subject: [PATCH 21/21] net: add debug atomic_inc_not_zero() in dst_hold() This patch is meant to add a debug warning on the situation where dst is being held during its destroy phase. This could potentially cause double free issue on the dst. Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/dst.h b/include/net/dst.h index d912b44d2dcb..f73611ec4017 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -251,7 +251,7 @@ static inline void dst_hold(struct dst_entry *dst) * __pad_to_align_refcnt declaration in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); - atomic_inc(&dst->__refcnt); + WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); } static inline void dst_use(struct dst_entry *dst, unsigned long time)