• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66 
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 enum rt6_nud_state {
74 	RT6_NUD_FAIL_HARD = -3,
75 	RT6_NUD_FAIL_PROBE = -2,
76 	RT6_NUD_FAIL_DO_RR = -1,
77 	RT6_NUD_SUCCEED = 1
78 };
79 
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu,
97 					   bool confirm_neigh);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 
109 #ifdef CONFIG_IPV6_ROUTE_INFO
110 static struct rt6_info *rt6_add_route_info(struct net *net,
111 					   const struct in6_addr *prefix, int prefixlen,
112 					   const struct in6_addr *gwaddr,
113 					   struct net_device *dev,
114 					   unsigned int pref);
115 static struct rt6_info *rt6_get_route_info(struct net *net,
116 					   const struct in6_addr *prefix, int prefixlen,
117 					   const struct in6_addr *gwaddr,
118 					   struct net_device *dev);
119 #endif
120 
121 struct uncached_list {
122 	spinlock_t		lock;
123 	struct list_head	head;
124 };
125 
126 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
127 
rt6_uncached_list_add(struct rt6_info * rt)128 static void rt6_uncached_list_add(struct rt6_info *rt)
129 {
130 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
131 
132 	rt->rt6i_uncached_list = ul;
133 
134 	spin_lock_bh(&ul->lock);
135 	list_add_tail(&rt->rt6i_uncached, &ul->head);
136 	spin_unlock_bh(&ul->lock);
137 }
138 
rt6_uncached_list_del(struct rt6_info * rt)139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141 	if (!list_empty(&rt->rt6i_uncached)) {
142 		struct uncached_list *ul = rt->rt6i_uncached_list;
143 
144 		spin_lock_bh(&ul->lock);
145 		list_del(&rt->rt6i_uncached);
146 		spin_unlock_bh(&ul->lock);
147 	}
148 }
149 
rt6_uncached_list_flush_dev(struct net * net,struct net_device * dev)150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152 	struct net_device *loopback_dev = net->loopback_dev;
153 	int cpu;
154 
155 	if (dev == loopback_dev)
156 		return;
157 
158 	for_each_possible_cpu(cpu) {
159 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160 		struct rt6_info *rt;
161 
162 		spin_lock_bh(&ul->lock);
163 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164 			struct inet6_dev *rt_idev = rt->rt6i_idev;
165 			struct net_device *rt_dev = rt->dst.dev;
166 
167 			if (rt_idev->dev == dev) {
168 				rt->rt6i_idev = in6_dev_get(loopback_dev);
169 				in6_dev_put(rt_idev);
170 			}
171 
172 			if (rt_dev == dev) {
173 				rt->dst.dev = loopback_dev;
174 				dev_hold(rt->dst.dev);
175 				dev_put(rt_dev);
176 			}
177 		}
178 		spin_unlock_bh(&ul->lock);
179 	}
180 }
181 
rt6_pcpu_cow_metrics(struct rt6_info * rt)182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184 	return dst_metrics_write_ptr(rt->dst.from);
185 }
186 
ipv6_cow_metrics(struct dst_entry * dst,unsigned long old)187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189 	struct rt6_info *rt = (struct rt6_info *)dst;
190 
191 	if (rt->rt6i_flags & RTF_PCPU)
192 		return rt6_pcpu_cow_metrics(rt);
193 	else if (rt->rt6i_flags & RTF_CACHE)
194 		return NULL;
195 	else
196 		return dst_cow_metrics_generic(dst, old);
197 }
198 
choose_neigh_daddr(struct rt6_info * rt,struct sk_buff * skb,const void * daddr)199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200 					     struct sk_buff *skb,
201 					     const void *daddr)
202 {
203 	struct in6_addr *p = &rt->rt6i_gateway;
204 
205 	if (!ipv6_addr_any(p))
206 		return (const void *) p;
207 	else if (skb)
208 		return &ipv6_hdr(skb)->daddr;
209 	return daddr;
210 }
211 
ip6_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213 					  struct sk_buff *skb,
214 					  const void *daddr)
215 {
216 	struct rt6_info *rt = (struct rt6_info *) dst;
217 	struct neighbour *n;
218 
219 	daddr = choose_neigh_daddr(rt, skb, daddr);
220 	n = __ipv6_neigh_lookup(dst->dev, daddr);
221 	if (n)
222 		return n;
223 	return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225 
ip6_confirm_neigh(const struct dst_entry * dst,const void * daddr)226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228 	struct net_device *dev = dst->dev;
229 	struct rt6_info *rt = (struct rt6_info *)dst;
230 
231 	daddr = choose_neigh_daddr(rt, NULL, daddr);
232 	if (!daddr)
233 		return;
234 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 		return;
236 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 		return;
238 	__ipv6_confirm_neigh(dev, daddr);
239 }
240 
241 static struct dst_ops ip6_dst_ops_template = {
242 	.family			=	AF_INET6,
243 	.gc			=	ip6_dst_gc,
244 	.gc_thresh		=	1024,
245 	.check			=	ip6_dst_check,
246 	.default_advmss		=	ip6_default_advmss,
247 	.mtu			=	ip6_mtu,
248 	.cow_metrics		=	ipv6_cow_metrics,
249 	.destroy		=	ip6_dst_destroy,
250 	.ifdown			=	ip6_dst_ifdown,
251 	.negative_advice	=	ip6_negative_advice,
252 	.link_failure		=	ip6_link_failure,
253 	.update_pmtu		=	ip6_rt_update_pmtu,
254 	.redirect		=	rt6_do_redirect,
255 	.local_out		=	__ip6_local_out,
256 	.neigh_lookup		=	ip6_neigh_lookup,
257 	.confirm_neigh		=	ip6_confirm_neigh,
258 };
259 
ip6_blackhole_mtu(const struct dst_entry * dst)260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 
264 	return mtu ? : dst->dev->mtu;
265 }
266 
ip6_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268 					 struct sk_buff *skb, u32 mtu,
269 					 bool confirm_neigh)
270 {
271 }
272 
ip6_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274 				      struct sk_buff *skb)
275 {
276 }
277 
278 static struct dst_ops ip6_dst_blackhole_ops = {
279 	.family			=	AF_INET6,
280 	.destroy		=	ip6_dst_destroy,
281 	.check			=	ip6_dst_check,
282 	.mtu			=	ip6_blackhole_mtu,
283 	.default_advmss		=	ip6_default_advmss,
284 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
285 	.redirect		=	ip6_rt_blackhole_redirect,
286 	.cow_metrics		=	dst_cow_metrics_generic,
287 	.neigh_lookup		=	ip6_neigh_lookup,
288 };
289 
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 	[RTAX_HOPLIMIT - 1] = 0,
292 };
293 
294 static const struct rt6_info ip6_null_entry_template = {
295 	.dst = {
296 		.__refcnt	= ATOMIC_INIT(1),
297 		.__use		= 1,
298 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
299 		.error		= -ENETUNREACH,
300 		.input		= ip6_pkt_discard,
301 		.output		= ip6_pkt_discard_out,
302 	},
303 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
304 	.rt6i_protocol  = RTPROT_KERNEL,
305 	.rt6i_metric	= ~(u32) 0,
306 	.rt6i_ref	= ATOMIC_INIT(1),
307 };
308 
309 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
310 
311 static const struct rt6_info ip6_prohibit_entry_template = {
312 	.dst = {
313 		.__refcnt	= ATOMIC_INIT(1),
314 		.__use		= 1,
315 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
316 		.error		= -EACCES,
317 		.input		= ip6_pkt_prohibit,
318 		.output		= ip6_pkt_prohibit_out,
319 	},
320 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
321 	.rt6i_protocol  = RTPROT_KERNEL,
322 	.rt6i_metric	= ~(u32) 0,
323 	.rt6i_ref	= ATOMIC_INIT(1),
324 };
325 
326 static const struct rt6_info ip6_blk_hole_entry_template = {
327 	.dst = {
328 		.__refcnt	= ATOMIC_INIT(1),
329 		.__use		= 1,
330 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
331 		.error		= -EINVAL,
332 		.input		= dst_discard,
333 		.output		= dst_discard_out,
334 	},
335 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
336 	.rt6i_protocol  = RTPROT_KERNEL,
337 	.rt6i_metric	= ~(u32) 0,
338 	.rt6i_ref	= ATOMIC_INIT(1),
339 };
340 
341 #endif
342 
rt6_info_init(struct rt6_info * rt)343 static void rt6_info_init(struct rt6_info *rt)
344 {
345 	struct dst_entry *dst = &rt->dst;
346 
347 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 	INIT_LIST_HEAD(&rt->rt6i_siblings);
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
__ip6_dst_alloc(struct net * net,struct net_device * dev,int flags)353 static struct rt6_info *__ip6_dst_alloc(struct net *net,
354 					struct net_device *dev,
355 					int flags)
356 {
357 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
358 					1, DST_OBSOLETE_FORCE_CHK, flags);
359 
360 	if (rt)
361 		rt6_info_init(rt);
362 
363 	return rt;
364 }
365 
ip6_dst_alloc(struct net * net,struct net_device * dev,int flags)366 struct rt6_info *ip6_dst_alloc(struct net *net,
367 			       struct net_device *dev,
368 			       int flags)
369 {
370 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
371 
372 	if (rt) {
373 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
374 		if (rt->rt6i_pcpu) {
375 			int cpu;
376 
377 			for_each_possible_cpu(cpu) {
378 				struct rt6_info **p;
379 
380 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
381 				/* no one shares rt */
382 				*p =  NULL;
383 			}
384 		} else {
385 			dst_release_immediate(&rt->dst);
386 			return NULL;
387 		}
388 	}
389 
390 	return rt;
391 }
392 EXPORT_SYMBOL(ip6_dst_alloc);
393 
ip6_dst_destroy(struct dst_entry * dst)394 static void ip6_dst_destroy(struct dst_entry *dst)
395 {
396 	struct rt6_info *rt = (struct rt6_info *)dst;
397 	struct dst_entry *from = dst->from;
398 	struct inet6_dev *idev;
399 
400 	dst_destroy_metrics_generic(dst);
401 	free_percpu(rt->rt6i_pcpu);
402 	rt6_uncached_list_del(rt);
403 
404 	idev = rt->rt6i_idev;
405 	if (idev) {
406 		rt->rt6i_idev = NULL;
407 		in6_dev_put(idev);
408 	}
409 
410 	dst->from = NULL;
411 	dst_release(from);
412 }
413 
ip6_dst_ifdown(struct dst_entry * dst,struct net_device * dev,int how)414 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
415 			   int how)
416 {
417 	struct rt6_info *rt = (struct rt6_info *)dst;
418 	struct inet6_dev *idev = rt->rt6i_idev;
419 	struct net_device *loopback_dev =
420 		dev_net(dev)->loopback_dev;
421 
422 	if (idev && idev->dev != loopback_dev) {
423 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
424 		if (loopback_idev) {
425 			rt->rt6i_idev = loopback_idev;
426 			in6_dev_put(idev);
427 		}
428 	}
429 }
430 
__rt6_check_expired(const struct rt6_info * rt)431 static bool __rt6_check_expired(const struct rt6_info *rt)
432 {
433 	if (rt->rt6i_flags & RTF_EXPIRES)
434 		return time_after(jiffies, rt->dst.expires);
435 	else
436 		return false;
437 }
438 
rt6_check_expired(const struct rt6_info * rt)439 static bool rt6_check_expired(const struct rt6_info *rt)
440 {
441 	if (rt->rt6i_flags & RTF_EXPIRES) {
442 		if (time_after(jiffies, rt->dst.expires))
443 			return true;
444 	} else if (rt->dst.from) {
445 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
446 		       rt6_check_expired((struct rt6_info *)rt->dst.from);
447 	}
448 	return false;
449 }
450 
rt6_multipath_select(struct rt6_info * match,struct flowi6 * fl6,int oif,int strict)451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 					     struct flowi6 *fl6, int oif,
453 					     int strict)
454 {
455 	struct rt6_info *sibling, *next_sibling;
456 	int route_choosen;
457 
458 	/* We might have already computed the hash for ICMPv6 errors. In such
459 	 * case it will always be non-zero. Otherwise now is the time to do it.
460 	 */
461 	if (!fl6->mp_hash)
462 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
463 
464 	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
465 	/* Don't change the route, if route_choosen == 0
466 	 * (siblings does not include ourself)
467 	 */
468 	if (route_choosen)
469 		list_for_each_entry_safe(sibling, next_sibling,
470 				&match->rt6i_siblings, rt6i_siblings) {
471 			route_choosen--;
472 			if (route_choosen == 0) {
473 				if (rt6_score_route(sibling, oif, strict) < 0)
474 					break;
475 				match = sibling;
476 				break;
477 			}
478 		}
479 	return match;
480 }
481 
482 /*
483  *	Route lookup. Any table->tb6_lock is implied.
484  */
485 
rt6_device_match(struct net * net,struct rt6_info * rt,const struct in6_addr * saddr,int oif,int flags)486 static inline struct rt6_info *rt6_device_match(struct net *net,
487 						    struct rt6_info *rt,
488 						    const struct in6_addr *saddr,
489 						    int oif,
490 						    int flags)
491 {
492 	struct rt6_info *local = NULL;
493 	struct rt6_info *sprt;
494 
495 	if (!oif && ipv6_addr_any(saddr))
496 		goto out;
497 
498 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
499 		struct net_device *dev = sprt->dst.dev;
500 
501 		if (oif) {
502 			if (dev->ifindex == oif)
503 				return sprt;
504 			if (dev->flags & IFF_LOOPBACK) {
505 				if (!sprt->rt6i_idev ||
506 				    sprt->rt6i_idev->dev->ifindex != oif) {
507 					if (flags & RT6_LOOKUP_F_IFACE)
508 						continue;
509 					if (local &&
510 					    local->rt6i_idev->dev->ifindex == oif)
511 						continue;
512 				}
513 				local = sprt;
514 			}
515 		} else {
516 			if (ipv6_chk_addr(net, saddr, dev,
517 					  flags & RT6_LOOKUP_F_IFACE))
518 				return sprt;
519 		}
520 	}
521 
522 	if (oif) {
523 		if (local)
524 			return local;
525 
526 		if (flags & RT6_LOOKUP_F_IFACE)
527 			return net->ipv6.ip6_null_entry;
528 	}
529 out:
530 	return rt;
531 }
532 
533 #ifdef CONFIG_IPV6_ROUTER_PREF
534 struct __rt6_probe_work {
535 	struct work_struct work;
536 	struct in6_addr target;
537 	struct net_device *dev;
538 };
539 
rt6_probe_deferred(struct work_struct * w)540 static void rt6_probe_deferred(struct work_struct *w)
541 {
542 	struct in6_addr mcaddr;
543 	struct __rt6_probe_work *work =
544 		container_of(w, struct __rt6_probe_work, work);
545 
546 	addrconf_addr_solict_mult(&work->target, &mcaddr);
547 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
548 	dev_put(work->dev);
549 	kfree(work);
550 }
551 
rt6_probe(struct rt6_info * rt)552 static void rt6_probe(struct rt6_info *rt)
553 {
554 	struct __rt6_probe_work *work;
555 	struct neighbour *neigh;
556 	/*
557 	 * Okay, this does not seem to be appropriate
558 	 * for now, however, we need to check if it
559 	 * is really so; aka Router Reachability Probing.
560 	 *
561 	 * Router Reachability Probe MUST be rate-limited
562 	 * to no more than one per minute.
563 	 */
564 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
565 		return;
566 	rcu_read_lock_bh();
567 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
568 	if (neigh) {
569 		if (neigh->nud_state & NUD_VALID)
570 			goto out;
571 
572 		work = NULL;
573 		write_lock(&neigh->lock);
574 		if (!(neigh->nud_state & NUD_VALID) &&
575 		    time_after(jiffies,
576 			       neigh->updated +
577 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
578 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
579 			if (work)
580 				__neigh_set_probe_once(neigh);
581 		}
582 		write_unlock(&neigh->lock);
583 	} else {
584 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
585 	}
586 
587 	if (work) {
588 		INIT_WORK(&work->work, rt6_probe_deferred);
589 		work->target = rt->rt6i_gateway;
590 		dev_hold(rt->dst.dev);
591 		work->dev = rt->dst.dev;
592 		schedule_work(&work->work);
593 	}
594 
595 out:
596 	rcu_read_unlock_bh();
597 }
598 #else
rt6_probe(struct rt6_info * rt)599 static inline void rt6_probe(struct rt6_info *rt)
600 {
601 }
602 #endif
603 
604 /*
605  * Default Router Selection (RFC 2461 6.3.6)
606  */
rt6_check_dev(struct rt6_info * rt,int oif)607 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
608 {
609 	struct net_device *dev = rt->dst.dev;
610 	if (!oif || dev->ifindex == oif)
611 		return 2;
612 	if ((dev->flags & IFF_LOOPBACK) &&
613 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
614 		return 1;
615 	return 0;
616 }
617 
rt6_check_neigh(struct rt6_info * rt)618 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
619 {
620 	struct neighbour *neigh;
621 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
622 
623 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
624 	    !(rt->rt6i_flags & RTF_GATEWAY))
625 		return RT6_NUD_SUCCEED;
626 
627 	rcu_read_lock_bh();
628 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
629 	if (neigh) {
630 		read_lock(&neigh->lock);
631 		if (neigh->nud_state & NUD_VALID)
632 			ret = RT6_NUD_SUCCEED;
633 #ifdef CONFIG_IPV6_ROUTER_PREF
634 		else if (!(neigh->nud_state & NUD_FAILED))
635 			ret = RT6_NUD_SUCCEED;
636 		else
637 			ret = RT6_NUD_FAIL_PROBE;
638 #endif
639 		read_unlock(&neigh->lock);
640 	} else {
641 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
642 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
643 	}
644 	rcu_read_unlock_bh();
645 
646 	return ret;
647 }
648 
rt6_score_route(struct rt6_info * rt,int oif,int strict)649 static int rt6_score_route(struct rt6_info *rt, int oif,
650 			   int strict)
651 {
652 	int m;
653 
654 	m = rt6_check_dev(rt, oif);
655 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
656 		return RT6_NUD_FAIL_HARD;
657 #ifdef CONFIG_IPV6_ROUTER_PREF
658 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
659 #endif
660 	if (strict & RT6_LOOKUP_F_REACHABLE) {
661 		int n = rt6_check_neigh(rt);
662 		if (n < 0)
663 			return n;
664 	}
665 	return m;
666 }
667 
find_match(struct rt6_info * rt,int oif,int strict,int * mpri,struct rt6_info * match,bool * do_rr)668 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
669 				   int *mpri, struct rt6_info *match,
670 				   bool *do_rr)
671 {
672 	int m;
673 	bool match_do_rr = false;
674 	struct inet6_dev *idev = rt->rt6i_idev;
675 	struct net_device *dev = rt->dst.dev;
676 
677 	if (dev && !netif_carrier_ok(dev) &&
678 	    idev->cnf.ignore_routes_with_linkdown &&
679 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
680 		goto out;
681 
682 	if (rt6_check_expired(rt))
683 		goto out;
684 
685 	m = rt6_score_route(rt, oif, strict);
686 	if (m == RT6_NUD_FAIL_DO_RR) {
687 		match_do_rr = true;
688 		m = 0; /* lowest valid score */
689 	} else if (m == RT6_NUD_FAIL_HARD) {
690 		goto out;
691 	}
692 
693 	if (strict & RT6_LOOKUP_F_REACHABLE)
694 		rt6_probe(rt);
695 
696 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
697 	if (m > *mpri) {
698 		*do_rr = match_do_rr;
699 		*mpri = m;
700 		match = rt;
701 	}
702 out:
703 	return match;
704 }
705 
find_rr_leaf(struct fib6_node * fn,struct rt6_info * rr_head,u32 metric,int oif,int strict,bool * do_rr)706 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
707 				     struct rt6_info *rr_head,
708 				     u32 metric, int oif, int strict,
709 				     bool *do_rr)
710 {
711 	struct rt6_info *rt, *match, *cont;
712 	int mpri = -1;
713 
714 	match = NULL;
715 	cont = NULL;
716 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
717 		if (rt->rt6i_metric != metric) {
718 			cont = rt;
719 			break;
720 		}
721 
722 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 	}
724 
725 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
726 		if (rt->rt6i_metric != metric) {
727 			cont = rt;
728 			break;
729 		}
730 
731 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 	}
733 
734 	if (match || !cont)
735 		return match;
736 
737 	for (rt = cont; rt; rt = rt->dst.rt6_next)
738 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
739 
740 	return match;
741 }
742 
rt6_select(struct fib6_node * fn,int oif,int strict)743 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
744 {
745 	struct rt6_info *match, *rt0;
746 	struct net *net;
747 	bool do_rr = false;
748 
749 	rt0 = fn->rr_ptr;
750 	if (!rt0)
751 		fn->rr_ptr = rt0 = fn->leaf;
752 
753 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
754 			     &do_rr);
755 
756 	if (do_rr) {
757 		struct rt6_info *next = rt0->dst.rt6_next;
758 
759 		/* no entries matched; do round-robin */
760 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
761 			next = fn->leaf;
762 
763 		if (next != rt0)
764 			fn->rr_ptr = next;
765 	}
766 
767 	net = dev_net(rt0->dst.dev);
768 	return match ? match : net->ipv6.ip6_null_entry;
769 }
770 
rt6_is_gw_or_nonexthop(const struct rt6_info * rt)771 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
772 {
773 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
774 }
775 
776 #ifdef CONFIG_IPV6_ROUTE_INFO
rt6_route_rcv(struct net_device * dev,u8 * opt,int len,const struct in6_addr * gwaddr)777 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
778 		  const struct in6_addr *gwaddr)
779 {
780 	struct net *net = dev_net(dev);
781 	struct route_info *rinfo = (struct route_info *) opt;
782 	struct in6_addr prefix_buf, *prefix;
783 	unsigned int pref;
784 	unsigned long lifetime;
785 	struct rt6_info *rt;
786 
787 	if (len < sizeof(struct route_info)) {
788 		return -EINVAL;
789 	}
790 
791 	/* Sanity check for prefix_len and length */
792 	if (rinfo->length > 3) {
793 		return -EINVAL;
794 	} else if (rinfo->prefix_len > 128) {
795 		return -EINVAL;
796 	} else if (rinfo->prefix_len > 64) {
797 		if (rinfo->length < 2) {
798 			return -EINVAL;
799 		}
800 	} else if (rinfo->prefix_len > 0) {
801 		if (rinfo->length < 1) {
802 			return -EINVAL;
803 		}
804 	}
805 
806 	pref = rinfo->route_pref;
807 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
808 		return -EINVAL;
809 
810 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
811 
812 	if (rinfo->length == 3)
813 		prefix = (struct in6_addr *)rinfo->prefix;
814 	else {
815 		/* this function is safe */
816 		ipv6_addr_prefix(&prefix_buf,
817 				 (struct in6_addr *)rinfo->prefix,
818 				 rinfo->prefix_len);
819 		prefix = &prefix_buf;
820 	}
821 
822 	if (rinfo->prefix_len == 0)
823 		rt = rt6_get_dflt_router(gwaddr, dev);
824 	else
825 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
826 					gwaddr, dev);
827 
828 	if (rt && !lifetime) {
829 		ip6_del_rt(rt);
830 		rt = NULL;
831 	}
832 
833 	if (!rt && lifetime)
834 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
835 					dev, pref);
836 	else if (rt)
837 		rt->rt6i_flags = RTF_ROUTEINFO |
838 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
839 
840 	if (rt) {
841 		if (!addrconf_finite_timeout(lifetime))
842 			rt6_clean_expires(rt);
843 		else
844 			rt6_set_expires(rt, jiffies + HZ * lifetime);
845 
846 		ip6_rt_put(rt);
847 	}
848 	return 0;
849 }
850 #endif
851 
fib6_backtrack(struct fib6_node * fn,struct in6_addr * saddr)852 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
853 					struct in6_addr *saddr)
854 {
855 	struct fib6_node *pn;
856 	while (1) {
857 		if (fn->fn_flags & RTN_TL_ROOT)
858 			return NULL;
859 		pn = fn->parent;
860 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
861 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
862 		else
863 			fn = pn;
864 		if (fn->fn_flags & RTN_RTINFO)
865 			return fn;
866 	}
867 }
868 
ip6_pol_route_lookup(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)869 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
870 					     struct fib6_table *table,
871 					     struct flowi6 *fl6, int flags)
872 {
873 	struct fib6_node *fn;
874 	struct rt6_info *rt;
875 
876 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
877 		flags &= ~RT6_LOOKUP_F_IFACE;
878 
879 	read_lock_bh(&table->tb6_lock);
880 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
881 restart:
882 	rt = fn->leaf;
883 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
884 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
885 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
886 	if (rt == net->ipv6.ip6_null_entry) {
887 		fn = fib6_backtrack(fn, &fl6->saddr);
888 		if (fn)
889 			goto restart;
890 	}
891 	dst_use(&rt->dst, jiffies);
892 	read_unlock_bh(&table->tb6_lock);
893 
894 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
895 
896 	return rt;
897 
898 }
899 
ip6_route_lookup(struct net * net,struct flowi6 * fl6,int flags)900 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
901 				    int flags)
902 {
903 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
904 }
905 EXPORT_SYMBOL_GPL(ip6_route_lookup);
906 
rt6_lookup(struct net * net,const struct in6_addr * daddr,const struct in6_addr * saddr,int oif,int strict)907 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
908 			    const struct in6_addr *saddr, int oif, int strict)
909 {
910 	struct flowi6 fl6 = {
911 		.flowi6_oif = oif,
912 		.daddr = *daddr,
913 	};
914 	struct dst_entry *dst;
915 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
916 
917 	if (saddr) {
918 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
919 		flags |= RT6_LOOKUP_F_HAS_SADDR;
920 	}
921 
922 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
923 	if (dst->error == 0)
924 		return (struct rt6_info *) dst;
925 
926 	dst_release(dst);
927 
928 	return NULL;
929 }
930 EXPORT_SYMBOL(rt6_lookup);
931 
932 /* ip6_ins_rt is called with FREE table->tb6_lock.
933  * It takes new route entry, the addition fails by any reason the
934  * route is released.
935  * Caller must hold dst before calling it.
936  */
937 
__ip6_ins_rt(struct rt6_info * rt,struct nl_info * info,struct mx6_config * mxc,struct netlink_ext_ack * extack)938 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
939 			struct mx6_config *mxc,
940 			struct netlink_ext_ack *extack)
941 {
942 	int err;
943 	struct fib6_table *table;
944 
945 	table = rt->rt6i_table;
946 	write_lock_bh(&table->tb6_lock);
947 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
948 	write_unlock_bh(&table->tb6_lock);
949 
950 	return err;
951 }
952 
ip6_ins_rt(struct rt6_info * rt)953 int ip6_ins_rt(struct rt6_info *rt)
954 {
955 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
956 	struct mx6_config mxc = { .mx = NULL, };
957 
958 	/* Hold dst to account for the reference from the fib6 tree */
959 	dst_hold(&rt->dst);
960 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
961 }
962 
963 /* called with rcu_lock held */
ip6_rt_get_dev_rcu(struct rt6_info * rt)964 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
965 {
966 	struct net_device *dev = rt->dst.dev;
967 
968 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
969 		/* for copies of local routes, dst->dev needs to be the
970 		 * device if it is a master device, the master device if
971 		 * device is enslaved, and the loopback as the default
972 		 */
973 		if (netif_is_l3_slave(dev) &&
974 		    !rt6_need_strict(&rt->rt6i_dst.addr))
975 			dev = l3mdev_master_dev_rcu(dev);
976 		else if (!netif_is_l3_master(dev))
977 			dev = dev_net(dev)->loopback_dev;
978 		/* last case is netif_is_l3_master(dev) is true in which
979 		 * case we want dev returned to be dev
980 		 */
981 	}
982 
983 	return dev;
984 }
985 
ip6_rt_cache_alloc(struct rt6_info * ort,const struct in6_addr * daddr,const struct in6_addr * saddr)986 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
987 					   const struct in6_addr *daddr,
988 					   const struct in6_addr *saddr)
989 {
990 	struct net_device *dev;
991 	struct rt6_info *rt;
992 
993 	/*
994 	 *	Clone the route.
995 	 */
996 
997 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
998 		ort = (struct rt6_info *)ort->dst.from;
999 
1000 	rcu_read_lock();
1001 	dev = ip6_rt_get_dev_rcu(ort);
1002 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1003 	rcu_read_unlock();
1004 	if (!rt)
1005 		return NULL;
1006 
1007 	ip6_rt_copy_init(rt, ort);
1008 	rt->rt6i_flags |= RTF_CACHE;
1009 	rt->rt6i_metric = 0;
1010 	rt->dst.flags |= DST_HOST;
1011 	rt->rt6i_dst.addr = *daddr;
1012 	rt->rt6i_dst.plen = 128;
1013 
1014 	if (!rt6_is_gw_or_nonexthop(ort)) {
1015 		if (ort->rt6i_dst.plen != 128 &&
1016 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1017 			rt->rt6i_flags |= RTF_ANYCAST;
1018 #ifdef CONFIG_IPV6_SUBTREES
1019 		if (rt->rt6i_src.plen && saddr) {
1020 			rt->rt6i_src.addr = *saddr;
1021 			rt->rt6i_src.plen = 128;
1022 		}
1023 #endif
1024 	}
1025 
1026 	return rt;
1027 }
1028 
ip6_rt_pcpu_alloc(struct rt6_info * rt)1029 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1030 {
1031 	struct net_device *dev;
1032 	struct rt6_info *pcpu_rt;
1033 
1034 	rcu_read_lock();
1035 	dev = ip6_rt_get_dev_rcu(rt);
1036 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1037 	rcu_read_unlock();
1038 	if (!pcpu_rt)
1039 		return NULL;
1040 	ip6_rt_copy_init(pcpu_rt, rt);
1041 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1042 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1043 	return pcpu_rt;
1044 }
1045 
1046 /* It should be called with read_lock_bh(&tb6_lock) acquired */
rt6_get_pcpu_route(struct rt6_info * rt)1047 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1048 {
1049 	struct rt6_info *pcpu_rt, **p;
1050 
1051 	p = this_cpu_ptr(rt->rt6i_pcpu);
1052 	pcpu_rt = *p;
1053 
1054 	if (pcpu_rt) {
1055 		dst_hold(&pcpu_rt->dst);
1056 		rt6_dst_from_metrics_check(pcpu_rt);
1057 	}
1058 	return pcpu_rt;
1059 }
1060 
rt6_make_pcpu_route(struct rt6_info * rt)1061 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1062 {
1063 	struct fib6_table *table = rt->rt6i_table;
1064 	struct rt6_info *pcpu_rt, *prev, **p;
1065 
1066 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1067 	if (!pcpu_rt) {
1068 		struct net *net = dev_net(rt->dst.dev);
1069 
1070 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1071 		return net->ipv6.ip6_null_entry;
1072 	}
1073 
1074 	read_lock_bh(&table->tb6_lock);
1075 	if (rt->rt6i_pcpu) {
1076 		p = this_cpu_ptr(rt->rt6i_pcpu);
1077 		prev = cmpxchg(p, NULL, pcpu_rt);
1078 		if (prev) {
1079 			/* If someone did it before us, return prev instead */
1080 			dst_release_immediate(&pcpu_rt->dst);
1081 			pcpu_rt = prev;
1082 		}
1083 	} else {
1084 		/* rt has been removed from the fib6 tree
1085 		 * before we have a chance to acquire the read_lock.
1086 		 * In this case, don't brother to create a pcpu rt
1087 		 * since rt is going away anyway.  The next
1088 		 * dst_check() will trigger a re-lookup.
1089 		 */
1090 		dst_release_immediate(&pcpu_rt->dst);
1091 		pcpu_rt = rt;
1092 	}
1093 	dst_hold(&pcpu_rt->dst);
1094 	rt6_dst_from_metrics_check(pcpu_rt);
1095 	read_unlock_bh(&table->tb6_lock);
1096 	return pcpu_rt;
1097 }
1098 
ip6_pol_route(struct net * net,struct fib6_table * table,int oif,struct flowi6 * fl6,int flags)1099 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1100 			       int oif, struct flowi6 *fl6, int flags)
1101 {
1102 	struct fib6_node *fn, *saved_fn;
1103 	struct rt6_info *rt;
1104 	int strict = 0;
1105 
1106 	strict |= flags & RT6_LOOKUP_F_IFACE;
1107 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1108 	if (net->ipv6.devconf_all->forwarding == 0)
1109 		strict |= RT6_LOOKUP_F_REACHABLE;
1110 
1111 	read_lock_bh(&table->tb6_lock);
1112 
1113 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1114 	saved_fn = fn;
1115 
1116 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1117 		oif = 0;
1118 
1119 redo_rt6_select:
1120 	rt = rt6_select(fn, oif, strict);
1121 	if (rt->rt6i_nsiblings)
1122 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1123 	if (rt == net->ipv6.ip6_null_entry) {
1124 		fn = fib6_backtrack(fn, &fl6->saddr);
1125 		if (fn)
1126 			goto redo_rt6_select;
1127 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1128 			/* also consider unreachable route */
1129 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1130 			fn = saved_fn;
1131 			goto redo_rt6_select;
1132 		}
1133 	}
1134 
1135 
1136 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1137 		dst_use(&rt->dst, jiffies);
1138 		read_unlock_bh(&table->tb6_lock);
1139 
1140 		rt6_dst_from_metrics_check(rt);
1141 
1142 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1143 		return rt;
1144 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1145 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1146 		/* Create a RTF_CACHE clone which will not be
1147 		 * owned by the fib6 tree.  It is for the special case where
1148 		 * the daddr in the skb during the neighbor look-up is different
1149 		 * from the fl6->daddr used to look-up route here.
1150 		 */
1151 
1152 		struct rt6_info *uncached_rt;
1153 
1154 		dst_use(&rt->dst, jiffies);
1155 		read_unlock_bh(&table->tb6_lock);
1156 
1157 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1158 		dst_release(&rt->dst);
1159 
1160 		if (uncached_rt) {
1161 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1162 			 * No need for another dst_hold()
1163 			 */
1164 			rt6_uncached_list_add(uncached_rt);
1165 		} else {
1166 			uncached_rt = net->ipv6.ip6_null_entry;
1167 			dst_hold(&uncached_rt->dst);
1168 		}
1169 
1170 		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1171 		return uncached_rt;
1172 
1173 	} else {
1174 		/* Get a percpu copy */
1175 
1176 		struct rt6_info *pcpu_rt;
1177 
1178 		rt->dst.lastuse = jiffies;
1179 		rt->dst.__use++;
1180 		pcpu_rt = rt6_get_pcpu_route(rt);
1181 
1182 		if (pcpu_rt) {
1183 			read_unlock_bh(&table->tb6_lock);
1184 		} else {
1185 			/* We have to do the read_unlock first
1186 			 * because rt6_make_pcpu_route() may trigger
1187 			 * ip6_dst_gc() which will take the write_lock.
1188 			 */
1189 			dst_hold(&rt->dst);
1190 			read_unlock_bh(&table->tb6_lock);
1191 			pcpu_rt = rt6_make_pcpu_route(rt);
1192 			dst_release(&rt->dst);
1193 		}
1194 
1195 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1196 		return pcpu_rt;
1197 
1198 	}
1199 }
1200 EXPORT_SYMBOL_GPL(ip6_pol_route);
1201 
ip6_pol_route_input(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)1202 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1203 					    struct flowi6 *fl6, int flags)
1204 {
1205 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1206 }
1207 
ip6_route_input_lookup(struct net * net,struct net_device * dev,struct flowi6 * fl6,int flags)1208 struct dst_entry *ip6_route_input_lookup(struct net *net,
1209 					 struct net_device *dev,
1210 					 struct flowi6 *fl6, int flags)
1211 {
1212 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1213 		flags |= RT6_LOOKUP_F_IFACE;
1214 
1215 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1216 }
1217 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1218 
ip6_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * keys)1219 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1220 				  struct flow_keys *keys)
1221 {
1222 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1223 	const struct ipv6hdr *key_iph = outer_iph;
1224 	const struct ipv6hdr *inner_iph;
1225 	const struct icmp6hdr *icmph;
1226 	struct ipv6hdr _inner_iph;
1227 	struct icmp6hdr _icmph;
1228 
1229 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1230 		goto out;
1231 
1232 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1233 				   sizeof(_icmph), &_icmph);
1234 	if (!icmph)
1235 		goto out;
1236 
1237 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1238 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1239 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1240 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1241 		goto out;
1242 
1243 	inner_iph = skb_header_pointer(skb,
1244 				       skb_transport_offset(skb) + sizeof(*icmph),
1245 				       sizeof(_inner_iph), &_inner_iph);
1246 	if (!inner_iph)
1247 		goto out;
1248 
1249 	key_iph = inner_iph;
1250 out:
1251 	memset(keys, 0, sizeof(*keys));
1252 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1253 	keys->addrs.v6addrs.src = key_iph->saddr;
1254 	keys->addrs.v6addrs.dst = key_iph->daddr;
1255 	keys->tags.flow_label = ip6_flowlabel(key_iph);
1256 	keys->basic.ip_proto = key_iph->nexthdr;
1257 }
1258 
1259 /* if skb is set it will be used and fl6 can be NULL */
rt6_multipath_hash(const struct flowi6 * fl6,const struct sk_buff * skb)1260 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1261 {
1262 	struct flow_keys hash_keys;
1263 
1264 	if (skb) {
1265 		ip6_multipath_l3_keys(skb, &hash_keys);
1266 		return flow_hash_from_keys(&hash_keys);
1267 	}
1268 
1269 	return get_hash_from_flowi6(fl6);
1270 }
1271 
ip6_route_input(struct sk_buff * skb)1272 void ip6_route_input(struct sk_buff *skb)
1273 {
1274 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1275 	struct net *net = dev_net(skb->dev);
1276 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1277 	struct ip_tunnel_info *tun_info;
1278 	struct flowi6 fl6 = {
1279 		.flowi6_iif = skb->dev->ifindex,
1280 		.daddr = iph->daddr,
1281 		.saddr = iph->saddr,
1282 		.flowlabel = ip6_flowinfo(iph),
1283 		.flowi6_mark = skb->mark,
1284 		.flowi6_proto = iph->nexthdr,
1285 	};
1286 
1287 	tun_info = skb_tunnel_info(skb);
1288 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1289 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1290 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1291 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1292 	skb_dst_drop(skb);
1293 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1294 }
1295 
ip6_pol_route_output(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)1296 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1297 					     struct flowi6 *fl6, int flags)
1298 {
1299 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1300 }
1301 
ip6_route_output_flags(struct net * net,const struct sock * sk,struct flowi6 * fl6,int flags)1302 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1303 					 struct flowi6 *fl6, int flags)
1304 {
1305 	bool any_src;
1306 
1307 	if (rt6_need_strict(&fl6->daddr)) {
1308 		struct dst_entry *dst;
1309 
1310 		dst = l3mdev_link_scope_lookup(net, fl6);
1311 		if (dst)
1312 			return dst;
1313 	}
1314 
1315 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1316 
1317 	any_src = ipv6_addr_any(&fl6->saddr);
1318 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1319 	    (fl6->flowi6_oif && any_src))
1320 		flags |= RT6_LOOKUP_F_IFACE;
1321 
1322 	if (!any_src)
1323 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1324 	else if (sk)
1325 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1326 
1327 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1328 }
1329 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1330 
ip6_blackhole_route(struct net * net,struct dst_entry * dst_orig)1331 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1332 {
1333 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1334 	struct net_device *loopback_dev = net->loopback_dev;
1335 	struct dst_entry *new = NULL;
1336 
1337 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1338 		       DST_OBSOLETE_DEAD, 0);
1339 	if (rt) {
1340 		rt6_info_init(rt);
1341 
1342 		new = &rt->dst;
1343 		new->__use = 1;
1344 		new->input = dst_discard;
1345 		new->output = dst_discard_out;
1346 
1347 		dst_copy_metrics(new, &ort->dst);
1348 
1349 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1350 		rt->rt6i_gateway = ort->rt6i_gateway;
1351 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1352 		rt->rt6i_metric = 0;
1353 
1354 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1355 #ifdef CONFIG_IPV6_SUBTREES
1356 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1357 #endif
1358 	}
1359 
1360 	dst_release(dst_orig);
1361 	return new ? new : ERR_PTR(-ENOMEM);
1362 }
1363 
1364 /*
1365  *	Destination cache support functions
1366  */
1367 
rt6_dst_from_metrics_check(struct rt6_info * rt)1368 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1369 {
1370 	if (rt->dst.from &&
1371 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1372 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1373 }
1374 
rt6_check(struct rt6_info * rt,u32 cookie)1375 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1376 {
1377 	u32 rt_cookie = 0;
1378 
1379 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1380 		return NULL;
1381 
1382 	if (rt6_check_expired(rt))
1383 		return NULL;
1384 
1385 	return &rt->dst;
1386 }
1387 
rt6_dst_from_check(struct rt6_info * rt,u32 cookie)1388 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1389 {
1390 	if (!__rt6_check_expired(rt) &&
1391 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1392 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1393 		return &rt->dst;
1394 	else
1395 		return NULL;
1396 }
1397 
ip6_dst_check(struct dst_entry * dst,u32 cookie)1398 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1399 {
1400 	struct rt6_info *rt;
1401 
1402 	rt = (struct rt6_info *) dst;
1403 
1404 	/* All IPV6 dsts are created with ->obsolete set to the value
1405 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1406 	 * into this function always.
1407 	 */
1408 
1409 	rt6_dst_from_metrics_check(rt);
1410 
1411 	if (rt->rt6i_flags & RTF_PCPU ||
1412 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1413 		return rt6_dst_from_check(rt, cookie);
1414 	else
1415 		return rt6_check(rt, cookie);
1416 }
1417 
ip6_negative_advice(struct dst_entry * dst)1418 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1419 {
1420 	struct rt6_info *rt = (struct rt6_info *) dst;
1421 
1422 	if (rt) {
1423 		if (rt->rt6i_flags & RTF_CACHE) {
1424 			if (rt6_check_expired(rt)) {
1425 				ip6_del_rt(rt);
1426 				dst = NULL;
1427 			}
1428 		} else {
1429 			dst_release(dst);
1430 			dst = NULL;
1431 		}
1432 	}
1433 	return dst;
1434 }
1435 
ip6_link_failure(struct sk_buff * skb)1436 static void ip6_link_failure(struct sk_buff *skb)
1437 {
1438 	struct rt6_info *rt;
1439 
1440 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1441 
1442 	rt = (struct rt6_info *) skb_dst(skb);
1443 	if (rt) {
1444 		if (rt->rt6i_flags & RTF_CACHE) {
1445 			if (dst_hold_safe(&rt->dst))
1446 				ip6_del_rt(rt);
1447 		} else {
1448 			struct fib6_node *fn;
1449 
1450 			rcu_read_lock();
1451 			fn = rcu_dereference(rt->rt6i_node);
1452 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1453 				fn->fn_sernum = -1;
1454 			rcu_read_unlock();
1455 		}
1456 	}
1457 }
1458 
rt6_do_update_pmtu(struct rt6_info * rt,u32 mtu)1459 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1460 {
1461 	struct net *net = dev_net(rt->dst.dev);
1462 
1463 	rt->rt6i_flags |= RTF_MODIFIED;
1464 	rt->rt6i_pmtu = mtu;
1465 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1466 }
1467 
rt6_cache_allowed_for_pmtu(const struct rt6_info * rt)1468 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1469 {
1470 	return !(rt->rt6i_flags & RTF_CACHE) &&
1471 		(rt->rt6i_flags & RTF_PCPU ||
1472 		 rcu_access_pointer(rt->rt6i_node));
1473 }
1474 
__ip6_rt_update_pmtu(struct dst_entry * dst,const struct sock * sk,const struct ipv6hdr * iph,u32 mtu,bool confirm_neigh)1475 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1476 				 const struct ipv6hdr *iph, u32 mtu,
1477 				 bool confirm_neigh)
1478 {
1479 	const struct in6_addr *daddr, *saddr;
1480 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1481 
1482 	if (dst_metric_locked(dst, RTAX_MTU))
1483 		return;
1484 
1485 	if (iph) {
1486 		daddr = &iph->daddr;
1487 		saddr = &iph->saddr;
1488 	} else if (sk) {
1489 		daddr = &sk->sk_v6_daddr;
1490 		saddr = &inet6_sk(sk)->saddr;
1491 	} else {
1492 		daddr = NULL;
1493 		saddr = NULL;
1494 	}
1495 
1496 	if (confirm_neigh)
1497 		dst_confirm_neigh(dst, daddr);
1498 
1499 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1500 	if (mtu >= dst_mtu(dst))
1501 		return;
1502 
1503 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1504 		rt6_do_update_pmtu(rt6, mtu);
1505 	} else if (daddr) {
1506 		struct rt6_info *nrt6;
1507 
1508 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1509 		if (nrt6) {
1510 			rt6_do_update_pmtu(nrt6, mtu);
1511 
1512 			/* ip6_ins_rt(nrt6) will bump the
1513 			 * rt6->rt6i_node->fn_sernum
1514 			 * which will fail the next rt6_check() and
1515 			 * invalidate the sk->sk_dst_cache.
1516 			 */
1517 			ip6_ins_rt(nrt6);
1518 			/* Release the reference taken in
1519 			 * ip6_rt_cache_alloc()
1520 			 */
1521 			dst_release(&nrt6->dst);
1522 		}
1523 	}
1524 }
1525 
ip6_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1526 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1527 			       struct sk_buff *skb, u32 mtu,
1528 			       bool confirm_neigh)
1529 {
1530 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
1531 			     confirm_neigh);
1532 }
1533 
ip6_update_pmtu(struct sk_buff * skb,struct net * net,__be32 mtu,int oif,u32 mark,kuid_t uid)1534 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1535 		     int oif, u32 mark, kuid_t uid)
1536 {
1537 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1538 	struct dst_entry *dst;
1539 	struct flowi6 fl6;
1540 
1541 	memset(&fl6, 0, sizeof(fl6));
1542 	fl6.flowi6_oif = oif;
1543 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1544 	fl6.daddr = iph->daddr;
1545 	fl6.saddr = iph->saddr;
1546 	fl6.flowlabel = ip6_flowinfo(iph);
1547 	fl6.flowi6_uid = uid;
1548 
1549 	dst = ip6_route_output(net, NULL, &fl6);
1550 	if (!dst->error)
1551 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
1552 	dst_release(dst);
1553 }
1554 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1555 
ip6_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,__be32 mtu)1556 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1557 {
1558 	int oif = sk->sk_bound_dev_if;
1559 	struct dst_entry *dst;
1560 
1561 	if (!oif && skb->dev)
1562 		oif = l3mdev_master_ifindex(skb->dev);
1563 
1564 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
1565 
1566 	dst = __sk_dst_get(sk);
1567 	if (!dst || !dst->obsolete ||
1568 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1569 		return;
1570 
1571 	bh_lock_sock(sk);
1572 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1573 		ip6_datagram_dst_update(sk, false);
1574 	bh_unlock_sock(sk);
1575 }
1576 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1577 
1578 /* Handle redirects */
1579 struct ip6rd_flowi {
1580 	struct flowi6 fl6;
1581 	struct in6_addr gateway;
1582 };
1583 
__ip6_route_redirect(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)1584 static struct rt6_info *__ip6_route_redirect(struct net *net,
1585 					     struct fib6_table *table,
1586 					     struct flowi6 *fl6,
1587 					     int flags)
1588 {
1589 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1590 	struct rt6_info *rt;
1591 	struct fib6_node *fn;
1592 
1593 	/* Get the "current" route for this destination and
1594 	 * check if the redirect has come from appropriate router.
1595 	 *
1596 	 * RFC 4861 specifies that redirects should only be
1597 	 * accepted if they come from the nexthop to the target.
1598 	 * Due to the way the routes are chosen, this notion
1599 	 * is a bit fuzzy and one might need to check all possible
1600 	 * routes.
1601 	 */
1602 
1603 	read_lock_bh(&table->tb6_lock);
1604 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1605 restart:
1606 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1607 		if (rt6_check_expired(rt))
1608 			continue;
1609 		if (rt->dst.error)
1610 			break;
1611 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1612 			continue;
1613 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1614 			continue;
1615 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1616 			continue;
1617 		break;
1618 	}
1619 
1620 	if (!rt)
1621 		rt = net->ipv6.ip6_null_entry;
1622 	else if (rt->dst.error) {
1623 		rt = net->ipv6.ip6_null_entry;
1624 		goto out;
1625 	}
1626 
1627 	if (rt == net->ipv6.ip6_null_entry) {
1628 		fn = fib6_backtrack(fn, &fl6->saddr);
1629 		if (fn)
1630 			goto restart;
1631 	}
1632 
1633 out:
1634 	dst_hold(&rt->dst);
1635 
1636 	read_unlock_bh(&table->tb6_lock);
1637 
1638 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1639 	return rt;
1640 };
1641 
ip6_route_redirect(struct net * net,const struct flowi6 * fl6,const struct in6_addr * gateway)1642 static struct dst_entry *ip6_route_redirect(struct net *net,
1643 					const struct flowi6 *fl6,
1644 					const struct in6_addr *gateway)
1645 {
1646 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1647 	struct ip6rd_flowi rdfl;
1648 
1649 	rdfl.fl6 = *fl6;
1650 	rdfl.gateway = *gateway;
1651 
1652 	return fib6_rule_lookup(net, &rdfl.fl6,
1653 				flags, __ip6_route_redirect);
1654 }
1655 
ip6_redirect(struct sk_buff * skb,struct net * net,int oif,u32 mark,kuid_t uid)1656 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1657 		  kuid_t uid)
1658 {
1659 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1660 	struct dst_entry *dst;
1661 	struct flowi6 fl6;
1662 
1663 	memset(&fl6, 0, sizeof(fl6));
1664 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1665 	fl6.flowi6_oif = oif;
1666 	fl6.flowi6_mark = mark;
1667 	fl6.daddr = iph->daddr;
1668 	fl6.saddr = iph->saddr;
1669 	fl6.flowlabel = ip6_flowinfo(iph);
1670 	fl6.flowi6_uid = uid;
1671 
1672 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1673 	rt6_do_redirect(dst, NULL, skb);
1674 	dst_release(dst);
1675 }
1676 EXPORT_SYMBOL_GPL(ip6_redirect);
1677 
ip6_redirect_no_header(struct sk_buff * skb,struct net * net,int oif,u32 mark)1678 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1679 			    u32 mark)
1680 {
1681 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1682 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1683 	struct dst_entry *dst;
1684 	struct flowi6 fl6;
1685 
1686 	memset(&fl6, 0, sizeof(fl6));
1687 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1688 	fl6.flowi6_oif = oif;
1689 	fl6.flowi6_mark = mark;
1690 	fl6.daddr = msg->dest;
1691 	fl6.saddr = iph->daddr;
1692 	fl6.flowi6_uid = sock_net_uid(net, NULL);
1693 
1694 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1695 	rt6_do_redirect(dst, NULL, skb);
1696 	dst_release(dst);
1697 }
1698 
ip6_sk_redirect(struct sk_buff * skb,struct sock * sk)1699 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1700 {
1701 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1702 		     sk->sk_uid);
1703 }
1704 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1705 
ip6_default_advmss(const struct dst_entry * dst)1706 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1707 {
1708 	struct net_device *dev = dst->dev;
1709 	unsigned int mtu = dst_mtu(dst);
1710 	struct net *net = dev_net(dev);
1711 
1712 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1713 
1714 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1715 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1716 
1717 	/*
1718 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1719 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1720 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1721 	 * rely only on pmtu discovery"
1722 	 */
1723 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1724 		mtu = IPV6_MAXPLEN;
1725 	return mtu;
1726 }
1727 
ip6_mtu(const struct dst_entry * dst)1728 static unsigned int ip6_mtu(const struct dst_entry *dst)
1729 {
1730 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1731 	unsigned int mtu = rt->rt6i_pmtu;
1732 	struct inet6_dev *idev;
1733 
1734 	if (mtu)
1735 		goto out;
1736 
1737 	mtu = dst_metric_raw(dst, RTAX_MTU);
1738 	if (mtu)
1739 		goto out;
1740 
1741 	mtu = IPV6_MIN_MTU;
1742 
1743 	rcu_read_lock();
1744 	idev = __in6_dev_get(dst->dev);
1745 	if (idev)
1746 		mtu = idev->cnf.mtu6;
1747 	rcu_read_unlock();
1748 
1749 out:
1750 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1751 
1752 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1753 }
1754 
icmp6_dst_alloc(struct net_device * dev,struct flowi6 * fl6)1755 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1756 				  struct flowi6 *fl6)
1757 {
1758 	struct dst_entry *dst;
1759 	struct rt6_info *rt;
1760 	struct inet6_dev *idev = in6_dev_get(dev);
1761 	struct net *net = dev_net(dev);
1762 
1763 	if (unlikely(!idev))
1764 		return ERR_PTR(-ENODEV);
1765 
1766 	rt = ip6_dst_alloc(net, dev, 0);
1767 	if (unlikely(!rt)) {
1768 		in6_dev_put(idev);
1769 		dst = ERR_PTR(-ENOMEM);
1770 		goto out;
1771 	}
1772 
1773 	rt->dst.flags |= DST_HOST;
1774 	rt->dst.input = ip6_input;
1775 	rt->dst.output  = ip6_output;
1776 	rt->rt6i_gateway  = fl6->daddr;
1777 	rt->rt6i_dst.addr = fl6->daddr;
1778 	rt->rt6i_dst.plen = 128;
1779 	rt->rt6i_idev     = idev;
1780 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1781 
1782 	/* Add this dst into uncached_list so that rt6_ifdown() can
1783 	 * do proper release of the net_device
1784 	 */
1785 	rt6_uncached_list_add(rt);
1786 
1787 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1788 
1789 out:
1790 	return dst;
1791 }
1792 
ip6_dst_gc(struct dst_ops * ops)1793 static int ip6_dst_gc(struct dst_ops *ops)
1794 {
1795 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1796 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1797 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1798 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1799 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1800 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1801 	int entries;
1802 
1803 	entries = dst_entries_get_fast(ops);
1804 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1805 	    entries <= rt_max_size)
1806 		goto out;
1807 
1808 	net->ipv6.ip6_rt_gc_expire++;
1809 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1810 	entries = dst_entries_get_slow(ops);
1811 	if (entries < ops->gc_thresh)
1812 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1813 out:
1814 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1815 	return entries > rt_max_size;
1816 }
1817 
ip6_convert_metrics(struct mx6_config * mxc,const struct fib6_config * cfg)1818 static int ip6_convert_metrics(struct mx6_config *mxc,
1819 			       const struct fib6_config *cfg)
1820 {
1821 	bool ecn_ca = false;
1822 	struct nlattr *nla;
1823 	int remaining;
1824 	u32 *mp;
1825 
1826 	if (!cfg->fc_mx)
1827 		return 0;
1828 
1829 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1830 	if (unlikely(!mp))
1831 		return -ENOMEM;
1832 
1833 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1834 		int type = nla_type(nla);
1835 		u32 val;
1836 
1837 		if (!type)
1838 			continue;
1839 		if (unlikely(type > RTAX_MAX))
1840 			goto err;
1841 
1842 		if (type == RTAX_CC_ALGO) {
1843 			char tmp[TCP_CA_NAME_MAX];
1844 
1845 			nla_strlcpy(tmp, nla, sizeof(tmp));
1846 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1847 			if (val == TCP_CA_UNSPEC)
1848 				goto err;
1849 		} else {
1850 			val = nla_get_u32(nla);
1851 		}
1852 		if (type == RTAX_HOPLIMIT && val > 255)
1853 			val = 255;
1854 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1855 			goto err;
1856 
1857 		mp[type - 1] = val;
1858 		__set_bit(type - 1, mxc->mx_valid);
1859 	}
1860 
1861 	if (ecn_ca) {
1862 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1863 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1864 	}
1865 
1866 	mxc->mx = mp;
1867 	return 0;
1868  err:
1869 	kfree(mp);
1870 	return -EINVAL;
1871 }
1872 
ip6_nh_lookup_table(struct net * net,struct fib6_config * cfg,const struct in6_addr * gw_addr)1873 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1874 					    struct fib6_config *cfg,
1875 					    const struct in6_addr *gw_addr)
1876 {
1877 	struct flowi6 fl6 = {
1878 		.flowi6_oif = cfg->fc_ifindex,
1879 		.daddr = *gw_addr,
1880 		.saddr = cfg->fc_prefsrc,
1881 	};
1882 	struct fib6_table *table;
1883 	struct rt6_info *rt;
1884 	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1885 
1886 	table = fib6_get_table(net, cfg->fc_table);
1887 	if (!table)
1888 		return NULL;
1889 
1890 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
1891 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1892 
1893 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1894 
1895 	/* if table lookup failed, fall back to full lookup */
1896 	if (rt == net->ipv6.ip6_null_entry) {
1897 		ip6_rt_put(rt);
1898 		rt = NULL;
1899 	}
1900 
1901 	return rt;
1902 }
1903 
ip6_route_info_create(struct fib6_config * cfg,struct netlink_ext_ack * extack)1904 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1905 					      struct netlink_ext_ack *extack)
1906 {
1907 	struct net *net = cfg->fc_nlinfo.nl_net;
1908 	struct rt6_info *rt = NULL;
1909 	struct net_device *dev = NULL;
1910 	struct inet6_dev *idev = NULL;
1911 	struct fib6_table *table;
1912 	int addr_type;
1913 	int err = -EINVAL;
1914 
1915 	/* RTF_PCPU is an internal flag; can not be set by userspace */
1916 	if (cfg->fc_flags & RTF_PCPU) {
1917 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1918 		goto out;
1919 	}
1920 
1921 	if (cfg->fc_dst_len > 128) {
1922 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
1923 		goto out;
1924 	}
1925 	if (cfg->fc_src_len > 128) {
1926 		NL_SET_ERR_MSG(extack, "Invalid source address length");
1927 		goto out;
1928 	}
1929 #ifndef CONFIG_IPV6_SUBTREES
1930 	if (cfg->fc_src_len) {
1931 		NL_SET_ERR_MSG(extack,
1932 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
1933 		goto out;
1934 	}
1935 #endif
1936 	if (cfg->fc_ifindex) {
1937 		err = -ENODEV;
1938 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1939 		if (!dev)
1940 			goto out;
1941 		idev = in6_dev_get(dev);
1942 		if (!idev)
1943 			goto out;
1944 	}
1945 
1946 	if (cfg->fc_metric == 0)
1947 		cfg->fc_metric = IP6_RT_PRIO_USER;
1948 
1949 	err = -ENOBUFS;
1950 	if (cfg->fc_nlinfo.nlh &&
1951 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1952 		table = fib6_get_table(net, cfg->fc_table);
1953 		if (!table) {
1954 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1955 			table = fib6_new_table(net, cfg->fc_table);
1956 		}
1957 	} else {
1958 		table = fib6_new_table(net, cfg->fc_table);
1959 	}
1960 
1961 	if (!table)
1962 		goto out;
1963 
1964 	rt = ip6_dst_alloc(net, NULL,
1965 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1966 
1967 	if (!rt) {
1968 		err = -ENOMEM;
1969 		goto out;
1970 	}
1971 
1972 	if (cfg->fc_flags & RTF_EXPIRES)
1973 		rt6_set_expires(rt, jiffies +
1974 				clock_t_to_jiffies(cfg->fc_expires));
1975 	else
1976 		rt6_clean_expires(rt);
1977 
1978 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1979 		cfg->fc_protocol = RTPROT_BOOT;
1980 	rt->rt6i_protocol = cfg->fc_protocol;
1981 
1982 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1983 
1984 	if (addr_type & IPV6_ADDR_MULTICAST)
1985 		rt->dst.input = ip6_mc_input;
1986 	else if (cfg->fc_flags & RTF_LOCAL)
1987 		rt->dst.input = ip6_input;
1988 	else
1989 		rt->dst.input = ip6_forward;
1990 
1991 	rt->dst.output = ip6_output;
1992 
1993 	if (cfg->fc_encap) {
1994 		struct lwtunnel_state *lwtstate;
1995 
1996 		err = lwtunnel_build_state(cfg->fc_encap_type,
1997 					   cfg->fc_encap, AF_INET6, cfg,
1998 					   &lwtstate, extack);
1999 		if (err)
2000 			goto out;
2001 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2002 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2003 			rt->dst.lwtstate->orig_output = rt->dst.output;
2004 			rt->dst.output = lwtunnel_output;
2005 		}
2006 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2007 			rt->dst.lwtstate->orig_input = rt->dst.input;
2008 			rt->dst.input = lwtunnel_input;
2009 		}
2010 	}
2011 
2012 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2013 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2014 	if (rt->rt6i_dst.plen == 128)
2015 		rt->dst.flags |= DST_HOST;
2016 
2017 #ifdef CONFIG_IPV6_SUBTREES
2018 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2019 	rt->rt6i_src.plen = cfg->fc_src_len;
2020 #endif
2021 
2022 	rt->rt6i_metric = cfg->fc_metric;
2023 
2024 	/* We cannot add true routes via loopback here,
2025 	   they would result in kernel looping; promote them to reject routes
2026 	 */
2027 	if ((cfg->fc_flags & RTF_REJECT) ||
2028 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2029 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2030 	     !(cfg->fc_flags & RTF_LOCAL))) {
2031 		/* hold loopback dev/idev if we haven't done so. */
2032 		if (dev != net->loopback_dev) {
2033 			if (dev) {
2034 				dev_put(dev);
2035 				in6_dev_put(idev);
2036 			}
2037 			dev = net->loopback_dev;
2038 			dev_hold(dev);
2039 			idev = in6_dev_get(dev);
2040 			if (!idev) {
2041 				err = -ENODEV;
2042 				goto out;
2043 			}
2044 		}
2045 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2046 		switch (cfg->fc_type) {
2047 		case RTN_BLACKHOLE:
2048 			rt->dst.error = -EINVAL;
2049 			rt->dst.output = dst_discard_out;
2050 			rt->dst.input = dst_discard;
2051 			break;
2052 		case RTN_PROHIBIT:
2053 			rt->dst.error = -EACCES;
2054 			rt->dst.output = ip6_pkt_prohibit_out;
2055 			rt->dst.input = ip6_pkt_prohibit;
2056 			break;
2057 		case RTN_THROW:
2058 		case RTN_UNREACHABLE:
2059 		default:
2060 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2061 					: (cfg->fc_type == RTN_UNREACHABLE)
2062 					? -EHOSTUNREACH : -ENETUNREACH;
2063 			rt->dst.output = ip6_pkt_discard_out;
2064 			rt->dst.input = ip6_pkt_discard;
2065 			break;
2066 		}
2067 		goto install_route;
2068 	}
2069 
2070 	if (cfg->fc_flags & RTF_GATEWAY) {
2071 		const struct in6_addr *gw_addr;
2072 		int gwa_type;
2073 
2074 		gw_addr = &cfg->fc_gateway;
2075 		gwa_type = ipv6_addr_type(gw_addr);
2076 
2077 		/* if gw_addr is local we will fail to detect this in case
2078 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2079 		 * will return already-added prefix route via interface that
2080 		 * prefix route was assigned to, which might be non-loopback.
2081 		 */
2082 		err = -EINVAL;
2083 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2084 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2085 					    dev : NULL, 0, 0)) {
2086 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2087 			goto out;
2088 		}
2089 		rt->rt6i_gateway = *gw_addr;
2090 
2091 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2092 			struct rt6_info *grt = NULL;
2093 
2094 			/* IPv6 strictly inhibits using not link-local
2095 			   addresses as nexthop address.
2096 			   Otherwise, router will not able to send redirects.
2097 			   It is very good, but in some (rare!) circumstances
2098 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2099 			   some exceptions. --ANK
2100 			   We allow IPv4-mapped nexthops to support RFC4798-type
2101 			   addressing
2102 			 */
2103 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2104 					  IPV6_ADDR_MAPPED))) {
2105 				NL_SET_ERR_MSG(extack,
2106 					       "Invalid gateway address");
2107 				goto out;
2108 			}
2109 
2110 			if (cfg->fc_table) {
2111 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2112 
2113 				if (grt) {
2114 					if (grt->rt6i_flags & RTF_GATEWAY ||
2115 					    (dev && dev != grt->dst.dev)) {
2116 						ip6_rt_put(grt);
2117 						grt = NULL;
2118 					}
2119 				}
2120 			}
2121 
2122 			if (!grt)
2123 				grt = rt6_lookup(net, gw_addr, NULL,
2124 						 cfg->fc_ifindex, 1);
2125 
2126 			err = -EHOSTUNREACH;
2127 			if (!grt)
2128 				goto out;
2129 			if (dev) {
2130 				if (dev != grt->dst.dev) {
2131 					ip6_rt_put(grt);
2132 					goto out;
2133 				}
2134 			} else {
2135 				dev = grt->dst.dev;
2136 				idev = grt->rt6i_idev;
2137 				dev_hold(dev);
2138 				in6_dev_hold(grt->rt6i_idev);
2139 			}
2140 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2141 				err = 0;
2142 			ip6_rt_put(grt);
2143 
2144 			if (err)
2145 				goto out;
2146 		}
2147 		err = -EINVAL;
2148 		if (!dev) {
2149 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2150 			goto out;
2151 		} else if (dev->flags & IFF_LOOPBACK) {
2152 			NL_SET_ERR_MSG(extack,
2153 				       "Egress device can not be loopback device for this route");
2154 			goto out;
2155 		}
2156 	}
2157 
2158 	err = -ENODEV;
2159 	if (!dev)
2160 		goto out;
2161 
2162 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2163 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2164 			NL_SET_ERR_MSG(extack, "Invalid source address");
2165 			err = -EINVAL;
2166 			goto out;
2167 		}
2168 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2169 		rt->rt6i_prefsrc.plen = 128;
2170 	} else
2171 		rt->rt6i_prefsrc.plen = 0;
2172 
2173 	rt->rt6i_flags = cfg->fc_flags;
2174 
2175 install_route:
2176 	rt->dst.dev = dev;
2177 	rt->rt6i_idev = idev;
2178 	rt->rt6i_table = table;
2179 
2180 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2181 
2182 	return rt;
2183 out:
2184 	if (dev)
2185 		dev_put(dev);
2186 	if (idev)
2187 		in6_dev_put(idev);
2188 	if (rt)
2189 		dst_release_immediate(&rt->dst);
2190 
2191 	return ERR_PTR(err);
2192 }
2193 
ip6_route_add(struct fib6_config * cfg,struct netlink_ext_ack * extack)2194 int ip6_route_add(struct fib6_config *cfg,
2195 		  struct netlink_ext_ack *extack)
2196 {
2197 	struct mx6_config mxc = { .mx = NULL, };
2198 	struct rt6_info *rt;
2199 	int err;
2200 
2201 	rt = ip6_route_info_create(cfg, extack);
2202 	if (IS_ERR(rt)) {
2203 		err = PTR_ERR(rt);
2204 		rt = NULL;
2205 		goto out;
2206 	}
2207 
2208 	err = ip6_convert_metrics(&mxc, cfg);
2209 	if (err)
2210 		goto out;
2211 
2212 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2213 
2214 	kfree(mxc.mx);
2215 
2216 	return err;
2217 out:
2218 	if (rt)
2219 		dst_release_immediate(&rt->dst);
2220 
2221 	return err;
2222 }
2223 
__ip6_del_rt(struct rt6_info * rt,struct nl_info * info)2224 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2225 {
2226 	int err;
2227 	struct fib6_table *table;
2228 	struct net *net = dev_net(rt->dst.dev);
2229 
2230 	if (rt == net->ipv6.ip6_null_entry) {
2231 		err = -ENOENT;
2232 		goto out;
2233 	}
2234 
2235 	table = rt->rt6i_table;
2236 	write_lock_bh(&table->tb6_lock);
2237 	err = fib6_del(rt, info);
2238 	write_unlock_bh(&table->tb6_lock);
2239 
2240 out:
2241 	ip6_rt_put(rt);
2242 	return err;
2243 }
2244 
ip6_del_rt(struct rt6_info * rt)2245 int ip6_del_rt(struct rt6_info *rt)
2246 {
2247 	struct nl_info info = {
2248 		.nl_net = dev_net(rt->dst.dev),
2249 	};
2250 	return __ip6_del_rt(rt, &info);
2251 }
2252 
__ip6_del_rt_siblings(struct rt6_info * rt,struct fib6_config * cfg)2253 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2254 {
2255 	struct nl_info *info = &cfg->fc_nlinfo;
2256 	struct net *net = info->nl_net;
2257 	struct sk_buff *skb = NULL;
2258 	struct fib6_table *table;
2259 	int err = -ENOENT;
2260 
2261 	if (rt == net->ipv6.ip6_null_entry)
2262 		goto out_put;
2263 	table = rt->rt6i_table;
2264 	write_lock_bh(&table->tb6_lock);
2265 
2266 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2267 		struct rt6_info *sibling, *next_sibling;
2268 
2269 		/* prefer to send a single notification with all hops */
2270 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2271 		if (skb) {
2272 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2273 
2274 			if (rt6_fill_node(net, skb, rt,
2275 					  NULL, NULL, 0, RTM_DELROUTE,
2276 					  info->portid, seq, 0) < 0) {
2277 				kfree_skb(skb);
2278 				skb = NULL;
2279 			} else
2280 				info->skip_notify = 1;
2281 		}
2282 
2283 		list_for_each_entry_safe(sibling, next_sibling,
2284 					 &rt->rt6i_siblings,
2285 					 rt6i_siblings) {
2286 			err = fib6_del(sibling, info);
2287 			if (err)
2288 				goto out_unlock;
2289 		}
2290 	}
2291 
2292 	err = fib6_del(rt, info);
2293 out_unlock:
2294 	write_unlock_bh(&table->tb6_lock);
2295 out_put:
2296 	ip6_rt_put(rt);
2297 
2298 	if (skb) {
2299 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2300 			    info->nlh, gfp_any());
2301 	}
2302 	return err;
2303 }
2304 
ip6_route_del(struct fib6_config * cfg,struct netlink_ext_ack * extack)2305 static int ip6_route_del(struct fib6_config *cfg,
2306 			 struct netlink_ext_ack *extack)
2307 {
2308 	struct fib6_table *table;
2309 	struct fib6_node *fn;
2310 	struct rt6_info *rt;
2311 	int err = -ESRCH;
2312 
2313 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2314 	if (!table) {
2315 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2316 		return err;
2317 	}
2318 
2319 	read_lock_bh(&table->tb6_lock);
2320 
2321 	fn = fib6_locate(&table->tb6_root,
2322 			 &cfg->fc_dst, cfg->fc_dst_len,
2323 			 &cfg->fc_src, cfg->fc_src_len);
2324 
2325 	if (fn) {
2326 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2327 			if ((rt->rt6i_flags & RTF_CACHE) &&
2328 			    !(cfg->fc_flags & RTF_CACHE))
2329 				continue;
2330 			if (cfg->fc_ifindex &&
2331 			    (!rt->dst.dev ||
2332 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2333 				continue;
2334 			if (cfg->fc_flags & RTF_GATEWAY &&
2335 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2336 				continue;
2337 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2338 				continue;
2339 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2340 				continue;
2341 			dst_hold(&rt->dst);
2342 			read_unlock_bh(&table->tb6_lock);
2343 
2344 			/* if gateway was specified only delete the one hop */
2345 			if (cfg->fc_flags & RTF_GATEWAY)
2346 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2347 
2348 			return __ip6_del_rt_siblings(rt, cfg);
2349 		}
2350 	}
2351 	read_unlock_bh(&table->tb6_lock);
2352 
2353 	return err;
2354 }
2355 
rt6_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2356 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2357 {
2358 	struct netevent_redirect netevent;
2359 	struct rt6_info *rt, *nrt = NULL;
2360 	struct ndisc_options ndopts;
2361 	struct inet6_dev *in6_dev;
2362 	struct neighbour *neigh;
2363 	struct rd_msg *msg;
2364 	int optlen, on_link;
2365 	u8 *lladdr;
2366 
2367 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2368 	optlen -= sizeof(*msg);
2369 
2370 	if (optlen < 0) {
2371 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2372 		return;
2373 	}
2374 
2375 	msg = (struct rd_msg *)icmp6_hdr(skb);
2376 
2377 	if (ipv6_addr_is_multicast(&msg->dest)) {
2378 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2379 		return;
2380 	}
2381 
2382 	on_link = 0;
2383 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2384 		on_link = 1;
2385 	} else if (ipv6_addr_type(&msg->target) !=
2386 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2387 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2388 		return;
2389 	}
2390 
2391 	in6_dev = __in6_dev_get(skb->dev);
2392 	if (!in6_dev)
2393 		return;
2394 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2395 		return;
2396 
2397 	/* RFC2461 8.1:
2398 	 *	The IP source address of the Redirect MUST be the same as the current
2399 	 *	first-hop router for the specified ICMP Destination Address.
2400 	 */
2401 
2402 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2403 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2404 		return;
2405 	}
2406 
2407 	lladdr = NULL;
2408 	if (ndopts.nd_opts_tgt_lladdr) {
2409 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2410 					     skb->dev);
2411 		if (!lladdr) {
2412 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2413 			return;
2414 		}
2415 	}
2416 
2417 	rt = (struct rt6_info *) dst;
2418 	if (rt->rt6i_flags & RTF_REJECT) {
2419 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2420 		return;
2421 	}
2422 
2423 	/* Redirect received -> path was valid.
2424 	 * Look, redirects are sent only in response to data packets,
2425 	 * so that this nexthop apparently is reachable. --ANK
2426 	 */
2427 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2428 
2429 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2430 	if (!neigh)
2431 		return;
2432 
2433 	/*
2434 	 *	We have finally decided to accept it.
2435 	 */
2436 
2437 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2438 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2439 		     NEIGH_UPDATE_F_OVERRIDE|
2440 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2441 				     NEIGH_UPDATE_F_ISROUTER)),
2442 		     NDISC_REDIRECT, &ndopts);
2443 
2444 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2445 	if (!nrt)
2446 		goto out;
2447 
2448 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2449 	if (on_link)
2450 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2451 
2452 	nrt->rt6i_protocol = RTPROT_REDIRECT;
2453 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2454 
2455 	if (ip6_ins_rt(nrt))
2456 		goto out_release;
2457 
2458 	netevent.old = &rt->dst;
2459 	netevent.new = &nrt->dst;
2460 	netevent.daddr = &msg->dest;
2461 	netevent.neigh = neigh;
2462 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2463 
2464 	if (rt->rt6i_flags & RTF_CACHE) {
2465 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2466 		ip6_del_rt(rt);
2467 	}
2468 
2469 out_release:
2470 	/* Release the reference taken in
2471 	 * ip6_rt_cache_alloc()
2472 	 */
2473 	dst_release(&nrt->dst);
2474 
2475 out:
2476 	neigh_release(neigh);
2477 }
2478 
2479 /*
2480  *	Misc support functions
2481  */
2482 
rt6_set_from(struct rt6_info * rt,struct rt6_info * from)2483 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2484 {
2485 	BUG_ON(from->dst.from);
2486 
2487 	rt->rt6i_flags &= ~RTF_EXPIRES;
2488 	dst_hold(&from->dst);
2489 	rt->dst.from = &from->dst;
2490 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2491 }
2492 
ip6_rt_copy_init(struct rt6_info * rt,struct rt6_info * ort)2493 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2494 {
2495 	rt->dst.input = ort->dst.input;
2496 	rt->dst.output = ort->dst.output;
2497 	rt->rt6i_dst = ort->rt6i_dst;
2498 	rt->dst.error = ort->dst.error;
2499 	rt->rt6i_idev = ort->rt6i_idev;
2500 	if (rt->rt6i_idev)
2501 		in6_dev_hold(rt->rt6i_idev);
2502 	rt->dst.lastuse = jiffies;
2503 	rt->rt6i_gateway = ort->rt6i_gateway;
2504 	rt->rt6i_flags = ort->rt6i_flags;
2505 	rt6_set_from(rt, ort);
2506 	rt->rt6i_metric = ort->rt6i_metric;
2507 #ifdef CONFIG_IPV6_SUBTREES
2508 	rt->rt6i_src = ort->rt6i_src;
2509 #endif
2510 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2511 	rt->rt6i_table = ort->rt6i_table;
2512 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2513 }
2514 
2515 #ifdef CONFIG_IPV6_ROUTE_INFO
rt6_get_route_info(struct net * net,const struct in6_addr * prefix,int prefixlen,const struct in6_addr * gwaddr,struct net_device * dev)2516 static struct rt6_info *rt6_get_route_info(struct net *net,
2517 					   const struct in6_addr *prefix, int prefixlen,
2518 					   const struct in6_addr *gwaddr,
2519 					   struct net_device *dev)
2520 {
2521 	u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
2522 	struct fib6_node *fn;
2523 	struct rt6_info *rt = NULL;
2524 	struct fib6_table *table;
2525 
2526 	table = fib6_get_table(net, tb_id);
2527 	if (!table)
2528 		return NULL;
2529 
2530 	read_lock_bh(&table->tb6_lock);
2531 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2532 	if (!fn)
2533 		goto out;
2534 
2535 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2536 		if (rt->dst.dev->ifindex != dev->ifindex)
2537 			continue;
2538 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2539 			continue;
2540 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2541 			continue;
2542 		dst_hold(&rt->dst);
2543 		break;
2544 	}
2545 out:
2546 	read_unlock_bh(&table->tb6_lock);
2547 	return rt;
2548 }
2549 
rt6_add_route_info(struct net * net,const struct in6_addr * prefix,int prefixlen,const struct in6_addr * gwaddr,struct net_device * dev,unsigned int pref)2550 static struct rt6_info *rt6_add_route_info(struct net *net,
2551 					   const struct in6_addr *prefix, int prefixlen,
2552 					   const struct in6_addr *gwaddr,
2553 					   struct net_device *dev,
2554 					   unsigned int pref)
2555 {
2556 	struct fib6_config cfg = {
2557 		.fc_metric	= IP6_RT_PRIO_USER,
2558 		.fc_ifindex	= dev->ifindex,
2559 		.fc_dst_len	= prefixlen,
2560 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2561 				  RTF_UP | RTF_PREF(pref),
2562 		.fc_protocol = RTPROT_RA,
2563 		.fc_nlinfo.portid = 0,
2564 		.fc_nlinfo.nlh = NULL,
2565 		.fc_nlinfo.nl_net = net,
2566 	};
2567 
2568 	cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
2569 	cfg.fc_dst = *prefix;
2570 	cfg.fc_gateway = *gwaddr;
2571 
2572 	/* We should treat it as a default route if prefix length is 0. */
2573 	if (!prefixlen)
2574 		cfg.fc_flags |= RTF_DEFAULT;
2575 
2576 	ip6_route_add(&cfg, NULL);
2577 
2578 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2579 }
2580 #endif
2581 
rt6_get_dflt_router(const struct in6_addr * addr,struct net_device * dev)2582 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2583 {
2584 	u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN);
2585 	struct rt6_info *rt;
2586 	struct fib6_table *table;
2587 
2588 	table = fib6_get_table(dev_net(dev), tb_id);
2589 	if (!table)
2590 		return NULL;
2591 
2592 	read_lock_bh(&table->tb6_lock);
2593 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2594 		if (dev == rt->dst.dev &&
2595 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2596 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2597 			break;
2598 	}
2599 	if (rt)
2600 		dst_hold(&rt->dst);
2601 	read_unlock_bh(&table->tb6_lock);
2602 	return rt;
2603 }
2604 
rt6_add_dflt_router(const struct in6_addr * gwaddr,struct net_device * dev,unsigned int pref)2605 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2606 				     struct net_device *dev,
2607 				     unsigned int pref)
2608 {
2609 	struct fib6_config cfg = {
2610 		.fc_table	= l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
2611 		.fc_metric	= IP6_RT_PRIO_USER,
2612 		.fc_ifindex	= dev->ifindex,
2613 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2614 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2615 		.fc_protocol = RTPROT_RA,
2616 		.fc_nlinfo.portid = 0,
2617 		.fc_nlinfo.nlh = NULL,
2618 		.fc_nlinfo.nl_net = dev_net(dev),
2619 	};
2620 
2621 	cfg.fc_gateway = *gwaddr;
2622 
2623 	if (!ip6_route_add(&cfg, NULL)) {
2624 		struct fib6_table *table;
2625 
2626 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
2627 		if (table)
2628 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2629 	}
2630 
2631 	return rt6_get_dflt_router(gwaddr, dev);
2632 }
2633 
rt6_addrconf_purge(struct rt6_info * rt,void * arg)2634 int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
2635 	if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2636 	    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
2637 		return -1;
2638 	return 0;
2639 }
2640 
rt6_purge_dflt_routers(struct net * net)2641 void rt6_purge_dflt_routers(struct net *net)
2642 {
2643 	fib6_clean_all(net, rt6_addrconf_purge, NULL);
2644 }
2645 
rtmsg_to_fib6_config(struct net * net,struct in6_rtmsg * rtmsg,struct fib6_config * cfg)2646 static void rtmsg_to_fib6_config(struct net *net,
2647 				 struct in6_rtmsg *rtmsg,
2648 				 struct fib6_config *cfg)
2649 {
2650 	memset(cfg, 0, sizeof(*cfg));
2651 
2652 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2653 			 : RT6_TABLE_MAIN;
2654 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2655 	cfg->fc_metric = rtmsg->rtmsg_metric;
2656 	cfg->fc_expires = rtmsg->rtmsg_info;
2657 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2658 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2659 	cfg->fc_flags = rtmsg->rtmsg_flags;
2660 
2661 	cfg->fc_nlinfo.nl_net = net;
2662 
2663 	cfg->fc_dst = rtmsg->rtmsg_dst;
2664 	cfg->fc_src = rtmsg->rtmsg_src;
2665 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2666 }
2667 
ipv6_route_ioctl(struct net * net,unsigned int cmd,void __user * arg)2668 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2669 {
2670 	struct fib6_config cfg;
2671 	struct in6_rtmsg rtmsg;
2672 	int err;
2673 
2674 	switch (cmd) {
2675 	case SIOCADDRT:		/* Add a route */
2676 	case SIOCDELRT:		/* Delete a route */
2677 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2678 			return -EPERM;
2679 		err = copy_from_user(&rtmsg, arg,
2680 				     sizeof(struct in6_rtmsg));
2681 		if (err)
2682 			return -EFAULT;
2683 
2684 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2685 
2686 		rtnl_lock();
2687 		switch (cmd) {
2688 		case SIOCADDRT:
2689 			err = ip6_route_add(&cfg, NULL);
2690 			break;
2691 		case SIOCDELRT:
2692 			err = ip6_route_del(&cfg, NULL);
2693 			break;
2694 		default:
2695 			err = -EINVAL;
2696 		}
2697 		rtnl_unlock();
2698 
2699 		return err;
2700 	}
2701 
2702 	return -EINVAL;
2703 }
2704 
2705 /*
2706  *	Drop the packet on the floor
2707  */
2708 
ip6_pkt_drop(struct sk_buff * skb,u8 code,int ipstats_mib_noroutes)2709 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2710 {
2711 	int type;
2712 	struct dst_entry *dst = skb_dst(skb);
2713 	switch (ipstats_mib_noroutes) {
2714 	case IPSTATS_MIB_INNOROUTES:
2715 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2716 		if (type == IPV6_ADDR_ANY) {
2717 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2718 				      IPSTATS_MIB_INADDRERRORS);
2719 			break;
2720 		}
2721 		/* FALLTHROUGH */
2722 	case IPSTATS_MIB_OUTNOROUTES:
2723 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2724 			      ipstats_mib_noroutes);
2725 		break;
2726 	}
2727 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2728 	kfree_skb(skb);
2729 	return 0;
2730 }
2731 
ip6_pkt_discard(struct sk_buff * skb)2732 static int ip6_pkt_discard(struct sk_buff *skb)
2733 {
2734 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2735 }
2736 
ip6_pkt_discard_out(struct net * net,struct sock * sk,struct sk_buff * skb)2737 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2738 {
2739 	skb->dev = skb_dst(skb)->dev;
2740 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2741 }
2742 
ip6_pkt_prohibit(struct sk_buff * skb)2743 static int ip6_pkt_prohibit(struct sk_buff *skb)
2744 {
2745 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2746 }
2747 
ip6_pkt_prohibit_out(struct net * net,struct sock * sk,struct sk_buff * skb)2748 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2749 {
2750 	skb->dev = skb_dst(skb)->dev;
2751 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2752 }
2753 
2754 /*
2755  *	Allocate a dst for local (unicast / anycast) address.
2756  */
2757 
addrconf_dst_alloc(struct inet6_dev * idev,const struct in6_addr * addr,bool anycast)2758 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2759 				    const struct in6_addr *addr,
2760 				    bool anycast)
2761 {
2762 	u32 tb_id;
2763 	struct net *net = dev_net(idev->dev);
2764 	struct net_device *dev = idev->dev;
2765 	struct rt6_info *rt;
2766 
2767 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2768 	if (!rt)
2769 		return ERR_PTR(-ENOMEM);
2770 
2771 	in6_dev_hold(idev);
2772 
2773 	rt->dst.flags |= DST_HOST;
2774 	rt->dst.input = ip6_input;
2775 	rt->dst.output = ip6_output;
2776 	rt->rt6i_idev = idev;
2777 
2778 	rt->rt6i_protocol = RTPROT_KERNEL;
2779 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2780 	if (anycast)
2781 		rt->rt6i_flags |= RTF_ANYCAST;
2782 	else
2783 		rt->rt6i_flags |= RTF_LOCAL;
2784 
2785 	rt->rt6i_gateway  = *addr;
2786 	rt->rt6i_dst.addr = *addr;
2787 	rt->rt6i_dst.plen = 128;
2788 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2789 	rt->rt6i_table = fib6_get_table(net, tb_id);
2790 
2791 	return rt;
2792 }
2793 
2794 /* remove deleted ip from prefsrc entries */
2795 struct arg_dev_net_ip {
2796 	struct net_device *dev;
2797 	struct net *net;
2798 	struct in6_addr *addr;
2799 };
2800 
fib6_remove_prefsrc(struct rt6_info * rt,void * arg)2801 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2802 {
2803 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2804 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2805 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2806 
2807 	if (((void *)rt->dst.dev == dev || !dev) &&
2808 	    rt != net->ipv6.ip6_null_entry &&
2809 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2810 		/* remove prefsrc entry */
2811 		rt->rt6i_prefsrc.plen = 0;
2812 	}
2813 	return 0;
2814 }
2815 
rt6_remove_prefsrc(struct inet6_ifaddr * ifp)2816 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2817 {
2818 	struct net *net = dev_net(ifp->idev->dev);
2819 	struct arg_dev_net_ip adni = {
2820 		.dev = ifp->idev->dev,
2821 		.net = net,
2822 		.addr = &ifp->addr,
2823 	};
2824 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2825 }
2826 
2827 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2828 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2829 
2830 /* Remove routers and update dst entries when gateway turn into host. */
fib6_clean_tohost(struct rt6_info * rt,void * arg)2831 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2832 {
2833 	struct in6_addr *gateway = (struct in6_addr *)arg;
2834 
2835 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2836 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2837 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2838 		return -1;
2839 	}
2840 	return 0;
2841 }
2842 
rt6_clean_tohost(struct net * net,struct in6_addr * gateway)2843 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2844 {
2845 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2846 }
2847 
2848 struct arg_dev_net {
2849 	struct net_device *dev;
2850 	struct net *net;
2851 };
2852 
2853 /* called with write lock held for table with rt */
fib6_ifdown(struct rt6_info * rt,void * arg)2854 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2855 {
2856 	const struct arg_dev_net *adn = arg;
2857 	const struct net_device *dev = adn->dev;
2858 
2859 	if ((rt->dst.dev == dev || !dev) &&
2860 	    rt != adn->net->ipv6.ip6_null_entry &&
2861 	    (rt->rt6i_nsiblings == 0 ||
2862 	     (dev && netdev_unregistering(dev)) ||
2863 	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2864 		return -1;
2865 
2866 	return 0;
2867 }
2868 
rt6_ifdown(struct net * net,struct net_device * dev)2869 void rt6_ifdown(struct net *net, struct net_device *dev)
2870 {
2871 	struct arg_dev_net adn = {
2872 		.dev = dev,
2873 		.net = net,
2874 	};
2875 
2876 	fib6_clean_all(net, fib6_ifdown, &adn);
2877 	if (dev)
2878 		rt6_uncached_list_flush_dev(net, dev);
2879 }
2880 
2881 struct rt6_mtu_change_arg {
2882 	struct net_device *dev;
2883 	unsigned int mtu;
2884 };
2885 
rt6_mtu_change_route(struct rt6_info * rt,void * p_arg)2886 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2887 {
2888 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2889 	struct inet6_dev *idev;
2890 
2891 	/* In IPv6 pmtu discovery is not optional,
2892 	   so that RTAX_MTU lock cannot disable it.
2893 	   We still use this lock to block changes
2894 	   caused by addrconf/ndisc.
2895 	*/
2896 
2897 	idev = __in6_dev_get(arg->dev);
2898 	if (!idev)
2899 		return 0;
2900 
2901 	/* For administrative MTU increase, there is no way to discover
2902 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2903 	   Since RFC 1981 doesn't include administrative MTU increase
2904 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2905 	 */
2906 	/*
2907 	   If new MTU is less than route PMTU, this new MTU will be the
2908 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2909 	   decreases; if new MTU is greater than route PMTU, and the
2910 	   old MTU is the lowest MTU in the path, update the route PMTU
2911 	   to reflect the increase. In this case if the other nodes' MTU
2912 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2913 	   PMTU discovery.
2914 	 */
2915 	if (rt->dst.dev == arg->dev &&
2916 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
2917 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2918 		if (rt->rt6i_flags & RTF_CACHE) {
2919 			/* For RTF_CACHE with rt6i_pmtu == 0
2920 			 * (i.e. a redirected route),
2921 			 * the metrics of its rt->dst.from has already
2922 			 * been updated.
2923 			 */
2924 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2925 				rt->rt6i_pmtu = arg->mtu;
2926 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2927 			   (dst_mtu(&rt->dst) < arg->mtu &&
2928 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2929 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2930 		}
2931 	}
2932 	return 0;
2933 }
2934 
rt6_mtu_change(struct net_device * dev,unsigned int mtu)2935 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2936 {
2937 	struct rt6_mtu_change_arg arg = {
2938 		.dev = dev,
2939 		.mtu = mtu,
2940 	};
2941 
2942 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2943 }
2944 
2945 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2946 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2947 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
2948 	[RTA_OIF]               = { .type = NLA_U32 },
2949 	[RTA_IIF]		= { .type = NLA_U32 },
2950 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2951 	[RTA_METRICS]           = { .type = NLA_NESTED },
2952 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2953 	[RTA_PREF]              = { .type = NLA_U8 },
2954 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2955 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2956 	[RTA_EXPIRES]		= { .type = NLA_U32 },
2957 	[RTA_UID]		= { .type = NLA_U32 },
2958 	[RTA_MARK]		= { .type = NLA_U32 },
2959 	[RTA_TABLE]		= { .type = NLA_U32 },
2960 };
2961 
rtm_to_fib6_config(struct sk_buff * skb,struct nlmsghdr * nlh,struct fib6_config * cfg,struct netlink_ext_ack * extack)2962 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2963 			      struct fib6_config *cfg,
2964 			      struct netlink_ext_ack *extack)
2965 {
2966 	struct rtmsg *rtm;
2967 	struct nlattr *tb[RTA_MAX+1];
2968 	unsigned int pref;
2969 	int err;
2970 
2971 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2972 			  NULL);
2973 	if (err < 0)
2974 		goto errout;
2975 
2976 	err = -EINVAL;
2977 	rtm = nlmsg_data(nlh);
2978 	memset(cfg, 0, sizeof(*cfg));
2979 
2980 	cfg->fc_table = rtm->rtm_table;
2981 	cfg->fc_dst_len = rtm->rtm_dst_len;
2982 	cfg->fc_src_len = rtm->rtm_src_len;
2983 	cfg->fc_flags = RTF_UP;
2984 	cfg->fc_protocol = rtm->rtm_protocol;
2985 	cfg->fc_type = rtm->rtm_type;
2986 
2987 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2988 	    rtm->rtm_type == RTN_BLACKHOLE ||
2989 	    rtm->rtm_type == RTN_PROHIBIT ||
2990 	    rtm->rtm_type == RTN_THROW)
2991 		cfg->fc_flags |= RTF_REJECT;
2992 
2993 	if (rtm->rtm_type == RTN_LOCAL)
2994 		cfg->fc_flags |= RTF_LOCAL;
2995 
2996 	if (rtm->rtm_flags & RTM_F_CLONED)
2997 		cfg->fc_flags |= RTF_CACHE;
2998 
2999 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3000 	cfg->fc_nlinfo.nlh = nlh;
3001 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3002 
3003 	if (tb[RTA_GATEWAY]) {
3004 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3005 		cfg->fc_flags |= RTF_GATEWAY;
3006 	}
3007 	if (tb[RTA_VIA]) {
3008 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
3009 		goto errout;
3010 	}
3011 
3012 	if (tb[RTA_DST]) {
3013 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3014 
3015 		if (nla_len(tb[RTA_DST]) < plen)
3016 			goto errout;
3017 
3018 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3019 	}
3020 
3021 	if (tb[RTA_SRC]) {
3022 		int plen = (rtm->rtm_src_len + 7) >> 3;
3023 
3024 		if (nla_len(tb[RTA_SRC]) < plen)
3025 			goto errout;
3026 
3027 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3028 	}
3029 
3030 	if (tb[RTA_PREFSRC])
3031 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3032 
3033 	if (tb[RTA_OIF])
3034 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3035 
3036 	if (tb[RTA_PRIORITY])
3037 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3038 
3039 	if (tb[RTA_METRICS]) {
3040 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3041 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3042 	}
3043 
3044 	if (tb[RTA_TABLE])
3045 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3046 
3047 	if (tb[RTA_MULTIPATH]) {
3048 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3049 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3050 
3051 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3052 						     cfg->fc_mp_len, extack);
3053 		if (err < 0)
3054 			goto errout;
3055 	}
3056 
3057 	if (tb[RTA_PREF]) {
3058 		pref = nla_get_u8(tb[RTA_PREF]);
3059 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3060 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3061 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3062 		cfg->fc_flags |= RTF_PREF(pref);
3063 	}
3064 
3065 	if (tb[RTA_ENCAP])
3066 		cfg->fc_encap = tb[RTA_ENCAP];
3067 
3068 	if (tb[RTA_ENCAP_TYPE]) {
3069 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3070 
3071 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3072 		if (err < 0)
3073 			goto errout;
3074 	}
3075 
3076 	if (tb[RTA_EXPIRES]) {
3077 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3078 
3079 		if (addrconf_finite_timeout(timeout)) {
3080 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3081 			cfg->fc_flags |= RTF_EXPIRES;
3082 		}
3083 	}
3084 
3085 	err = 0;
3086 errout:
3087 	return err;
3088 }
3089 
3090 struct rt6_nh {
3091 	struct rt6_info *rt6_info;
3092 	struct fib6_config r_cfg;
3093 	struct mx6_config mxc;
3094 	struct list_head next;
3095 };
3096 
ip6_print_replace_route_err(struct list_head * rt6_nh_list)3097 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3098 {
3099 	struct rt6_nh *nh;
3100 
3101 	list_for_each_entry(nh, rt6_nh_list, next) {
3102 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3103 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3104 		        nh->r_cfg.fc_ifindex);
3105 	}
3106 }
3107 
ip6_route_info_append(struct list_head * rt6_nh_list,struct rt6_info * rt,struct fib6_config * r_cfg)3108 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3109 				 struct rt6_info *rt, struct fib6_config *r_cfg)
3110 {
3111 	struct rt6_nh *nh;
3112 	int err = -EEXIST;
3113 
3114 	list_for_each_entry(nh, rt6_nh_list, next) {
3115 		/* check if rt6_info already exists */
3116 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3117 			return err;
3118 	}
3119 
3120 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3121 	if (!nh)
3122 		return -ENOMEM;
3123 	nh->rt6_info = rt;
3124 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
3125 	if (err) {
3126 		kfree(nh);
3127 		return err;
3128 	}
3129 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3130 	list_add_tail(&nh->next, rt6_nh_list);
3131 
3132 	return 0;
3133 }
3134 
ip6_route_mpath_notify(struct rt6_info * rt,struct rt6_info * rt_last,struct nl_info * info,__u16 nlflags)3135 static void ip6_route_mpath_notify(struct rt6_info *rt,
3136 				   struct rt6_info *rt_last,
3137 				   struct nl_info *info,
3138 				   __u16 nlflags)
3139 {
3140 	/* if this is an APPEND route, then rt points to the first route
3141 	 * inserted and rt_last points to last route inserted. Userspace
3142 	 * wants a consistent dump of the route which starts at the first
3143 	 * nexthop. Since sibling routes are always added at the end of
3144 	 * the list, find the first sibling of the last route appended
3145 	 */
3146 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3147 		rt = list_first_entry(&rt_last->rt6i_siblings,
3148 				      struct rt6_info,
3149 				      rt6i_siblings);
3150 	}
3151 
3152 	if (rt)
3153 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3154 }
3155 
ip6_route_multipath_add(struct fib6_config * cfg,struct netlink_ext_ack * extack)3156 static int ip6_route_multipath_add(struct fib6_config *cfg,
3157 				   struct netlink_ext_ack *extack)
3158 {
3159 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3160 	struct nl_info *info = &cfg->fc_nlinfo;
3161 	struct fib6_config r_cfg;
3162 	struct rtnexthop *rtnh;
3163 	struct rt6_info *rt;
3164 	struct rt6_nh *err_nh;
3165 	struct rt6_nh *nh, *nh_safe;
3166 	__u16 nlflags;
3167 	int remaining;
3168 	int attrlen;
3169 	int err = 1;
3170 	int nhn = 0;
3171 	int replace = (cfg->fc_nlinfo.nlh &&
3172 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3173 	LIST_HEAD(rt6_nh_list);
3174 
3175 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3176 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3177 		nlflags |= NLM_F_APPEND;
3178 
3179 	remaining = cfg->fc_mp_len;
3180 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3181 
3182 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
3183 	 * rt6_info structs per nexthop
3184 	 */
3185 	while (rtnh_ok(rtnh, remaining)) {
3186 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3187 		if (rtnh->rtnh_ifindex)
3188 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3189 
3190 		attrlen = rtnh_attrlen(rtnh);
3191 		if (attrlen > 0) {
3192 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3193 
3194 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3195 			if (nla) {
3196 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
3197 				r_cfg.fc_flags |= RTF_GATEWAY;
3198 			}
3199 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3200 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3201 			if (nla)
3202 				r_cfg.fc_encap_type = nla_get_u16(nla);
3203 		}
3204 
3205 		rt = ip6_route_info_create(&r_cfg, extack);
3206 		if (IS_ERR(rt)) {
3207 			err = PTR_ERR(rt);
3208 			rt = NULL;
3209 			goto cleanup;
3210 		}
3211 
3212 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3213 		if (err) {
3214 			dst_release_immediate(&rt->dst);
3215 			goto cleanup;
3216 		}
3217 
3218 		rtnh = rtnh_next(rtnh, &remaining);
3219 	}
3220 
3221 	/* for add and replace send one notification with all nexthops.
3222 	 * Skip the notification in fib6_add_rt2node and send one with
3223 	 * the full route when done
3224 	 */
3225 	info->skip_notify = 1;
3226 
3227 	err_nh = NULL;
3228 	list_for_each_entry(nh, &rt6_nh_list, next) {
3229 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3230 
3231 		if (!err) {
3232 			/* save reference to last route successfully inserted */
3233 			rt_last = nh->rt6_info;
3234 
3235 			/* save reference to first route for notification */
3236 			if (!rt_notif)
3237 				rt_notif = nh->rt6_info;
3238 		}
3239 
3240 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
3241 		nh->rt6_info = NULL;
3242 		if (err) {
3243 			if (replace && nhn)
3244 				ip6_print_replace_route_err(&rt6_nh_list);
3245 			err_nh = nh;
3246 			goto add_errout;
3247 		}
3248 
3249 		/* Because each route is added like a single route we remove
3250 		 * these flags after the first nexthop: if there is a collision,
3251 		 * we have already failed to add the first nexthop:
3252 		 * fib6_add_rt2node() has rejected it; when replacing, old
3253 		 * nexthops have been replaced by first new, the rest should
3254 		 * be added to it.
3255 		 */
3256 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3257 						     NLM_F_REPLACE);
3258 		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
3259 		nhn++;
3260 	}
3261 
3262 	/* success ... tell user about new route */
3263 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3264 	goto cleanup;
3265 
3266 add_errout:
3267 	/* send notification for routes that were added so that
3268 	 * the delete notifications sent by ip6_route_del are
3269 	 * coherent
3270 	 */
3271 	if (rt_notif)
3272 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3273 
3274 	/* Delete routes that were already added */
3275 	list_for_each_entry(nh, &rt6_nh_list, next) {
3276 		if (err_nh == nh)
3277 			break;
3278 		ip6_route_del(&nh->r_cfg, extack);
3279 	}
3280 
3281 cleanup:
3282 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3283 		if (nh->rt6_info)
3284 			dst_release_immediate(&nh->rt6_info->dst);
3285 		kfree(nh->mxc.mx);
3286 		list_del(&nh->next);
3287 		kfree(nh);
3288 	}
3289 
3290 	return err;
3291 }
3292 
ip6_route_multipath_del(struct fib6_config * cfg,struct netlink_ext_ack * extack)3293 static int ip6_route_multipath_del(struct fib6_config *cfg,
3294 				   struct netlink_ext_ack *extack)
3295 {
3296 	struct fib6_config r_cfg;
3297 	struct rtnexthop *rtnh;
3298 	int remaining;
3299 	int attrlen;
3300 	int err = 1, last_err = 0;
3301 
3302 	remaining = cfg->fc_mp_len;
3303 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3304 
3305 	/* Parse a Multipath Entry */
3306 	while (rtnh_ok(rtnh, remaining)) {
3307 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3308 		if (rtnh->rtnh_ifindex)
3309 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3310 
3311 		attrlen = rtnh_attrlen(rtnh);
3312 		if (attrlen > 0) {
3313 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3314 
3315 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3316 			if (nla) {
3317 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3318 				r_cfg.fc_flags |= RTF_GATEWAY;
3319 			}
3320 		}
3321 		err = ip6_route_del(&r_cfg, extack);
3322 		if (err)
3323 			last_err = err;
3324 
3325 		rtnh = rtnh_next(rtnh, &remaining);
3326 	}
3327 
3328 	return last_err;
3329 }
3330 
inet6_rtm_delroute(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3331 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3332 			      struct netlink_ext_ack *extack)
3333 {
3334 	struct fib6_config cfg;
3335 	int err;
3336 
3337 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3338 	if (err < 0)
3339 		return err;
3340 
3341 	if (cfg.fc_mp)
3342 		return ip6_route_multipath_del(&cfg, extack);
3343 	else {
3344 		cfg.fc_delete_all_nh = 1;
3345 		return ip6_route_del(&cfg, extack);
3346 	}
3347 }
3348 
inet6_rtm_newroute(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3349 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3350 			      struct netlink_ext_ack *extack)
3351 {
3352 	struct fib6_config cfg;
3353 	int err;
3354 
3355 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3356 	if (err < 0)
3357 		return err;
3358 
3359 	if (cfg.fc_mp)
3360 		return ip6_route_multipath_add(&cfg, extack);
3361 	else
3362 		return ip6_route_add(&cfg, extack);
3363 }
3364 
rt6_nlmsg_size(struct rt6_info * rt)3365 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3366 {
3367 	int nexthop_len = 0;
3368 
3369 	if (rt->rt6i_nsiblings) {
3370 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
3371 			    + NLA_ALIGN(sizeof(struct rtnexthop))
3372 			    + nla_total_size(16) /* RTA_GATEWAY */
3373 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
3374 
3375 		nexthop_len *= rt->rt6i_nsiblings;
3376 	}
3377 
3378 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3379 	       + nla_total_size(16) /* RTA_SRC */
3380 	       + nla_total_size(16) /* RTA_DST */
3381 	       + nla_total_size(16) /* RTA_GATEWAY */
3382 	       + nla_total_size(16) /* RTA_PREFSRC */
3383 	       + nla_total_size(4) /* RTA_TABLE */
3384 	       + nla_total_size(4) /* RTA_IIF */
3385 	       + nla_total_size(4) /* RTA_OIF */
3386 	       + nla_total_size(4) /* RTA_PRIORITY */
3387 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3388 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3389 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3390 	       + nla_total_size(1) /* RTA_PREF */
3391 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
3392 	       + nexthop_len;
3393 }
3394 
rt6_nexthop_info(struct sk_buff * skb,struct rt6_info * rt,unsigned int * flags,bool skip_oif)3395 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3396 			    unsigned int *flags, bool skip_oif)
3397 {
3398 	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3399 		*flags |= RTNH_F_LINKDOWN;
3400 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3401 			*flags |= RTNH_F_DEAD;
3402 	}
3403 
3404 	if (rt->rt6i_flags & RTF_GATEWAY) {
3405 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3406 			goto nla_put_failure;
3407 	}
3408 
3409 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3410 		*flags |= RTNH_F_OFFLOAD;
3411 
3412 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
3413 	if (!skip_oif && rt->dst.dev &&
3414 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3415 		goto nla_put_failure;
3416 
3417 	if (rt->dst.lwtstate &&
3418 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3419 		goto nla_put_failure;
3420 
3421 	return 0;
3422 
3423 nla_put_failure:
3424 	return -EMSGSIZE;
3425 }
3426 
3427 /* add multipath next hop */
rt6_add_nexthop(struct sk_buff * skb,struct rt6_info * rt)3428 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3429 {
3430 	struct rtnexthop *rtnh;
3431 	unsigned int flags = 0;
3432 
3433 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3434 	if (!rtnh)
3435 		goto nla_put_failure;
3436 
3437 	rtnh->rtnh_hops = 0;
3438 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3439 
3440 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3441 		goto nla_put_failure;
3442 
3443 	rtnh->rtnh_flags = flags;
3444 
3445 	/* length of rtnetlink header + attributes */
3446 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3447 
3448 	return 0;
3449 
3450 nla_put_failure:
3451 	return -EMSGSIZE;
3452 }
3453 
rt6_fill_node(struct net * net,struct sk_buff * skb,struct rt6_info * rt,struct in6_addr * dst,struct in6_addr * src,int iif,int type,u32 portid,u32 seq,unsigned int flags)3454 static int rt6_fill_node(struct net *net,
3455 			 struct sk_buff *skb, struct rt6_info *rt,
3456 			 struct in6_addr *dst, struct in6_addr *src,
3457 			 int iif, int type, u32 portid, u32 seq,
3458 			 unsigned int flags)
3459 {
3460 	u32 metrics[RTAX_MAX];
3461 	struct rtmsg *rtm;
3462 	struct nlmsghdr *nlh;
3463 	long expires;
3464 	u32 table;
3465 
3466 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3467 	if (!nlh)
3468 		return -EMSGSIZE;
3469 
3470 	rtm = nlmsg_data(nlh);
3471 	rtm->rtm_family = AF_INET6;
3472 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
3473 	rtm->rtm_src_len = rt->rt6i_src.plen;
3474 	rtm->rtm_tos = 0;
3475 	if (rt->rt6i_table)
3476 		table = rt->rt6i_table->tb6_id;
3477 	else
3478 		table = RT6_TABLE_UNSPEC;
3479 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
3480 	if (nla_put_u32(skb, RTA_TABLE, table))
3481 		goto nla_put_failure;
3482 	if (rt->rt6i_flags & RTF_REJECT) {
3483 		switch (rt->dst.error) {
3484 		case -EINVAL:
3485 			rtm->rtm_type = RTN_BLACKHOLE;
3486 			break;
3487 		case -EACCES:
3488 			rtm->rtm_type = RTN_PROHIBIT;
3489 			break;
3490 		case -EAGAIN:
3491 			rtm->rtm_type = RTN_THROW;
3492 			break;
3493 		default:
3494 			rtm->rtm_type = RTN_UNREACHABLE;
3495 			break;
3496 		}
3497 	}
3498 	else if (rt->rt6i_flags & RTF_LOCAL)
3499 		rtm->rtm_type = RTN_LOCAL;
3500 	else if (rt->rt6i_flags & RTF_ANYCAST)
3501 		rtm->rtm_type = RTN_ANYCAST;
3502 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3503 		rtm->rtm_type = RTN_LOCAL;
3504 	else
3505 		rtm->rtm_type = RTN_UNICAST;
3506 	rtm->rtm_flags = 0;
3507 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3508 	rtm->rtm_protocol = rt->rt6i_protocol;
3509 
3510 	if (rt->rt6i_flags & RTF_CACHE)
3511 		rtm->rtm_flags |= RTM_F_CLONED;
3512 
3513 	if (dst) {
3514 		if (nla_put_in6_addr(skb, RTA_DST, dst))
3515 			goto nla_put_failure;
3516 		rtm->rtm_dst_len = 128;
3517 	} else if (rtm->rtm_dst_len)
3518 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3519 			goto nla_put_failure;
3520 #ifdef CONFIG_IPV6_SUBTREES
3521 	if (src) {
3522 		if (nla_put_in6_addr(skb, RTA_SRC, src))
3523 			goto nla_put_failure;
3524 		rtm->rtm_src_len = 128;
3525 	} else if (rtm->rtm_src_len &&
3526 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3527 		goto nla_put_failure;
3528 #endif
3529 	if (iif) {
3530 #ifdef CONFIG_IPV6_MROUTE
3531 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3532 			int err = ip6mr_get_route(net, skb, rtm, portid);
3533 
3534 			if (err == 0)
3535 				return 0;
3536 			if (err < 0)
3537 				goto nla_put_failure;
3538 		} else
3539 #endif
3540 			if (nla_put_u32(skb, RTA_IIF, iif))
3541 				goto nla_put_failure;
3542 	} else if (dst) {
3543 		struct in6_addr saddr_buf;
3544 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3545 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3546 			goto nla_put_failure;
3547 	}
3548 
3549 	if (rt->rt6i_prefsrc.plen) {
3550 		struct in6_addr saddr_buf;
3551 		saddr_buf = rt->rt6i_prefsrc.addr;
3552 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3553 			goto nla_put_failure;
3554 	}
3555 
3556 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3557 	if (rt->rt6i_pmtu)
3558 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3559 	if (rtnetlink_put_metrics(skb, metrics) < 0)
3560 		goto nla_put_failure;
3561 
3562 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3563 		goto nla_put_failure;
3564 
3565 	/* For multipath routes, walk the siblings list and add
3566 	 * each as a nexthop within RTA_MULTIPATH.
3567 	 */
3568 	if (rt->rt6i_nsiblings) {
3569 		struct rt6_info *sibling, *next_sibling;
3570 		struct nlattr *mp;
3571 
3572 		mp = nla_nest_start(skb, RTA_MULTIPATH);
3573 		if (!mp)
3574 			goto nla_put_failure;
3575 
3576 		if (rt6_add_nexthop(skb, rt) < 0)
3577 			goto nla_put_failure;
3578 
3579 		list_for_each_entry_safe(sibling, next_sibling,
3580 					 &rt->rt6i_siblings, rt6i_siblings) {
3581 			if (rt6_add_nexthop(skb, sibling) < 0)
3582 				goto nla_put_failure;
3583 		}
3584 
3585 		nla_nest_end(skb, mp);
3586 	} else {
3587 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3588 			goto nla_put_failure;
3589 	}
3590 
3591 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3592 
3593 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3594 		goto nla_put_failure;
3595 
3596 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3597 		goto nla_put_failure;
3598 
3599 
3600 	nlmsg_end(skb, nlh);
3601 	return 0;
3602 
3603 nla_put_failure:
3604 	nlmsg_cancel(skb, nlh);
3605 	return -EMSGSIZE;
3606 }
3607 
rt6_dump_route(struct rt6_info * rt,void * p_arg)3608 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3609 {
3610 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3611 	struct net *net = arg->net;
3612 
3613 	if (rt == net->ipv6.ip6_null_entry)
3614 		return 0;
3615 
3616 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3617 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3618 
3619 		/* user wants prefix routes only */
3620 		if (rtm->rtm_flags & RTM_F_PREFIX &&
3621 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3622 			/* success since this is not a prefix route */
3623 			return 1;
3624 		}
3625 	}
3626 
3627 	return rt6_fill_node(net,
3628 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3629 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3630 		     NLM_F_MULTI);
3631 }
3632 
inet6_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3633 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3634 			      struct netlink_ext_ack *extack)
3635 {
3636 	struct net *net = sock_net(in_skb->sk);
3637 	struct nlattr *tb[RTA_MAX+1];
3638 	int err, iif = 0, oif = 0;
3639 	struct dst_entry *dst;
3640 	struct rt6_info *rt;
3641 	struct sk_buff *skb;
3642 	struct rtmsg *rtm;
3643 	struct flowi6 fl6;
3644 	bool fibmatch;
3645 
3646 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3647 			  extack);
3648 	if (err < 0)
3649 		goto errout;
3650 
3651 	err = -EINVAL;
3652 	memset(&fl6, 0, sizeof(fl6));
3653 	rtm = nlmsg_data(nlh);
3654 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3655 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3656 
3657 	if (tb[RTA_SRC]) {
3658 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3659 			goto errout;
3660 
3661 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3662 	}
3663 
3664 	if (tb[RTA_DST]) {
3665 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3666 			goto errout;
3667 
3668 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3669 	}
3670 
3671 	if (tb[RTA_IIF])
3672 		iif = nla_get_u32(tb[RTA_IIF]);
3673 
3674 	if (tb[RTA_OIF])
3675 		oif = nla_get_u32(tb[RTA_OIF]);
3676 
3677 	if (tb[RTA_MARK])
3678 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3679 
3680 	if (tb[RTA_UID])
3681 		fl6.flowi6_uid = make_kuid(current_user_ns(),
3682 					   nla_get_u32(tb[RTA_UID]));
3683 	else
3684 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3685 
3686 	if (iif) {
3687 		struct net_device *dev;
3688 		int flags = 0;
3689 
3690 		rcu_read_lock();
3691 
3692 		dev = dev_get_by_index_rcu(net, iif);
3693 		if (!dev) {
3694 			rcu_read_unlock();
3695 			err = -ENODEV;
3696 			goto errout;
3697 		}
3698 
3699 		fl6.flowi6_iif = iif;
3700 
3701 		if (!ipv6_addr_any(&fl6.saddr))
3702 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3703 
3704 		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3705 
3706 		rcu_read_unlock();
3707 	} else {
3708 		fl6.flowi6_oif = oif;
3709 
3710 		dst = ip6_route_output(net, NULL, &fl6);
3711 	}
3712 
3713 
3714 	rt = container_of(dst, struct rt6_info, dst);
3715 	if (rt->dst.error) {
3716 		err = rt->dst.error;
3717 		ip6_rt_put(rt);
3718 		goto errout;
3719 	}
3720 
3721 	if (rt == net->ipv6.ip6_null_entry) {
3722 		err = rt->dst.error;
3723 		ip6_rt_put(rt);
3724 		goto errout;
3725 	}
3726 
3727 	if (fibmatch && rt->dst.from) {
3728 		struct rt6_info *ort = container_of(rt->dst.from,
3729 						    struct rt6_info, dst);
3730 
3731 		dst_hold(&ort->dst);
3732 		ip6_rt_put(rt);
3733 		rt = ort;
3734 	}
3735 
3736 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3737 	if (!skb) {
3738 		ip6_rt_put(rt);
3739 		err = -ENOBUFS;
3740 		goto errout;
3741 	}
3742 
3743 	skb_dst_set(skb, &rt->dst);
3744 	if (fibmatch)
3745 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3746 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3747 				    nlh->nlmsg_seq, 0);
3748 	else
3749 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3750 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3751 				    nlh->nlmsg_seq, 0);
3752 	if (err < 0) {
3753 		kfree_skb(skb);
3754 		goto errout;
3755 	}
3756 
3757 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3758 errout:
3759 	return err;
3760 }
3761 
inet6_rt_notify(int event,struct rt6_info * rt,struct nl_info * info,unsigned int nlm_flags)3762 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3763 		     unsigned int nlm_flags)
3764 {
3765 	struct sk_buff *skb;
3766 	struct net *net = info->nl_net;
3767 	u32 seq;
3768 	int err;
3769 
3770 	err = -ENOBUFS;
3771 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3772 
3773 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3774 	if (!skb)
3775 		goto errout;
3776 
3777 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3778 				event, info->portid, seq, nlm_flags);
3779 	if (err < 0) {
3780 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3781 		WARN_ON(err == -EMSGSIZE);
3782 		kfree_skb(skb);
3783 		goto errout;
3784 	}
3785 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3786 		    info->nlh, gfp_any());
3787 	return;
3788 errout:
3789 	if (err < 0)
3790 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3791 }
3792 
ip6_route_dev_notify(struct notifier_block * this,unsigned long event,void * ptr)3793 static int ip6_route_dev_notify(struct notifier_block *this,
3794 				unsigned long event, void *ptr)
3795 {
3796 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3797 	struct net *net = dev_net(dev);
3798 
3799 	if (!(dev->flags & IFF_LOOPBACK))
3800 		return NOTIFY_OK;
3801 
3802 	if (event == NETDEV_REGISTER) {
3803 		net->ipv6.ip6_null_entry->dst.dev = dev;
3804 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3805 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3806 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3807 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3808 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3809 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3810 #endif
3811 	 } else if (event == NETDEV_UNREGISTER &&
3812 		    dev->reg_state != NETREG_UNREGISTERED) {
3813 		/* NETDEV_UNREGISTER could be fired for multiple times by
3814 		 * netdev_wait_allrefs(). Make sure we only call this once.
3815 		 */
3816 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3817 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3818 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3819 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3820 #endif
3821 	}
3822 
3823 	return NOTIFY_OK;
3824 }
3825 
3826 /*
3827  *	/proc
3828  */
3829 
3830 #ifdef CONFIG_PROC_FS
3831 
3832 static const struct file_operations ipv6_route_proc_fops = {
3833 	.owner		= THIS_MODULE,
3834 	.open		= ipv6_route_open,
3835 	.read		= seq_read,
3836 	.llseek		= seq_lseek,
3837 	.release	= seq_release_net,
3838 };
3839 
rt6_stats_seq_show(struct seq_file * seq,void * v)3840 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3841 {
3842 	struct net *net = (struct net *)seq->private;
3843 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3844 		   net->ipv6.rt6_stats->fib_nodes,
3845 		   net->ipv6.rt6_stats->fib_route_nodes,
3846 		   net->ipv6.rt6_stats->fib_rt_alloc,
3847 		   net->ipv6.rt6_stats->fib_rt_entries,
3848 		   net->ipv6.rt6_stats->fib_rt_cache,
3849 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3850 		   net->ipv6.rt6_stats->fib_discarded_routes);
3851 
3852 	return 0;
3853 }
3854 
rt6_stats_seq_open(struct inode * inode,struct file * file)3855 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3856 {
3857 	return single_open_net(inode, file, rt6_stats_seq_show);
3858 }
3859 
3860 static const struct file_operations rt6_stats_seq_fops = {
3861 	.owner	 = THIS_MODULE,
3862 	.open	 = rt6_stats_seq_open,
3863 	.read	 = seq_read,
3864 	.llseek	 = seq_lseek,
3865 	.release = single_release_net,
3866 };
3867 #endif	/* CONFIG_PROC_FS */
3868 
3869 #ifdef CONFIG_SYSCTL
3870 
3871 static
ipv6_sysctl_rtcache_flush(struct ctl_table * ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)3872 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3873 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3874 {
3875 	struct net *net;
3876 	int delay;
3877 	if (!write)
3878 		return -EINVAL;
3879 
3880 	net = (struct net *)ctl->extra1;
3881 	delay = net->ipv6.sysctl.flush_delay;
3882 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3883 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3884 	return 0;
3885 }
3886 
3887 struct ctl_table ipv6_route_table_template[] = {
3888 	{
3889 		.procname	=	"flush",
3890 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3891 		.maxlen		=	sizeof(int),
3892 		.mode		=	0200,
3893 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3894 	},
3895 	{
3896 		.procname	=	"gc_thresh",
3897 		.data		=	&ip6_dst_ops_template.gc_thresh,
3898 		.maxlen		=	sizeof(int),
3899 		.mode		=	0644,
3900 		.proc_handler	=	proc_dointvec,
3901 	},
3902 	{
3903 		.procname	=	"max_size",
3904 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3905 		.maxlen		=	sizeof(int),
3906 		.mode		=	0644,
3907 		.proc_handler	=	proc_dointvec,
3908 	},
3909 	{
3910 		.procname	=	"gc_min_interval",
3911 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3912 		.maxlen		=	sizeof(int),
3913 		.mode		=	0644,
3914 		.proc_handler	=	proc_dointvec_jiffies,
3915 	},
3916 	{
3917 		.procname	=	"gc_timeout",
3918 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3919 		.maxlen		=	sizeof(int),
3920 		.mode		=	0644,
3921 		.proc_handler	=	proc_dointvec_jiffies,
3922 	},
3923 	{
3924 		.procname	=	"gc_interval",
3925 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3926 		.maxlen		=	sizeof(int),
3927 		.mode		=	0644,
3928 		.proc_handler	=	proc_dointvec_jiffies,
3929 	},
3930 	{
3931 		.procname	=	"gc_elasticity",
3932 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3933 		.maxlen		=	sizeof(int),
3934 		.mode		=	0644,
3935 		.proc_handler	=	proc_dointvec,
3936 	},
3937 	{
3938 		.procname	=	"mtu_expires",
3939 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3940 		.maxlen		=	sizeof(int),
3941 		.mode		=	0644,
3942 		.proc_handler	=	proc_dointvec_jiffies,
3943 	},
3944 	{
3945 		.procname	=	"min_adv_mss",
3946 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3947 		.maxlen		=	sizeof(int),
3948 		.mode		=	0644,
3949 		.proc_handler	=	proc_dointvec,
3950 	},
3951 	{
3952 		.procname	=	"gc_min_interval_ms",
3953 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3954 		.maxlen		=	sizeof(int),
3955 		.mode		=	0644,
3956 		.proc_handler	=	proc_dointvec_ms_jiffies,
3957 	},
3958 	{ }
3959 };
3960 
ipv6_route_sysctl_init(struct net * net)3961 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3962 {
3963 	struct ctl_table *table;
3964 
3965 	table = kmemdup(ipv6_route_table_template,
3966 			sizeof(ipv6_route_table_template),
3967 			GFP_KERNEL);
3968 
3969 	if (table) {
3970 		table[0].data = &net->ipv6.sysctl.flush_delay;
3971 		table[0].extra1 = net;
3972 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3973 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3974 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3975 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3976 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3977 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3978 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3979 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3980 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3981 
3982 		/* Don't export sysctls to unprivileged users */
3983 		if (net->user_ns != &init_user_ns)
3984 			table[0].procname = NULL;
3985 	}
3986 
3987 	return table;
3988 }
3989 #endif
3990 
ip6_route_net_init(struct net * net)3991 static int __net_init ip6_route_net_init(struct net *net)
3992 {
3993 	int ret = -ENOMEM;
3994 
3995 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3996 	       sizeof(net->ipv6.ip6_dst_ops));
3997 
3998 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3999 		goto out_ip6_dst_ops;
4000 
4001 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4002 					   sizeof(*net->ipv6.ip6_null_entry),
4003 					   GFP_KERNEL);
4004 	if (!net->ipv6.ip6_null_entry)
4005 		goto out_ip6_dst_entries;
4006 	net->ipv6.ip6_null_entry->dst.path =
4007 		(struct dst_entry *)net->ipv6.ip6_null_entry;
4008 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4009 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4010 			 ip6_template_metrics, true);
4011 
4012 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4013 	net->ipv6.fib6_has_custom_rules = false;
4014 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4015 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4016 					       GFP_KERNEL);
4017 	if (!net->ipv6.ip6_prohibit_entry)
4018 		goto out_ip6_null_entry;
4019 	net->ipv6.ip6_prohibit_entry->dst.path =
4020 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4021 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4022 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4023 			 ip6_template_metrics, true);
4024 
4025 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4026 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4027 					       GFP_KERNEL);
4028 	if (!net->ipv6.ip6_blk_hole_entry)
4029 		goto out_ip6_prohibit_entry;
4030 	net->ipv6.ip6_blk_hole_entry->dst.path =
4031 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4032 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4033 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4034 			 ip6_template_metrics, true);
4035 #endif
4036 
4037 	net->ipv6.sysctl.flush_delay = 0;
4038 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4039 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4040 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4041 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4042 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4043 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4044 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4045 
4046 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4047 
4048 	ret = 0;
4049 out:
4050 	return ret;
4051 
4052 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4053 out_ip6_prohibit_entry:
4054 	kfree(net->ipv6.ip6_prohibit_entry);
4055 out_ip6_null_entry:
4056 	kfree(net->ipv6.ip6_null_entry);
4057 #endif
4058 out_ip6_dst_entries:
4059 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4060 out_ip6_dst_ops:
4061 	goto out;
4062 }
4063 
ip6_route_net_exit(struct net * net)4064 static void __net_exit ip6_route_net_exit(struct net *net)
4065 {
4066 	kfree(net->ipv6.ip6_null_entry);
4067 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4068 	kfree(net->ipv6.ip6_prohibit_entry);
4069 	kfree(net->ipv6.ip6_blk_hole_entry);
4070 #endif
4071 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4072 }
4073 
ip6_route_net_init_late(struct net * net)4074 static int __net_init ip6_route_net_init_late(struct net *net)
4075 {
4076 #ifdef CONFIG_PROC_FS
4077 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4078 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4079 #endif
4080 	return 0;
4081 }
4082 
ip6_route_net_exit_late(struct net * net)4083 static void __net_exit ip6_route_net_exit_late(struct net *net)
4084 {
4085 #ifdef CONFIG_PROC_FS
4086 	remove_proc_entry("ipv6_route", net->proc_net);
4087 	remove_proc_entry("rt6_stats", net->proc_net);
4088 #endif
4089 }
4090 
4091 static struct pernet_operations ip6_route_net_ops = {
4092 	.init = ip6_route_net_init,
4093 	.exit = ip6_route_net_exit,
4094 };
4095 
ipv6_inetpeer_init(struct net * net)4096 static int __net_init ipv6_inetpeer_init(struct net *net)
4097 {
4098 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4099 
4100 	if (!bp)
4101 		return -ENOMEM;
4102 	inet_peer_base_init(bp);
4103 	net->ipv6.peers = bp;
4104 	return 0;
4105 }
4106 
ipv6_inetpeer_exit(struct net * net)4107 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4108 {
4109 	struct inet_peer_base *bp = net->ipv6.peers;
4110 
4111 	net->ipv6.peers = NULL;
4112 	inetpeer_invalidate_tree(bp);
4113 	kfree(bp);
4114 }
4115 
4116 static struct pernet_operations ipv6_inetpeer_ops = {
4117 	.init	=	ipv6_inetpeer_init,
4118 	.exit	=	ipv6_inetpeer_exit,
4119 };
4120 
4121 static struct pernet_operations ip6_route_net_late_ops = {
4122 	.init = ip6_route_net_init_late,
4123 	.exit = ip6_route_net_exit_late,
4124 };
4125 
4126 static struct notifier_block ip6_route_dev_notifier = {
4127 	.notifier_call = ip6_route_dev_notify,
4128 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4129 };
4130 
ip6_route_init_special_entries(void)4131 void __init ip6_route_init_special_entries(void)
4132 {
4133 	/* Registering of the loopback is done before this portion of code,
4134 	 * the loopback reference in rt6_info will not be taken, do it
4135 	 * manually for init_net */
4136 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4137 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4138   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4139 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4140 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4141 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4142 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4143   #endif
4144 }
4145 
ip6_route_init(void)4146 int __init ip6_route_init(void)
4147 {
4148 	int ret;
4149 	int cpu;
4150 
4151 	ret = -ENOMEM;
4152 	ip6_dst_ops_template.kmem_cachep =
4153 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4154 				  SLAB_HWCACHE_ALIGN, NULL);
4155 	if (!ip6_dst_ops_template.kmem_cachep)
4156 		goto out;
4157 
4158 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
4159 	if (ret)
4160 		goto out_kmem_cache;
4161 
4162 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4163 	if (ret)
4164 		goto out_dst_entries;
4165 
4166 	ret = register_pernet_subsys(&ip6_route_net_ops);
4167 	if (ret)
4168 		goto out_register_inetpeer;
4169 
4170 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4171 
4172 	ret = fib6_init();
4173 	if (ret)
4174 		goto out_register_subsys;
4175 
4176 	ret = xfrm6_init();
4177 	if (ret)
4178 		goto out_fib6_init;
4179 
4180 	ret = fib6_rules_init();
4181 	if (ret)
4182 		goto xfrm6_init;
4183 
4184 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
4185 	if (ret)
4186 		goto fib6_rules_init;
4187 
4188 	ret = -ENOBUFS;
4189 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4190 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4191 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4192 			    RTNL_FLAG_DOIT_UNLOCKED))
4193 		goto out_register_late_subsys;
4194 
4195 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4196 	if (ret)
4197 		goto out_register_late_subsys;
4198 
4199 	for_each_possible_cpu(cpu) {
4200 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4201 
4202 		INIT_LIST_HEAD(&ul->head);
4203 		spin_lock_init(&ul->lock);
4204 	}
4205 
4206 out:
4207 	return ret;
4208 
4209 out_register_late_subsys:
4210 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4211 fib6_rules_init:
4212 	fib6_rules_cleanup();
4213 xfrm6_init:
4214 	xfrm6_fini();
4215 out_fib6_init:
4216 	fib6_gc_cleanup();
4217 out_register_subsys:
4218 	unregister_pernet_subsys(&ip6_route_net_ops);
4219 out_register_inetpeer:
4220 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4221 out_dst_entries:
4222 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4223 out_kmem_cache:
4224 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4225 	goto out;
4226 }
4227 
ip6_route_cleanup(void)4228 void ip6_route_cleanup(void)
4229 {
4230 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
4231 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4232 	fib6_rules_cleanup();
4233 	xfrm6_fini();
4234 	fib6_gc_cleanup();
4235 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4236 	unregister_pernet_subsys(&ip6_route_net_ops);
4237 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4238 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4239 }
4240