• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static void		ipv4_negative_advice(struct sock *sk,
141 					     struct dst_entry *dst);
142 static void		 ipv4_link_failure(struct sk_buff *skb);
143 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 					   struct sk_buff *skb, u32 mtu,
145 					   bool confirm_neigh);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
rt_cache_seq_stop(struct seq_file * seq,void * v)216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
rt_cache_seq_show(struct seq_file * seq,void * v)220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
rt_cache_seq_open(struct inode * inode,struct file * file)237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct proc_ops rt_cache_proc_ops = {
243 	.proc_open	= rt_cache_seq_open,
244 	.proc_read	= seq_read,
245 	.proc_lseek	= seq_lseek,
246 	.proc_release	= seq_release,
247 };
248 
249 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	(*pos)++;
277 	return NULL;
278 
279 }
280 
rt_cpu_seq_stop(struct seq_file * seq,void * v)281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
rt_cpu_seq_show(struct seq_file * seq,void * v)286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
rt_cpu_seq_open(struct inode * inode,struct file * file)328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct proc_ops rt_cpu_proc_ops = {
334 	.proc_open	= rt_cpu_seq_open,
335 	.proc_read	= seq_read,
336 	.proc_lseek	= seq_lseek,
337 	.proc_release	= seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 #endif
365 
ip_rt_do_proc_init(struct net * net)366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368 	struct proc_dir_entry *pde;
369 
370 	pde = proc_create("rt_cache", 0444, net->proc_net,
371 			  &rt_cache_proc_ops);
372 	if (!pde)
373 		goto err1;
374 
375 	pde = proc_create("rt_cache", 0444,
376 			  net->proc_net_stat, &rt_cpu_proc_ops);
377 	if (!pde)
378 		goto err2;
379 
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 	pde = proc_create_single("rt_acct", 0, net->proc_net,
382 			rt_acct_proc_show);
383 	if (!pde)
384 		goto err3;
385 #endif
386 	return 0;
387 
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390 	remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393 	remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395 	return -ENOMEM;
396 }
397 
ip_rt_do_proc_exit(struct net * net)398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 	remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 	remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406 
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408 	.init = ip_rt_do_proc_init,
409 	.exit = ip_rt_do_proc_exit,
410 };
411 
ip_rt_proc_init(void)412 static int __init ip_rt_proc_init(void)
413 {
414 	return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416 
417 #else
ip_rt_proc_init(void)418 static inline int ip_rt_proc_init(void)
419 {
420 	return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423 
rt_is_expired(const struct rtable * rth)424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428 
rt_cache_flush(struct net * net)429 void rt_cache_flush(struct net *net)
430 {
431 	rt_genid_bump_ipv4(net);
432 }
433 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435 					   struct sk_buff *skb,
436 					   const void *daddr)
437 {
438 	const struct rtable *rt = container_of(dst, struct rtable, dst);
439 	struct net_device *dev = dst->dev;
440 	struct neighbour *n;
441 
442 	rcu_read_lock_bh();
443 
444 	if (likely(rt->rt_gw_family == AF_INET)) {
445 		n = ip_neigh_gw4(dev, rt->rt_gw4);
446 	} else if (rt->rt_gw_family == AF_INET6) {
447 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449 		__be32 pkey;
450 
451 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 		n = ip_neigh_gw4(dev, pkey);
453 	}
454 
455 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 		n = NULL;
457 
458 	rcu_read_unlock_bh();
459 
460 	return n;
461 }
462 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465 	const struct rtable *rt = container_of(dst, struct rtable, dst);
466 	struct net_device *dev = dst->dev;
467 	const __be32 *pkey = daddr;
468 
469 	if (rt->rt_gw_family == AF_INET) {
470 		pkey = (const __be32 *)&rt->rt_gw4;
471 	} else if (rt->rt_gw_family == AF_INET6) {
472 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 	} else if (!daddr ||
474 		 (rt->rt_flags &
475 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 		return;
477 	}
478 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480 
481 /* Hash tables of size 2048..262144 depending on RAM size.
482  * Each bucket uses 8 bytes.
483  */
484 static u32 ip_idents_mask __read_mostly;
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487 
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
ip_idents_reserve(u32 hash,int segs)492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 	u32 bucket, old, now = (u32)jiffies;
495 	atomic_t *p_id;
496 	u32 *p_tstamp;
497 	u32 delta = 0;
498 
499 	bucket = hash & ip_idents_mask;
500 	p_tstamp = ip_tstamps + bucket;
501 	p_id = ip_idents + bucket;
502 	old = READ_ONCE(*p_tstamp);
503 
504 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
505 		delta = prandom_u32_max(now - old);
506 
507 	/* If UBSAN reports an error there, please make sure your compiler
508 	 * supports -fno-strict-overflow before reporting it that was a bug
509 	 * in UBSAN, and it has been fixed in GCC-8.
510 	 */
511 	return atomic_add_return(segs + delta, p_id) - segs;
512 }
513 EXPORT_SYMBOL(ip_idents_reserve);
514 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 {
517 	u32 hash, id;
518 
519 	/* Note the following code is not safe, but this is okay. */
520 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
521 		get_random_bytes(&net->ipv4.ip_id_key,
522 				 sizeof(net->ipv4.ip_id_key));
523 
524 	hash = siphash_3u32((__force u32)iph->daddr,
525 			    (__force u32)iph->saddr,
526 			    iph->protocol,
527 			    &net->ipv4.ip_id_key);
528 	id = ip_idents_reserve(hash, segs);
529 	iph->id = htons(id);
530 }
531 EXPORT_SYMBOL(__ip_select_ident);
532 
ip_rt_fix_tos(struct flowi4 * fl4)533 static void ip_rt_fix_tos(struct flowi4 *fl4)
534 {
535 	__u8 tos = RT_FL_TOS(fl4);
536 
537 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
538 	fl4->flowi4_scope = tos & RTO_ONLINK ?
539 			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
540 }
541 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)542 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
543 			     const struct sock *sk,
544 			     const struct iphdr *iph,
545 			     int oif, u8 tos,
546 			     u8 prot, u32 mark, int flow_flags)
547 {
548 	if (sk) {
549 		const struct inet_sock *inet = inet_sk(sk);
550 
551 		oif = sk->sk_bound_dev_if;
552 		mark = sk->sk_mark;
553 		tos = RT_CONN_FLAGS(sk);
554 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
555 	}
556 	flowi4_init_output(fl4, oif, mark, tos,
557 			   RT_SCOPE_UNIVERSE, prot,
558 			   flow_flags,
559 			   iph->daddr, iph->saddr, 0, 0,
560 			   sock_net_uid(net, sk));
561 }
562 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)563 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
564 			       const struct sock *sk)
565 {
566 	const struct net *net = dev_net(skb->dev);
567 	const struct iphdr *iph = ip_hdr(skb);
568 	int oif = skb->dev->ifindex;
569 	u8 tos = RT_TOS(iph->tos);
570 	u8 prot = iph->protocol;
571 	u32 mark = skb->mark;
572 
573 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
574 }
575 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)576 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
577 {
578 	const struct inet_sock *inet = inet_sk(sk);
579 	const struct ip_options_rcu *inet_opt;
580 	__be32 daddr = inet->inet_daddr;
581 
582 	rcu_read_lock();
583 	inet_opt = rcu_dereference(inet->inet_opt);
584 	if (inet_opt && inet_opt->opt.srr)
585 		daddr = inet_opt->opt.faddr;
586 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
587 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
588 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
589 			   inet_sk_flowi_flags(sk),
590 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
591 	rcu_read_unlock();
592 }
593 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)594 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
595 				 const struct sk_buff *skb)
596 {
597 	if (skb)
598 		build_skb_flow_key(fl4, skb, sk);
599 	else
600 		build_sk_flow_key(fl4, sk);
601 }
602 
603 static DEFINE_SPINLOCK(fnhe_lock);
604 
fnhe_flush_routes(struct fib_nh_exception * fnhe)605 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
606 {
607 	struct rtable *rt;
608 
609 	rt = rcu_dereference(fnhe->fnhe_rth_input);
610 	if (rt) {
611 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
612 		dst_dev_put(&rt->dst);
613 		dst_release(&rt->dst);
614 	}
615 	rt = rcu_dereference(fnhe->fnhe_rth_output);
616 	if (rt) {
617 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
618 		dst_dev_put(&rt->dst);
619 		dst_release(&rt->dst);
620 	}
621 }
622 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)623 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
624 {
625 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
626 	struct fib_nh_exception *fnhe, *oldest = NULL;
627 
628 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
629 		fnhe = rcu_dereference_protected(*fnhe_p,
630 						 lockdep_is_held(&fnhe_lock));
631 		if (!fnhe)
632 			break;
633 		if (!oldest ||
634 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
635 			oldest = fnhe;
636 			oldest_p = fnhe_p;
637 		}
638 	}
639 	fnhe_flush_routes(oldest);
640 	*oldest_p = oldest->fnhe_next;
641 	kfree_rcu(oldest, rcu);
642 }
643 
fnhe_hashfun(__be32 daddr)644 static u32 fnhe_hashfun(__be32 daddr)
645 {
646 	static siphash_key_t fnhe_hash_key __read_mostly;
647 	u64 hval;
648 
649 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
650 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
651 	return hash_64(hval, FNHE_HASH_SHIFT);
652 }
653 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)654 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
655 {
656 	rt->rt_pmtu = fnhe->fnhe_pmtu;
657 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
658 	rt->dst.expires = fnhe->fnhe_expires;
659 
660 	if (fnhe->fnhe_gw) {
661 		rt->rt_flags |= RTCF_REDIRECTED;
662 		rt->rt_uses_gateway = 1;
663 		rt->rt_gw_family = AF_INET;
664 		rt->rt_gw4 = fnhe->fnhe_gw;
665 	}
666 }
667 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)668 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
669 				  __be32 gw, u32 pmtu, bool lock,
670 				  unsigned long expires)
671 {
672 	struct fnhe_hash_bucket *hash;
673 	struct fib_nh_exception *fnhe;
674 	struct rtable *rt;
675 	u32 genid, hval;
676 	unsigned int i;
677 	int depth;
678 
679 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
680 	hval = fnhe_hashfun(daddr);
681 
682 	spin_lock_bh(&fnhe_lock);
683 
684 	hash = rcu_dereference(nhc->nhc_exceptions);
685 	if (!hash) {
686 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
687 		if (!hash)
688 			goto out_unlock;
689 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
690 	}
691 
692 	hash += hval;
693 
694 	depth = 0;
695 	for (fnhe = rcu_dereference(hash->chain); fnhe;
696 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
697 		if (fnhe->fnhe_daddr == daddr)
698 			break;
699 		depth++;
700 	}
701 
702 	if (fnhe) {
703 		if (fnhe->fnhe_genid != genid)
704 			fnhe->fnhe_genid = genid;
705 		if (gw)
706 			fnhe->fnhe_gw = gw;
707 		if (pmtu) {
708 			fnhe->fnhe_pmtu = pmtu;
709 			fnhe->fnhe_mtu_locked = lock;
710 		}
711 		fnhe->fnhe_expires = max(1UL, expires);
712 		/* Update all cached dsts too */
713 		rt = rcu_dereference(fnhe->fnhe_rth_input);
714 		if (rt)
715 			fill_route_from_fnhe(rt, fnhe);
716 		rt = rcu_dereference(fnhe->fnhe_rth_output);
717 		if (rt)
718 			fill_route_from_fnhe(rt, fnhe);
719 	} else {
720 		/* Randomize max depth to avoid some side channels attacks. */
721 		int max_depth = FNHE_RECLAIM_DEPTH +
722 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
723 
724 		while (depth > max_depth) {
725 			fnhe_remove_oldest(hash);
726 			depth--;
727 		}
728 
729 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
730 		if (!fnhe)
731 			goto out_unlock;
732 
733 		fnhe->fnhe_next = hash->chain;
734 
735 		fnhe->fnhe_genid = genid;
736 		fnhe->fnhe_daddr = daddr;
737 		fnhe->fnhe_gw = gw;
738 		fnhe->fnhe_pmtu = pmtu;
739 		fnhe->fnhe_mtu_locked = lock;
740 		fnhe->fnhe_expires = max(1UL, expires);
741 
742 		rcu_assign_pointer(hash->chain, fnhe);
743 
744 		/* Exception created; mark the cached routes for the nexthop
745 		 * stale, so anyone caching it rechecks if this exception
746 		 * applies to them.
747 		 */
748 		rt = rcu_dereference(nhc->nhc_rth_input);
749 		if (rt)
750 			rt->dst.obsolete = DST_OBSOLETE_KILL;
751 
752 		for_each_possible_cpu(i) {
753 			struct rtable __rcu **prt;
754 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
755 			rt = rcu_dereference(*prt);
756 			if (rt)
757 				rt->dst.obsolete = DST_OBSOLETE_KILL;
758 		}
759 	}
760 
761 	fnhe->fnhe_stamp = jiffies;
762 
763 out_unlock:
764 	spin_unlock_bh(&fnhe_lock);
765 }
766 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)767 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
768 			     bool kill_route)
769 {
770 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
771 	__be32 old_gw = ip_hdr(skb)->saddr;
772 	struct net_device *dev = skb->dev;
773 	struct in_device *in_dev;
774 	struct fib_result res;
775 	struct neighbour *n;
776 	struct net *net;
777 
778 	switch (icmp_hdr(skb)->code & 7) {
779 	case ICMP_REDIR_NET:
780 	case ICMP_REDIR_NETTOS:
781 	case ICMP_REDIR_HOST:
782 	case ICMP_REDIR_HOSTTOS:
783 		break;
784 
785 	default:
786 		return;
787 	}
788 
789 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
790 		return;
791 
792 	in_dev = __in_dev_get_rcu(dev);
793 	if (!in_dev)
794 		return;
795 
796 	net = dev_net(dev);
797 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
798 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
799 	    ipv4_is_zeronet(new_gw))
800 		goto reject_redirect;
801 
802 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
803 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
804 			goto reject_redirect;
805 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
806 			goto reject_redirect;
807 	} else {
808 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
809 			goto reject_redirect;
810 	}
811 
812 	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
813 	if (!n)
814 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
815 	if (!IS_ERR(n)) {
816 		if (!(n->nud_state & NUD_VALID)) {
817 			neigh_event_send(n, NULL);
818 		} else {
819 			if (fib_lookup(net, fl4, &res, 0) == 0) {
820 				struct fib_nh_common *nhc;
821 
822 				fib_select_path(net, &res, fl4, skb);
823 				nhc = FIB_RES_NHC(res);
824 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
825 						0, false,
826 						jiffies + ip_rt_gc_timeout);
827 			}
828 			if (kill_route)
829 				rt->dst.obsolete = DST_OBSOLETE_KILL;
830 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
831 		}
832 		neigh_release(n);
833 	}
834 	return;
835 
836 reject_redirect:
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
839 		const struct iphdr *iph = (const struct iphdr *) skb->data;
840 		__be32 daddr = iph->daddr;
841 		__be32 saddr = iph->saddr;
842 
843 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
844 				     "  Advised path = %pI4 -> %pI4\n",
845 				     &old_gw, dev->name, &new_gw,
846 				     &saddr, &daddr);
847 	}
848 #endif
849 	;
850 }
851 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)852 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
853 {
854 	struct rtable *rt;
855 	struct flowi4 fl4;
856 	const struct iphdr *iph = (const struct iphdr *) skb->data;
857 	struct net *net = dev_net(skb->dev);
858 	int oif = skb->dev->ifindex;
859 	u8 tos = RT_TOS(iph->tos);
860 	u8 prot = iph->protocol;
861 	u32 mark = skb->mark;
862 
863 	rt = (struct rtable *) dst;
864 
865 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
866 	ip_rt_fix_tos(&fl4);
867 	__ip_do_redirect(rt, skb, &fl4, true);
868 }
869 
ipv4_negative_advice(struct sock * sk,struct dst_entry * dst)870 static void ipv4_negative_advice(struct sock *sk,
871 				 struct dst_entry *dst)
872 {
873 	struct rtable *rt = (struct rtable *)dst;
874 
875 	if ((dst->obsolete > 0) ||
876 	    (rt->rt_flags & RTCF_REDIRECTED) ||
877 	    rt->dst.expires)
878 		sk_dst_reset(sk);
879 }
880 
881 /*
882  * Algorithm:
883  *	1. The first ip_rt_redirect_number redirects are sent
884  *	   with exponential backoff, then we stop sending them at all,
885  *	   assuming that the host ignores our redirects.
886  *	2. If we did not see packets requiring redirects
887  *	   during ip_rt_redirect_silence, we assume that the host
888  *	   forgot redirected route and start to send redirects again.
889  *
890  * This algorithm is much cheaper and more intelligent than dumb load limiting
891  * in icmp.c.
892  *
893  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
894  * and "frag. need" (breaks PMTU discovery) in icmp.c.
895  */
896 
ip_rt_send_redirect(struct sk_buff * skb)897 void ip_rt_send_redirect(struct sk_buff *skb)
898 {
899 	struct rtable *rt = skb_rtable(skb);
900 	struct in_device *in_dev;
901 	struct inet_peer *peer;
902 	struct net *net;
903 	int log_martians;
904 	int vif;
905 
906 	rcu_read_lock();
907 	in_dev = __in_dev_get_rcu(rt->dst.dev);
908 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
909 		rcu_read_unlock();
910 		return;
911 	}
912 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
913 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
914 	rcu_read_unlock();
915 
916 	net = dev_net(rt->dst.dev);
917 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
918 	if (!peer) {
919 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
920 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
921 		return;
922 	}
923 
924 	/* No redirected packets during ip_rt_redirect_silence;
925 	 * reset the algorithm.
926 	 */
927 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
928 		peer->rate_tokens = 0;
929 		peer->n_redirects = 0;
930 	}
931 
932 	/* Too many ignored redirects; do not send anything
933 	 * set dst.rate_last to the last seen redirected packet.
934 	 */
935 	if (peer->n_redirects >= ip_rt_redirect_number) {
936 		peer->rate_last = jiffies;
937 		goto out_put_peer;
938 	}
939 
940 	/* Check for load limit; set rate_last to the latest sent
941 	 * redirect.
942 	 */
943 	if (peer->n_redirects == 0 ||
944 	    time_after(jiffies,
945 		       (peer->rate_last +
946 			(ip_rt_redirect_load << peer->n_redirects)))) {
947 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
948 
949 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
950 		peer->rate_last = jiffies;
951 		++peer->n_redirects;
952 #ifdef CONFIG_IP_ROUTE_VERBOSE
953 		if (log_martians &&
954 		    peer->n_redirects == ip_rt_redirect_number)
955 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
956 					     &ip_hdr(skb)->saddr, inet_iif(skb),
957 					     &ip_hdr(skb)->daddr, &gw);
958 #endif
959 	}
960 out_put_peer:
961 	inet_putpeer(peer);
962 }
963 
ip_error(struct sk_buff * skb)964 static int ip_error(struct sk_buff *skb)
965 {
966 	struct rtable *rt = skb_rtable(skb);
967 	struct net_device *dev = skb->dev;
968 	struct in_device *in_dev;
969 	struct inet_peer *peer;
970 	unsigned long now;
971 	struct net *net;
972 	bool send;
973 	int code;
974 
975 	if (netif_is_l3_master(skb->dev)) {
976 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
977 		if (!dev)
978 			goto out;
979 	}
980 
981 	in_dev = __in_dev_get_rcu(dev);
982 
983 	/* IP on this device is disabled. */
984 	if (!in_dev)
985 		goto out;
986 
987 	net = dev_net(rt->dst.dev);
988 	if (!IN_DEV_FORWARD(in_dev)) {
989 		switch (rt->dst.error) {
990 		case EHOSTUNREACH:
991 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
992 			break;
993 
994 		case ENETUNREACH:
995 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
996 			break;
997 		}
998 		goto out;
999 	}
1000 
1001 	switch (rt->dst.error) {
1002 	case EINVAL:
1003 	default:
1004 		goto out;
1005 	case EHOSTUNREACH:
1006 		code = ICMP_HOST_UNREACH;
1007 		break;
1008 	case ENETUNREACH:
1009 		code = ICMP_NET_UNREACH;
1010 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1011 		break;
1012 	case EACCES:
1013 		code = ICMP_PKT_FILTERED;
1014 		break;
1015 	}
1016 
1017 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1018 			       l3mdev_master_ifindex(skb->dev), 1);
1019 
1020 	send = true;
1021 	if (peer) {
1022 		now = jiffies;
1023 		peer->rate_tokens += now - peer->rate_last;
1024 		if (peer->rate_tokens > ip_rt_error_burst)
1025 			peer->rate_tokens = ip_rt_error_burst;
1026 		peer->rate_last = now;
1027 		if (peer->rate_tokens >= ip_rt_error_cost)
1028 			peer->rate_tokens -= ip_rt_error_cost;
1029 		else
1030 			send = false;
1031 		inet_putpeer(peer);
1032 	}
1033 	if (send)
1034 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1035 
1036 out:	kfree_skb(skb);
1037 	return 0;
1038 }
1039 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1040 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1041 {
1042 	struct dst_entry *dst = &rt->dst;
1043 	struct net *net = dev_net(dst->dev);
1044 	struct fib_result res;
1045 	bool lock = false;
1046 	u32 old_mtu;
1047 
1048 	if (ip_mtu_locked(dst))
1049 		return;
1050 
1051 	old_mtu = ipv4_mtu(dst);
1052 	if (old_mtu < mtu)
1053 		return;
1054 
1055 	if (mtu < ip_rt_min_pmtu) {
1056 		lock = true;
1057 		mtu = min(old_mtu, ip_rt_min_pmtu);
1058 	}
1059 
1060 	if (rt->rt_pmtu == mtu && !lock &&
1061 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1062 		return;
1063 
1064 	rcu_read_lock();
1065 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1066 		struct fib_nh_common *nhc;
1067 
1068 		fib_select_path(net, &res, fl4, NULL);
1069 		nhc = FIB_RES_NHC(res);
1070 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1071 				      jiffies + ip_rt_mtu_expires);
1072 	}
1073 	rcu_read_unlock();
1074 }
1075 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1076 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1077 			      struct sk_buff *skb, u32 mtu,
1078 			      bool confirm_neigh)
1079 {
1080 	struct rtable *rt = (struct rtable *) dst;
1081 	struct flowi4 fl4;
1082 
1083 	ip_rt_build_flow_key(&fl4, sk, skb);
1084 	ip_rt_fix_tos(&fl4);
1085 
1086 	/* Don't make lookup fail for bridged encapsulations */
1087 	if (skb && netif_is_any_bridge_port(skb->dev))
1088 		fl4.flowi4_oif = 0;
1089 
1090 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1091 }
1092 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1093 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1094 		      int oif, u8 protocol)
1095 {
1096 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1097 	struct flowi4 fl4;
1098 	struct rtable *rt;
1099 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1100 
1101 	__build_flow_key(net, &fl4, NULL, iph, oif,
1102 			 RT_TOS(iph->tos), protocol, mark, 0);
1103 	rt = __ip_route_output_key(net, &fl4);
1104 	if (!IS_ERR(rt)) {
1105 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1106 		ip_rt_put(rt);
1107 	}
1108 }
1109 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1110 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1111 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1112 {
1113 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1114 	struct flowi4 fl4;
1115 	struct rtable *rt;
1116 
1117 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1118 
1119 	if (!fl4.flowi4_mark)
1120 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1121 
1122 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1123 	if (!IS_ERR(rt)) {
1124 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1125 		ip_rt_put(rt);
1126 	}
1127 }
1128 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1129 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1130 {
1131 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1132 	struct flowi4 fl4;
1133 	struct rtable *rt;
1134 	struct dst_entry *odst = NULL;
1135 	bool new = false;
1136 	struct net *net = sock_net(sk);
1137 
1138 	bh_lock_sock(sk);
1139 
1140 	if (!ip_sk_accept_pmtu(sk))
1141 		goto out;
1142 
1143 	odst = sk_dst_get(sk);
1144 
1145 	if (sock_owned_by_user(sk) || !odst) {
1146 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1147 		goto out;
1148 	}
1149 
1150 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1151 
1152 	rt = (struct rtable *)odst;
1153 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1154 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1155 		if (IS_ERR(rt))
1156 			goto out;
1157 
1158 		new = true;
1159 	} else {
1160 		ip_rt_fix_tos(&fl4);
1161 	}
1162 
1163 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1164 
1165 	if (!dst_check(&rt->dst, 0)) {
1166 		if (new)
1167 			dst_release(&rt->dst);
1168 
1169 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1170 		if (IS_ERR(rt))
1171 			goto out;
1172 
1173 		new = true;
1174 	}
1175 
1176 	if (new)
1177 		sk_dst_set(sk, &rt->dst);
1178 
1179 out:
1180 	bh_unlock_sock(sk);
1181 	dst_release(odst);
1182 }
1183 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1184 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1185 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1186 		   int oif, u8 protocol)
1187 {
1188 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1189 	struct flowi4 fl4;
1190 	struct rtable *rt;
1191 
1192 	__build_flow_key(net, &fl4, NULL, iph, oif,
1193 			 RT_TOS(iph->tos), protocol, 0, 0);
1194 	rt = __ip_route_output_key(net, &fl4);
1195 	if (!IS_ERR(rt)) {
1196 		__ip_do_redirect(rt, skb, &fl4, false);
1197 		ip_rt_put(rt);
1198 	}
1199 }
1200 EXPORT_SYMBOL_GPL(ipv4_redirect);
1201 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1202 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1203 {
1204 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1205 	struct flowi4 fl4;
1206 	struct rtable *rt;
1207 	struct net *net = sock_net(sk);
1208 
1209 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1210 	rt = __ip_route_output_key(net, &fl4);
1211 	if (!IS_ERR(rt)) {
1212 		__ip_do_redirect(rt, skb, &fl4, false);
1213 		ip_rt_put(rt);
1214 	}
1215 }
1216 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1217 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1218 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1219 {
1220 	struct rtable *rt = (struct rtable *) dst;
1221 
1222 	/* All IPV4 dsts are created with ->obsolete set to the value
1223 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1224 	 * into this function always.
1225 	 *
1226 	 * When a PMTU/redirect information update invalidates a route,
1227 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1228 	 * DST_OBSOLETE_DEAD.
1229 	 */
1230 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1231 		return NULL;
1232 	return dst;
1233 }
1234 
ipv4_send_dest_unreach(struct sk_buff * skb)1235 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1236 {
1237 	struct net_device *dev;
1238 	struct ip_options opt;
1239 	int res;
1240 
1241 	/* Recompile ip options since IPCB may not be valid anymore.
1242 	 * Also check we have a reasonable ipv4 header.
1243 	 */
1244 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1245 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1246 		return;
1247 
1248 	memset(&opt, 0, sizeof(opt));
1249 	if (ip_hdr(skb)->ihl > 5) {
1250 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1251 			return;
1252 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1253 
1254 		rcu_read_lock();
1255 		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1256 		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1257 		rcu_read_unlock();
1258 
1259 		if (res)
1260 			return;
1261 	}
1262 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1263 }
1264 
ipv4_link_failure(struct sk_buff * skb)1265 static void ipv4_link_failure(struct sk_buff *skb)
1266 {
1267 	struct rtable *rt;
1268 
1269 	ipv4_send_dest_unreach(skb);
1270 
1271 	rt = skb_rtable(skb);
1272 	if (rt)
1273 		dst_set_expires(&rt->dst, 0);
1274 }
1275 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1276 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1277 {
1278 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1279 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1280 		 skb->dev ? skb->dev->name : "?");
1281 	kfree_skb(skb);
1282 	WARN_ON(1);
1283 	return 0;
1284 }
1285 
1286 /*
1287    We do not cache source address of outgoing interface,
1288    because it is used only by IP RR, TS and SRR options,
1289    so that it out of fast path.
1290 
1291    BTW remember: "addr" is allowed to be not aligned
1292    in IP options!
1293  */
1294 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1295 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1296 {
1297 	__be32 src;
1298 
1299 	if (rt_is_output_route(rt))
1300 		src = ip_hdr(skb)->saddr;
1301 	else {
1302 		struct fib_result res;
1303 		struct iphdr *iph = ip_hdr(skb);
1304 		struct flowi4 fl4 = {
1305 			.daddr = iph->daddr,
1306 			.saddr = iph->saddr,
1307 			.flowi4_tos = RT_TOS(iph->tos),
1308 			.flowi4_oif = rt->dst.dev->ifindex,
1309 			.flowi4_iif = skb->dev->ifindex,
1310 			.flowi4_mark = skb->mark,
1311 		};
1312 
1313 		rcu_read_lock();
1314 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1315 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1316 		else
1317 			src = inet_select_addr(rt->dst.dev,
1318 					       rt_nexthop(rt, iph->daddr),
1319 					       RT_SCOPE_UNIVERSE);
1320 		rcu_read_unlock();
1321 	}
1322 	memcpy(addr, &src, 4);
1323 }
1324 
1325 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1326 static void set_class_tag(struct rtable *rt, u32 tag)
1327 {
1328 	if (!(rt->dst.tclassid & 0xFFFF))
1329 		rt->dst.tclassid |= tag & 0xFFFF;
1330 	if (!(rt->dst.tclassid & 0xFFFF0000))
1331 		rt->dst.tclassid |= tag & 0xFFFF0000;
1332 }
1333 #endif
1334 
ipv4_default_advmss(const struct dst_entry * dst)1335 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1336 {
1337 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1338 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1339 				    ip_rt_min_advmss);
1340 
1341 	return min(advmss, IPV4_MAX_PMTU - header_size);
1342 }
1343 
ipv4_mtu(const struct dst_entry * dst)1344 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1345 {
1346 	const struct rtable *rt = (const struct rtable *)dst;
1347 	unsigned int mtu = rt->rt_pmtu;
1348 
1349 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1350 		mtu = dst_metric_raw(dst, RTAX_MTU);
1351 
1352 	if (mtu)
1353 		goto out;
1354 
1355 	mtu = READ_ONCE(dst->dev->mtu);
1356 
1357 	if (unlikely(ip_mtu_locked(dst))) {
1358 		if (rt->rt_uses_gateway && mtu > 576)
1359 			mtu = 576;
1360 	}
1361 
1362 out:
1363 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1364 
1365 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1366 }
1367 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1368 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1369 {
1370 	struct fnhe_hash_bucket *hash;
1371 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1372 	u32 hval = fnhe_hashfun(daddr);
1373 
1374 	spin_lock_bh(&fnhe_lock);
1375 
1376 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1377 					 lockdep_is_held(&fnhe_lock));
1378 	hash += hval;
1379 
1380 	fnhe_p = &hash->chain;
1381 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1382 	while (fnhe) {
1383 		if (fnhe->fnhe_daddr == daddr) {
1384 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1385 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1386 			/* set fnhe_daddr to 0 to ensure it won't bind with
1387 			 * new dsts in rt_bind_exception().
1388 			 */
1389 			fnhe->fnhe_daddr = 0;
1390 			fnhe_flush_routes(fnhe);
1391 			kfree_rcu(fnhe, rcu);
1392 			break;
1393 		}
1394 		fnhe_p = &fnhe->fnhe_next;
1395 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1396 						 lockdep_is_held(&fnhe_lock));
1397 	}
1398 
1399 	spin_unlock_bh(&fnhe_lock);
1400 }
1401 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1402 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1403 					       __be32 daddr)
1404 {
1405 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1406 	struct fib_nh_exception *fnhe;
1407 	u32 hval;
1408 
1409 	if (!hash)
1410 		return NULL;
1411 
1412 	hval = fnhe_hashfun(daddr);
1413 
1414 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1415 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1416 		if (fnhe->fnhe_daddr == daddr) {
1417 			if (fnhe->fnhe_expires &&
1418 			    time_after(jiffies, fnhe->fnhe_expires)) {
1419 				ip_del_fnhe(nhc, daddr);
1420 				break;
1421 			}
1422 			return fnhe;
1423 		}
1424 	}
1425 	return NULL;
1426 }
1427 
1428 /* MTU selection:
1429  * 1. mtu on route is locked - use it
1430  * 2. mtu from nexthop exception
1431  * 3. mtu from egress device
1432  */
1433 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1434 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1435 {
1436 	struct fib_nh_common *nhc = res->nhc;
1437 	struct net_device *dev = nhc->nhc_dev;
1438 	struct fib_info *fi = res->fi;
1439 	u32 mtu = 0;
1440 
1441 	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1442 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1443 		mtu = fi->fib_mtu;
1444 
1445 	if (likely(!mtu)) {
1446 		struct fib_nh_exception *fnhe;
1447 
1448 		fnhe = find_exception(nhc, daddr);
1449 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1450 			mtu = fnhe->fnhe_pmtu;
1451 	}
1452 
1453 	if (likely(!mtu))
1454 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1455 
1456 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1457 }
1458 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1459 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1460 			      __be32 daddr, const bool do_cache)
1461 {
1462 	bool ret = false;
1463 
1464 	spin_lock_bh(&fnhe_lock);
1465 
1466 	if (daddr == fnhe->fnhe_daddr) {
1467 		struct rtable __rcu **porig;
1468 		struct rtable *orig;
1469 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1470 
1471 		if (rt_is_input_route(rt))
1472 			porig = &fnhe->fnhe_rth_input;
1473 		else
1474 			porig = &fnhe->fnhe_rth_output;
1475 		orig = rcu_dereference(*porig);
1476 
1477 		if (fnhe->fnhe_genid != genid) {
1478 			fnhe->fnhe_genid = genid;
1479 			fnhe->fnhe_gw = 0;
1480 			fnhe->fnhe_pmtu = 0;
1481 			fnhe->fnhe_expires = 0;
1482 			fnhe->fnhe_mtu_locked = false;
1483 			fnhe_flush_routes(fnhe);
1484 			orig = NULL;
1485 		}
1486 		fill_route_from_fnhe(rt, fnhe);
1487 		if (!rt->rt_gw4) {
1488 			rt->rt_gw4 = daddr;
1489 			rt->rt_gw_family = AF_INET;
1490 		}
1491 
1492 		if (do_cache) {
1493 			dst_hold(&rt->dst);
1494 			rcu_assign_pointer(*porig, rt);
1495 			if (orig) {
1496 				dst_dev_put(&orig->dst);
1497 				dst_release(&orig->dst);
1498 			}
1499 			ret = true;
1500 		}
1501 
1502 		fnhe->fnhe_stamp = jiffies;
1503 	}
1504 	spin_unlock_bh(&fnhe_lock);
1505 
1506 	return ret;
1507 }
1508 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1509 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1510 {
1511 	struct rtable *orig, *prev, **p;
1512 	bool ret = true;
1513 
1514 	if (rt_is_input_route(rt)) {
1515 		p = (struct rtable **)&nhc->nhc_rth_input;
1516 	} else {
1517 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1518 	}
1519 	orig = *p;
1520 
1521 	/* hold dst before doing cmpxchg() to avoid race condition
1522 	 * on this dst
1523 	 */
1524 	dst_hold(&rt->dst);
1525 	prev = cmpxchg(p, orig, rt);
1526 	if (prev == orig) {
1527 		if (orig) {
1528 			rt_add_uncached_list(orig);
1529 			dst_release(&orig->dst);
1530 		}
1531 	} else {
1532 		dst_release(&rt->dst);
1533 		ret = false;
1534 	}
1535 
1536 	return ret;
1537 }
1538 
1539 struct uncached_list {
1540 	spinlock_t		lock;
1541 	struct list_head	head;
1542 };
1543 
1544 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1545 
rt_add_uncached_list(struct rtable * rt)1546 void rt_add_uncached_list(struct rtable *rt)
1547 {
1548 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1549 
1550 	rt->rt_uncached_list = ul;
1551 
1552 	spin_lock_bh(&ul->lock);
1553 	list_add_tail(&rt->rt_uncached, &ul->head);
1554 	spin_unlock_bh(&ul->lock);
1555 }
1556 
rt_del_uncached_list(struct rtable * rt)1557 void rt_del_uncached_list(struct rtable *rt)
1558 {
1559 	if (!list_empty(&rt->rt_uncached)) {
1560 		struct uncached_list *ul = rt->rt_uncached_list;
1561 
1562 		spin_lock_bh(&ul->lock);
1563 		list_del(&rt->rt_uncached);
1564 		spin_unlock_bh(&ul->lock);
1565 	}
1566 }
1567 
ipv4_dst_destroy(struct dst_entry * dst)1568 static void ipv4_dst_destroy(struct dst_entry *dst)
1569 {
1570 	struct rtable *rt = (struct rtable *)dst;
1571 
1572 	ip_dst_metrics_put(dst);
1573 	rt_del_uncached_list(rt);
1574 }
1575 
rt_flush_dev(struct net_device * dev)1576 void rt_flush_dev(struct net_device *dev)
1577 {
1578 	struct rtable *rt;
1579 	int cpu;
1580 
1581 	for_each_possible_cpu(cpu) {
1582 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1583 
1584 		spin_lock_bh(&ul->lock);
1585 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1586 			if (rt->dst.dev != dev)
1587 				continue;
1588 			rt->dst.dev = blackhole_netdev;
1589 			dev_hold(rt->dst.dev);
1590 			dev_put(dev);
1591 		}
1592 		spin_unlock_bh(&ul->lock);
1593 	}
1594 }
1595 
rt_cache_valid(const struct rtable * rt)1596 static bool rt_cache_valid(const struct rtable *rt)
1597 {
1598 	return	rt &&
1599 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1600 		!rt_is_expired(rt);
1601 }
1602 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1603 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1604 			   const struct fib_result *res,
1605 			   struct fib_nh_exception *fnhe,
1606 			   struct fib_info *fi, u16 type, u32 itag,
1607 			   const bool do_cache)
1608 {
1609 	bool cached = false;
1610 
1611 	if (fi) {
1612 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1613 
1614 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1615 			rt->rt_uses_gateway = 1;
1616 			rt->rt_gw_family = nhc->nhc_gw_family;
1617 			/* only INET and INET6 are supported */
1618 			if (likely(nhc->nhc_gw_family == AF_INET))
1619 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1620 			else
1621 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1622 		}
1623 
1624 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1625 
1626 #ifdef CONFIG_IP_ROUTE_CLASSID
1627 		if (nhc->nhc_family == AF_INET) {
1628 			struct fib_nh *nh;
1629 
1630 			nh = container_of(nhc, struct fib_nh, nh_common);
1631 			rt->dst.tclassid = nh->nh_tclassid;
1632 		}
1633 #endif
1634 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1635 		if (unlikely(fnhe))
1636 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1637 		else if (do_cache)
1638 			cached = rt_cache_route(nhc, rt);
1639 		if (unlikely(!cached)) {
1640 			/* Routes we intend to cache in nexthop exception or
1641 			 * FIB nexthop have the DST_NOCACHE bit clear.
1642 			 * However, if we are unsuccessful at storing this
1643 			 * route into the cache we really need to set it.
1644 			 */
1645 			if (!rt->rt_gw4) {
1646 				rt->rt_gw_family = AF_INET;
1647 				rt->rt_gw4 = daddr;
1648 			}
1649 			rt_add_uncached_list(rt);
1650 		}
1651 	} else
1652 		rt_add_uncached_list(rt);
1653 
1654 #ifdef CONFIG_IP_ROUTE_CLASSID
1655 #ifdef CONFIG_IP_MULTIPLE_TABLES
1656 	set_class_tag(rt, res->tclassid);
1657 #endif
1658 	set_class_tag(rt, itag);
1659 #endif
1660 }
1661 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1662 struct rtable *rt_dst_alloc(struct net_device *dev,
1663 			    unsigned int flags, u16 type,
1664 			    bool nopolicy, bool noxfrm)
1665 {
1666 	struct rtable *rt;
1667 
1668 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1669 		       (nopolicy ? DST_NOPOLICY : 0) |
1670 		       (noxfrm ? DST_NOXFRM : 0));
1671 
1672 	if (rt) {
1673 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1674 		rt->rt_flags = flags;
1675 		rt->rt_type = type;
1676 		rt->rt_is_input = 0;
1677 		rt->rt_iif = 0;
1678 		rt->rt_pmtu = 0;
1679 		rt->rt_mtu_locked = 0;
1680 		rt->rt_uses_gateway = 0;
1681 		rt->rt_gw_family = 0;
1682 		rt->rt_gw4 = 0;
1683 		INIT_LIST_HEAD(&rt->rt_uncached);
1684 
1685 		rt->dst.output = ip_output;
1686 		if (flags & RTCF_LOCAL)
1687 			rt->dst.input = ip_local_deliver;
1688 	}
1689 
1690 	return rt;
1691 }
1692 EXPORT_SYMBOL(rt_dst_alloc);
1693 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1694 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1695 {
1696 	struct rtable *new_rt;
1697 
1698 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1699 			   rt->dst.flags);
1700 
1701 	if (new_rt) {
1702 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1703 		new_rt->rt_flags = rt->rt_flags;
1704 		new_rt->rt_type = rt->rt_type;
1705 		new_rt->rt_is_input = rt->rt_is_input;
1706 		new_rt->rt_iif = rt->rt_iif;
1707 		new_rt->rt_pmtu = rt->rt_pmtu;
1708 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1709 		new_rt->rt_gw_family = rt->rt_gw_family;
1710 		if (rt->rt_gw_family == AF_INET)
1711 			new_rt->rt_gw4 = rt->rt_gw4;
1712 		else if (rt->rt_gw_family == AF_INET6)
1713 			new_rt->rt_gw6 = rt->rt_gw6;
1714 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1715 
1716 		new_rt->dst.input = rt->dst.input;
1717 		new_rt->dst.output = rt->dst.output;
1718 		new_rt->dst.error = rt->dst.error;
1719 		new_rt->dst.lastuse = jiffies;
1720 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1721 	}
1722 	return new_rt;
1723 }
1724 EXPORT_SYMBOL(rt_dst_clone);
1725 
1726 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1727 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1728 			  u8 tos, struct net_device *dev,
1729 			  struct in_device *in_dev, u32 *itag)
1730 {
1731 	int err;
1732 
1733 	/* Primary sanity checks. */
1734 	if (!in_dev)
1735 		return -EINVAL;
1736 
1737 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1738 	    skb->protocol != htons(ETH_P_IP))
1739 		return -EINVAL;
1740 
1741 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1742 		return -EINVAL;
1743 
1744 	if (ipv4_is_zeronet(saddr)) {
1745 		if (!ipv4_is_local_multicast(daddr) &&
1746 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1747 			return -EINVAL;
1748 	} else {
1749 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1750 					  in_dev, itag);
1751 		if (err < 0)
1752 			return err;
1753 	}
1754 	return 0;
1755 }
1756 
1757 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1758 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1759 			     u8 tos, struct net_device *dev, int our)
1760 {
1761 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1762 	unsigned int flags = RTCF_MULTICAST;
1763 	struct rtable *rth;
1764 	bool no_policy;
1765 	u32 itag = 0;
1766 	int err;
1767 
1768 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1769 	if (err)
1770 		return err;
1771 
1772 	if (our)
1773 		flags |= RTCF_LOCAL;
1774 
1775 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1776 	if (no_policy)
1777 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1778 
1779 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1780 			   no_policy, false);
1781 	if (!rth)
1782 		return -ENOBUFS;
1783 
1784 #ifdef CONFIG_IP_ROUTE_CLASSID
1785 	rth->dst.tclassid = itag;
1786 #endif
1787 	rth->dst.output = ip_rt_bug;
1788 	rth->rt_is_input= 1;
1789 
1790 #ifdef CONFIG_IP_MROUTE
1791 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1792 		rth->dst.input = ip_mr_input;
1793 #endif
1794 	RT_CACHE_STAT_INC(in_slow_mc);
1795 
1796 	skb_dst_drop(skb);
1797 	skb_dst_set(skb, &rth->dst);
1798 	return 0;
1799 }
1800 
1801 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1802 static void ip_handle_martian_source(struct net_device *dev,
1803 				     struct in_device *in_dev,
1804 				     struct sk_buff *skb,
1805 				     __be32 daddr,
1806 				     __be32 saddr)
1807 {
1808 	RT_CACHE_STAT_INC(in_martian_src);
1809 #ifdef CONFIG_IP_ROUTE_VERBOSE
1810 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1811 		/*
1812 		 *	RFC1812 recommendation, if source is martian,
1813 		 *	the only hint is MAC header.
1814 		 */
1815 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1816 			&daddr, &saddr, dev->name);
1817 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1818 			print_hex_dump(KERN_WARNING, "ll header: ",
1819 				       DUMP_PREFIX_OFFSET, 16, 1,
1820 				       skb_mac_header(skb),
1821 				       dev->hard_header_len, false);
1822 		}
1823 	}
1824 #endif
1825 }
1826 
1827 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1828 static int __mkroute_input(struct sk_buff *skb,
1829 			   const struct fib_result *res,
1830 			   struct in_device *in_dev,
1831 			   __be32 daddr, __be32 saddr, u32 tos)
1832 {
1833 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1834 	struct net_device *dev = nhc->nhc_dev;
1835 	struct fib_nh_exception *fnhe;
1836 	struct rtable *rth;
1837 	int err;
1838 	struct in_device *out_dev;
1839 	bool do_cache, no_policy;
1840 	u32 itag = 0;
1841 
1842 	/* get a working reference to the output device */
1843 	out_dev = __in_dev_get_rcu(dev);
1844 	if (!out_dev) {
1845 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1846 		return -EINVAL;
1847 	}
1848 
1849 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1850 				  in_dev->dev, in_dev, &itag);
1851 	if (err < 0) {
1852 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1853 					 saddr);
1854 
1855 		goto cleanup;
1856 	}
1857 
1858 	do_cache = res->fi && !itag;
1859 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1860 	    skb->protocol == htons(ETH_P_IP)) {
1861 		__be32 gw;
1862 
1863 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1864 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1865 		    inet_addr_onlink(out_dev, saddr, gw))
1866 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1867 	}
1868 
1869 	if (skb->protocol != htons(ETH_P_IP)) {
1870 		/* Not IP (i.e. ARP). Do not create route, if it is
1871 		 * invalid for proxy arp. DNAT routes are always valid.
1872 		 *
1873 		 * Proxy arp feature have been extended to allow, ARP
1874 		 * replies back to the same interface, to support
1875 		 * Private VLAN switch technologies. See arp.c.
1876 		 */
1877 		if (out_dev == in_dev &&
1878 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1879 			err = -EINVAL;
1880 			goto cleanup;
1881 		}
1882 	}
1883 
1884 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1885 	if (no_policy)
1886 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1887 
1888 	fnhe = find_exception(nhc, daddr);
1889 	if (do_cache) {
1890 		if (fnhe)
1891 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1892 		else
1893 			rth = rcu_dereference(nhc->nhc_rth_input);
1894 		if (rt_cache_valid(rth)) {
1895 			skb_dst_set_noref(skb, &rth->dst);
1896 			goto out;
1897 		}
1898 	}
1899 
1900 	rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1901 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1902 	if (!rth) {
1903 		err = -ENOBUFS;
1904 		goto cleanup;
1905 	}
1906 
1907 	rth->rt_is_input = 1;
1908 	RT_CACHE_STAT_INC(in_slow_tot);
1909 
1910 	rth->dst.input = ip_forward;
1911 
1912 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1913 		       do_cache);
1914 	lwtunnel_set_redirect(&rth->dst);
1915 	skb_dst_set(skb, &rth->dst);
1916 out:
1917 	err = 0;
1918  cleanup:
1919 	return err;
1920 }
1921 
1922 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1923 /* To make ICMP packets follow the right flow, the multipath hash is
1924  * calculated from the inner IP addresses.
1925  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1926 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1927 				 struct flow_keys *hash_keys)
1928 {
1929 	const struct iphdr *outer_iph = ip_hdr(skb);
1930 	const struct iphdr *key_iph = outer_iph;
1931 	const struct iphdr *inner_iph;
1932 	const struct icmphdr *icmph;
1933 	struct iphdr _inner_iph;
1934 	struct icmphdr _icmph;
1935 
1936 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1937 		goto out;
1938 
1939 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1940 		goto out;
1941 
1942 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1943 				   &_icmph);
1944 	if (!icmph)
1945 		goto out;
1946 
1947 	if (!icmp_is_err(icmph->type))
1948 		goto out;
1949 
1950 	inner_iph = skb_header_pointer(skb,
1951 				       outer_iph->ihl * 4 + sizeof(_icmph),
1952 				       sizeof(_inner_iph), &_inner_iph);
1953 	if (!inner_iph)
1954 		goto out;
1955 
1956 	key_iph = inner_iph;
1957 out:
1958 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1959 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1960 }
1961 
1962 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1963 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1964 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1965 {
1966 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1967 	struct flow_keys hash_keys;
1968 	u32 mhash;
1969 
1970 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1971 	case 0:
1972 		memset(&hash_keys, 0, sizeof(hash_keys));
1973 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1974 		if (skb) {
1975 			ip_multipath_l3_keys(skb, &hash_keys);
1976 		} else {
1977 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1978 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1979 		}
1980 		break;
1981 	case 1:
1982 		/* skb is currently provided only when forwarding */
1983 		if (skb) {
1984 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1985 			struct flow_keys keys;
1986 
1987 			/* short-circuit if we already have L4 hash present */
1988 			if (skb->l4_hash)
1989 				return skb_get_hash_raw(skb) >> 1;
1990 
1991 			memset(&hash_keys, 0, sizeof(hash_keys));
1992 
1993 			if (!flkeys) {
1994 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1995 				flkeys = &keys;
1996 			}
1997 
1998 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1999 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2000 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2001 			hash_keys.ports.src = flkeys->ports.src;
2002 			hash_keys.ports.dst = flkeys->ports.dst;
2003 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2004 		} else {
2005 			memset(&hash_keys, 0, sizeof(hash_keys));
2006 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2007 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2008 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2009 			hash_keys.ports.src = fl4->fl4_sport;
2010 			hash_keys.ports.dst = fl4->fl4_dport;
2011 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2012 		}
2013 		break;
2014 	case 2:
2015 		memset(&hash_keys, 0, sizeof(hash_keys));
2016 		/* skb is currently provided only when forwarding */
2017 		if (skb) {
2018 			struct flow_keys keys;
2019 
2020 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2021 			/* Inner can be v4 or v6 */
2022 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2023 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2025 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2026 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2027 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2028 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2029 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2030 				hash_keys.tags.flow_label = keys.tags.flow_label;
2031 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2032 			} else {
2033 				/* Same as case 0 */
2034 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2035 				ip_multipath_l3_keys(skb, &hash_keys);
2036 			}
2037 		} else {
2038 			/* Same as case 0 */
2039 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2040 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2041 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2042 		}
2043 		break;
2044 	}
2045 	mhash = flow_hash_from_keys(&hash_keys);
2046 
2047 	if (multipath_hash)
2048 		mhash = jhash_2words(mhash, multipath_hash, 0);
2049 
2050 	return mhash >> 1;
2051 }
2052 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2053 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2054 static int ip_mkroute_input(struct sk_buff *skb,
2055 			    struct fib_result *res,
2056 			    struct in_device *in_dev,
2057 			    __be32 daddr, __be32 saddr, u32 tos,
2058 			    struct flow_keys *hkeys)
2059 {
2060 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2061 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2062 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2063 
2064 		fib_select_multipath(res, h);
2065 		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2066 	}
2067 #endif
2068 
2069 	/* create a routing cache entry */
2070 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2071 }
2072 
2073 /* Implements all the saddr-related checks as ip_route_input_slow(),
2074  * assuming daddr is valid and the destination is not a local broadcast one.
2075  * Uses the provided hint instead of performing a route lookup.
2076  */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2077 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078 		      u8 tos, struct net_device *dev,
2079 		      const struct sk_buff *hint)
2080 {
2081 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2082 	struct rtable *rt = skb_rtable(hint);
2083 	struct net *net = dev_net(dev);
2084 	int err = -EINVAL;
2085 	u32 tag = 0;
2086 
2087 	if (!in_dev)
2088 		return -EINVAL;
2089 
2090 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2091 		goto martian_source;
2092 
2093 	if (ipv4_is_zeronet(saddr))
2094 		goto martian_source;
2095 
2096 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2097 		goto martian_source;
2098 
2099 	if (rt->rt_type != RTN_LOCAL)
2100 		goto skip_validate_source;
2101 
2102 	tos &= IPTOS_RT_MASK;
2103 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2104 	if (err < 0)
2105 		goto martian_source;
2106 
2107 skip_validate_source:
2108 	skb_dst_copy(skb, hint);
2109 	return 0;
2110 
2111 martian_source:
2112 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2113 	return err;
2114 }
2115 
2116 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2117 static struct net_device *ip_rt_get_dev(struct net *net,
2118 					const struct fib_result *res)
2119 {
2120 	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2121 	struct net_device *dev = NULL;
2122 
2123 	if (nhc)
2124 		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2125 
2126 	return dev ? : net->loopback_dev;
2127 }
2128 
2129 /*
2130  *	NOTE. We drop all the packets that has local source
2131  *	addresses, because every properly looped back packet
2132  *	must have correct destination already attached by output routine.
2133  *	Changes in the enforced policies must be applied also to
2134  *	ip_route_use_hint().
2135  *
2136  *	Such approach solves two big problems:
2137  *	1. Not simplex devices are handled properly.
2138  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2139  *	called with rcu_read_lock()
2140  */
2141 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2142 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2143 			       u8 tos, struct net_device *dev,
2144 			       struct fib_result *res)
2145 {
2146 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2147 	struct flow_keys *flkeys = NULL, _flkeys;
2148 	struct net    *net = dev_net(dev);
2149 	struct ip_tunnel_info *tun_info;
2150 	int		err = -EINVAL;
2151 	unsigned int	flags = 0;
2152 	u32		itag = 0;
2153 	struct rtable	*rth;
2154 	struct flowi4	fl4;
2155 	bool do_cache = true;
2156 	bool no_policy;
2157 
2158 	/* IP on this device is disabled. */
2159 
2160 	if (!in_dev)
2161 		goto out;
2162 
2163 	/* Check for the most weird martians, which can be not detected
2164 	   by fib_lookup.
2165 	 */
2166 
2167 	tun_info = skb_tunnel_info(skb);
2168 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2169 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2170 	else
2171 		fl4.flowi4_tun_key.tun_id = 0;
2172 	skb_dst_drop(skb);
2173 
2174 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2175 		goto martian_source;
2176 
2177 	res->fi = NULL;
2178 	res->table = NULL;
2179 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2180 		goto brd_input;
2181 
2182 	/* Accept zero addresses only to limited broadcast;
2183 	 * I even do not know to fix it or not. Waiting for complains :-)
2184 	 */
2185 	if (ipv4_is_zeronet(saddr))
2186 		goto martian_source;
2187 
2188 	if (ipv4_is_zeronet(daddr))
2189 		goto martian_destination;
2190 
2191 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2192 	 * and call it once if daddr or/and saddr are loopback addresses
2193 	 */
2194 	if (ipv4_is_loopback(daddr)) {
2195 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2196 			goto martian_destination;
2197 	} else if (ipv4_is_loopback(saddr)) {
2198 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2199 			goto martian_source;
2200 	}
2201 
2202 	/*
2203 	 *	Now we are ready to route packet.
2204 	 */
2205 	fl4.flowi4_oif = 0;
2206 	fl4.flowi4_iif = dev->ifindex;
2207 	fl4.flowi4_mark = skb->mark;
2208 	fl4.flowi4_tos = tos;
2209 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2210 	fl4.flowi4_flags = 0;
2211 	fl4.daddr = daddr;
2212 	fl4.saddr = saddr;
2213 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2214 	fl4.flowi4_multipath_hash = 0;
2215 
2216 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2217 		flkeys = &_flkeys;
2218 	} else {
2219 		fl4.flowi4_proto = 0;
2220 		fl4.fl4_sport = 0;
2221 		fl4.fl4_dport = 0;
2222 	}
2223 
2224 	err = fib_lookup(net, &fl4, res, 0);
2225 	if (err != 0) {
2226 		if (!IN_DEV_FORWARD(in_dev))
2227 			err = -EHOSTUNREACH;
2228 		goto no_route;
2229 	}
2230 
2231 	if (res->type == RTN_BROADCAST) {
2232 		if (IN_DEV_BFORWARD(in_dev))
2233 			goto make_route;
2234 		/* not do cache if bc_forwarding is enabled */
2235 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2236 			do_cache = false;
2237 		goto brd_input;
2238 	}
2239 
2240 	if (res->type == RTN_LOCAL) {
2241 		err = fib_validate_source(skb, saddr, daddr, tos,
2242 					  0, dev, in_dev, &itag);
2243 		if (err < 0)
2244 			goto martian_source;
2245 		goto local_input;
2246 	}
2247 
2248 	if (!IN_DEV_FORWARD(in_dev)) {
2249 		err = -EHOSTUNREACH;
2250 		goto no_route;
2251 	}
2252 	if (res->type != RTN_UNICAST)
2253 		goto martian_destination;
2254 
2255 make_route:
2256 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2257 out:	return err;
2258 
2259 brd_input:
2260 	if (skb->protocol != htons(ETH_P_IP))
2261 		goto e_inval;
2262 
2263 	if (!ipv4_is_zeronet(saddr)) {
2264 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2265 					  in_dev, &itag);
2266 		if (err < 0)
2267 			goto martian_source;
2268 	}
2269 	flags |= RTCF_BROADCAST;
2270 	res->type = RTN_BROADCAST;
2271 	RT_CACHE_STAT_INC(in_brd);
2272 
2273 local_input:
2274 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2275 	if (no_policy)
2276 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2277 
2278 	do_cache &= res->fi && !itag;
2279 	if (do_cache) {
2280 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2281 
2282 		rth = rcu_dereference(nhc->nhc_rth_input);
2283 		if (rt_cache_valid(rth)) {
2284 			skb_dst_set_noref(skb, &rth->dst);
2285 			err = 0;
2286 			goto out;
2287 		}
2288 	}
2289 
2290 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2291 			   flags | RTCF_LOCAL, res->type,
2292 			   no_policy, false);
2293 	if (!rth)
2294 		goto e_nobufs;
2295 
2296 	rth->dst.output= ip_rt_bug;
2297 #ifdef CONFIG_IP_ROUTE_CLASSID
2298 	rth->dst.tclassid = itag;
2299 #endif
2300 	rth->rt_is_input = 1;
2301 
2302 	RT_CACHE_STAT_INC(in_slow_tot);
2303 	if (res->type == RTN_UNREACHABLE) {
2304 		rth->dst.input= ip_error;
2305 		rth->dst.error= -err;
2306 		rth->rt_flags 	&= ~RTCF_LOCAL;
2307 	}
2308 
2309 	if (do_cache) {
2310 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2311 
2312 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2313 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2314 			WARN_ON(rth->dst.input == lwtunnel_input);
2315 			rth->dst.lwtstate->orig_input = rth->dst.input;
2316 			rth->dst.input = lwtunnel_input;
2317 		}
2318 
2319 		if (unlikely(!rt_cache_route(nhc, rth)))
2320 			rt_add_uncached_list(rth);
2321 	}
2322 	skb_dst_set(skb, &rth->dst);
2323 	err = 0;
2324 	goto out;
2325 
2326 no_route:
2327 	RT_CACHE_STAT_INC(in_no_route);
2328 	res->type = RTN_UNREACHABLE;
2329 	res->fi = NULL;
2330 	res->table = NULL;
2331 	goto local_input;
2332 
2333 	/*
2334 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2335 	 */
2336 martian_destination:
2337 	RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339 	if (IN_DEV_LOG_MARTIANS(in_dev))
2340 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2341 				     &daddr, &saddr, dev->name);
2342 #endif
2343 
2344 e_inval:
2345 	err = -EINVAL;
2346 	goto out;
2347 
2348 e_nobufs:
2349 	err = -ENOBUFS;
2350 	goto out;
2351 
2352 martian_source:
2353 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2354 	goto out;
2355 }
2356 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2357 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2358 			 u8 tos, struct net_device *dev)
2359 {
2360 	struct fib_result res;
2361 	int err;
2362 
2363 	tos &= IPTOS_RT_MASK;
2364 	rcu_read_lock();
2365 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2366 	rcu_read_unlock();
2367 
2368 	return err;
2369 }
2370 EXPORT_SYMBOL(ip_route_input_noref);
2371 
2372 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2373 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2374 		       u8 tos, struct net_device *dev, struct fib_result *res)
2375 {
2376 	/* Multicast recognition logic is moved from route cache to here.
2377 	   The problem was that too many Ethernet cards have broken/missing
2378 	   hardware multicast filters :-( As result the host on multicasting
2379 	   network acquires a lot of useless route cache entries, sort of
2380 	   SDR messages from all the world. Now we try to get rid of them.
2381 	   Really, provided software IP multicast filter is organized
2382 	   reasonably (at least, hashed), it does not result in a slowdown
2383 	   comparing with route cache reject entries.
2384 	   Note, that multicast routers are not affected, because
2385 	   route cache entry is created eventually.
2386 	 */
2387 	if (ipv4_is_multicast(daddr)) {
2388 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2389 		int our = 0;
2390 		int err = -EINVAL;
2391 
2392 		if (!in_dev)
2393 			return err;
2394 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2395 				      ip_hdr(skb)->protocol);
2396 
2397 		/* check l3 master if no match yet */
2398 		if (!our && netif_is_l3_slave(dev)) {
2399 			struct in_device *l3_in_dev;
2400 
2401 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2402 			if (l3_in_dev)
2403 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2404 						      ip_hdr(skb)->protocol);
2405 		}
2406 
2407 		if (our
2408 #ifdef CONFIG_IP_MROUTE
2409 			||
2410 		    (!ipv4_is_local_multicast(daddr) &&
2411 		     IN_DEV_MFORWARD(in_dev))
2412 #endif
2413 		   ) {
2414 			err = ip_route_input_mc(skb, daddr, saddr,
2415 						tos, dev, our);
2416 		}
2417 		return err;
2418 	}
2419 
2420 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2421 }
2422 
2423 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2424 static struct rtable *__mkroute_output(const struct fib_result *res,
2425 				       const struct flowi4 *fl4, int orig_oif,
2426 				       struct net_device *dev_out,
2427 				       unsigned int flags)
2428 {
2429 	struct fib_info *fi = res->fi;
2430 	struct fib_nh_exception *fnhe;
2431 	struct in_device *in_dev;
2432 	u16 type = res->type;
2433 	struct rtable *rth;
2434 	bool do_cache;
2435 
2436 	in_dev = __in_dev_get_rcu(dev_out);
2437 	if (!in_dev)
2438 		return ERR_PTR(-EINVAL);
2439 
2440 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2441 		if (ipv4_is_loopback(fl4->saddr) &&
2442 		    !(dev_out->flags & IFF_LOOPBACK) &&
2443 		    !netif_is_l3_master(dev_out))
2444 			return ERR_PTR(-EINVAL);
2445 
2446 	if (ipv4_is_lbcast(fl4->daddr))
2447 		type = RTN_BROADCAST;
2448 	else if (ipv4_is_multicast(fl4->daddr))
2449 		type = RTN_MULTICAST;
2450 	else if (ipv4_is_zeronet(fl4->daddr))
2451 		return ERR_PTR(-EINVAL);
2452 
2453 	if (dev_out->flags & IFF_LOOPBACK)
2454 		flags |= RTCF_LOCAL;
2455 
2456 	do_cache = true;
2457 	if (type == RTN_BROADCAST) {
2458 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2459 		fi = NULL;
2460 	} else if (type == RTN_MULTICAST) {
2461 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2462 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2463 				     fl4->flowi4_proto))
2464 			flags &= ~RTCF_LOCAL;
2465 		else
2466 			do_cache = false;
2467 		/* If multicast route do not exist use
2468 		 * default one, but do not gateway in this case.
2469 		 * Yes, it is hack.
2470 		 */
2471 		if (fi && res->prefixlen < 4)
2472 			fi = NULL;
2473 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2474 		   (orig_oif != dev_out->ifindex)) {
2475 		/* For local routes that require a particular output interface
2476 		 * we do not want to cache the result.  Caching the result
2477 		 * causes incorrect behaviour when there are multiple source
2478 		 * addresses on the interface, the end result being that if the
2479 		 * intended recipient is waiting on that interface for the
2480 		 * packet he won't receive it because it will be delivered on
2481 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2482 		 * be set to the loopback interface as well.
2483 		 */
2484 		do_cache = false;
2485 	}
2486 
2487 	fnhe = NULL;
2488 	do_cache &= fi != NULL;
2489 	if (fi) {
2490 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2491 		struct rtable __rcu **prth;
2492 
2493 		fnhe = find_exception(nhc, fl4->daddr);
2494 		if (!do_cache)
2495 			goto add;
2496 		if (fnhe) {
2497 			prth = &fnhe->fnhe_rth_output;
2498 		} else {
2499 			if (unlikely(fl4->flowi4_flags &
2500 				     FLOWI_FLAG_KNOWN_NH &&
2501 				     !(nhc->nhc_gw_family &&
2502 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2503 				do_cache = false;
2504 				goto add;
2505 			}
2506 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2507 		}
2508 		rth = rcu_dereference(*prth);
2509 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2510 			return rth;
2511 	}
2512 
2513 add:
2514 	rth = rt_dst_alloc(dev_out, flags, type,
2515 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2516 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2517 	if (!rth)
2518 		return ERR_PTR(-ENOBUFS);
2519 
2520 	rth->rt_iif = orig_oif;
2521 
2522 	RT_CACHE_STAT_INC(out_slow_tot);
2523 
2524 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2525 		if (flags & RTCF_LOCAL &&
2526 		    !(dev_out->flags & IFF_LOOPBACK)) {
2527 			rth->dst.output = ip_mc_output;
2528 			RT_CACHE_STAT_INC(out_slow_mc);
2529 		}
2530 #ifdef CONFIG_IP_MROUTE
2531 		if (type == RTN_MULTICAST) {
2532 			if (IN_DEV_MFORWARD(in_dev) &&
2533 			    !ipv4_is_local_multicast(fl4->daddr)) {
2534 				rth->dst.input = ip_mr_input;
2535 				rth->dst.output = ip_mc_output;
2536 			}
2537 		}
2538 #endif
2539 	}
2540 
2541 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2542 	lwtunnel_set_redirect(&rth->dst);
2543 
2544 	return rth;
2545 }
2546 
2547 /*
2548  * Major route resolver routine.
2549  */
2550 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2551 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2552 					const struct sk_buff *skb)
2553 {
2554 	struct fib_result res = {
2555 		.type		= RTN_UNSPEC,
2556 		.fi		= NULL,
2557 		.table		= NULL,
2558 		.tclassid	= 0,
2559 	};
2560 	struct rtable *rth;
2561 
2562 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2563 	ip_rt_fix_tos(fl4);
2564 
2565 	rcu_read_lock();
2566 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2567 	rcu_read_unlock();
2568 
2569 	return rth;
2570 }
2571 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2572 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2573 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2574 					    struct fib_result *res,
2575 					    const struct sk_buff *skb)
2576 {
2577 	struct net_device *dev_out = NULL;
2578 	int orig_oif = fl4->flowi4_oif;
2579 	unsigned int flags = 0;
2580 	struct rtable *rth;
2581 	int err;
2582 
2583 	if (fl4->saddr) {
2584 		if (ipv4_is_multicast(fl4->saddr) ||
2585 		    ipv4_is_lbcast(fl4->saddr) ||
2586 		    ipv4_is_zeronet(fl4->saddr)) {
2587 			rth = ERR_PTR(-EINVAL);
2588 			goto out;
2589 		}
2590 
2591 		rth = ERR_PTR(-ENETUNREACH);
2592 
2593 		/* I removed check for oif == dev_out->oif here.
2594 		   It was wrong for two reasons:
2595 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2596 		      is assigned to multiple interfaces.
2597 		   2. Moreover, we are allowed to send packets with saddr
2598 		      of another iface. --ANK
2599 		 */
2600 
2601 		if (fl4->flowi4_oif == 0 &&
2602 		    (ipv4_is_multicast(fl4->daddr) ||
2603 		     ipv4_is_lbcast(fl4->daddr))) {
2604 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2605 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2606 			if (!dev_out)
2607 				goto out;
2608 
2609 			/* Special hack: user can direct multicasts
2610 			   and limited broadcast via necessary interface
2611 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2612 			   This hack is not just for fun, it allows
2613 			   vic,vat and friends to work.
2614 			   They bind socket to loopback, set ttl to zero
2615 			   and expect that it will work.
2616 			   From the viewpoint of routing cache they are broken,
2617 			   because we are not allowed to build multicast path
2618 			   with loopback source addr (look, routing cache
2619 			   cannot know, that ttl is zero, so that packet
2620 			   will not leave this host and route is valid).
2621 			   Luckily, this hack is good workaround.
2622 			 */
2623 
2624 			fl4->flowi4_oif = dev_out->ifindex;
2625 			goto make_route;
2626 		}
2627 
2628 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2629 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2630 			if (!__ip_dev_find(net, fl4->saddr, false))
2631 				goto out;
2632 		}
2633 	}
2634 
2635 
2636 	if (fl4->flowi4_oif) {
2637 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2638 		rth = ERR_PTR(-ENODEV);
2639 		if (!dev_out)
2640 			goto out;
2641 
2642 		/* RACE: Check return value of inet_select_addr instead. */
2643 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2644 			rth = ERR_PTR(-ENETUNREACH);
2645 			goto out;
2646 		}
2647 		if (ipv4_is_local_multicast(fl4->daddr) ||
2648 		    ipv4_is_lbcast(fl4->daddr) ||
2649 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2650 			if (!fl4->saddr)
2651 				fl4->saddr = inet_select_addr(dev_out, 0,
2652 							      RT_SCOPE_LINK);
2653 			goto make_route;
2654 		}
2655 		if (!fl4->saddr) {
2656 			if (ipv4_is_multicast(fl4->daddr))
2657 				fl4->saddr = inet_select_addr(dev_out, 0,
2658 							      fl4->flowi4_scope);
2659 			else if (!fl4->daddr)
2660 				fl4->saddr = inet_select_addr(dev_out, 0,
2661 							      RT_SCOPE_HOST);
2662 		}
2663 	}
2664 
2665 	if (!fl4->daddr) {
2666 		fl4->daddr = fl4->saddr;
2667 		if (!fl4->daddr)
2668 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2669 		dev_out = net->loopback_dev;
2670 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2671 		res->type = RTN_LOCAL;
2672 		flags |= RTCF_LOCAL;
2673 		goto make_route;
2674 	}
2675 
2676 	err = fib_lookup(net, fl4, res, 0);
2677 	if (err) {
2678 		res->fi = NULL;
2679 		res->table = NULL;
2680 		if (fl4->flowi4_oif &&
2681 		    (ipv4_is_multicast(fl4->daddr) ||
2682 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2683 			/* Apparently, routing tables are wrong. Assume,
2684 			   that the destination is on link.
2685 
2686 			   WHY? DW.
2687 			   Because we are allowed to send to iface
2688 			   even if it has NO routes and NO assigned
2689 			   addresses. When oif is specified, routing
2690 			   tables are looked up with only one purpose:
2691 			   to catch if destination is gatewayed, rather than
2692 			   direct. Moreover, if MSG_DONTROUTE is set,
2693 			   we send packet, ignoring both routing tables
2694 			   and ifaddr state. --ANK
2695 
2696 
2697 			   We could make it even if oif is unknown,
2698 			   likely IPv6, but we do not.
2699 			 */
2700 
2701 			if (fl4->saddr == 0)
2702 				fl4->saddr = inet_select_addr(dev_out, 0,
2703 							      RT_SCOPE_LINK);
2704 			res->type = RTN_UNICAST;
2705 			goto make_route;
2706 		}
2707 		rth = ERR_PTR(err);
2708 		goto out;
2709 	}
2710 
2711 	if (res->type == RTN_LOCAL) {
2712 		if (!fl4->saddr) {
2713 			if (res->fi->fib_prefsrc)
2714 				fl4->saddr = res->fi->fib_prefsrc;
2715 			else
2716 				fl4->saddr = fl4->daddr;
2717 		}
2718 
2719 		/* L3 master device is the loopback for that domain */
2720 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2721 			net->loopback_dev;
2722 
2723 		/* make sure orig_oif points to fib result device even
2724 		 * though packet rx/tx happens over loopback or l3mdev
2725 		 */
2726 		orig_oif = FIB_RES_OIF(*res);
2727 
2728 		fl4->flowi4_oif = dev_out->ifindex;
2729 		flags |= RTCF_LOCAL;
2730 		goto make_route;
2731 	}
2732 
2733 	fib_select_path(net, res, fl4, skb);
2734 
2735 	dev_out = FIB_RES_DEV(*res);
2736 
2737 make_route:
2738 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2739 
2740 out:
2741 	return rth;
2742 }
2743 
2744 static struct dst_ops ipv4_dst_blackhole_ops = {
2745 	.family			= AF_INET,
2746 	.default_advmss		= ipv4_default_advmss,
2747 	.neigh_lookup		= ipv4_neigh_lookup,
2748 	.check			= dst_blackhole_check,
2749 	.cow_metrics		= dst_blackhole_cow_metrics,
2750 	.update_pmtu		= dst_blackhole_update_pmtu,
2751 	.redirect		= dst_blackhole_redirect,
2752 	.mtu			= dst_blackhole_mtu,
2753 };
2754 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2755 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2756 {
2757 	struct rtable *ort = (struct rtable *) dst_orig;
2758 	struct rtable *rt;
2759 
2760 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2761 	if (rt) {
2762 		struct dst_entry *new = &rt->dst;
2763 
2764 		new->__use = 1;
2765 		new->input = dst_discard;
2766 		new->output = dst_discard_out;
2767 
2768 		new->dev = net->loopback_dev;
2769 		if (new->dev)
2770 			dev_hold(new->dev);
2771 
2772 		rt->rt_is_input = ort->rt_is_input;
2773 		rt->rt_iif = ort->rt_iif;
2774 		rt->rt_pmtu = ort->rt_pmtu;
2775 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2776 
2777 		rt->rt_genid = rt_genid_ipv4(net);
2778 		rt->rt_flags = ort->rt_flags;
2779 		rt->rt_type = ort->rt_type;
2780 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2781 		rt->rt_gw_family = ort->rt_gw_family;
2782 		if (rt->rt_gw_family == AF_INET)
2783 			rt->rt_gw4 = ort->rt_gw4;
2784 		else if (rt->rt_gw_family == AF_INET6)
2785 			rt->rt_gw6 = ort->rt_gw6;
2786 
2787 		INIT_LIST_HEAD(&rt->rt_uncached);
2788 	}
2789 
2790 	dst_release(dst_orig);
2791 
2792 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2793 }
2794 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2795 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2796 				    const struct sock *sk)
2797 {
2798 	struct rtable *rt = __ip_route_output_key(net, flp4);
2799 
2800 	if (IS_ERR(rt))
2801 		return rt;
2802 
2803 	if (flp4->flowi4_proto) {
2804 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2805 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2806 							flowi4_to_flowi(flp4),
2807 							sk, 0);
2808 	}
2809 
2810 	return rt;
2811 }
2812 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2813 
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2814 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2815 				      struct net_device *dev,
2816 				      struct net *net, __be32 *saddr,
2817 				      const struct ip_tunnel_info *info,
2818 				      u8 protocol, bool use_cache)
2819 {
2820 #ifdef CONFIG_DST_CACHE
2821 	struct dst_cache *dst_cache;
2822 #endif
2823 	struct rtable *rt = NULL;
2824 	struct flowi4 fl4;
2825 	__u8 tos;
2826 
2827 #ifdef CONFIG_DST_CACHE
2828 	dst_cache = (struct dst_cache *)&info->dst_cache;
2829 	if (use_cache) {
2830 		rt = dst_cache_get_ip4(dst_cache, saddr);
2831 		if (rt)
2832 			return rt;
2833 	}
2834 #endif
2835 	memset(&fl4, 0, sizeof(fl4));
2836 	fl4.flowi4_mark = skb->mark;
2837 	fl4.flowi4_proto = protocol;
2838 	fl4.daddr = info->key.u.ipv4.dst;
2839 	fl4.saddr = info->key.u.ipv4.src;
2840 	tos = info->key.tos;
2841 	fl4.flowi4_tos = RT_TOS(tos);
2842 
2843 	rt = ip_route_output_key(net, &fl4);
2844 	if (IS_ERR(rt)) {
2845 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2846 		return ERR_PTR(-ENETUNREACH);
2847 	}
2848 	if (rt->dst.dev == dev) { /* is this necessary? */
2849 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2850 		ip_rt_put(rt);
2851 		return ERR_PTR(-ELOOP);
2852 	}
2853 #ifdef CONFIG_DST_CACHE
2854 	if (use_cache)
2855 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2856 #endif
2857 	*saddr = fl4.saddr;
2858 	return rt;
2859 }
2860 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2861 
2862 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2863 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2864 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2865 			struct sk_buff *skb, u32 portid, u32 seq,
2866 			unsigned int flags)
2867 {
2868 	struct rtmsg *r;
2869 	struct nlmsghdr *nlh;
2870 	unsigned long expires = 0;
2871 	u32 error;
2872 	u32 metrics[RTAX_MAX];
2873 
2874 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2875 	if (!nlh)
2876 		return -EMSGSIZE;
2877 
2878 	r = nlmsg_data(nlh);
2879 	r->rtm_family	 = AF_INET;
2880 	r->rtm_dst_len	= 32;
2881 	r->rtm_src_len	= 0;
2882 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2883 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2884 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2885 		goto nla_put_failure;
2886 	r->rtm_type	= rt->rt_type;
2887 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2888 	r->rtm_protocol = RTPROT_UNSPEC;
2889 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890 	if (rt->rt_flags & RTCF_NOTIFY)
2891 		r->rtm_flags |= RTM_F_NOTIFY;
2892 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2893 		r->rtm_flags |= RTCF_DOREDIRECT;
2894 
2895 	if (nla_put_in_addr(skb, RTA_DST, dst))
2896 		goto nla_put_failure;
2897 	if (src) {
2898 		r->rtm_src_len = 32;
2899 		if (nla_put_in_addr(skb, RTA_SRC, src))
2900 			goto nla_put_failure;
2901 	}
2902 	if (rt->dst.dev &&
2903 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2904 		goto nla_put_failure;
2905 #ifdef CONFIG_IP_ROUTE_CLASSID
2906 	if (rt->dst.tclassid &&
2907 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2908 		goto nla_put_failure;
2909 #endif
2910 	if (fl4 && !rt_is_input_route(rt) &&
2911 	    fl4->saddr != src) {
2912 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2913 			goto nla_put_failure;
2914 	}
2915 	if (rt->rt_uses_gateway) {
2916 		if (rt->rt_gw_family == AF_INET &&
2917 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2918 			goto nla_put_failure;
2919 		} else if (rt->rt_gw_family == AF_INET6) {
2920 			int alen = sizeof(struct in6_addr);
2921 			struct nlattr *nla;
2922 			struct rtvia *via;
2923 
2924 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2925 			if (!nla)
2926 				goto nla_put_failure;
2927 
2928 			via = nla_data(nla);
2929 			via->rtvia_family = AF_INET6;
2930 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2931 		}
2932 	}
2933 
2934 	expires = rt->dst.expires;
2935 	if (expires) {
2936 		unsigned long now = jiffies;
2937 
2938 		if (time_before(now, expires))
2939 			expires -= now;
2940 		else
2941 			expires = 0;
2942 	}
2943 
2944 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2945 	if (rt->rt_pmtu && expires)
2946 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2947 	if (rt->rt_mtu_locked && expires)
2948 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2949 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2950 		goto nla_put_failure;
2951 
2952 	if (fl4) {
2953 		if (fl4->flowi4_mark &&
2954 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2955 			goto nla_put_failure;
2956 
2957 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2958 		    nla_put_u32(skb, RTA_UID,
2959 				from_kuid_munged(current_user_ns(),
2960 						 fl4->flowi4_uid)))
2961 			goto nla_put_failure;
2962 
2963 		if (rt_is_input_route(rt)) {
2964 #ifdef CONFIG_IP_MROUTE
2965 			if (ipv4_is_multicast(dst) &&
2966 			    !ipv4_is_local_multicast(dst) &&
2967 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2968 				int err = ipmr_get_route(net, skb,
2969 							 fl4->saddr, fl4->daddr,
2970 							 r, portid);
2971 
2972 				if (err <= 0) {
2973 					if (err == 0)
2974 						return 0;
2975 					goto nla_put_failure;
2976 				}
2977 			} else
2978 #endif
2979 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2980 					goto nla_put_failure;
2981 		}
2982 	}
2983 
2984 	error = rt->dst.error;
2985 
2986 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2987 		goto nla_put_failure;
2988 
2989 	nlmsg_end(skb, nlh);
2990 	return 0;
2991 
2992 nla_put_failure:
2993 	nlmsg_cancel(skb, nlh);
2994 	return -EMSGSIZE;
2995 }
2996 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2997 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2998 			    struct netlink_callback *cb, u32 table_id,
2999 			    struct fnhe_hash_bucket *bucket, int genid,
3000 			    int *fa_index, int fa_start, unsigned int flags)
3001 {
3002 	int i;
3003 
3004 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3005 		struct fib_nh_exception *fnhe;
3006 
3007 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3008 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3009 			struct rtable *rt;
3010 			int err;
3011 
3012 			if (*fa_index < fa_start)
3013 				goto next;
3014 
3015 			if (fnhe->fnhe_genid != genid)
3016 				goto next;
3017 
3018 			if (fnhe->fnhe_expires &&
3019 			    time_after(jiffies, fnhe->fnhe_expires))
3020 				goto next;
3021 
3022 			rt = rcu_dereference(fnhe->fnhe_rth_input);
3023 			if (!rt)
3024 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3025 			if (!rt)
3026 				goto next;
3027 
3028 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3029 					   table_id, NULL, skb,
3030 					   NETLINK_CB(cb->skb).portid,
3031 					   cb->nlh->nlmsg_seq, flags);
3032 			if (err)
3033 				return err;
3034 next:
3035 			(*fa_index)++;
3036 		}
3037 	}
3038 
3039 	return 0;
3040 }
3041 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3042 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3043 		       u32 table_id, struct fib_info *fi,
3044 		       int *fa_index, int fa_start, unsigned int flags)
3045 {
3046 	struct net *net = sock_net(cb->skb->sk);
3047 	int nhsel, genid = fnhe_genid(net);
3048 
3049 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3050 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3051 		struct fnhe_hash_bucket *bucket;
3052 		int err;
3053 
3054 		if (nhc->nhc_flags & RTNH_F_DEAD)
3055 			continue;
3056 
3057 		rcu_read_lock();
3058 		bucket = rcu_dereference(nhc->nhc_exceptions);
3059 		err = 0;
3060 		if (bucket)
3061 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3062 					       genid, fa_index, fa_start,
3063 					       flags);
3064 		rcu_read_unlock();
3065 		if (err)
3066 			return err;
3067 	}
3068 
3069 	return 0;
3070 }
3071 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3072 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3073 						   u8 ip_proto, __be16 sport,
3074 						   __be16 dport)
3075 {
3076 	struct sk_buff *skb;
3077 	struct iphdr *iph;
3078 
3079 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3080 	if (!skb)
3081 		return NULL;
3082 
3083 	/* Reserve room for dummy headers, this skb can pass
3084 	 * through good chunk of routing engine.
3085 	 */
3086 	skb_reset_mac_header(skb);
3087 	skb_reset_network_header(skb);
3088 	skb->protocol = htons(ETH_P_IP);
3089 	iph = skb_put(skb, sizeof(struct iphdr));
3090 	iph->protocol = ip_proto;
3091 	iph->saddr = src;
3092 	iph->daddr = dst;
3093 	iph->version = 0x4;
3094 	iph->frag_off = 0;
3095 	iph->ihl = 0x5;
3096 	skb_set_transport_header(skb, skb->len);
3097 
3098 	switch (iph->protocol) {
3099 	case IPPROTO_UDP: {
3100 		struct udphdr *udph;
3101 
3102 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3103 		udph->source = sport;
3104 		udph->dest = dport;
3105 		udph->len = htons(sizeof(struct udphdr));
3106 		udph->check = 0;
3107 		break;
3108 	}
3109 	case IPPROTO_TCP: {
3110 		struct tcphdr *tcph;
3111 
3112 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3113 		tcph->source	= sport;
3114 		tcph->dest	= dport;
3115 		tcph->doff	= sizeof(struct tcphdr) / 4;
3116 		tcph->rst = 1;
3117 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3118 					    src, dst, 0);
3119 		break;
3120 	}
3121 	case IPPROTO_ICMP: {
3122 		struct icmphdr *icmph;
3123 
3124 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3125 		icmph->type = ICMP_ECHO;
3126 		icmph->code = 0;
3127 	}
3128 	}
3129 
3130 	return skb;
3131 }
3132 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3133 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3134 				       const struct nlmsghdr *nlh,
3135 				       struct nlattr **tb,
3136 				       struct netlink_ext_ack *extack)
3137 {
3138 	struct rtmsg *rtm;
3139 	int i, err;
3140 
3141 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3142 		NL_SET_ERR_MSG(extack,
3143 			       "ipv4: Invalid header for route get request");
3144 		return -EINVAL;
3145 	}
3146 
3147 	if (!netlink_strict_get_check(skb))
3148 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3149 					      rtm_ipv4_policy, extack);
3150 
3151 	rtm = nlmsg_data(nlh);
3152 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3153 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3154 	    rtm->rtm_table || rtm->rtm_protocol ||
3155 	    rtm->rtm_scope || rtm->rtm_type) {
3156 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3157 		return -EINVAL;
3158 	}
3159 
3160 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3161 			       RTM_F_LOOKUP_TABLE |
3162 			       RTM_F_FIB_MATCH)) {
3163 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3164 		return -EINVAL;
3165 	}
3166 
3167 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3168 					    rtm_ipv4_policy, extack);
3169 	if (err)
3170 		return err;
3171 
3172 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3173 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3174 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3175 		return -EINVAL;
3176 	}
3177 
3178 	for (i = 0; i <= RTA_MAX; i++) {
3179 		if (!tb[i])
3180 			continue;
3181 
3182 		switch (i) {
3183 		case RTA_IIF:
3184 		case RTA_OIF:
3185 		case RTA_SRC:
3186 		case RTA_DST:
3187 		case RTA_IP_PROTO:
3188 		case RTA_SPORT:
3189 		case RTA_DPORT:
3190 		case RTA_MARK:
3191 		case RTA_UID:
3192 			break;
3193 		default:
3194 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3195 			return -EINVAL;
3196 		}
3197 	}
3198 
3199 	return 0;
3200 }
3201 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3202 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3203 			     struct netlink_ext_ack *extack)
3204 {
3205 	struct net *net = sock_net(in_skb->sk);
3206 	struct nlattr *tb[RTA_MAX+1];
3207 	u32 table_id = RT_TABLE_MAIN;
3208 	__be16 sport = 0, dport = 0;
3209 	struct fib_result res = {};
3210 	u8 ip_proto = IPPROTO_UDP;
3211 	struct rtable *rt = NULL;
3212 	struct sk_buff *skb;
3213 	struct rtmsg *rtm;
3214 	struct flowi4 fl4 = {};
3215 	__be32 dst = 0;
3216 	__be32 src = 0;
3217 	kuid_t uid;
3218 	u32 iif;
3219 	int err;
3220 	int mark;
3221 
3222 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3223 	if (err < 0)
3224 		return err;
3225 
3226 	rtm = nlmsg_data(nlh);
3227 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3228 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3229 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3230 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3231 	if (tb[RTA_UID])
3232 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3233 	else
3234 		uid = (iif ? INVALID_UID : current_uid());
3235 
3236 	if (tb[RTA_IP_PROTO]) {
3237 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3238 						  &ip_proto, AF_INET, extack);
3239 		if (err)
3240 			return err;
3241 	}
3242 
3243 	if (tb[RTA_SPORT])
3244 		sport = nla_get_be16(tb[RTA_SPORT]);
3245 
3246 	if (tb[RTA_DPORT])
3247 		dport = nla_get_be16(tb[RTA_DPORT]);
3248 
3249 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3250 	if (!skb)
3251 		return -ENOBUFS;
3252 
3253 	fl4.daddr = dst;
3254 	fl4.saddr = src;
3255 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3256 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3257 	fl4.flowi4_mark = mark;
3258 	fl4.flowi4_uid = uid;
3259 	if (sport)
3260 		fl4.fl4_sport = sport;
3261 	if (dport)
3262 		fl4.fl4_dport = dport;
3263 	fl4.flowi4_proto = ip_proto;
3264 
3265 	rcu_read_lock();
3266 
3267 	if (iif) {
3268 		struct net_device *dev;
3269 
3270 		dev = dev_get_by_index_rcu(net, iif);
3271 		if (!dev) {
3272 			err = -ENODEV;
3273 			goto errout_rcu;
3274 		}
3275 
3276 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3277 		skb->dev	= dev;
3278 		skb->mark	= mark;
3279 		err = ip_route_input_rcu(skb, dst, src,
3280 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3281 					 &res);
3282 
3283 		rt = skb_rtable(skb);
3284 		if (err == 0 && rt->dst.error)
3285 			err = -rt->dst.error;
3286 	} else {
3287 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3288 		skb->dev = net->loopback_dev;
3289 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3290 		err = 0;
3291 		if (IS_ERR(rt))
3292 			err = PTR_ERR(rt);
3293 		else
3294 			skb_dst_set(skb, &rt->dst);
3295 	}
3296 
3297 	if (err)
3298 		goto errout_rcu;
3299 
3300 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3301 		rt->rt_flags |= RTCF_NOTIFY;
3302 
3303 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3304 		table_id = res.table ? res.table->tb_id : 0;
3305 
3306 	/* reset skb for netlink reply msg */
3307 	skb_trim(skb, 0);
3308 	skb_reset_network_header(skb);
3309 	skb_reset_transport_header(skb);
3310 	skb_reset_mac_header(skb);
3311 
3312 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3313 		struct fib_rt_info fri;
3314 
3315 		if (!res.fi) {
3316 			err = fib_props[res.type].error;
3317 			if (!err)
3318 				err = -EHOSTUNREACH;
3319 			goto errout_rcu;
3320 		}
3321 		fri.fi = res.fi;
3322 		fri.tb_id = table_id;
3323 		fri.dst = res.prefix;
3324 		fri.dst_len = res.prefixlen;
3325 		fri.tos = fl4.flowi4_tos;
3326 		fri.type = rt->rt_type;
3327 		fri.offload = 0;
3328 		fri.trap = 0;
3329 		if (res.fa_head) {
3330 			struct fib_alias *fa;
3331 
3332 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3333 				u8 slen = 32 - fri.dst_len;
3334 
3335 				if (fa->fa_slen == slen &&
3336 				    fa->tb_id == fri.tb_id &&
3337 				    fa->fa_tos == fri.tos &&
3338 				    fa->fa_info == res.fi &&
3339 				    fa->fa_type == fri.type) {
3340 					fri.offload = fa->offload;
3341 					fri.trap = fa->trap;
3342 					break;
3343 				}
3344 			}
3345 		}
3346 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3347 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3348 	} else {
3349 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3350 				   NETLINK_CB(in_skb).portid,
3351 				   nlh->nlmsg_seq, 0);
3352 	}
3353 	if (err < 0)
3354 		goto errout_rcu;
3355 
3356 	rcu_read_unlock();
3357 
3358 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3359 
3360 errout_free:
3361 	return err;
3362 errout_rcu:
3363 	rcu_read_unlock();
3364 	kfree_skb(skb);
3365 	goto errout_free;
3366 }
3367 
ip_rt_multicast_event(struct in_device * in_dev)3368 void ip_rt_multicast_event(struct in_device *in_dev)
3369 {
3370 	rt_cache_flush(dev_net(in_dev->dev));
3371 }
3372 
3373 #ifdef CONFIG_SYSCTL
3374 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3375 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3376 static int ip_rt_gc_elasticity __read_mostly	= 8;
3377 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3378 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3379 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3380 		void *buffer, size_t *lenp, loff_t *ppos)
3381 {
3382 	struct net *net = (struct net *)__ctl->extra1;
3383 
3384 	if (write) {
3385 		rt_cache_flush(net);
3386 		fnhe_genid_bump(net);
3387 		return 0;
3388 	}
3389 
3390 	return -EINVAL;
3391 }
3392 
3393 static struct ctl_table ipv4_route_table[] = {
3394 	{
3395 		.procname	= "gc_thresh",
3396 		.data		= &ipv4_dst_ops.gc_thresh,
3397 		.maxlen		= sizeof(int),
3398 		.mode		= 0644,
3399 		.proc_handler	= proc_dointvec,
3400 	},
3401 	{
3402 		.procname	= "max_size",
3403 		.data		= &ip_rt_max_size,
3404 		.maxlen		= sizeof(int),
3405 		.mode		= 0644,
3406 		.proc_handler	= proc_dointvec,
3407 	},
3408 	{
3409 		/*  Deprecated. Use gc_min_interval_ms */
3410 
3411 		.procname	= "gc_min_interval",
3412 		.data		= &ip_rt_gc_min_interval,
3413 		.maxlen		= sizeof(int),
3414 		.mode		= 0644,
3415 		.proc_handler	= proc_dointvec_jiffies,
3416 	},
3417 	{
3418 		.procname	= "gc_min_interval_ms",
3419 		.data		= &ip_rt_gc_min_interval,
3420 		.maxlen		= sizeof(int),
3421 		.mode		= 0644,
3422 		.proc_handler	= proc_dointvec_ms_jiffies,
3423 	},
3424 	{
3425 		.procname	= "gc_timeout",
3426 		.data		= &ip_rt_gc_timeout,
3427 		.maxlen		= sizeof(int),
3428 		.mode		= 0644,
3429 		.proc_handler	= proc_dointvec_jiffies,
3430 	},
3431 	{
3432 		.procname	= "gc_interval",
3433 		.data		= &ip_rt_gc_interval,
3434 		.maxlen		= sizeof(int),
3435 		.mode		= 0644,
3436 		.proc_handler	= proc_dointvec_jiffies,
3437 	},
3438 	{
3439 		.procname	= "redirect_load",
3440 		.data		= &ip_rt_redirect_load,
3441 		.maxlen		= sizeof(int),
3442 		.mode		= 0644,
3443 		.proc_handler	= proc_dointvec,
3444 	},
3445 	{
3446 		.procname	= "redirect_number",
3447 		.data		= &ip_rt_redirect_number,
3448 		.maxlen		= sizeof(int),
3449 		.mode		= 0644,
3450 		.proc_handler	= proc_dointvec,
3451 	},
3452 	{
3453 		.procname	= "redirect_silence",
3454 		.data		= &ip_rt_redirect_silence,
3455 		.maxlen		= sizeof(int),
3456 		.mode		= 0644,
3457 		.proc_handler	= proc_dointvec,
3458 	},
3459 	{
3460 		.procname	= "error_cost",
3461 		.data		= &ip_rt_error_cost,
3462 		.maxlen		= sizeof(int),
3463 		.mode		= 0644,
3464 		.proc_handler	= proc_dointvec,
3465 	},
3466 	{
3467 		.procname	= "error_burst",
3468 		.data		= &ip_rt_error_burst,
3469 		.maxlen		= sizeof(int),
3470 		.mode		= 0644,
3471 		.proc_handler	= proc_dointvec,
3472 	},
3473 	{
3474 		.procname	= "gc_elasticity",
3475 		.data		= &ip_rt_gc_elasticity,
3476 		.maxlen		= sizeof(int),
3477 		.mode		= 0644,
3478 		.proc_handler	= proc_dointvec,
3479 	},
3480 	{
3481 		.procname	= "mtu_expires",
3482 		.data		= &ip_rt_mtu_expires,
3483 		.maxlen		= sizeof(int),
3484 		.mode		= 0644,
3485 		.proc_handler	= proc_dointvec_jiffies,
3486 	},
3487 	{
3488 		.procname	= "min_pmtu",
3489 		.data		= &ip_rt_min_pmtu,
3490 		.maxlen		= sizeof(int),
3491 		.mode		= 0644,
3492 		.proc_handler	= proc_dointvec_minmax,
3493 		.extra1		= &ip_min_valid_pmtu,
3494 	},
3495 	{
3496 		.procname	= "min_adv_mss",
3497 		.data		= &ip_rt_min_advmss,
3498 		.maxlen		= sizeof(int),
3499 		.mode		= 0644,
3500 		.proc_handler	= proc_dointvec,
3501 	},
3502 	{ }
3503 };
3504 
3505 static const char ipv4_route_flush_procname[] = "flush";
3506 
3507 static struct ctl_table ipv4_route_flush_table[] = {
3508 	{
3509 		.procname	= ipv4_route_flush_procname,
3510 		.maxlen		= sizeof(int),
3511 		.mode		= 0200,
3512 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3513 	},
3514 	{ },
3515 };
3516 
sysctl_route_net_init(struct net * net)3517 static __net_init int sysctl_route_net_init(struct net *net)
3518 {
3519 	struct ctl_table *tbl;
3520 
3521 	tbl = ipv4_route_flush_table;
3522 	if (!net_eq(net, &init_net)) {
3523 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3524 		if (!tbl)
3525 			goto err_dup;
3526 
3527 		/* Don't export non-whitelisted sysctls to unprivileged users */
3528 		if (net->user_ns != &init_user_ns) {
3529 			if (tbl[0].procname != ipv4_route_flush_procname)
3530 				tbl[0].procname = NULL;
3531 		}
3532 	}
3533 	tbl[0].extra1 = net;
3534 
3535 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3536 	if (!net->ipv4.route_hdr)
3537 		goto err_reg;
3538 	return 0;
3539 
3540 err_reg:
3541 	if (tbl != ipv4_route_flush_table)
3542 		kfree(tbl);
3543 err_dup:
3544 	return -ENOMEM;
3545 }
3546 
sysctl_route_net_exit(struct net * net)3547 static __net_exit void sysctl_route_net_exit(struct net *net)
3548 {
3549 	struct ctl_table *tbl;
3550 
3551 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3552 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3553 	BUG_ON(tbl == ipv4_route_flush_table);
3554 	kfree(tbl);
3555 }
3556 
3557 static __net_initdata struct pernet_operations sysctl_route_ops = {
3558 	.init = sysctl_route_net_init,
3559 	.exit = sysctl_route_net_exit,
3560 };
3561 #endif
3562 
rt_genid_init(struct net * net)3563 static __net_init int rt_genid_init(struct net *net)
3564 {
3565 	atomic_set(&net->ipv4.rt_genid, 0);
3566 	atomic_set(&net->fnhe_genid, 0);
3567 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3568 	return 0;
3569 }
3570 
3571 static __net_initdata struct pernet_operations rt_genid_ops = {
3572 	.init = rt_genid_init,
3573 };
3574 
ipv4_inetpeer_init(struct net * net)3575 static int __net_init ipv4_inetpeer_init(struct net *net)
3576 {
3577 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3578 
3579 	if (!bp)
3580 		return -ENOMEM;
3581 	inet_peer_base_init(bp);
3582 	net->ipv4.peers = bp;
3583 	return 0;
3584 }
3585 
ipv4_inetpeer_exit(struct net * net)3586 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3587 {
3588 	struct inet_peer_base *bp = net->ipv4.peers;
3589 
3590 	net->ipv4.peers = NULL;
3591 	inetpeer_invalidate_tree(bp);
3592 	kfree(bp);
3593 }
3594 
3595 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3596 	.init	=	ipv4_inetpeer_init,
3597 	.exit	=	ipv4_inetpeer_exit,
3598 };
3599 
3600 #ifdef CONFIG_IP_ROUTE_CLASSID
3601 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3602 #endif /* CONFIG_IP_ROUTE_CLASSID */
3603 
ip_rt_init(void)3604 int __init ip_rt_init(void)
3605 {
3606 	void *idents_hash;
3607 	int cpu;
3608 
3609 	/* For modern hosts, this will use 2 MB of memory */
3610 	idents_hash = alloc_large_system_hash("IP idents",
3611 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3612 					      0,
3613 					      16, /* one bucket per 64 KB */
3614 					      HASH_ZERO,
3615 					      NULL,
3616 					      &ip_idents_mask,
3617 					      2048,
3618 					      256*1024);
3619 
3620 	ip_idents = idents_hash;
3621 
3622 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3623 
3624 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3625 
3626 	for_each_possible_cpu(cpu) {
3627 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3628 
3629 		INIT_LIST_HEAD(&ul->head);
3630 		spin_lock_init(&ul->lock);
3631 	}
3632 #ifdef CONFIG_IP_ROUTE_CLASSID
3633 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3634 	if (!ip_rt_acct)
3635 		panic("IP: failed to allocate ip_rt_acct\n");
3636 #endif
3637 
3638 	ipv4_dst_ops.kmem_cachep =
3639 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3640 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3641 
3642 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3643 
3644 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3645 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3646 
3647 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3648 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3649 
3650 	ipv4_dst_ops.gc_thresh = ~0;
3651 	ip_rt_max_size = INT_MAX;
3652 
3653 	devinet_init();
3654 	ip_fib_init();
3655 
3656 	if (ip_rt_proc_init())
3657 		pr_err("Unable to create route proc files\n");
3658 #ifdef CONFIG_XFRM
3659 	xfrm_init();
3660 	xfrm4_init();
3661 #endif
3662 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3663 		      RTNL_FLAG_DOIT_UNLOCKED);
3664 
3665 #ifdef CONFIG_SYSCTL
3666 	register_pernet_subsys(&sysctl_route_ops);
3667 #endif
3668 	register_pernet_subsys(&rt_genid_ops);
3669 	register_pernet_subsys(&ipv4_inetpeer_ops);
3670 	return 0;
3671 }
3672 
3673 #ifdef CONFIG_SYSCTL
3674 /*
3675  * We really need to sanitize the damn ipv4 init order, then all
3676  * this nonsense will go away.
3677  */
ip_static_sysctl_init(void)3678 void __init ip_static_sysctl_init(void)
3679 {
3680 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3681 }
3682 #endif
3683