• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116 
117 #define RT_FL_TOS(oldflp4) \
118 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119 
120 #define RT_GC_TIMEOUT (300*HZ)
121 
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
129 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly	= 256;
131 
132 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
133 
134 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
135 
136 /*
137  *	Interface to generic destination cache.
138  */
139 
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void		 ipv4_link_failure(struct sk_buff *skb);
145 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 					   struct sk_buff *skb, u32 mtu);
147 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148 					struct sk_buff *skb);
149 static void		ipv4_dst_destroy(struct dst_entry *dst);
150 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
152 {
153 	WARN_ON(1);
154 	return NULL;
155 }
156 
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 					   struct sk_buff *skb,
159 					   const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct file_operations rt_cache_seq_fops = {
242 	.owner	 = THIS_MODULE,
243 	.open	 = rt_cache_seq_open,
244 	.read	 = seq_read,
245 	.llseek	 = seq_lseek,
246 	.release = seq_release,
247 };
248 
249 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	return NULL;
277 
278 }
279 
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.owner	 = THIS_MODULE,
334 	.open	 = rt_cpu_seq_open,
335 	.read	 = seq_read,
336 	.llseek	 = seq_lseek,
337 	.release = seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 
rt_acct_proc_open(struct inode * inode,struct file * file)365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 	return single_open(file, rt_acct_proc_show, NULL);
368 }
369 
370 static const struct file_operations rt_acct_proc_fops = {
371 	.owner		= THIS_MODULE,
372 	.open		= rt_acct_proc_open,
373 	.read		= seq_read,
374 	.llseek		= seq_lseek,
375 	.release	= single_release,
376 };
377 #endif
378 
ip_rt_do_proc_init(struct net * net)379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381 	struct proc_dir_entry *pde;
382 
383 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 			  &rt_cache_seq_fops);
385 	if (!pde)
386 		goto err1;
387 
388 	pde = proc_create("rt_cache", S_IRUGO,
389 			  net->proc_net_stat, &rt_cpu_seq_fops);
390 	if (!pde)
391 		goto err2;
392 
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395 	if (!pde)
396 		goto err3;
397 #endif
398 	return 0;
399 
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402 	remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405 	remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407 	return -ENOMEM;
408 }
409 
ip_rt_do_proc_exit(struct net * net)410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412 	remove_proc_entry("rt_cache", net->proc_net_stat);
413 	remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415 	remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418 
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420 	.init = ip_rt_do_proc_init,
421 	.exit = ip_rt_do_proc_exit,
422 };
423 
ip_rt_proc_init(void)424 static int __init ip_rt_proc_init(void)
425 {
426 	return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428 
429 #else
ip_rt_proc_init(void)430 static inline int ip_rt_proc_init(void)
431 {
432 	return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435 
rt_is_expired(const struct rtable * rth)436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
439 }
440 
rt_cache_flush(struct net * net)441 void rt_cache_flush(struct net *net)
442 {
443 	rt_genid_bump_ipv4(net);
444 }
445 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 					   struct sk_buff *skb,
448 					   const void *daddr)
449 {
450 	struct net_device *dev = dst->dev;
451 	const __be32 *pkey = daddr;
452 	const struct rtable *rt;
453 	struct neighbour *n;
454 
455 	rt = (const struct rtable *) dst;
456 	if (rt->rt_gateway)
457 		pkey = (const __be32 *) &rt->rt_gateway;
458 	else if (skb)
459 		pkey = &ip_hdr(skb)->daddr;
460 
461 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 	if (n)
463 		return n;
464 	return neigh_create(&arp_tbl, pkey, dev);
465 }
466 
467 #define IP_IDENTS_SZ 2048u
468 
469 static atomic_t *ip_idents __read_mostly;
470 static u32 *ip_tstamps __read_mostly;
471 
472 /* In order to protect privacy, we add a perturbation to identifiers
473  * if one generator is seldom used. This makes hard for an attacker
474  * to infer how many packets were sent between two points in time.
475  */
ip_idents_reserve(u32 hash,int segs)476 u32 ip_idents_reserve(u32 hash, int segs)
477 {
478 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
479 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
480 	u32 old = ACCESS_ONCE(*p_tstamp);
481 	u32 now = (u32)jiffies;
482 	u32 new, delta = 0;
483 
484 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
485 		delta = prandom_u32_max(now - old);
486 
487 	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
488 	do {
489 		old = (u32)atomic_read(p_id);
490 		new = old + delta + segs;
491 	} while (atomic_cmpxchg(p_id, old, new) != old);
492 
493 	return new - segs;
494 }
495 EXPORT_SYMBOL(ip_idents_reserve);
496 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)497 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
498 {
499 	static u32 ip_idents_hashrnd __read_mostly;
500 	u32 hash, id;
501 
502 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
503 
504 	hash = jhash_3words((__force u32)iph->daddr,
505 			    (__force u32)iph->saddr,
506 			    iph->protocol ^ net_hash_mix(net),
507 			    ip_idents_hashrnd);
508 	id = ip_idents_reserve(hash, segs);
509 	iph->id = htons(id);
510 }
511 EXPORT_SYMBOL(__ip_select_ident);
512 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)513 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
514 			     const struct sock *sk,
515 			     const struct iphdr *iph,
516 			     int oif, u8 tos,
517 			     u8 prot, u32 mark, int flow_flags)
518 {
519 	if (sk) {
520 		const struct inet_sock *inet = inet_sk(sk);
521 
522 		oif = sk->sk_bound_dev_if;
523 		mark = sk->sk_mark;
524 		tos = RT_CONN_FLAGS(sk);
525 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
526 	}
527 	flowi4_init_output(fl4, oif, mark, tos,
528 			   RT_SCOPE_UNIVERSE, prot,
529 			   flow_flags,
530 			   iph->daddr, iph->saddr, 0, 0,
531 			   sock_net_uid(net, sk));
532 }
533 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)534 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
535 			       const struct sock *sk)
536 {
537 	const struct net *net = dev_net(skb->dev);
538 	const struct iphdr *iph = ip_hdr(skb);
539 	int oif = skb->dev->ifindex;
540 	u8 tos = RT_TOS(iph->tos);
541 	u8 prot = iph->protocol;
542 	u32 mark = skb->mark;
543 
544 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
545 }
546 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)547 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
548 {
549 	const struct inet_sock *inet = inet_sk(sk);
550 	const struct ip_options_rcu *inet_opt;
551 	__be32 daddr = inet->inet_daddr;
552 
553 	rcu_read_lock();
554 	inet_opt = rcu_dereference(inet->inet_opt);
555 	if (inet_opt && inet_opt->opt.srr)
556 		daddr = inet_opt->opt.faddr;
557 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
558 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
559 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
560 			   inet_sk_flowi_flags(sk),
561 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
562 	rcu_read_unlock();
563 }
564 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)565 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
566 				 const struct sk_buff *skb)
567 {
568 	if (skb)
569 		build_skb_flow_key(fl4, skb, sk);
570 	else
571 		build_sk_flow_key(fl4, sk);
572 }
573 
rt_free(struct rtable * rt)574 static inline void rt_free(struct rtable *rt)
575 {
576 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
577 }
578 
579 static DEFINE_SPINLOCK(fnhe_lock);
580 
fnhe_flush_routes(struct fib_nh_exception * fnhe)581 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
582 {
583 	struct rtable *rt;
584 
585 	rt = rcu_dereference(fnhe->fnhe_rth_input);
586 	if (rt) {
587 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
588 		rt_free(rt);
589 	}
590 	rt = rcu_dereference(fnhe->fnhe_rth_output);
591 	if (rt) {
592 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
593 		rt_free(rt);
594 	}
595 }
596 
fnhe_oldest(struct fnhe_hash_bucket * hash)597 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
598 {
599 	struct fib_nh_exception *fnhe, *oldest;
600 
601 	oldest = rcu_dereference(hash->chain);
602 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
603 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
604 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
605 			oldest = fnhe;
606 	}
607 	fnhe_flush_routes(oldest);
608 	return oldest;
609 }
610 
fnhe_hashfun(__be32 daddr)611 static inline u32 fnhe_hashfun(__be32 daddr)
612 {
613 	static u32 fnhe_hashrnd __read_mostly;
614 	u32 hval;
615 
616 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
617 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
618 	return hash_32(hval, FNHE_HASH_SHIFT);
619 }
620 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)621 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
622 {
623 	rt->rt_pmtu = fnhe->fnhe_pmtu;
624 	rt->dst.expires = fnhe->fnhe_expires;
625 
626 	if (fnhe->fnhe_gw) {
627 		rt->rt_flags |= RTCF_REDIRECTED;
628 		rt->rt_gateway = fnhe->fnhe_gw;
629 		rt->rt_uses_gateway = 1;
630 	}
631 }
632 
update_or_create_fnhe(struct fib_nh * nh,__be32 daddr,__be32 gw,u32 pmtu,unsigned long expires)633 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
634 				  u32 pmtu, unsigned long expires)
635 {
636 	struct fnhe_hash_bucket *hash;
637 	struct fib_nh_exception *fnhe;
638 	struct rtable *rt;
639 	u32 genid, hval;
640 	unsigned int i;
641 	int depth;
642 
643 	genid = fnhe_genid(dev_net(nh->nh_dev));
644 	hval = fnhe_hashfun(daddr);
645 
646 	spin_lock_bh(&fnhe_lock);
647 
648 	hash = rcu_dereference(nh->nh_exceptions);
649 	if (!hash) {
650 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
651 		if (!hash)
652 			goto out_unlock;
653 		rcu_assign_pointer(nh->nh_exceptions, hash);
654 	}
655 
656 	hash += hval;
657 
658 	depth = 0;
659 	for (fnhe = rcu_dereference(hash->chain); fnhe;
660 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
661 		if (fnhe->fnhe_daddr == daddr)
662 			break;
663 		depth++;
664 	}
665 
666 	if (fnhe) {
667 		if (fnhe->fnhe_genid != genid)
668 			fnhe->fnhe_genid = genid;
669 		if (gw)
670 			fnhe->fnhe_gw = gw;
671 		if (pmtu)
672 			fnhe->fnhe_pmtu = pmtu;
673 		fnhe->fnhe_expires = max(1UL, expires);
674 		/* Update all cached dsts too */
675 		rt = rcu_dereference(fnhe->fnhe_rth_input);
676 		if (rt)
677 			fill_route_from_fnhe(rt, fnhe);
678 		rt = rcu_dereference(fnhe->fnhe_rth_output);
679 		if (rt)
680 			fill_route_from_fnhe(rt, fnhe);
681 	} else {
682 		if (depth > FNHE_RECLAIM_DEPTH)
683 			fnhe = fnhe_oldest(hash);
684 		else {
685 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
686 			if (!fnhe)
687 				goto out_unlock;
688 
689 			fnhe->fnhe_next = hash->chain;
690 			rcu_assign_pointer(hash->chain, fnhe);
691 		}
692 		fnhe->fnhe_genid = genid;
693 		fnhe->fnhe_daddr = daddr;
694 		fnhe->fnhe_gw = gw;
695 		fnhe->fnhe_pmtu = pmtu;
696 		fnhe->fnhe_expires = expires;
697 
698 		/* Exception created; mark the cached routes for the nexthop
699 		 * stale, so anyone caching it rechecks if this exception
700 		 * applies to them.
701 		 */
702 		rt = rcu_dereference(nh->nh_rth_input);
703 		if (rt)
704 			rt->dst.obsolete = DST_OBSOLETE_KILL;
705 
706 		for_each_possible_cpu(i) {
707 			struct rtable __rcu **prt;
708 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
709 			rt = rcu_dereference(*prt);
710 			if (rt)
711 				rt->dst.obsolete = DST_OBSOLETE_KILL;
712 		}
713 	}
714 
715 	fnhe->fnhe_stamp = jiffies;
716 
717 out_unlock:
718 	spin_unlock_bh(&fnhe_lock);
719 }
720 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)721 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
722 			     bool kill_route)
723 {
724 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
725 	__be32 old_gw = ip_hdr(skb)->saddr;
726 	struct net_device *dev = skb->dev;
727 	struct in_device *in_dev;
728 	struct fib_result res;
729 	struct neighbour *n;
730 	struct net *net;
731 
732 	switch (icmp_hdr(skb)->code & 7) {
733 	case ICMP_REDIR_NET:
734 	case ICMP_REDIR_NETTOS:
735 	case ICMP_REDIR_HOST:
736 	case ICMP_REDIR_HOSTTOS:
737 		break;
738 
739 	default:
740 		return;
741 	}
742 
743 	if (rt->rt_gateway != old_gw)
744 		return;
745 
746 	in_dev = __in_dev_get_rcu(dev);
747 	if (!in_dev)
748 		return;
749 
750 	net = dev_net(dev);
751 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
752 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
753 	    ipv4_is_zeronet(new_gw))
754 		goto reject_redirect;
755 
756 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
757 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
758 			goto reject_redirect;
759 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
760 			goto reject_redirect;
761 	} else {
762 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
763 			goto reject_redirect;
764 	}
765 
766 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
767 	if (!n)
768 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
769 	if (!IS_ERR(n)) {
770 		if (!(n->nud_state & NUD_VALID)) {
771 			neigh_event_send(n, NULL);
772 		} else {
773 			if (fib_lookup(net, fl4, &res, 0) == 0) {
774 				struct fib_nh *nh = &FIB_RES_NH(res);
775 
776 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
777 						0, jiffies + ip_rt_gc_timeout);
778 			}
779 			if (kill_route)
780 				rt->dst.obsolete = DST_OBSOLETE_KILL;
781 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
782 		}
783 		neigh_release(n);
784 	}
785 	return;
786 
787 reject_redirect:
788 #ifdef CONFIG_IP_ROUTE_VERBOSE
789 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
790 		const struct iphdr *iph = (const struct iphdr *) skb->data;
791 		__be32 daddr = iph->daddr;
792 		__be32 saddr = iph->saddr;
793 
794 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
795 				     "  Advised path = %pI4 -> %pI4\n",
796 				     &old_gw, dev->name, &new_gw,
797 				     &saddr, &daddr);
798 	}
799 #endif
800 	;
801 }
802 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)803 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
804 {
805 	struct rtable *rt;
806 	struct flowi4 fl4;
807 	const struct iphdr *iph = (const struct iphdr *) skb->data;
808 	struct net *net = dev_net(skb->dev);
809 	int oif = skb->dev->ifindex;
810 	u8 tos = RT_TOS(iph->tos);
811 	u8 prot = iph->protocol;
812 	u32 mark = skb->mark;
813 
814 	rt = (struct rtable *) dst;
815 
816 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
817 	__ip_do_redirect(rt, skb, &fl4, true);
818 }
819 
ipv4_negative_advice(struct dst_entry * dst)820 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
821 {
822 	struct rtable *rt = (struct rtable *)dst;
823 	struct dst_entry *ret = dst;
824 
825 	if (rt) {
826 		if (dst->obsolete > 0) {
827 			ip_rt_put(rt);
828 			ret = NULL;
829 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
830 			   rt->dst.expires) {
831 			ip_rt_put(rt);
832 			ret = NULL;
833 		}
834 	}
835 	return ret;
836 }
837 
838 /*
839  * Algorithm:
840  *	1. The first ip_rt_redirect_number redirects are sent
841  *	   with exponential backoff, then we stop sending them at all,
842  *	   assuming that the host ignores our redirects.
843  *	2. If we did not see packets requiring redirects
844  *	   during ip_rt_redirect_silence, we assume that the host
845  *	   forgot redirected route and start to send redirects again.
846  *
847  * This algorithm is much cheaper and more intelligent than dumb load limiting
848  * in icmp.c.
849  *
850  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
851  * and "frag. need" (breaks PMTU discovery) in icmp.c.
852  */
853 
ip_rt_send_redirect(struct sk_buff * skb)854 void ip_rt_send_redirect(struct sk_buff *skb)
855 {
856 	struct rtable *rt = skb_rtable(skb);
857 	struct in_device *in_dev;
858 	struct inet_peer *peer;
859 	struct net *net;
860 	int log_martians;
861 	int vif;
862 
863 	rcu_read_lock();
864 	in_dev = __in_dev_get_rcu(rt->dst.dev);
865 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
866 		rcu_read_unlock();
867 		return;
868 	}
869 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
870 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
871 	rcu_read_unlock();
872 
873 	net = dev_net(rt->dst.dev);
874 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
875 	if (!peer) {
876 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
877 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
878 		return;
879 	}
880 
881 	/* No redirected packets during ip_rt_redirect_silence;
882 	 * reset the algorithm.
883 	 */
884 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
885 		peer->rate_tokens = 0;
886 
887 	/* Too many ignored redirects; do not send anything
888 	 * set dst.rate_last to the last seen redirected packet.
889 	 */
890 	if (peer->rate_tokens >= ip_rt_redirect_number) {
891 		peer->rate_last = jiffies;
892 		goto out_put_peer;
893 	}
894 
895 	/* Check for load limit; set rate_last to the latest sent
896 	 * redirect.
897 	 */
898 	if (peer->rate_tokens == 0 ||
899 	    time_after(jiffies,
900 		       (peer->rate_last +
901 			(ip_rt_redirect_load << peer->rate_tokens)))) {
902 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
903 
904 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
905 		peer->rate_last = jiffies;
906 		++peer->rate_tokens;
907 #ifdef CONFIG_IP_ROUTE_VERBOSE
908 		if (log_martians &&
909 		    peer->rate_tokens == ip_rt_redirect_number)
910 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
911 					     &ip_hdr(skb)->saddr, inet_iif(skb),
912 					     &ip_hdr(skb)->daddr, &gw);
913 #endif
914 	}
915 out_put_peer:
916 	inet_putpeer(peer);
917 }
918 
ip_error(struct sk_buff * skb)919 static int ip_error(struct sk_buff *skb)
920 {
921 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
922 	struct rtable *rt = skb_rtable(skb);
923 	struct inet_peer *peer;
924 	unsigned long now;
925 	struct net *net;
926 	bool send;
927 	int code;
928 
929 	/* IP on this device is disabled. */
930 	if (!in_dev)
931 		goto out;
932 
933 	net = dev_net(rt->dst.dev);
934 	if (!IN_DEV_FORWARD(in_dev)) {
935 		switch (rt->dst.error) {
936 		case EHOSTUNREACH:
937 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
938 			break;
939 
940 		case ENETUNREACH:
941 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
942 			break;
943 		}
944 		goto out;
945 	}
946 
947 	switch (rt->dst.error) {
948 	case EINVAL:
949 	default:
950 		goto out;
951 	case EHOSTUNREACH:
952 		code = ICMP_HOST_UNREACH;
953 		break;
954 	case ENETUNREACH:
955 		code = ICMP_NET_UNREACH;
956 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
957 		break;
958 	case EACCES:
959 		code = ICMP_PKT_FILTERED;
960 		break;
961 	}
962 
963 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
964 			       l3mdev_master_ifindex(skb->dev), 1);
965 
966 	send = true;
967 	if (peer) {
968 		now = jiffies;
969 		peer->rate_tokens += now - peer->rate_last;
970 		if (peer->rate_tokens > ip_rt_error_burst)
971 			peer->rate_tokens = ip_rt_error_burst;
972 		peer->rate_last = now;
973 		if (peer->rate_tokens >= ip_rt_error_cost)
974 			peer->rate_tokens -= ip_rt_error_cost;
975 		else
976 			send = false;
977 		inet_putpeer(peer);
978 	}
979 	if (send)
980 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
981 
982 out:	kfree_skb(skb);
983 	return 0;
984 }
985 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)986 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
987 {
988 	struct dst_entry *dst = &rt->dst;
989 	struct fib_result res;
990 
991 	if (dst_metric_locked(dst, RTAX_MTU))
992 		return;
993 
994 	if (ipv4_mtu(dst) < mtu)
995 		return;
996 
997 	if (mtu < ip_rt_min_pmtu)
998 		mtu = ip_rt_min_pmtu;
999 
1000 	if (rt->rt_pmtu == mtu &&
1001 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1002 		return;
1003 
1004 	rcu_read_lock();
1005 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1006 		struct fib_nh *nh = &FIB_RES_NH(res);
1007 
1008 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1009 				      jiffies + ip_rt_mtu_expires);
1010 	}
1011 	rcu_read_unlock();
1012 }
1013 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)1014 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1015 			      struct sk_buff *skb, u32 mtu)
1016 {
1017 	struct rtable *rt = (struct rtable *) dst;
1018 	struct flowi4 fl4;
1019 
1020 	ip_rt_build_flow_key(&fl4, sk, skb);
1021 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1022 }
1023 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u32 mark,u8 protocol,int flow_flags)1024 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1025 		      int oif, u32 mark, u8 protocol, int flow_flags)
1026 {
1027 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1028 	struct flowi4 fl4;
1029 	struct rtable *rt;
1030 
1031 	if (!mark)
1032 		mark = IP4_REPLY_MARK(net, skb->mark);
1033 
1034 	__build_flow_key(net, &fl4, NULL, iph, oif,
1035 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1036 	rt = __ip_route_output_key(net, &fl4);
1037 	if (!IS_ERR(rt)) {
1038 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1039 		ip_rt_put(rt);
1040 	}
1041 }
1042 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1043 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1044 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1045 {
1046 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1047 	struct flowi4 fl4;
1048 	struct rtable *rt;
1049 
1050 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1051 
1052 	if (!fl4.flowi4_mark)
1053 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1054 
1055 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1056 	if (!IS_ERR(rt)) {
1057 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1058 		ip_rt_put(rt);
1059 	}
1060 }
1061 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1062 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1063 {
1064 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1065 	struct flowi4 fl4;
1066 	struct rtable *rt;
1067 	struct dst_entry *odst = NULL;
1068 	bool new = false;
1069 	struct net *net = sock_net(sk);
1070 
1071 	bh_lock_sock(sk);
1072 
1073 	if (!ip_sk_accept_pmtu(sk))
1074 		goto out;
1075 
1076 	odst = sk_dst_get(sk);
1077 
1078 	if (sock_owned_by_user(sk) || !odst) {
1079 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1080 		goto out;
1081 	}
1082 
1083 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1084 
1085 	rt = (struct rtable *)odst;
1086 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1087 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1088 		if (IS_ERR(rt))
1089 			goto out;
1090 
1091 		new = true;
1092 	}
1093 
1094 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1095 
1096 	if (!dst_check(&rt->dst, 0)) {
1097 		if (new)
1098 			dst_release(&rt->dst);
1099 
1100 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1101 		if (IS_ERR(rt))
1102 			goto out;
1103 
1104 		new = true;
1105 	}
1106 
1107 	if (new)
1108 		sk_dst_set(sk, &rt->dst);
1109 
1110 out:
1111 	bh_unlock_sock(sk);
1112 	dst_release(odst);
1113 }
1114 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1115 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u32 mark,u8 protocol,int flow_flags)1116 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1117 		   int oif, u32 mark, u8 protocol, int flow_flags)
1118 {
1119 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1120 	struct flowi4 fl4;
1121 	struct rtable *rt;
1122 
1123 	__build_flow_key(net, &fl4, NULL, iph, oif,
1124 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1125 	rt = __ip_route_output_key(net, &fl4);
1126 	if (!IS_ERR(rt)) {
1127 		__ip_do_redirect(rt, skb, &fl4, false);
1128 		ip_rt_put(rt);
1129 	}
1130 }
1131 EXPORT_SYMBOL_GPL(ipv4_redirect);
1132 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1133 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1134 {
1135 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1136 	struct flowi4 fl4;
1137 	struct rtable *rt;
1138 	struct net *net = sock_net(sk);
1139 
1140 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1141 	rt = __ip_route_output_key(net, &fl4);
1142 	if (!IS_ERR(rt)) {
1143 		__ip_do_redirect(rt, skb, &fl4, false);
1144 		ip_rt_put(rt);
1145 	}
1146 }
1147 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1148 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1149 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1150 {
1151 	struct rtable *rt = (struct rtable *) dst;
1152 
1153 	/* All IPV4 dsts are created with ->obsolete set to the value
1154 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1155 	 * into this function always.
1156 	 *
1157 	 * When a PMTU/redirect information update invalidates a route,
1158 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1159 	 * DST_OBSOLETE_DEAD by dst_free().
1160 	 */
1161 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1162 		return NULL;
1163 	return dst;
1164 }
1165 
ipv4_link_failure(struct sk_buff * skb)1166 static void ipv4_link_failure(struct sk_buff *skb)
1167 {
1168 	struct rtable *rt;
1169 
1170 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1171 
1172 	rt = skb_rtable(skb);
1173 	if (rt)
1174 		dst_set_expires(&rt->dst, 0);
1175 }
1176 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1177 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1178 {
1179 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1180 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1181 		 skb->dev ? skb->dev->name : "?");
1182 	kfree_skb(skb);
1183 	WARN_ON(1);
1184 	return 0;
1185 }
1186 
1187 /*
1188    We do not cache source address of outgoing interface,
1189    because it is used only by IP RR, TS and SRR options,
1190    so that it out of fast path.
1191 
1192    BTW remember: "addr" is allowed to be not aligned
1193    in IP options!
1194  */
1195 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1196 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1197 {
1198 	__be32 src;
1199 
1200 	if (rt_is_output_route(rt))
1201 		src = ip_hdr(skb)->saddr;
1202 	else {
1203 		struct fib_result res;
1204 		struct flowi4 fl4;
1205 		struct iphdr *iph;
1206 
1207 		iph = ip_hdr(skb);
1208 
1209 		memset(&fl4, 0, sizeof(fl4));
1210 		fl4.daddr = iph->daddr;
1211 		fl4.saddr = iph->saddr;
1212 		fl4.flowi4_tos = RT_TOS(iph->tos);
1213 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1214 		fl4.flowi4_iif = skb->dev->ifindex;
1215 		fl4.flowi4_mark = skb->mark;
1216 
1217 		rcu_read_lock();
1218 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1219 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1220 		else
1221 			src = inet_select_addr(rt->dst.dev,
1222 					       rt_nexthop(rt, iph->daddr),
1223 					       RT_SCOPE_UNIVERSE);
1224 		rcu_read_unlock();
1225 	}
1226 	memcpy(addr, &src, 4);
1227 }
1228 
1229 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1230 static void set_class_tag(struct rtable *rt, u32 tag)
1231 {
1232 	if (!(rt->dst.tclassid & 0xFFFF))
1233 		rt->dst.tclassid |= tag & 0xFFFF;
1234 	if (!(rt->dst.tclassid & 0xFFFF0000))
1235 		rt->dst.tclassid |= tag & 0xFFFF0000;
1236 }
1237 #endif
1238 
ipv4_default_advmss(const struct dst_entry * dst)1239 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1240 {
1241 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1242 
1243 	if (advmss == 0) {
1244 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1245 			       ip_rt_min_advmss);
1246 		if (advmss > 65535 - 40)
1247 			advmss = 65535 - 40;
1248 	}
1249 	return advmss;
1250 }
1251 
ipv4_mtu(const struct dst_entry * dst)1252 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1253 {
1254 	const struct rtable *rt = (const struct rtable *) dst;
1255 	unsigned int mtu = rt->rt_pmtu;
1256 
1257 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1258 		mtu = dst_metric_raw(dst, RTAX_MTU);
1259 
1260 	if (mtu)
1261 		return mtu;
1262 
1263 	mtu = READ_ONCE(dst->dev->mtu);
1264 
1265 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1266 		if (rt->rt_uses_gateway && mtu > 576)
1267 			mtu = 576;
1268 	}
1269 
1270 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1271 
1272 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1273 }
1274 
find_exception(struct fib_nh * nh,__be32 daddr)1275 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1276 {
1277 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1278 	struct fib_nh_exception *fnhe;
1279 	u32 hval;
1280 
1281 	if (!hash)
1282 		return NULL;
1283 
1284 	hval = fnhe_hashfun(daddr);
1285 
1286 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1287 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1288 		if (fnhe->fnhe_daddr == daddr)
1289 			return fnhe;
1290 	}
1291 	return NULL;
1292 }
1293 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr)1294 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1295 			      __be32 daddr)
1296 {
1297 	bool ret = false;
1298 
1299 	spin_lock_bh(&fnhe_lock);
1300 
1301 	if (daddr == fnhe->fnhe_daddr) {
1302 		struct rtable __rcu **porig;
1303 		struct rtable *orig;
1304 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1305 
1306 		if (rt_is_input_route(rt))
1307 			porig = &fnhe->fnhe_rth_input;
1308 		else
1309 			porig = &fnhe->fnhe_rth_output;
1310 		orig = rcu_dereference(*porig);
1311 
1312 		if (fnhe->fnhe_genid != genid) {
1313 			fnhe->fnhe_genid = genid;
1314 			fnhe->fnhe_gw = 0;
1315 			fnhe->fnhe_pmtu = 0;
1316 			fnhe->fnhe_expires = 0;
1317 			fnhe_flush_routes(fnhe);
1318 			orig = NULL;
1319 		}
1320 		fill_route_from_fnhe(rt, fnhe);
1321 		if (!rt->rt_gateway)
1322 			rt->rt_gateway = daddr;
1323 
1324 		if (!(rt->dst.flags & DST_NOCACHE)) {
1325 			rcu_assign_pointer(*porig, rt);
1326 			if (orig)
1327 				rt_free(orig);
1328 			ret = true;
1329 		}
1330 
1331 		fnhe->fnhe_stamp = jiffies;
1332 	}
1333 	spin_unlock_bh(&fnhe_lock);
1334 
1335 	return ret;
1336 }
1337 
rt_cache_route(struct fib_nh * nh,struct rtable * rt)1338 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1339 {
1340 	struct rtable *orig, *prev, **p;
1341 	bool ret = true;
1342 
1343 	if (rt_is_input_route(rt)) {
1344 		p = (struct rtable **)&nh->nh_rth_input;
1345 	} else {
1346 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1347 	}
1348 	orig = *p;
1349 
1350 	prev = cmpxchg(p, orig, rt);
1351 	if (prev == orig) {
1352 		if (orig)
1353 			rt_free(orig);
1354 	} else
1355 		ret = false;
1356 
1357 	return ret;
1358 }
1359 
1360 struct uncached_list {
1361 	spinlock_t		lock;
1362 	struct list_head	head;
1363 };
1364 
1365 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1366 
rt_add_uncached_list(struct rtable * rt)1367 static void rt_add_uncached_list(struct rtable *rt)
1368 {
1369 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1370 
1371 	rt->rt_uncached_list = ul;
1372 
1373 	spin_lock_bh(&ul->lock);
1374 	list_add_tail(&rt->rt_uncached, &ul->head);
1375 	spin_unlock_bh(&ul->lock);
1376 }
1377 
ipv4_dst_destroy(struct dst_entry * dst)1378 static void ipv4_dst_destroy(struct dst_entry *dst)
1379 {
1380 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1381 	struct rtable *rt = (struct rtable *) dst;
1382 
1383 	if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1384 		kfree(p);
1385 
1386 	if (!list_empty(&rt->rt_uncached)) {
1387 		struct uncached_list *ul = rt->rt_uncached_list;
1388 
1389 		spin_lock_bh(&ul->lock);
1390 		list_del(&rt->rt_uncached);
1391 		spin_unlock_bh(&ul->lock);
1392 	}
1393 }
1394 
rt_flush_dev(struct net_device * dev)1395 void rt_flush_dev(struct net_device *dev)
1396 {
1397 	struct net *net = dev_net(dev);
1398 	struct rtable *rt;
1399 	int cpu;
1400 
1401 	for_each_possible_cpu(cpu) {
1402 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1403 
1404 		spin_lock_bh(&ul->lock);
1405 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1406 			if (rt->dst.dev != dev)
1407 				continue;
1408 			rt->dst.dev = net->loopback_dev;
1409 			dev_hold(rt->dst.dev);
1410 			dev_put(dev);
1411 		}
1412 		spin_unlock_bh(&ul->lock);
1413 	}
1414 }
1415 
rt_cache_valid(const struct rtable * rt)1416 static bool rt_cache_valid(const struct rtable *rt)
1417 {
1418 	return	rt &&
1419 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1420 		!rt_is_expired(rt);
1421 }
1422 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag)1423 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1424 			   const struct fib_result *res,
1425 			   struct fib_nh_exception *fnhe,
1426 			   struct fib_info *fi, u16 type, u32 itag)
1427 {
1428 	bool cached = false;
1429 
1430 	if (fi) {
1431 		struct fib_nh *nh = &FIB_RES_NH(*res);
1432 
1433 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1434 			rt->rt_gateway = nh->nh_gw;
1435 			rt->rt_uses_gateway = 1;
1436 		}
1437 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1438 		if (fi->fib_metrics != &dst_default_metrics) {
1439 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1440 			atomic_inc(&fi->fib_metrics->refcnt);
1441 		}
1442 #ifdef CONFIG_IP_ROUTE_CLASSID
1443 		rt->dst.tclassid = nh->nh_tclassid;
1444 #endif
1445 		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1446 		if (unlikely(fnhe))
1447 			cached = rt_bind_exception(rt, fnhe, daddr);
1448 		else if (!(rt->dst.flags & DST_NOCACHE))
1449 			cached = rt_cache_route(nh, rt);
1450 		if (unlikely(!cached)) {
1451 			/* Routes we intend to cache in nexthop exception or
1452 			 * FIB nexthop have the DST_NOCACHE bit clear.
1453 			 * However, if we are unsuccessful at storing this
1454 			 * route into the cache we really need to set it.
1455 			 */
1456 			rt->dst.flags |= DST_NOCACHE;
1457 			if (!rt->rt_gateway)
1458 				rt->rt_gateway = daddr;
1459 			rt_add_uncached_list(rt);
1460 		}
1461 	} else
1462 		rt_add_uncached_list(rt);
1463 
1464 #ifdef CONFIG_IP_ROUTE_CLASSID
1465 #ifdef CONFIG_IP_MULTIPLE_TABLES
1466 	set_class_tag(rt, res->tclassid);
1467 #endif
1468 	set_class_tag(rt, itag);
1469 #endif
1470 }
1471 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm,bool will_cache)1472 struct rtable *rt_dst_alloc(struct net_device *dev,
1473 			    unsigned int flags, u16 type,
1474 			    bool nopolicy, bool noxfrm, bool will_cache)
1475 {
1476 	struct rtable *rt;
1477 
1478 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1479 		       (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1480 		       (nopolicy ? DST_NOPOLICY : 0) |
1481 		       (noxfrm ? DST_NOXFRM : 0));
1482 
1483 	if (rt) {
1484 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1485 		rt->rt_flags = flags;
1486 		rt->rt_type = type;
1487 		rt->rt_is_input = 0;
1488 		rt->rt_iif = 0;
1489 		rt->rt_pmtu = 0;
1490 		rt->rt_gateway = 0;
1491 		rt->rt_uses_gateway = 0;
1492 		rt->rt_table_id = 0;
1493 		INIT_LIST_HEAD(&rt->rt_uncached);
1494 
1495 		rt->dst.output = ip_output;
1496 		if (flags & RTCF_LOCAL)
1497 			rt->dst.input = ip_local_deliver;
1498 	}
1499 
1500 	return rt;
1501 }
1502 EXPORT_SYMBOL(rt_dst_alloc);
1503 
1504 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1505 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1506 				u8 tos, struct net_device *dev, int our)
1507 {
1508 	struct rtable *rth;
1509 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1510 	unsigned int flags = RTCF_MULTICAST;
1511 	u32 itag = 0;
1512 	int err;
1513 
1514 	/* Primary sanity checks. */
1515 
1516 	if (!in_dev)
1517 		return -EINVAL;
1518 
1519 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1520 	    skb->protocol != htons(ETH_P_IP))
1521 		goto e_inval;
1522 
1523 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1524 		goto e_inval;
1525 
1526 	if (ipv4_is_zeronet(saddr)) {
1527 		if (!ipv4_is_local_multicast(daddr))
1528 			goto e_inval;
1529 	} else {
1530 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1531 					  in_dev, &itag);
1532 		if (err < 0)
1533 			goto e_err;
1534 	}
1535 	if (our)
1536 		flags |= RTCF_LOCAL;
1537 
1538 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1539 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1540 	if (!rth)
1541 		goto e_nobufs;
1542 
1543 #ifdef CONFIG_IP_ROUTE_CLASSID
1544 	rth->dst.tclassid = itag;
1545 #endif
1546 	rth->dst.output = ip_rt_bug;
1547 	rth->rt_is_input= 1;
1548 
1549 #ifdef CONFIG_IP_MROUTE
1550 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1551 		rth->dst.input = ip_mr_input;
1552 #endif
1553 	RT_CACHE_STAT_INC(in_slow_mc);
1554 
1555 	skb_dst_set(skb, &rth->dst);
1556 	return 0;
1557 
1558 e_nobufs:
1559 	return -ENOBUFS;
1560 e_inval:
1561 	return -EINVAL;
1562 e_err:
1563 	return err;
1564 }
1565 
1566 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1567 static void ip_handle_martian_source(struct net_device *dev,
1568 				     struct in_device *in_dev,
1569 				     struct sk_buff *skb,
1570 				     __be32 daddr,
1571 				     __be32 saddr)
1572 {
1573 	RT_CACHE_STAT_INC(in_martian_src);
1574 #ifdef CONFIG_IP_ROUTE_VERBOSE
1575 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1576 		/*
1577 		 *	RFC1812 recommendation, if source is martian,
1578 		 *	the only hint is MAC header.
1579 		 */
1580 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1581 			&daddr, &saddr, dev->name);
1582 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1583 			print_hex_dump(KERN_WARNING, "ll header: ",
1584 				       DUMP_PREFIX_OFFSET, 16, 1,
1585 				       skb_mac_header(skb),
1586 				       dev->hard_header_len, true);
1587 		}
1588 	}
1589 #endif
1590 }
1591 
ip_del_fnhe(struct fib_nh * nh,__be32 daddr)1592 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1593 {
1594 	struct fnhe_hash_bucket *hash;
1595 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1596 	u32 hval = fnhe_hashfun(daddr);
1597 
1598 	spin_lock_bh(&fnhe_lock);
1599 
1600 	hash = rcu_dereference_protected(nh->nh_exceptions,
1601 					 lockdep_is_held(&fnhe_lock));
1602 	hash += hval;
1603 
1604 	fnhe_p = &hash->chain;
1605 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1606 	while (fnhe) {
1607 		if (fnhe->fnhe_daddr == daddr) {
1608 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1609 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1610 			fnhe_flush_routes(fnhe);
1611 			kfree_rcu(fnhe, rcu);
1612 			break;
1613 		}
1614 		fnhe_p = &fnhe->fnhe_next;
1615 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1616 						 lockdep_is_held(&fnhe_lock));
1617 	}
1618 
1619 	spin_unlock_bh(&fnhe_lock);
1620 }
1621 
1622 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1623 static int __mkroute_input(struct sk_buff *skb,
1624 			   const struct fib_result *res,
1625 			   struct in_device *in_dev,
1626 			   __be32 daddr, __be32 saddr, u32 tos)
1627 {
1628 	struct fib_nh_exception *fnhe;
1629 	struct rtable *rth;
1630 	int err;
1631 	struct in_device *out_dev;
1632 	bool do_cache;
1633 	u32 itag = 0;
1634 
1635 	/* get a working reference to the output device */
1636 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1637 	if (!out_dev) {
1638 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1639 		return -EINVAL;
1640 	}
1641 
1642 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1643 				  in_dev->dev, in_dev, &itag);
1644 	if (err < 0) {
1645 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1646 					 saddr);
1647 
1648 		goto cleanup;
1649 	}
1650 
1651 	do_cache = res->fi && !itag;
1652 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1653 	    skb->protocol == htons(ETH_P_IP) &&
1654 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1655 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1656 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1657 
1658 	if (skb->protocol != htons(ETH_P_IP)) {
1659 		/* Not IP (i.e. ARP). Do not create route, if it is
1660 		 * invalid for proxy arp. DNAT routes are always valid.
1661 		 *
1662 		 * Proxy arp feature have been extended to allow, ARP
1663 		 * replies back to the same interface, to support
1664 		 * Private VLAN switch technologies. See arp.c.
1665 		 */
1666 		if (out_dev == in_dev &&
1667 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1668 			err = -EINVAL;
1669 			goto cleanup;
1670 		}
1671 	}
1672 
1673 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1674 	if (do_cache) {
1675 		if (fnhe) {
1676 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1677 			if (rth && rth->dst.expires &&
1678 			    time_after(jiffies, rth->dst.expires)) {
1679 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1680 				fnhe = NULL;
1681 			} else {
1682 				goto rt_cache;
1683 			}
1684 		}
1685 
1686 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1687 
1688 rt_cache:
1689 		if (rt_cache_valid(rth)) {
1690 			skb_dst_set_noref(skb, &rth->dst);
1691 			goto out;
1692 		}
1693 	}
1694 
1695 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1696 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1697 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1698 	if (!rth) {
1699 		err = -ENOBUFS;
1700 		goto cleanup;
1701 	}
1702 
1703 	rth->rt_is_input = 1;
1704 	if (res->table)
1705 		rth->rt_table_id = res->table->tb_id;
1706 	RT_CACHE_STAT_INC(in_slow_tot);
1707 
1708 	rth->dst.input = ip_forward;
1709 
1710 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1711 	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1712 		rth->dst.lwtstate->orig_output = rth->dst.output;
1713 		rth->dst.output = lwtunnel_output;
1714 	}
1715 	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1716 		rth->dst.lwtstate->orig_input = rth->dst.input;
1717 		rth->dst.input = lwtunnel_input;
1718 	}
1719 	skb_dst_set(skb, &rth->dst);
1720 out:
1721 	err = 0;
1722  cleanup:
1723 	return err;
1724 }
1725 
1726 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1727 
1728 /* To make ICMP packets follow the right flow, the multipath hash is
1729  * calculated from the inner IP addresses in reverse order.
1730  */
ip_multipath_icmp_hash(struct sk_buff * skb)1731 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1732 {
1733 	const struct iphdr *outer_iph = ip_hdr(skb);
1734 	struct icmphdr _icmph;
1735 	const struct icmphdr *icmph;
1736 	struct iphdr _inner_iph;
1737 	const struct iphdr *inner_iph;
1738 
1739 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1740 		goto standard_hash;
1741 
1742 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1743 				   &_icmph);
1744 	if (!icmph)
1745 		goto standard_hash;
1746 
1747 	if (icmph->type != ICMP_DEST_UNREACH &&
1748 	    icmph->type != ICMP_REDIRECT &&
1749 	    icmph->type != ICMP_TIME_EXCEEDED &&
1750 	    icmph->type != ICMP_PARAMETERPROB) {
1751 		goto standard_hash;
1752 	}
1753 
1754 	inner_iph = skb_header_pointer(skb,
1755 				       outer_iph->ihl * 4 + sizeof(_icmph),
1756 				       sizeof(_inner_iph), &_inner_iph);
1757 	if (!inner_iph)
1758 		goto standard_hash;
1759 
1760 	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1761 
1762 standard_hash:
1763 	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1764 }
1765 
1766 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1767 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,const struct flowi4 * fl4,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1768 static int ip_mkroute_input(struct sk_buff *skb,
1769 			    struct fib_result *res,
1770 			    const struct flowi4 *fl4,
1771 			    struct in_device *in_dev,
1772 			    __be32 daddr, __be32 saddr, u32 tos)
1773 {
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1775 	if (res->fi && res->fi->fib_nhs > 1) {
1776 		int h;
1777 
1778 		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1779 			h = ip_multipath_icmp_hash(skb);
1780 		else
1781 			h = fib_multipath_hash(saddr, daddr);
1782 		fib_select_multipath(res, h);
1783 	}
1784 #endif
1785 
1786 	/* create a routing cache entry */
1787 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1788 }
1789 
1790 /*
1791  *	NOTE. We drop all the packets that has local source
1792  *	addresses, because every properly looped back packet
1793  *	must have correct destination already attached by output routine.
1794  *
1795  *	Such approach solves two big problems:
1796  *	1. Not simplex devices are handled properly.
1797  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1798  *	called with rcu_read_lock()
1799  */
1800 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1801 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1802 			       u8 tos, struct net_device *dev)
1803 {
1804 	struct fib_result res;
1805 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1806 	struct ip_tunnel_info *tun_info;
1807 	struct flowi4	fl4;
1808 	unsigned int	flags = 0;
1809 	u32		itag = 0;
1810 	struct rtable	*rth;
1811 	int		err = -EINVAL;
1812 	struct net    *net = dev_net(dev);
1813 	bool do_cache;
1814 
1815 	/* IP on this device is disabled. */
1816 
1817 	if (!in_dev)
1818 		goto out;
1819 
1820 	/* Check for the most weird martians, which can be not detected
1821 	   by fib_lookup.
1822 	 */
1823 
1824 	tun_info = skb_tunnel_info(skb);
1825 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1826 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1827 	else
1828 		fl4.flowi4_tun_key.tun_id = 0;
1829 	skb_dst_drop(skb);
1830 
1831 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1832 		goto martian_source;
1833 
1834 	res.fi = NULL;
1835 	res.table = NULL;
1836 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1837 		goto brd_input;
1838 
1839 	/* Accept zero addresses only to limited broadcast;
1840 	 * I even do not know to fix it or not. Waiting for complains :-)
1841 	 */
1842 	if (ipv4_is_zeronet(saddr))
1843 		goto martian_source;
1844 
1845 	if (ipv4_is_zeronet(daddr))
1846 		goto martian_destination;
1847 
1848 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1849 	 * and call it once if daddr or/and saddr are loopback addresses
1850 	 */
1851 	if (ipv4_is_loopback(daddr)) {
1852 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1853 			goto martian_destination;
1854 	} else if (ipv4_is_loopback(saddr)) {
1855 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1856 			goto martian_source;
1857 	}
1858 
1859 	/*
1860 	 *	Now we are ready to route packet.
1861 	 */
1862 	fl4.flowi4_oif = 0;
1863 	fl4.flowi4_iif = dev->ifindex;
1864 	fl4.flowi4_mark = skb->mark;
1865 	fl4.flowi4_tos = tos;
1866 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1867 	fl4.flowi4_flags = 0;
1868 	fl4.daddr = daddr;
1869 	fl4.saddr = saddr;
1870 	fl4.flowi4_uid = sock_net_uid(net, NULL);
1871 	err = fib_lookup(net, &fl4, &res, 0);
1872 	if (err != 0) {
1873 		if (!IN_DEV_FORWARD(in_dev))
1874 			err = -EHOSTUNREACH;
1875 		goto no_route;
1876 	}
1877 
1878 	if (res.type == RTN_BROADCAST)
1879 		goto brd_input;
1880 
1881 	if (res.type == RTN_LOCAL) {
1882 		err = fib_validate_source(skb, saddr, daddr, tos,
1883 					  0, dev, in_dev, &itag);
1884 		if (err < 0)
1885 			goto martian_source;
1886 		goto local_input;
1887 	}
1888 
1889 	if (!IN_DEV_FORWARD(in_dev)) {
1890 		err = -EHOSTUNREACH;
1891 		goto no_route;
1892 	}
1893 	if (res.type != RTN_UNICAST)
1894 		goto martian_destination;
1895 
1896 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1897 out:	return err;
1898 
1899 brd_input:
1900 	if (skb->protocol != htons(ETH_P_IP))
1901 		goto e_inval;
1902 
1903 	if (!ipv4_is_zeronet(saddr)) {
1904 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1905 					  in_dev, &itag);
1906 		if (err < 0)
1907 			goto martian_source;
1908 	}
1909 	flags |= RTCF_BROADCAST;
1910 	res.type = RTN_BROADCAST;
1911 	RT_CACHE_STAT_INC(in_brd);
1912 
1913 local_input:
1914 	do_cache = false;
1915 	if (res.fi) {
1916 		if (!itag) {
1917 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1918 			if (rt_cache_valid(rth)) {
1919 				skb_dst_set_noref(skb, &rth->dst);
1920 				err = 0;
1921 				goto out;
1922 			}
1923 			do_cache = true;
1924 		}
1925 	}
1926 
1927 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1928 			   flags | RTCF_LOCAL, res.type,
1929 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1930 	if (!rth)
1931 		goto e_nobufs;
1932 
1933 	rth->dst.output= ip_rt_bug;
1934 #ifdef CONFIG_IP_ROUTE_CLASSID
1935 	rth->dst.tclassid = itag;
1936 #endif
1937 	rth->rt_is_input = 1;
1938 	if (res.table)
1939 		rth->rt_table_id = res.table->tb_id;
1940 
1941 	RT_CACHE_STAT_INC(in_slow_tot);
1942 	if (res.type == RTN_UNREACHABLE) {
1943 		rth->dst.input= ip_error;
1944 		rth->dst.error= -err;
1945 		rth->rt_flags 	&= ~RTCF_LOCAL;
1946 	}
1947 	if (do_cache) {
1948 		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1949 			rth->dst.flags |= DST_NOCACHE;
1950 			rt_add_uncached_list(rth);
1951 		}
1952 	}
1953 	skb_dst_set(skb, &rth->dst);
1954 	err = 0;
1955 	goto out;
1956 
1957 no_route:
1958 	RT_CACHE_STAT_INC(in_no_route);
1959 	res.type = RTN_UNREACHABLE;
1960 	res.fi = NULL;
1961 	res.table = NULL;
1962 	goto local_input;
1963 
1964 	/*
1965 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1966 	 */
1967 martian_destination:
1968 	RT_CACHE_STAT_INC(in_martian_dst);
1969 #ifdef CONFIG_IP_ROUTE_VERBOSE
1970 	if (IN_DEV_LOG_MARTIANS(in_dev))
1971 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1972 				     &daddr, &saddr, dev->name);
1973 #endif
1974 
1975 e_inval:
1976 	err = -EINVAL;
1977 	goto out;
1978 
1979 e_nobufs:
1980 	err = -ENOBUFS;
1981 	goto out;
1982 
1983 martian_source:
1984 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1985 	goto out;
1986 }
1987 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1988 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1989 			 u8 tos, struct net_device *dev)
1990 {
1991 	int res;
1992 
1993 	tos &= IPTOS_RT_MASK;
1994 	rcu_read_lock();
1995 
1996 	/* Multicast recognition logic is moved from route cache to here.
1997 	   The problem was that too many Ethernet cards have broken/missing
1998 	   hardware multicast filters :-( As result the host on multicasting
1999 	   network acquires a lot of useless route cache entries, sort of
2000 	   SDR messages from all the world. Now we try to get rid of them.
2001 	   Really, provided software IP multicast filter is organized
2002 	   reasonably (at least, hashed), it does not result in a slowdown
2003 	   comparing with route cache reject entries.
2004 	   Note, that multicast routers are not affected, because
2005 	   route cache entry is created eventually.
2006 	 */
2007 	if (ipv4_is_multicast(daddr)) {
2008 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2009 
2010 		if (in_dev) {
2011 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2012 						  ip_hdr(skb)->protocol);
2013 			if (our
2014 #ifdef CONFIG_IP_MROUTE
2015 				||
2016 			    (!ipv4_is_local_multicast(daddr) &&
2017 			     IN_DEV_MFORWARD(in_dev))
2018 #endif
2019 			   ) {
2020 				int res = ip_route_input_mc(skb, daddr, saddr,
2021 							    tos, dev, our);
2022 				rcu_read_unlock();
2023 				return res;
2024 			}
2025 		}
2026 		rcu_read_unlock();
2027 		return -EINVAL;
2028 	}
2029 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2030 	rcu_read_unlock();
2031 	return res;
2032 }
2033 EXPORT_SYMBOL(ip_route_input_noref);
2034 
2035 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2036 static struct rtable *__mkroute_output(const struct fib_result *res,
2037 				       const struct flowi4 *fl4, int orig_oif,
2038 				       struct net_device *dev_out,
2039 				       unsigned int flags)
2040 {
2041 	struct fib_info *fi = res->fi;
2042 	struct fib_nh_exception *fnhe;
2043 	struct in_device *in_dev;
2044 	u16 type = res->type;
2045 	struct rtable *rth;
2046 	bool do_cache;
2047 
2048 	in_dev = __in_dev_get_rcu(dev_out);
2049 	if (!in_dev)
2050 		return ERR_PTR(-EINVAL);
2051 
2052 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2053 		if (ipv4_is_loopback(fl4->saddr) &&
2054 		    !(dev_out->flags & IFF_LOOPBACK) &&
2055 		    !netif_is_l3_master(dev_out))
2056 			return ERR_PTR(-EINVAL);
2057 
2058 	if (ipv4_is_lbcast(fl4->daddr))
2059 		type = RTN_BROADCAST;
2060 	else if (ipv4_is_multicast(fl4->daddr))
2061 		type = RTN_MULTICAST;
2062 	else if (ipv4_is_zeronet(fl4->daddr))
2063 		return ERR_PTR(-EINVAL);
2064 
2065 	if (dev_out->flags & IFF_LOOPBACK)
2066 		flags |= RTCF_LOCAL;
2067 
2068 	do_cache = true;
2069 	if (type == RTN_BROADCAST) {
2070 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2071 		fi = NULL;
2072 	} else if (type == RTN_MULTICAST) {
2073 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2074 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2075 				     fl4->flowi4_proto))
2076 			flags &= ~RTCF_LOCAL;
2077 		else
2078 			do_cache = false;
2079 		/* If multicast route do not exist use
2080 		 * default one, but do not gateway in this case.
2081 		 * Yes, it is hack.
2082 		 */
2083 		if (fi && res->prefixlen < 4)
2084 			fi = NULL;
2085 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2086 		   (orig_oif != dev_out->ifindex)) {
2087 		/* For local routes that require a particular output interface
2088 		 * we do not want to cache the result.  Caching the result
2089 		 * causes incorrect behaviour when there are multiple source
2090 		 * addresses on the interface, the end result being that if the
2091 		 * intended recipient is waiting on that interface for the
2092 		 * packet he won't receive it because it will be delivered on
2093 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2094 		 * be set to the loopback interface as well.
2095 		 */
2096 		fi = NULL;
2097 	}
2098 
2099 	fnhe = NULL;
2100 	do_cache &= fi != NULL;
2101 	if (do_cache) {
2102 		struct rtable __rcu **prth;
2103 		struct fib_nh *nh = &FIB_RES_NH(*res);
2104 
2105 		fnhe = find_exception(nh, fl4->daddr);
2106 		if (fnhe) {
2107 			prth = &fnhe->fnhe_rth_output;
2108 			rth = rcu_dereference(*prth);
2109 			if (rth && rth->dst.expires &&
2110 			    time_after(jiffies, rth->dst.expires)) {
2111 				ip_del_fnhe(nh, fl4->daddr);
2112 				fnhe = NULL;
2113 			} else {
2114 				goto rt_cache;
2115 			}
2116 		}
2117 
2118 		if (unlikely(fl4->flowi4_flags &
2119 			     FLOWI_FLAG_KNOWN_NH &&
2120 			     !(nh->nh_gw &&
2121 			       nh->nh_scope == RT_SCOPE_LINK))) {
2122 			do_cache = false;
2123 			goto add;
2124 		}
2125 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2126 		rth = rcu_dereference(*prth);
2127 
2128 rt_cache:
2129 		if (rt_cache_valid(rth)) {
2130 			dst_hold(&rth->dst);
2131 			return rth;
2132 		}
2133 	}
2134 
2135 add:
2136 	rth = rt_dst_alloc(dev_out, flags, type,
2137 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2138 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2139 			   do_cache);
2140 	if (!rth)
2141 		return ERR_PTR(-ENOBUFS);
2142 
2143 	rth->rt_iif	= orig_oif ? : 0;
2144 	if (res->table)
2145 		rth->rt_table_id = res->table->tb_id;
2146 
2147 	RT_CACHE_STAT_INC(out_slow_tot);
2148 
2149 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2150 		if (flags & RTCF_LOCAL &&
2151 		    !(dev_out->flags & IFF_LOOPBACK)) {
2152 			rth->dst.output = ip_mc_output;
2153 			RT_CACHE_STAT_INC(out_slow_mc);
2154 		}
2155 #ifdef CONFIG_IP_MROUTE
2156 		if (type == RTN_MULTICAST) {
2157 			if (IN_DEV_MFORWARD(in_dev) &&
2158 			    !ipv4_is_local_multicast(fl4->daddr)) {
2159 				rth->dst.input = ip_mr_input;
2160 				rth->dst.output = ip_mc_output;
2161 			}
2162 		}
2163 #endif
2164 	}
2165 
2166 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2167 	if (lwtunnel_output_redirect(rth->dst.lwtstate))
2168 		rth->dst.output = lwtunnel_output;
2169 
2170 	return rth;
2171 }
2172 
2173 /*
2174  * Major route resolver routine.
2175  */
2176 
__ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,int mp_hash)2177 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2178 					  int mp_hash)
2179 {
2180 	struct net_device *dev_out = NULL;
2181 	__u8 tos = RT_FL_TOS(fl4);
2182 	unsigned int flags = 0;
2183 	struct fib_result res;
2184 	struct rtable *rth;
2185 	int orig_oif;
2186 	int err = -ENETUNREACH;
2187 
2188 	res.tclassid	= 0;
2189 	res.fi		= NULL;
2190 	res.table	= NULL;
2191 
2192 	orig_oif = fl4->flowi4_oif;
2193 
2194 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2195 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2196 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2197 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2198 
2199 	rcu_read_lock();
2200 	if (fl4->saddr) {
2201 		rth = ERR_PTR(-EINVAL);
2202 		if (ipv4_is_multicast(fl4->saddr) ||
2203 		    ipv4_is_lbcast(fl4->saddr) ||
2204 		    ipv4_is_zeronet(fl4->saddr))
2205 			goto out;
2206 
2207 		/* I removed check for oif == dev_out->oif here.
2208 		   It was wrong for two reasons:
2209 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2210 		      is assigned to multiple interfaces.
2211 		   2. Moreover, we are allowed to send packets with saddr
2212 		      of another iface. --ANK
2213 		 */
2214 
2215 		if (fl4->flowi4_oif == 0 &&
2216 		    (ipv4_is_multicast(fl4->daddr) ||
2217 		     ipv4_is_lbcast(fl4->daddr))) {
2218 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2219 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2220 			if (!dev_out)
2221 				goto out;
2222 
2223 			/* Special hack: user can direct multicasts
2224 			   and limited broadcast via necessary interface
2225 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2226 			   This hack is not just for fun, it allows
2227 			   vic,vat and friends to work.
2228 			   They bind socket to loopback, set ttl to zero
2229 			   and expect that it will work.
2230 			   From the viewpoint of routing cache they are broken,
2231 			   because we are not allowed to build multicast path
2232 			   with loopback source addr (look, routing cache
2233 			   cannot know, that ttl is zero, so that packet
2234 			   will not leave this host and route is valid).
2235 			   Luckily, this hack is good workaround.
2236 			 */
2237 
2238 			fl4->flowi4_oif = dev_out->ifindex;
2239 			goto make_route;
2240 		}
2241 
2242 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2243 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2244 			if (!__ip_dev_find(net, fl4->saddr, false))
2245 				goto out;
2246 		}
2247 	}
2248 
2249 
2250 	if (fl4->flowi4_oif) {
2251 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2252 		rth = ERR_PTR(-ENODEV);
2253 		if (!dev_out)
2254 			goto out;
2255 
2256 		/* RACE: Check return value of inet_select_addr instead. */
2257 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2258 			rth = ERR_PTR(-ENETUNREACH);
2259 			goto out;
2260 		}
2261 		if (ipv4_is_local_multicast(fl4->daddr) ||
2262 		    ipv4_is_lbcast(fl4->daddr) ||
2263 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2264 			if (!fl4->saddr)
2265 				fl4->saddr = inet_select_addr(dev_out, 0,
2266 							      RT_SCOPE_LINK);
2267 			goto make_route;
2268 		}
2269 		if (!fl4->saddr) {
2270 			if (ipv4_is_multicast(fl4->daddr))
2271 				fl4->saddr = inet_select_addr(dev_out, 0,
2272 							      fl4->flowi4_scope);
2273 			else if (!fl4->daddr)
2274 				fl4->saddr = inet_select_addr(dev_out, 0,
2275 							      RT_SCOPE_HOST);
2276 		}
2277 	}
2278 
2279 	if (!fl4->daddr) {
2280 		fl4->daddr = fl4->saddr;
2281 		if (!fl4->daddr)
2282 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2283 		dev_out = net->loopback_dev;
2284 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2285 		res.type = RTN_LOCAL;
2286 		flags |= RTCF_LOCAL;
2287 		goto make_route;
2288 	}
2289 
2290 	err = fib_lookup(net, fl4, &res, 0);
2291 	if (err) {
2292 		res.fi = NULL;
2293 		res.table = NULL;
2294 		if (fl4->flowi4_oif &&
2295 		    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2296 			/* Apparently, routing tables are wrong. Assume,
2297 			   that the destination is on link.
2298 
2299 			   WHY? DW.
2300 			   Because we are allowed to send to iface
2301 			   even if it has NO routes and NO assigned
2302 			   addresses. When oif is specified, routing
2303 			   tables are looked up with only one purpose:
2304 			   to catch if destination is gatewayed, rather than
2305 			   direct. Moreover, if MSG_DONTROUTE is set,
2306 			   we send packet, ignoring both routing tables
2307 			   and ifaddr state. --ANK
2308 
2309 
2310 			   We could make it even if oif is unknown,
2311 			   likely IPv6, but we do not.
2312 			 */
2313 
2314 			if (fl4->saddr == 0)
2315 				fl4->saddr = inet_select_addr(dev_out, 0,
2316 							      RT_SCOPE_LINK);
2317 			res.type = RTN_UNICAST;
2318 			goto make_route;
2319 		}
2320 		rth = ERR_PTR(err);
2321 		goto out;
2322 	}
2323 
2324 	if (res.type == RTN_LOCAL) {
2325 		if (!fl4->saddr) {
2326 			if (res.fi->fib_prefsrc)
2327 				fl4->saddr = res.fi->fib_prefsrc;
2328 			else
2329 				fl4->saddr = fl4->daddr;
2330 		}
2331 
2332 		/* L3 master device is the loopback for that domain */
2333 		dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2334 		fl4->flowi4_oif = dev_out->ifindex;
2335 		flags |= RTCF_LOCAL;
2336 		goto make_route;
2337 	}
2338 
2339 	fib_select_path(net, &res, fl4, mp_hash);
2340 
2341 	dev_out = FIB_RES_DEV(res);
2342 	fl4->flowi4_oif = dev_out->ifindex;
2343 
2344 
2345 make_route:
2346 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2347 
2348 out:
2349 	rcu_read_unlock();
2350 	return rth;
2351 }
2352 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2353 
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2354 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2355 {
2356 	return NULL;
2357 }
2358 
ipv4_blackhole_mtu(const struct dst_entry * dst)2359 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2360 {
2361 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2362 
2363 	return mtu ? : dst->dev->mtu;
2364 }
2365 
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)2366 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2367 					  struct sk_buff *skb, u32 mtu)
2368 {
2369 }
2370 
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2371 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2372 				       struct sk_buff *skb)
2373 {
2374 }
2375 
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2376 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2377 					  unsigned long old)
2378 {
2379 	return NULL;
2380 }
2381 
2382 static struct dst_ops ipv4_dst_blackhole_ops = {
2383 	.family			=	AF_INET,
2384 	.check			=	ipv4_blackhole_dst_check,
2385 	.mtu			=	ipv4_blackhole_mtu,
2386 	.default_advmss		=	ipv4_default_advmss,
2387 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2388 	.redirect		=	ipv4_rt_blackhole_redirect,
2389 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2390 	.neigh_lookup		=	ipv4_neigh_lookup,
2391 };
2392 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2393 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2394 {
2395 	struct rtable *ort = (struct rtable *) dst_orig;
2396 	struct rtable *rt;
2397 
2398 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2399 	if (rt) {
2400 		struct dst_entry *new = &rt->dst;
2401 
2402 		new->__use = 1;
2403 		new->input = dst_discard;
2404 		new->output = dst_discard_out;
2405 
2406 		new->dev = ort->dst.dev;
2407 		if (new->dev)
2408 			dev_hold(new->dev);
2409 
2410 		rt->rt_is_input = ort->rt_is_input;
2411 		rt->rt_iif = ort->rt_iif;
2412 		rt->rt_pmtu = ort->rt_pmtu;
2413 
2414 		rt->rt_genid = rt_genid_ipv4(net);
2415 		rt->rt_flags = ort->rt_flags;
2416 		rt->rt_type = ort->rt_type;
2417 		rt->rt_gateway = ort->rt_gateway;
2418 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2419 
2420 		INIT_LIST_HEAD(&rt->rt_uncached);
2421 		dst_free(new);
2422 	}
2423 
2424 	dst_release(dst_orig);
2425 
2426 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2427 }
2428 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2429 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2430 				    const struct sock *sk)
2431 {
2432 	struct rtable *rt = __ip_route_output_key(net, flp4);
2433 
2434 	if (IS_ERR(rt))
2435 		return rt;
2436 
2437 	if (flp4->flowi4_proto)
2438 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2439 							flowi4_to_flowi(flp4),
2440 							sk, 0);
2441 
2442 	return rt;
2443 }
2444 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2445 
rt_fill_info(struct net * net,__be32 dst,__be32 src,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,int event,int nowait,unsigned int flags)2446 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2447 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2448 			u32 seq, int event, int nowait, unsigned int flags)
2449 {
2450 	struct rtable *rt = skb_rtable(skb);
2451 	struct rtmsg *r;
2452 	struct nlmsghdr *nlh;
2453 	unsigned long expires = 0;
2454 	u32 error;
2455 	u32 metrics[RTAX_MAX];
2456 
2457 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2458 	if (!nlh)
2459 		return -EMSGSIZE;
2460 
2461 	r = nlmsg_data(nlh);
2462 	r->rtm_family	 = AF_INET;
2463 	r->rtm_dst_len	= 32;
2464 	r->rtm_src_len	= 0;
2465 	r->rtm_tos	= fl4->flowi4_tos;
2466 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2467 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2468 		goto nla_put_failure;
2469 	r->rtm_type	= rt->rt_type;
2470 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2471 	r->rtm_protocol = RTPROT_UNSPEC;
2472 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2473 	if (rt->rt_flags & RTCF_NOTIFY)
2474 		r->rtm_flags |= RTM_F_NOTIFY;
2475 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2476 		r->rtm_flags |= RTCF_DOREDIRECT;
2477 
2478 	if (nla_put_in_addr(skb, RTA_DST, dst))
2479 		goto nla_put_failure;
2480 	if (src) {
2481 		r->rtm_src_len = 32;
2482 		if (nla_put_in_addr(skb, RTA_SRC, src))
2483 			goto nla_put_failure;
2484 	}
2485 	if (rt->dst.dev &&
2486 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2487 		goto nla_put_failure;
2488 #ifdef CONFIG_IP_ROUTE_CLASSID
2489 	if (rt->dst.tclassid &&
2490 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2491 		goto nla_put_failure;
2492 #endif
2493 	if (!rt_is_input_route(rt) &&
2494 	    fl4->saddr != src) {
2495 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2496 			goto nla_put_failure;
2497 	}
2498 	if (rt->rt_uses_gateway &&
2499 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2500 		goto nla_put_failure;
2501 
2502 	expires = rt->dst.expires;
2503 	if (expires) {
2504 		unsigned long now = jiffies;
2505 
2506 		if (time_before(now, expires))
2507 			expires -= now;
2508 		else
2509 			expires = 0;
2510 	}
2511 
2512 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2513 	if (rt->rt_pmtu && expires)
2514 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2515 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2516 		goto nla_put_failure;
2517 
2518 	if (fl4->flowi4_mark &&
2519 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2520 		goto nla_put_failure;
2521 
2522 	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2523 	    nla_put_u32(skb, RTA_UID,
2524 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2525 		goto nla_put_failure;
2526 
2527 	error = rt->dst.error;
2528 
2529 	if (rt_is_input_route(rt)) {
2530 #ifdef CONFIG_IP_MROUTE
2531 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2532 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2533 			int err = ipmr_get_route(net, skb,
2534 						 fl4->saddr, fl4->daddr,
2535 						 r, nowait, portid);
2536 
2537 			if (err <= 0) {
2538 				if (!nowait) {
2539 					if (err == 0)
2540 						return 0;
2541 					goto nla_put_failure;
2542 				} else {
2543 					if (err == -EMSGSIZE)
2544 						goto nla_put_failure;
2545 					error = err;
2546 				}
2547 			}
2548 		} else
2549 #endif
2550 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2551 				goto nla_put_failure;
2552 	}
2553 
2554 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2555 		goto nla_put_failure;
2556 
2557 	nlmsg_end(skb, nlh);
2558 	return 0;
2559 
2560 nla_put_failure:
2561 	nlmsg_cancel(skb, nlh);
2562 	return -EMSGSIZE;
2563 }
2564 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh)2565 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2566 {
2567 	struct net *net = sock_net(in_skb->sk);
2568 	struct rtmsg *rtm;
2569 	struct nlattr *tb[RTA_MAX+1];
2570 	struct rtable *rt = NULL;
2571 	struct flowi4 fl4;
2572 	__be32 dst = 0;
2573 	__be32 src = 0;
2574 	u32 iif;
2575 	int err;
2576 	int mark;
2577 	struct sk_buff *skb;
2578 	u32 table_id = RT_TABLE_MAIN;
2579 	kuid_t uid;
2580 
2581 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2582 	if (err < 0)
2583 		goto errout;
2584 
2585 	rtm = nlmsg_data(nlh);
2586 
2587 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2588 	if (!skb) {
2589 		err = -ENOBUFS;
2590 		goto errout;
2591 	}
2592 
2593 	/* Reserve room for dummy headers, this skb can pass
2594 	   through good chunk of routing engine.
2595 	 */
2596 	skb_reset_mac_header(skb);
2597 	skb_reset_network_header(skb);
2598 
2599 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2600 	ip_hdr(skb)->protocol = IPPROTO_UDP;
2601 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2602 
2603 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2604 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2605 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2606 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2607 	if (tb[RTA_UID])
2608 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2609 	else
2610 		uid = (iif ? INVALID_UID : current_uid());
2611 
2612 	memset(&fl4, 0, sizeof(fl4));
2613 	fl4.daddr = dst;
2614 	fl4.saddr = src;
2615 	fl4.flowi4_tos = rtm->rtm_tos;
2616 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2617 	fl4.flowi4_mark = mark;
2618 	fl4.flowi4_uid = uid;
2619 
2620 	if (iif) {
2621 		struct net_device *dev;
2622 
2623 		dev = __dev_get_by_index(net, iif);
2624 		if (!dev) {
2625 			err = -ENODEV;
2626 			goto errout_free;
2627 		}
2628 
2629 		skb->protocol	= htons(ETH_P_IP);
2630 		skb->dev	= dev;
2631 		skb->mark	= mark;
2632 		local_bh_disable();
2633 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2634 		local_bh_enable();
2635 
2636 		rt = skb_rtable(skb);
2637 		if (err == 0 && rt->dst.error)
2638 			err = -rt->dst.error;
2639 	} else {
2640 		rt = ip_route_output_key(net, &fl4);
2641 
2642 		err = 0;
2643 		if (IS_ERR(rt))
2644 			err = PTR_ERR(rt);
2645 	}
2646 
2647 	if (err)
2648 		goto errout_free;
2649 
2650 	skb_dst_set(skb, &rt->dst);
2651 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2652 		rt->rt_flags |= RTCF_NOTIFY;
2653 
2654 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2655 		table_id = rt->rt_table_id;
2656 
2657 	err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2658 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2659 			   RTM_NEWROUTE, 0, 0);
2660 	if (err < 0)
2661 		goto errout_free;
2662 
2663 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2664 errout:
2665 	return err;
2666 
2667 errout_free:
2668 	kfree_skb(skb);
2669 	goto errout;
2670 }
2671 
ip_rt_multicast_event(struct in_device * in_dev)2672 void ip_rt_multicast_event(struct in_device *in_dev)
2673 {
2674 	rt_cache_flush(dev_net(in_dev->dev));
2675 }
2676 
2677 #ifdef CONFIG_SYSCTL
2678 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2679 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2680 static int ip_rt_gc_elasticity __read_mostly	= 8;
2681 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)2682 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2683 					void __user *buffer,
2684 					size_t *lenp, loff_t *ppos)
2685 {
2686 	struct net *net = (struct net *)__ctl->extra1;
2687 
2688 	if (write) {
2689 		rt_cache_flush(net);
2690 		fnhe_genid_bump(net);
2691 		return 0;
2692 	}
2693 
2694 	return -EINVAL;
2695 }
2696 
2697 static struct ctl_table ipv4_route_table[] = {
2698 	{
2699 		.procname	= "gc_thresh",
2700 		.data		= &ipv4_dst_ops.gc_thresh,
2701 		.maxlen		= sizeof(int),
2702 		.mode		= 0644,
2703 		.proc_handler	= proc_dointvec,
2704 	},
2705 	{
2706 		.procname	= "max_size",
2707 		.data		= &ip_rt_max_size,
2708 		.maxlen		= sizeof(int),
2709 		.mode		= 0644,
2710 		.proc_handler	= proc_dointvec,
2711 	},
2712 	{
2713 		/*  Deprecated. Use gc_min_interval_ms */
2714 
2715 		.procname	= "gc_min_interval",
2716 		.data		= &ip_rt_gc_min_interval,
2717 		.maxlen		= sizeof(int),
2718 		.mode		= 0644,
2719 		.proc_handler	= proc_dointvec_jiffies,
2720 	},
2721 	{
2722 		.procname	= "gc_min_interval_ms",
2723 		.data		= &ip_rt_gc_min_interval,
2724 		.maxlen		= sizeof(int),
2725 		.mode		= 0644,
2726 		.proc_handler	= proc_dointvec_ms_jiffies,
2727 	},
2728 	{
2729 		.procname	= "gc_timeout",
2730 		.data		= &ip_rt_gc_timeout,
2731 		.maxlen		= sizeof(int),
2732 		.mode		= 0644,
2733 		.proc_handler	= proc_dointvec_jiffies,
2734 	},
2735 	{
2736 		.procname	= "gc_interval",
2737 		.data		= &ip_rt_gc_interval,
2738 		.maxlen		= sizeof(int),
2739 		.mode		= 0644,
2740 		.proc_handler	= proc_dointvec_jiffies,
2741 	},
2742 	{
2743 		.procname	= "redirect_load",
2744 		.data		= &ip_rt_redirect_load,
2745 		.maxlen		= sizeof(int),
2746 		.mode		= 0644,
2747 		.proc_handler	= proc_dointvec,
2748 	},
2749 	{
2750 		.procname	= "redirect_number",
2751 		.data		= &ip_rt_redirect_number,
2752 		.maxlen		= sizeof(int),
2753 		.mode		= 0644,
2754 		.proc_handler	= proc_dointvec,
2755 	},
2756 	{
2757 		.procname	= "redirect_silence",
2758 		.data		= &ip_rt_redirect_silence,
2759 		.maxlen		= sizeof(int),
2760 		.mode		= 0644,
2761 		.proc_handler	= proc_dointvec,
2762 	},
2763 	{
2764 		.procname	= "error_cost",
2765 		.data		= &ip_rt_error_cost,
2766 		.maxlen		= sizeof(int),
2767 		.mode		= 0644,
2768 		.proc_handler	= proc_dointvec,
2769 	},
2770 	{
2771 		.procname	= "error_burst",
2772 		.data		= &ip_rt_error_burst,
2773 		.maxlen		= sizeof(int),
2774 		.mode		= 0644,
2775 		.proc_handler	= proc_dointvec,
2776 	},
2777 	{
2778 		.procname	= "gc_elasticity",
2779 		.data		= &ip_rt_gc_elasticity,
2780 		.maxlen		= sizeof(int),
2781 		.mode		= 0644,
2782 		.proc_handler	= proc_dointvec,
2783 	},
2784 	{
2785 		.procname	= "mtu_expires",
2786 		.data		= &ip_rt_mtu_expires,
2787 		.maxlen		= sizeof(int),
2788 		.mode		= 0644,
2789 		.proc_handler	= proc_dointvec_jiffies,
2790 	},
2791 	{
2792 		.procname	= "min_pmtu",
2793 		.data		= &ip_rt_min_pmtu,
2794 		.maxlen		= sizeof(int),
2795 		.mode		= 0644,
2796 		.proc_handler	= proc_dointvec_minmax,
2797 		.extra1		= &ip_min_valid_pmtu,
2798 	},
2799 	{
2800 		.procname	= "min_adv_mss",
2801 		.data		= &ip_rt_min_advmss,
2802 		.maxlen		= sizeof(int),
2803 		.mode		= 0644,
2804 		.proc_handler	= proc_dointvec,
2805 	},
2806 	{ }
2807 };
2808 
2809 static struct ctl_table ipv4_route_flush_table[] = {
2810 	{
2811 		.procname	= "flush",
2812 		.maxlen		= sizeof(int),
2813 		.mode		= 0200,
2814 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2815 	},
2816 	{ },
2817 };
2818 
sysctl_route_net_init(struct net * net)2819 static __net_init int sysctl_route_net_init(struct net *net)
2820 {
2821 	struct ctl_table *tbl;
2822 
2823 	tbl = ipv4_route_flush_table;
2824 	if (!net_eq(net, &init_net)) {
2825 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2826 		if (!tbl)
2827 			goto err_dup;
2828 
2829 		/* Don't export sysctls to unprivileged users */
2830 		if (net->user_ns != &init_user_ns)
2831 			tbl[0].procname = NULL;
2832 	}
2833 	tbl[0].extra1 = net;
2834 
2835 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2836 	if (!net->ipv4.route_hdr)
2837 		goto err_reg;
2838 	return 0;
2839 
2840 err_reg:
2841 	if (tbl != ipv4_route_flush_table)
2842 		kfree(tbl);
2843 err_dup:
2844 	return -ENOMEM;
2845 }
2846 
sysctl_route_net_exit(struct net * net)2847 static __net_exit void sysctl_route_net_exit(struct net *net)
2848 {
2849 	struct ctl_table *tbl;
2850 
2851 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2852 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2853 	BUG_ON(tbl == ipv4_route_flush_table);
2854 	kfree(tbl);
2855 }
2856 
2857 static __net_initdata struct pernet_operations sysctl_route_ops = {
2858 	.init = sysctl_route_net_init,
2859 	.exit = sysctl_route_net_exit,
2860 };
2861 #endif
2862 
rt_genid_init(struct net * net)2863 static __net_init int rt_genid_init(struct net *net)
2864 {
2865 	atomic_set(&net->ipv4.rt_genid, 0);
2866 	atomic_set(&net->fnhe_genid, 0);
2867 	get_random_bytes(&net->ipv4.dev_addr_genid,
2868 			 sizeof(net->ipv4.dev_addr_genid));
2869 	return 0;
2870 }
2871 
2872 static __net_initdata struct pernet_operations rt_genid_ops = {
2873 	.init = rt_genid_init,
2874 };
2875 
ipv4_inetpeer_init(struct net * net)2876 static int __net_init ipv4_inetpeer_init(struct net *net)
2877 {
2878 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2879 
2880 	if (!bp)
2881 		return -ENOMEM;
2882 	inet_peer_base_init(bp);
2883 	net->ipv4.peers = bp;
2884 	return 0;
2885 }
2886 
ipv4_inetpeer_exit(struct net * net)2887 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2888 {
2889 	struct inet_peer_base *bp = net->ipv4.peers;
2890 
2891 	net->ipv4.peers = NULL;
2892 	inetpeer_invalidate_tree(bp);
2893 	kfree(bp);
2894 }
2895 
2896 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2897 	.init	=	ipv4_inetpeer_init,
2898 	.exit	=	ipv4_inetpeer_exit,
2899 };
2900 
2901 #ifdef CONFIG_IP_ROUTE_CLASSID
2902 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2903 #endif /* CONFIG_IP_ROUTE_CLASSID */
2904 
ip_rt_init(void)2905 int __init ip_rt_init(void)
2906 {
2907 	int rc = 0;
2908 	int cpu;
2909 
2910 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2911 	if (!ip_idents)
2912 		panic("IP: failed to allocate ip_idents\n");
2913 
2914 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2915 
2916 	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2917 	if (!ip_tstamps)
2918 		panic("IP: failed to allocate ip_tstamps\n");
2919 
2920 	for_each_possible_cpu(cpu) {
2921 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2922 
2923 		INIT_LIST_HEAD(&ul->head);
2924 		spin_lock_init(&ul->lock);
2925 	}
2926 #ifdef CONFIG_IP_ROUTE_CLASSID
2927 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2928 	if (!ip_rt_acct)
2929 		panic("IP: failed to allocate ip_rt_acct\n");
2930 #endif
2931 
2932 	ipv4_dst_ops.kmem_cachep =
2933 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2934 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2935 
2936 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2937 
2938 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2939 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2940 
2941 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2942 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2943 
2944 	ipv4_dst_ops.gc_thresh = ~0;
2945 	ip_rt_max_size = INT_MAX;
2946 
2947 	devinet_init();
2948 	ip_fib_init();
2949 
2950 	if (ip_rt_proc_init())
2951 		pr_err("Unable to create route proc files\n");
2952 #ifdef CONFIG_XFRM
2953 	xfrm_init();
2954 	xfrm4_init();
2955 #endif
2956 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2957 
2958 #ifdef CONFIG_SYSCTL
2959 	register_pernet_subsys(&sysctl_route_ops);
2960 #endif
2961 	register_pernet_subsys(&rt_genid_ops);
2962 	register_pernet_subsys(&ipv4_inetpeer_ops);
2963 	return rc;
2964 }
2965 
2966 #ifdef CONFIG_SYSCTL
2967 /*
2968  * We really need to sanitize the damn ipv4 init order, then all
2969  * this nonsense will go away.
2970  */
ip_static_sysctl_init(void)2971 void __init ip_static_sysctl_init(void)
2972 {
2973 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2974 }
2975 #endif
2976