• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu,
144 					   bool confirm_neigh);
145 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 					struct sk_buff *skb);
147 static void		ipv4_dst_destroy(struct dst_entry *dst);
148 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159 
160 static struct dst_ops ipv4_dst_ops = {
161 	.family =		AF_INET,
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.negative_advice =	ipv4_negative_advice,
168 	.link_failure =		ipv4_link_failure,
169 	.update_pmtu =		ip_rt_update_pmtu,
170 	.redirect =		ip_do_redirect,
171 	.local_out =		__ip_local_out,
172 	.neigh_lookup =		ipv4_neigh_lookup,
173 	.confirm_neigh =	ipv4_confirm_neigh,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct file_operations rt_cache_seq_fops = {
242 	.open	 = rt_cache_seq_open,
243 	.read	 = seq_read,
244 	.llseek	 = seq_lseek,
245 	.release = seq_release,
246 };
247 
248 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 	int cpu;
252 
253 	if (*pos == 0)
254 		return SEQ_START_TOKEN;
255 
256 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 		if (!cpu_possible(cpu))
258 			continue;
259 		*pos = cpu+1;
260 		return &per_cpu(rt_cache_stat, cpu);
261 	}
262 	return NULL;
263 }
264 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 	int cpu;
268 
269 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 		if (!cpu_possible(cpu))
271 			continue;
272 		*pos = cpu+1;
273 		return &per_cpu(rt_cache_stat, cpu);
274 	}
275 	(*pos)++;
276 	return NULL;
277 
278 }
279 
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct file_operations rt_cpu_seq_fops = {
333 	.open	 = rt_cpu_seq_open,
334 	.read	 = seq_read,
335 	.llseek	 = seq_lseek,
336 	.release = seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 #endif
364 
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 	struct proc_dir_entry *pde;
368 
369 	pde = proc_create("rt_cache", 0444, net->proc_net,
370 			  &rt_cache_seq_fops);
371 	if (!pde)
372 		goto err1;
373 
374 	pde = proc_create("rt_cache", 0444,
375 			  net->proc_net_stat, &rt_cpu_seq_fops);
376 	if (!pde)
377 		goto err2;
378 
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 	pde = proc_create_single("rt_acct", 0, net->proc_net,
381 			rt_acct_proc_show);
382 	if (!pde)
383 		goto err3;
384 #endif
385 	return 0;
386 
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 	remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 	remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 	return -ENOMEM;
395 }
396 
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 	remove_proc_entry("rt_cache", net->proc_net_stat);
400 	remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 	remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405 
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407 	.init = ip_rt_do_proc_init,
408 	.exit = ip_rt_do_proc_exit,
409 };
410 
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 	return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415 
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 	return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422 
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427 
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 	rt_genid_bump_ipv4(net);
431 }
432 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 					   struct sk_buff *skb,
435 					   const void *daddr)
436 {
437 	const struct rtable *rt = container_of(dst, struct rtable, dst);
438 	struct net_device *dev = dst->dev;
439 	struct neighbour *n;
440 
441 	rcu_read_lock_bh();
442 
443 	if (likely(rt->rt_gw_family == AF_INET)) {
444 		n = ip_neigh_gw4(dev, rt->rt_gw4);
445 	} else if (rt->rt_gw_family == AF_INET6) {
446 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448 		__be32 pkey;
449 
450 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 		n = ip_neigh_gw4(dev, pkey);
452 	}
453 
454 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 		n = NULL;
456 
457 	rcu_read_unlock_bh();
458 
459 	return n;
460 }
461 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 	const struct rtable *rt = container_of(dst, struct rtable, dst);
465 	struct net_device *dev = dst->dev;
466 	const __be32 *pkey = daddr;
467 
468 	if (rt->rt_gw_family == AF_INET) {
469 		pkey = (const __be32 *)&rt->rt_gw4;
470 	} else if (rt->rt_gw_family == AF_INET6) {
471 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 	} else if (!daddr ||
473 		 (rt->rt_flags &
474 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 		return;
476 	}
477 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479 
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486 
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 	u32 bucket, old, now = (u32)jiffies;
494 	atomic_t *p_id;
495 	u32 *p_tstamp;
496 	u32 delta = 0;
497 
498 	bucket = hash & ip_idents_mask;
499 	p_tstamp = ip_tstamps + bucket;
500 	p_id = ip_idents + bucket;
501 	old = READ_ONCE(*p_tstamp);
502 
503 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 		delta = prandom_u32_max(now - old);
505 
506 	/* If UBSAN reports an error there, please make sure your compiler
507 	 * supports -fno-strict-overflow before reporting it that was a bug
508 	 * in UBSAN, and it has been fixed in GCC-8.
509 	 */
510 	return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 	u32 hash, id;
517 
518 	/* Note the following code is not safe, but this is okay. */
519 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 		get_random_bytes(&net->ipv4.ip_id_key,
521 				 sizeof(net->ipv4.ip_id_key));
522 
523 	hash = siphash_3u32((__force u32)iph->daddr,
524 			    (__force u32)iph->saddr,
525 			    iph->protocol,
526 			    &net->ipv4.ip_id_key);
527 	id = ip_idents_reserve(hash, segs);
528 	iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)532 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
533 			     const struct sock *sk,
534 			     const struct iphdr *iph,
535 			     int oif, u8 tos,
536 			     u8 prot, u32 mark, int flow_flags)
537 {
538 	if (sk) {
539 		const struct inet_sock *inet = inet_sk(sk);
540 
541 		oif = sk->sk_bound_dev_if;
542 		mark = sk->sk_mark;
543 		tos = RT_CONN_FLAGS(sk);
544 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
545 	}
546 	flowi4_init_output(fl4, oif, mark, tos,
547 			   RT_SCOPE_UNIVERSE, prot,
548 			   flow_flags,
549 			   iph->daddr, iph->saddr, 0, 0,
550 			   sock_net_uid(net, sk));
551 }
552 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 			       const struct sock *sk)
555 {
556 	const struct net *net = dev_net(skb->dev);
557 	const struct iphdr *iph = ip_hdr(skb);
558 	int oif = skb->dev->ifindex;
559 	u8 tos = RT_TOS(iph->tos);
560 	u8 prot = iph->protocol;
561 	u32 mark = skb->mark;
562 
563 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
564 }
565 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)566 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
567 {
568 	const struct inet_sock *inet = inet_sk(sk);
569 	const struct ip_options_rcu *inet_opt;
570 	__be32 daddr = inet->inet_daddr;
571 
572 	rcu_read_lock();
573 	inet_opt = rcu_dereference(inet->inet_opt);
574 	if (inet_opt && inet_opt->opt.srr)
575 		daddr = inet_opt->opt.faddr;
576 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
577 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
578 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
579 			   inet_sk_flowi_flags(sk),
580 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
581 	rcu_read_unlock();
582 }
583 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)584 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
585 				 const struct sk_buff *skb)
586 {
587 	if (skb)
588 		build_skb_flow_key(fl4, skb, sk);
589 	else
590 		build_sk_flow_key(fl4, sk);
591 }
592 
593 static DEFINE_SPINLOCK(fnhe_lock);
594 
fnhe_flush_routes(struct fib_nh_exception * fnhe)595 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
596 {
597 	struct rtable *rt;
598 
599 	rt = rcu_dereference(fnhe->fnhe_rth_input);
600 	if (rt) {
601 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
602 		dst_dev_put(&rt->dst);
603 		dst_release(&rt->dst);
604 	}
605 	rt = rcu_dereference(fnhe->fnhe_rth_output);
606 	if (rt) {
607 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
608 		dst_dev_put(&rt->dst);
609 		dst_release(&rt->dst);
610 	}
611 }
612 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)613 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
614 {
615 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
616 	struct fib_nh_exception *fnhe, *oldest = NULL;
617 
618 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
619 		fnhe = rcu_dereference_protected(*fnhe_p,
620 						 lockdep_is_held(&fnhe_lock));
621 		if (!fnhe)
622 			break;
623 		if (!oldest ||
624 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
625 			oldest = fnhe;
626 			oldest_p = fnhe_p;
627 		}
628 	}
629 	fnhe_flush_routes(oldest);
630 	*oldest_p = oldest->fnhe_next;
631 	kfree_rcu(oldest, rcu);
632 }
633 
fnhe_hashfun(__be32 daddr)634 static u32 fnhe_hashfun(__be32 daddr)
635 {
636 	static siphash_key_t fnhe_hash_key __read_mostly;
637 	u64 hval;
638 
639 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
640 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
641 	return hash_64(hval, FNHE_HASH_SHIFT);
642 }
643 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)644 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
645 {
646 	rt->rt_pmtu = fnhe->fnhe_pmtu;
647 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
648 	rt->dst.expires = fnhe->fnhe_expires;
649 
650 	if (fnhe->fnhe_gw) {
651 		rt->rt_flags |= RTCF_REDIRECTED;
652 		rt->rt_uses_gateway = 1;
653 		rt->rt_gw_family = AF_INET;
654 		rt->rt_gw4 = fnhe->fnhe_gw;
655 	}
656 }
657 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)658 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
659 				  __be32 gw, u32 pmtu, bool lock,
660 				  unsigned long expires)
661 {
662 	struct fnhe_hash_bucket *hash;
663 	struct fib_nh_exception *fnhe;
664 	struct rtable *rt;
665 	u32 genid, hval;
666 	unsigned int i;
667 	int depth;
668 
669 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
670 	hval = fnhe_hashfun(daddr);
671 
672 	spin_lock_bh(&fnhe_lock);
673 
674 	hash = rcu_dereference(nhc->nhc_exceptions);
675 	if (!hash) {
676 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
677 		if (!hash)
678 			goto out_unlock;
679 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
680 	}
681 
682 	hash += hval;
683 
684 	depth = 0;
685 	for (fnhe = rcu_dereference(hash->chain); fnhe;
686 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
687 		if (fnhe->fnhe_daddr == daddr)
688 			break;
689 		depth++;
690 	}
691 
692 	if (fnhe) {
693 		if (fnhe->fnhe_genid != genid)
694 			fnhe->fnhe_genid = genid;
695 		if (gw)
696 			fnhe->fnhe_gw = gw;
697 		if (pmtu) {
698 			fnhe->fnhe_pmtu = pmtu;
699 			fnhe->fnhe_mtu_locked = lock;
700 		}
701 		fnhe->fnhe_expires = max(1UL, expires);
702 		/* Update all cached dsts too */
703 		rt = rcu_dereference(fnhe->fnhe_rth_input);
704 		if (rt)
705 			fill_route_from_fnhe(rt, fnhe);
706 		rt = rcu_dereference(fnhe->fnhe_rth_output);
707 		if (rt)
708 			fill_route_from_fnhe(rt, fnhe);
709 	} else {
710 		/* Randomize max depth to avoid some side channels attacks. */
711 		int max_depth = FNHE_RECLAIM_DEPTH +
712 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
713 
714 		while (depth > max_depth) {
715 			fnhe_remove_oldest(hash);
716 			depth--;
717 		}
718 
719 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
720 		if (!fnhe)
721 			goto out_unlock;
722 
723 		fnhe->fnhe_next = hash->chain;
724 
725 		fnhe->fnhe_genid = genid;
726 		fnhe->fnhe_daddr = daddr;
727 		fnhe->fnhe_gw = gw;
728 		fnhe->fnhe_pmtu = pmtu;
729 		fnhe->fnhe_mtu_locked = lock;
730 		fnhe->fnhe_expires = max(1UL, expires);
731 
732 		rcu_assign_pointer(hash->chain, fnhe);
733 
734 		/* Exception created; mark the cached routes for the nexthop
735 		 * stale, so anyone caching it rechecks if this exception
736 		 * applies to them.
737 		 */
738 		rt = rcu_dereference(nhc->nhc_rth_input);
739 		if (rt)
740 			rt->dst.obsolete = DST_OBSOLETE_KILL;
741 
742 		for_each_possible_cpu(i) {
743 			struct rtable __rcu **prt;
744 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
745 			rt = rcu_dereference(*prt);
746 			if (rt)
747 				rt->dst.obsolete = DST_OBSOLETE_KILL;
748 		}
749 	}
750 
751 	fnhe->fnhe_stamp = jiffies;
752 
753 out_unlock:
754 	spin_unlock_bh(&fnhe_lock);
755 }
756 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)757 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
758 			     bool kill_route)
759 {
760 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
761 	__be32 old_gw = ip_hdr(skb)->saddr;
762 	struct net_device *dev = skb->dev;
763 	struct in_device *in_dev;
764 	struct fib_result res;
765 	struct neighbour *n;
766 	struct net *net;
767 
768 	switch (icmp_hdr(skb)->code & 7) {
769 	case ICMP_REDIR_NET:
770 	case ICMP_REDIR_NETTOS:
771 	case ICMP_REDIR_HOST:
772 	case ICMP_REDIR_HOSTTOS:
773 		break;
774 
775 	default:
776 		return;
777 	}
778 
779 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
780 		return;
781 
782 	in_dev = __in_dev_get_rcu(dev);
783 	if (!in_dev)
784 		return;
785 
786 	net = dev_net(dev);
787 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
788 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
789 	    ipv4_is_zeronet(new_gw))
790 		goto reject_redirect;
791 
792 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
793 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
794 			goto reject_redirect;
795 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
796 			goto reject_redirect;
797 	} else {
798 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
799 			goto reject_redirect;
800 	}
801 
802 	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
803 	if (!n)
804 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
805 	if (!IS_ERR(n)) {
806 		if (!(n->nud_state & NUD_VALID)) {
807 			neigh_event_send(n, NULL);
808 		} else {
809 			if (fib_lookup(net, fl4, &res, 0) == 0) {
810 				struct fib_nh_common *nhc;
811 
812 				fib_select_path(net, &res, fl4, skb);
813 				nhc = FIB_RES_NHC(res);
814 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
815 						0, false,
816 						jiffies + ip_rt_gc_timeout);
817 			}
818 			if (kill_route)
819 				rt->dst.obsolete = DST_OBSOLETE_KILL;
820 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
821 		}
822 		neigh_release(n);
823 	}
824 	return;
825 
826 reject_redirect:
827 #ifdef CONFIG_IP_ROUTE_VERBOSE
828 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
829 		const struct iphdr *iph = (const struct iphdr *) skb->data;
830 		__be32 daddr = iph->daddr;
831 		__be32 saddr = iph->saddr;
832 
833 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
834 				     "  Advised path = %pI4 -> %pI4\n",
835 				     &old_gw, dev->name, &new_gw,
836 				     &saddr, &daddr);
837 	}
838 #endif
839 	;
840 }
841 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)842 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
843 {
844 	struct rtable *rt;
845 	struct flowi4 fl4;
846 	const struct iphdr *iph = (const struct iphdr *) skb->data;
847 	struct net *net = dev_net(skb->dev);
848 	int oif = skb->dev->ifindex;
849 	u8 tos = RT_TOS(iph->tos);
850 	u8 prot = iph->protocol;
851 	u32 mark = skb->mark;
852 
853 	rt = (struct rtable *) dst;
854 
855 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
856 	__ip_do_redirect(rt, skb, &fl4, true);
857 }
858 
ipv4_negative_advice(struct dst_entry * dst)859 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
860 {
861 	struct rtable *rt = (struct rtable *)dst;
862 	struct dst_entry *ret = dst;
863 
864 	if (rt) {
865 		if (dst->obsolete > 0) {
866 			ip_rt_put(rt);
867 			ret = NULL;
868 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
869 			   rt->dst.expires) {
870 			ip_rt_put(rt);
871 			ret = NULL;
872 		}
873 	}
874 	return ret;
875 }
876 
877 /*
878  * Algorithm:
879  *	1. The first ip_rt_redirect_number redirects are sent
880  *	   with exponential backoff, then we stop sending them at all,
881  *	   assuming that the host ignores our redirects.
882  *	2. If we did not see packets requiring redirects
883  *	   during ip_rt_redirect_silence, we assume that the host
884  *	   forgot redirected route and start to send redirects again.
885  *
886  * This algorithm is much cheaper and more intelligent than dumb load limiting
887  * in icmp.c.
888  *
889  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
890  * and "frag. need" (breaks PMTU discovery) in icmp.c.
891  */
892 
ip_rt_send_redirect(struct sk_buff * skb)893 void ip_rt_send_redirect(struct sk_buff *skb)
894 {
895 	struct rtable *rt = skb_rtable(skb);
896 	struct in_device *in_dev;
897 	struct inet_peer *peer;
898 	struct net *net;
899 	int log_martians;
900 	int vif;
901 
902 	rcu_read_lock();
903 	in_dev = __in_dev_get_rcu(rt->dst.dev);
904 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
905 		rcu_read_unlock();
906 		return;
907 	}
908 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
909 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
910 	rcu_read_unlock();
911 
912 	net = dev_net(rt->dst.dev);
913 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
914 	if (!peer) {
915 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
916 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
917 		return;
918 	}
919 
920 	/* No redirected packets during ip_rt_redirect_silence;
921 	 * reset the algorithm.
922 	 */
923 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
924 		peer->rate_tokens = 0;
925 		peer->n_redirects = 0;
926 	}
927 
928 	/* Too many ignored redirects; do not send anything
929 	 * set dst.rate_last to the last seen redirected packet.
930 	 */
931 	if (peer->n_redirects >= ip_rt_redirect_number) {
932 		peer->rate_last = jiffies;
933 		goto out_put_peer;
934 	}
935 
936 	/* Check for load limit; set rate_last to the latest sent
937 	 * redirect.
938 	 */
939 	if (peer->n_redirects == 0 ||
940 	    time_after(jiffies,
941 		       (peer->rate_last +
942 			(ip_rt_redirect_load << peer->n_redirects)))) {
943 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
944 
945 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
946 		peer->rate_last = jiffies;
947 		++peer->n_redirects;
948 #ifdef CONFIG_IP_ROUTE_VERBOSE
949 		if (log_martians &&
950 		    peer->n_redirects == ip_rt_redirect_number)
951 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
952 					     &ip_hdr(skb)->saddr, inet_iif(skb),
953 					     &ip_hdr(skb)->daddr, &gw);
954 #endif
955 	}
956 out_put_peer:
957 	inet_putpeer(peer);
958 }
959 
ip_error(struct sk_buff * skb)960 static int ip_error(struct sk_buff *skb)
961 {
962 	struct rtable *rt = skb_rtable(skb);
963 	struct net_device *dev = skb->dev;
964 	struct in_device *in_dev;
965 	struct inet_peer *peer;
966 	unsigned long now;
967 	struct net *net;
968 	bool send;
969 	int code;
970 
971 	if (netif_is_l3_master(skb->dev)) {
972 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
973 		if (!dev)
974 			goto out;
975 	}
976 
977 	in_dev = __in_dev_get_rcu(dev);
978 
979 	/* IP on this device is disabled. */
980 	if (!in_dev)
981 		goto out;
982 
983 	net = dev_net(rt->dst.dev);
984 	if (!IN_DEV_FORWARD(in_dev)) {
985 		switch (rt->dst.error) {
986 		case EHOSTUNREACH:
987 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
988 			break;
989 
990 		case ENETUNREACH:
991 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
992 			break;
993 		}
994 		goto out;
995 	}
996 
997 	switch (rt->dst.error) {
998 	case EINVAL:
999 	default:
1000 		goto out;
1001 	case EHOSTUNREACH:
1002 		code = ICMP_HOST_UNREACH;
1003 		break;
1004 	case ENETUNREACH:
1005 		code = ICMP_NET_UNREACH;
1006 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1007 		break;
1008 	case EACCES:
1009 		code = ICMP_PKT_FILTERED;
1010 		break;
1011 	}
1012 
1013 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014 			       l3mdev_master_ifindex(skb->dev), 1);
1015 
1016 	send = true;
1017 	if (peer) {
1018 		now = jiffies;
1019 		peer->rate_tokens += now - peer->rate_last;
1020 		if (peer->rate_tokens > ip_rt_error_burst)
1021 			peer->rate_tokens = ip_rt_error_burst;
1022 		peer->rate_last = now;
1023 		if (peer->rate_tokens >= ip_rt_error_cost)
1024 			peer->rate_tokens -= ip_rt_error_cost;
1025 		else
1026 			send = false;
1027 		inet_putpeer(peer);
1028 	}
1029 	if (send)
1030 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1031 
1032 out:	kfree_skb(skb);
1033 	return 0;
1034 }
1035 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1036 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1037 {
1038 	struct dst_entry *dst = &rt->dst;
1039 	struct net *net = dev_net(dst->dev);
1040 	u32 old_mtu = ipv4_mtu(dst);
1041 	struct fib_result res;
1042 	bool lock = false;
1043 
1044 	if (ip_mtu_locked(dst))
1045 		return;
1046 
1047 	if (old_mtu < mtu)
1048 		return;
1049 
1050 	if (mtu < ip_rt_min_pmtu) {
1051 		lock = true;
1052 		mtu = min(old_mtu, ip_rt_min_pmtu);
1053 	}
1054 
1055 	if (rt->rt_pmtu == mtu && !lock &&
1056 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1057 		return;
1058 
1059 	rcu_read_lock();
1060 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1061 		struct fib_nh_common *nhc;
1062 
1063 		fib_select_path(net, &res, fl4, NULL);
1064 		nhc = FIB_RES_NHC(res);
1065 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1066 				      jiffies + ip_rt_mtu_expires);
1067 	}
1068 	rcu_read_unlock();
1069 }
1070 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1071 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1072 			      struct sk_buff *skb, u32 mtu,
1073 			      bool confirm_neigh)
1074 {
1075 	struct rtable *rt = (struct rtable *) dst;
1076 	struct flowi4 fl4;
1077 
1078 	ip_rt_build_flow_key(&fl4, sk, skb);
1079 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1080 }
1081 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1082 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1083 		      int oif, u8 protocol)
1084 {
1085 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1086 	struct flowi4 fl4;
1087 	struct rtable *rt;
1088 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1089 
1090 	__build_flow_key(net, &fl4, NULL, iph, oif,
1091 			 RT_TOS(iph->tos), protocol, mark, 0);
1092 	rt = __ip_route_output_key(net, &fl4);
1093 	if (!IS_ERR(rt)) {
1094 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1095 		ip_rt_put(rt);
1096 	}
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101 {
1102 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1103 	struct flowi4 fl4;
1104 	struct rtable *rt;
1105 
1106 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107 
1108 	if (!fl4.flowi4_mark)
1109 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110 
1111 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1112 	if (!IS_ERR(rt)) {
1113 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1114 		ip_rt_put(rt);
1115 	}
1116 }
1117 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119 {
1120 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1121 	struct flowi4 fl4;
1122 	struct rtable *rt;
1123 	struct dst_entry *odst = NULL;
1124 	bool new = false;
1125 	struct net *net = sock_net(sk);
1126 
1127 	bh_lock_sock(sk);
1128 
1129 	if (!ip_sk_accept_pmtu(sk))
1130 		goto out;
1131 
1132 	odst = sk_dst_get(sk);
1133 
1134 	if (sock_owned_by_user(sk) || !odst) {
1135 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1136 		goto out;
1137 	}
1138 
1139 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140 
1141 	rt = (struct rtable *)odst;
1142 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 		if (IS_ERR(rt))
1145 			goto out;
1146 
1147 		new = true;
1148 	}
1149 
1150 	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1151 
1152 	if (!dst_check(&rt->dst, 0)) {
1153 		if (new)
1154 			dst_release(&rt->dst);
1155 
1156 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157 		if (IS_ERR(rt))
1158 			goto out;
1159 
1160 		new = true;
1161 	}
1162 
1163 	if (new)
1164 		sk_dst_set(sk, &rt->dst);
1165 
1166 out:
1167 	bh_unlock_sock(sk);
1168 	dst_release(odst);
1169 }
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173 		   int oif, u8 protocol)
1174 {
1175 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1176 	struct flowi4 fl4;
1177 	struct rtable *rt;
1178 
1179 	__build_flow_key(net, &fl4, NULL, iph, oif,
1180 			 RT_TOS(iph->tos), protocol, 0, 0);
1181 	rt = __ip_route_output_key(net, &fl4);
1182 	if (!IS_ERR(rt)) {
1183 		__ip_do_redirect(rt, skb, &fl4, false);
1184 		ip_rt_put(rt);
1185 	}
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1188 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190 {
1191 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1192 	struct flowi4 fl4;
1193 	struct rtable *rt;
1194 	struct net *net = sock_net(sk);
1195 
1196 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197 	rt = __ip_route_output_key(net, &fl4);
1198 	if (!IS_ERR(rt)) {
1199 		__ip_do_redirect(rt, skb, &fl4, false);
1200 		ip_rt_put(rt);
1201 	}
1202 }
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206 {
1207 	struct rtable *rt = (struct rtable *) dst;
1208 
1209 	/* All IPV4 dsts are created with ->obsolete set to the value
1210 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211 	 * into this function always.
1212 	 *
1213 	 * When a PMTU/redirect information update invalidates a route,
1214 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215 	 * DST_OBSOLETE_DEAD.
1216 	 */
1217 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218 		return NULL;
1219 	return dst;
1220 }
1221 
ipv4_send_dest_unreach(struct sk_buff * skb)1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223 {
1224 	struct net_device *dev;
1225 	struct ip_options opt;
1226 	int res;
1227 
1228 	/* Recompile ip options since IPCB may not be valid anymore.
1229 	 * Also check we have a reasonable ipv4 header.
1230 	 */
1231 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1232 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1233 		return;
1234 
1235 	memset(&opt, 0, sizeof(opt));
1236 	if (ip_hdr(skb)->ihl > 5) {
1237 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1238 			return;
1239 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1240 
1241 		rcu_read_lock();
1242 		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1243 		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1244 		rcu_read_unlock();
1245 
1246 		if (res)
1247 			return;
1248 	}
1249 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1250 }
1251 
ipv4_link_failure(struct sk_buff * skb)1252 static void ipv4_link_failure(struct sk_buff *skb)
1253 {
1254 	struct rtable *rt;
1255 
1256 	ipv4_send_dest_unreach(skb);
1257 
1258 	rt = skb_rtable(skb);
1259 	if (rt)
1260 		dst_set_expires(&rt->dst, 0);
1261 }
1262 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1263 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1264 {
1265 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1266 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1267 		 skb->dev ? skb->dev->name : "?");
1268 	kfree_skb(skb);
1269 	WARN_ON(1);
1270 	return 0;
1271 }
1272 
1273 /*
1274    We do not cache source address of outgoing interface,
1275    because it is used only by IP RR, TS and SRR options,
1276    so that it out of fast path.
1277 
1278    BTW remember: "addr" is allowed to be not aligned
1279    in IP options!
1280  */
1281 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1282 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1283 {
1284 	__be32 src;
1285 
1286 	if (rt_is_output_route(rt))
1287 		src = ip_hdr(skb)->saddr;
1288 	else {
1289 		struct fib_result res;
1290 		struct iphdr *iph = ip_hdr(skb);
1291 		struct flowi4 fl4 = {
1292 			.daddr = iph->daddr,
1293 			.saddr = iph->saddr,
1294 			.flowi4_tos = RT_TOS(iph->tos),
1295 			.flowi4_oif = rt->dst.dev->ifindex,
1296 			.flowi4_iif = skb->dev->ifindex,
1297 			.flowi4_mark = skb->mark,
1298 		};
1299 
1300 		rcu_read_lock();
1301 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1302 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1303 		else
1304 			src = inet_select_addr(rt->dst.dev,
1305 					       rt_nexthop(rt, iph->daddr),
1306 					       RT_SCOPE_UNIVERSE);
1307 		rcu_read_unlock();
1308 	}
1309 	memcpy(addr, &src, 4);
1310 }
1311 
1312 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1313 static void set_class_tag(struct rtable *rt, u32 tag)
1314 {
1315 	if (!(rt->dst.tclassid & 0xFFFF))
1316 		rt->dst.tclassid |= tag & 0xFFFF;
1317 	if (!(rt->dst.tclassid & 0xFFFF0000))
1318 		rt->dst.tclassid |= tag & 0xFFFF0000;
1319 }
1320 #endif
1321 
ipv4_default_advmss(const struct dst_entry * dst)1322 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1323 {
1324 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1325 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1326 				    ip_rt_min_advmss);
1327 
1328 	return min(advmss, IPV4_MAX_PMTU - header_size);
1329 }
1330 
ipv4_mtu(const struct dst_entry * dst)1331 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1332 {
1333 	const struct rtable *rt = (const struct rtable *) dst;
1334 	unsigned int mtu = rt->rt_pmtu;
1335 
1336 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1337 		mtu = dst_metric_raw(dst, RTAX_MTU);
1338 
1339 	if (mtu)
1340 		goto out;
1341 
1342 	mtu = READ_ONCE(dst->dev->mtu);
1343 
1344 	if (unlikely(ip_mtu_locked(dst))) {
1345 		if (rt->rt_uses_gateway && mtu > 576)
1346 			mtu = 576;
1347 	}
1348 
1349 out:
1350 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1351 
1352 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1353 }
1354 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1355 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1356 {
1357 	struct fnhe_hash_bucket *hash;
1358 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1359 	u32 hval = fnhe_hashfun(daddr);
1360 
1361 	spin_lock_bh(&fnhe_lock);
1362 
1363 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1364 					 lockdep_is_held(&fnhe_lock));
1365 	hash += hval;
1366 
1367 	fnhe_p = &hash->chain;
1368 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1369 	while (fnhe) {
1370 		if (fnhe->fnhe_daddr == daddr) {
1371 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1372 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1373 			/* set fnhe_daddr to 0 to ensure it won't bind with
1374 			 * new dsts in rt_bind_exception().
1375 			 */
1376 			fnhe->fnhe_daddr = 0;
1377 			fnhe_flush_routes(fnhe);
1378 			kfree_rcu(fnhe, rcu);
1379 			break;
1380 		}
1381 		fnhe_p = &fnhe->fnhe_next;
1382 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1383 						 lockdep_is_held(&fnhe_lock));
1384 	}
1385 
1386 	spin_unlock_bh(&fnhe_lock);
1387 }
1388 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1389 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1390 					       __be32 daddr)
1391 {
1392 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1393 	struct fib_nh_exception *fnhe;
1394 	u32 hval;
1395 
1396 	if (!hash)
1397 		return NULL;
1398 
1399 	hval = fnhe_hashfun(daddr);
1400 
1401 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1402 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1403 		if (fnhe->fnhe_daddr == daddr) {
1404 			if (fnhe->fnhe_expires &&
1405 			    time_after(jiffies, fnhe->fnhe_expires)) {
1406 				ip_del_fnhe(nhc, daddr);
1407 				break;
1408 			}
1409 			return fnhe;
1410 		}
1411 	}
1412 	return NULL;
1413 }
1414 
1415 /* MTU selection:
1416  * 1. mtu on route is locked - use it
1417  * 2. mtu from nexthop exception
1418  * 3. mtu from egress device
1419  */
1420 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1421 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1422 {
1423 	struct fib_nh_common *nhc = res->nhc;
1424 	struct net_device *dev = nhc->nhc_dev;
1425 	struct fib_info *fi = res->fi;
1426 	u32 mtu = 0;
1427 
1428 	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1429 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1430 		mtu = fi->fib_mtu;
1431 
1432 	if (likely(!mtu)) {
1433 		struct fib_nh_exception *fnhe;
1434 
1435 		fnhe = find_exception(nhc, daddr);
1436 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1437 			mtu = fnhe->fnhe_pmtu;
1438 	}
1439 
1440 	if (likely(!mtu))
1441 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1442 
1443 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1444 }
1445 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1446 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1447 			      __be32 daddr, const bool do_cache)
1448 {
1449 	bool ret = false;
1450 
1451 	spin_lock_bh(&fnhe_lock);
1452 
1453 	if (daddr == fnhe->fnhe_daddr) {
1454 		struct rtable __rcu **porig;
1455 		struct rtable *orig;
1456 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1457 
1458 		if (rt_is_input_route(rt))
1459 			porig = &fnhe->fnhe_rth_input;
1460 		else
1461 			porig = &fnhe->fnhe_rth_output;
1462 		orig = rcu_dereference(*porig);
1463 
1464 		if (fnhe->fnhe_genid != genid) {
1465 			fnhe->fnhe_genid = genid;
1466 			fnhe->fnhe_gw = 0;
1467 			fnhe->fnhe_pmtu = 0;
1468 			fnhe->fnhe_expires = 0;
1469 			fnhe->fnhe_mtu_locked = false;
1470 			fnhe_flush_routes(fnhe);
1471 			orig = NULL;
1472 		}
1473 		fill_route_from_fnhe(rt, fnhe);
1474 		if (!rt->rt_gw4) {
1475 			rt->rt_gw4 = daddr;
1476 			rt->rt_gw_family = AF_INET;
1477 		}
1478 
1479 		if (do_cache) {
1480 			dst_hold(&rt->dst);
1481 			rcu_assign_pointer(*porig, rt);
1482 			if (orig) {
1483 				dst_dev_put(&orig->dst);
1484 				dst_release(&orig->dst);
1485 			}
1486 			ret = true;
1487 		}
1488 
1489 		fnhe->fnhe_stamp = jiffies;
1490 	}
1491 	spin_unlock_bh(&fnhe_lock);
1492 
1493 	return ret;
1494 }
1495 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1496 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1497 {
1498 	struct rtable *orig, *prev, **p;
1499 	bool ret = true;
1500 
1501 	if (rt_is_input_route(rt)) {
1502 		p = (struct rtable **)&nhc->nhc_rth_input;
1503 	} else {
1504 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1505 	}
1506 	orig = *p;
1507 
1508 	/* hold dst before doing cmpxchg() to avoid race condition
1509 	 * on this dst
1510 	 */
1511 	dst_hold(&rt->dst);
1512 	prev = cmpxchg(p, orig, rt);
1513 	if (prev == orig) {
1514 		if (orig) {
1515 			rt_add_uncached_list(orig);
1516 			dst_release(&orig->dst);
1517 		}
1518 	} else {
1519 		dst_release(&rt->dst);
1520 		ret = false;
1521 	}
1522 
1523 	return ret;
1524 }
1525 
1526 struct uncached_list {
1527 	spinlock_t		lock;
1528 	struct list_head	head;
1529 };
1530 
1531 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1532 
rt_add_uncached_list(struct rtable * rt)1533 void rt_add_uncached_list(struct rtable *rt)
1534 {
1535 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1536 
1537 	rt->rt_uncached_list = ul;
1538 
1539 	spin_lock_bh(&ul->lock);
1540 	list_add_tail(&rt->rt_uncached, &ul->head);
1541 	spin_unlock_bh(&ul->lock);
1542 }
1543 
rt_del_uncached_list(struct rtable * rt)1544 void rt_del_uncached_list(struct rtable *rt)
1545 {
1546 	if (!list_empty(&rt->rt_uncached)) {
1547 		struct uncached_list *ul = rt->rt_uncached_list;
1548 
1549 		spin_lock_bh(&ul->lock);
1550 		list_del(&rt->rt_uncached);
1551 		spin_unlock_bh(&ul->lock);
1552 	}
1553 }
1554 
ipv4_dst_destroy(struct dst_entry * dst)1555 static void ipv4_dst_destroy(struct dst_entry *dst)
1556 {
1557 	struct rtable *rt = (struct rtable *)dst;
1558 
1559 	ip_dst_metrics_put(dst);
1560 	rt_del_uncached_list(rt);
1561 }
1562 
rt_flush_dev(struct net_device * dev)1563 void rt_flush_dev(struct net_device *dev)
1564 {
1565 	struct rtable *rt;
1566 	int cpu;
1567 
1568 	for_each_possible_cpu(cpu) {
1569 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1570 
1571 		spin_lock_bh(&ul->lock);
1572 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1573 			if (rt->dst.dev != dev)
1574 				continue;
1575 			rt->dst.dev = blackhole_netdev;
1576 			dev_hold(rt->dst.dev);
1577 			dev_put(dev);
1578 		}
1579 		spin_unlock_bh(&ul->lock);
1580 	}
1581 }
1582 
rt_cache_valid(const struct rtable * rt)1583 static bool rt_cache_valid(const struct rtable *rt)
1584 {
1585 	return	rt &&
1586 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1587 		!rt_is_expired(rt);
1588 }
1589 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1590 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1591 			   const struct fib_result *res,
1592 			   struct fib_nh_exception *fnhe,
1593 			   struct fib_info *fi, u16 type, u32 itag,
1594 			   const bool do_cache)
1595 {
1596 	bool cached = false;
1597 
1598 	if (fi) {
1599 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1600 
1601 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1602 			rt->rt_uses_gateway = 1;
1603 			rt->rt_gw_family = nhc->nhc_gw_family;
1604 			/* only INET and INET6 are supported */
1605 			if (likely(nhc->nhc_gw_family == AF_INET))
1606 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1607 			else
1608 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1609 		}
1610 
1611 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1612 
1613 #ifdef CONFIG_IP_ROUTE_CLASSID
1614 		if (nhc->nhc_family == AF_INET) {
1615 			struct fib_nh *nh;
1616 
1617 			nh = container_of(nhc, struct fib_nh, nh_common);
1618 			rt->dst.tclassid = nh->nh_tclassid;
1619 		}
1620 #endif
1621 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1622 		if (unlikely(fnhe))
1623 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1624 		else if (do_cache)
1625 			cached = rt_cache_route(nhc, rt);
1626 		if (unlikely(!cached)) {
1627 			/* Routes we intend to cache in nexthop exception or
1628 			 * FIB nexthop have the DST_NOCACHE bit clear.
1629 			 * However, if we are unsuccessful at storing this
1630 			 * route into the cache we really need to set it.
1631 			 */
1632 			if (!rt->rt_gw4) {
1633 				rt->rt_gw_family = AF_INET;
1634 				rt->rt_gw4 = daddr;
1635 			}
1636 			rt_add_uncached_list(rt);
1637 		}
1638 	} else
1639 		rt_add_uncached_list(rt);
1640 
1641 #ifdef CONFIG_IP_ROUTE_CLASSID
1642 #ifdef CONFIG_IP_MULTIPLE_TABLES
1643 	set_class_tag(rt, res->tclassid);
1644 #endif
1645 	set_class_tag(rt, itag);
1646 #endif
1647 }
1648 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm,bool will_cache)1649 struct rtable *rt_dst_alloc(struct net_device *dev,
1650 			    unsigned int flags, u16 type,
1651 			    bool nopolicy, bool noxfrm, bool will_cache)
1652 {
1653 	struct rtable *rt;
1654 
1655 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1656 		       (will_cache ? 0 : DST_HOST) |
1657 		       (nopolicy ? DST_NOPOLICY : 0) |
1658 		       (noxfrm ? DST_NOXFRM : 0));
1659 
1660 	if (rt) {
1661 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1662 		rt->rt_flags = flags;
1663 		rt->rt_type = type;
1664 		rt->rt_is_input = 0;
1665 		rt->rt_iif = 0;
1666 		rt->rt_pmtu = 0;
1667 		rt->rt_mtu_locked = 0;
1668 		rt->rt_uses_gateway = 0;
1669 		rt->rt_gw_family = 0;
1670 		rt->rt_gw4 = 0;
1671 		INIT_LIST_HEAD(&rt->rt_uncached);
1672 
1673 		rt->dst.output = ip_output;
1674 		if (flags & RTCF_LOCAL)
1675 			rt->dst.input = ip_local_deliver;
1676 	}
1677 
1678 	return rt;
1679 }
1680 EXPORT_SYMBOL(rt_dst_alloc);
1681 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1682 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1683 {
1684 	struct rtable *new_rt;
1685 
1686 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1687 			   rt->dst.flags);
1688 
1689 	if (new_rt) {
1690 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1691 		new_rt->rt_flags = rt->rt_flags;
1692 		new_rt->rt_type = rt->rt_type;
1693 		new_rt->rt_is_input = rt->rt_is_input;
1694 		new_rt->rt_iif = rt->rt_iif;
1695 		new_rt->rt_pmtu = rt->rt_pmtu;
1696 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1697 		new_rt->rt_gw_family = rt->rt_gw_family;
1698 		if (rt->rt_gw_family == AF_INET)
1699 			new_rt->rt_gw4 = rt->rt_gw4;
1700 		else if (rt->rt_gw_family == AF_INET6)
1701 			new_rt->rt_gw6 = rt->rt_gw6;
1702 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1703 
1704 		new_rt->dst.flags |= DST_HOST;
1705 		new_rt->dst.input = rt->dst.input;
1706 		new_rt->dst.output = rt->dst.output;
1707 		new_rt->dst.error = rt->dst.error;
1708 		new_rt->dst.lastuse = jiffies;
1709 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1710 	}
1711 	return new_rt;
1712 }
1713 EXPORT_SYMBOL(rt_dst_clone);
1714 
1715 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1716 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1717 			  u8 tos, struct net_device *dev,
1718 			  struct in_device *in_dev, u32 *itag)
1719 {
1720 	int err;
1721 
1722 	/* Primary sanity checks. */
1723 	if (!in_dev)
1724 		return -EINVAL;
1725 
1726 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1727 	    skb->protocol != htons(ETH_P_IP))
1728 		return -EINVAL;
1729 
1730 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1731 		return -EINVAL;
1732 
1733 	if (ipv4_is_zeronet(saddr)) {
1734 		if (!ipv4_is_local_multicast(daddr) &&
1735 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1736 			return -EINVAL;
1737 	} else {
1738 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1739 					  in_dev, itag);
1740 		if (err < 0)
1741 			return err;
1742 	}
1743 	return 0;
1744 }
1745 
1746 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1747 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1748 			     u8 tos, struct net_device *dev, int our)
1749 {
1750 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1751 	unsigned int flags = RTCF_MULTICAST;
1752 	struct rtable *rth;
1753 	u32 itag = 0;
1754 	int err;
1755 
1756 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1757 	if (err)
1758 		return err;
1759 
1760 	if (our)
1761 		flags |= RTCF_LOCAL;
1762 
1763 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1764 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1765 	if (!rth)
1766 		return -ENOBUFS;
1767 
1768 #ifdef CONFIG_IP_ROUTE_CLASSID
1769 	rth->dst.tclassid = itag;
1770 #endif
1771 	rth->dst.output = ip_rt_bug;
1772 	rth->rt_is_input= 1;
1773 
1774 #ifdef CONFIG_IP_MROUTE
1775 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1776 		rth->dst.input = ip_mr_input;
1777 #endif
1778 	RT_CACHE_STAT_INC(in_slow_mc);
1779 
1780 	skb_dst_drop(skb);
1781 	skb_dst_set(skb, &rth->dst);
1782 	return 0;
1783 }
1784 
1785 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1786 static void ip_handle_martian_source(struct net_device *dev,
1787 				     struct in_device *in_dev,
1788 				     struct sk_buff *skb,
1789 				     __be32 daddr,
1790 				     __be32 saddr)
1791 {
1792 	RT_CACHE_STAT_INC(in_martian_src);
1793 #ifdef CONFIG_IP_ROUTE_VERBOSE
1794 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1795 		/*
1796 		 *	RFC1812 recommendation, if source is martian,
1797 		 *	the only hint is MAC header.
1798 		 */
1799 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1800 			&daddr, &saddr, dev->name);
1801 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1802 			print_hex_dump(KERN_WARNING, "ll header: ",
1803 				       DUMP_PREFIX_OFFSET, 16, 1,
1804 				       skb_mac_header(skb),
1805 				       dev->hard_header_len, false);
1806 		}
1807 	}
1808 #endif
1809 }
1810 
1811 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1812 static int __mkroute_input(struct sk_buff *skb,
1813 			   const struct fib_result *res,
1814 			   struct in_device *in_dev,
1815 			   __be32 daddr, __be32 saddr, u32 tos)
1816 {
1817 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1818 	struct net_device *dev = nhc->nhc_dev;
1819 	struct fib_nh_exception *fnhe;
1820 	struct rtable *rth;
1821 	int err;
1822 	struct in_device *out_dev;
1823 	bool do_cache;
1824 	u32 itag = 0;
1825 
1826 	/* get a working reference to the output device */
1827 	out_dev = __in_dev_get_rcu(dev);
1828 	if (!out_dev) {
1829 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1830 		return -EINVAL;
1831 	}
1832 
1833 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1834 				  in_dev->dev, in_dev, &itag);
1835 	if (err < 0) {
1836 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1837 					 saddr);
1838 
1839 		goto cleanup;
1840 	}
1841 
1842 	do_cache = res->fi && !itag;
1843 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1844 	    skb->protocol == htons(ETH_P_IP)) {
1845 		__be32 gw;
1846 
1847 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1848 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1849 		    inet_addr_onlink(out_dev, saddr, gw))
1850 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1851 	}
1852 
1853 	if (skb->protocol != htons(ETH_P_IP)) {
1854 		/* Not IP (i.e. ARP). Do not create route, if it is
1855 		 * invalid for proxy arp. DNAT routes are always valid.
1856 		 *
1857 		 * Proxy arp feature have been extended to allow, ARP
1858 		 * replies back to the same interface, to support
1859 		 * Private VLAN switch technologies. See arp.c.
1860 		 */
1861 		if (out_dev == in_dev &&
1862 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1863 			err = -EINVAL;
1864 			goto cleanup;
1865 		}
1866 	}
1867 
1868 	fnhe = find_exception(nhc, daddr);
1869 	if (do_cache) {
1870 		if (fnhe)
1871 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1872 		else
1873 			rth = rcu_dereference(nhc->nhc_rth_input);
1874 		if (rt_cache_valid(rth)) {
1875 			skb_dst_set_noref(skb, &rth->dst);
1876 			goto out;
1877 		}
1878 	}
1879 
1880 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1881 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1882 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1883 	if (!rth) {
1884 		err = -ENOBUFS;
1885 		goto cleanup;
1886 	}
1887 
1888 	rth->rt_is_input = 1;
1889 	RT_CACHE_STAT_INC(in_slow_tot);
1890 
1891 	rth->dst.input = ip_forward;
1892 
1893 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1894 		       do_cache);
1895 	lwtunnel_set_redirect(&rth->dst);
1896 	skb_dst_set(skb, &rth->dst);
1897 out:
1898 	err = 0;
1899  cleanup:
1900 	return err;
1901 }
1902 
1903 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1904 /* To make ICMP packets follow the right flow, the multipath hash is
1905  * calculated from the inner IP addresses.
1906  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1907 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1908 				 struct flow_keys *hash_keys)
1909 {
1910 	const struct iphdr *outer_iph = ip_hdr(skb);
1911 	const struct iphdr *key_iph = outer_iph;
1912 	const struct iphdr *inner_iph;
1913 	const struct icmphdr *icmph;
1914 	struct iphdr _inner_iph;
1915 	struct icmphdr _icmph;
1916 
1917 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1918 		goto out;
1919 
1920 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1921 		goto out;
1922 
1923 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1924 				   &_icmph);
1925 	if (!icmph)
1926 		goto out;
1927 
1928 	if (icmph->type != ICMP_DEST_UNREACH &&
1929 	    icmph->type != ICMP_REDIRECT &&
1930 	    icmph->type != ICMP_TIME_EXCEEDED &&
1931 	    icmph->type != ICMP_PARAMETERPROB)
1932 		goto out;
1933 
1934 	inner_iph = skb_header_pointer(skb,
1935 				       outer_iph->ihl * 4 + sizeof(_icmph),
1936 				       sizeof(_inner_iph), &_inner_iph);
1937 	if (!inner_iph)
1938 		goto out;
1939 
1940 	key_iph = inner_iph;
1941 out:
1942 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1943 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1944 }
1945 
1946 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1947 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1948 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1949 {
1950 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1951 	struct flow_keys hash_keys;
1952 	u32 mhash;
1953 
1954 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1955 	case 0:
1956 		memset(&hash_keys, 0, sizeof(hash_keys));
1957 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958 		if (skb) {
1959 			ip_multipath_l3_keys(skb, &hash_keys);
1960 		} else {
1961 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1962 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1963 		}
1964 		break;
1965 	case 1:
1966 		/* skb is currently provided only when forwarding */
1967 		if (skb) {
1968 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1969 			struct flow_keys keys;
1970 
1971 			/* short-circuit if we already have L4 hash present */
1972 			if (skb->l4_hash)
1973 				return skb_get_hash_raw(skb) >> 1;
1974 
1975 			memset(&hash_keys, 0, sizeof(hash_keys));
1976 
1977 			if (!flkeys) {
1978 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1979 				flkeys = &keys;
1980 			}
1981 
1982 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1983 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1984 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1985 			hash_keys.ports.src = flkeys->ports.src;
1986 			hash_keys.ports.dst = flkeys->ports.dst;
1987 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1988 		} else {
1989 			memset(&hash_keys, 0, sizeof(hash_keys));
1990 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1991 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1992 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1993 			hash_keys.ports.src = fl4->fl4_sport;
1994 			hash_keys.ports.dst = fl4->fl4_dport;
1995 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1996 		}
1997 		break;
1998 	case 2:
1999 		memset(&hash_keys, 0, sizeof(hash_keys));
2000 		/* skb is currently provided only when forwarding */
2001 		if (skb) {
2002 			struct flow_keys keys;
2003 
2004 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2005 			/* Inner can be v4 or v6 */
2006 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2007 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2008 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2009 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2010 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2011 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2013 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2014 				hash_keys.tags.flow_label = keys.tags.flow_label;
2015 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2016 			} else {
2017 				/* Same as case 0 */
2018 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2019 				ip_multipath_l3_keys(skb, &hash_keys);
2020 			}
2021 		} else {
2022 			/* Same as case 0 */
2023 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2025 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2026 		}
2027 		break;
2028 	}
2029 	mhash = flow_hash_from_keys(&hash_keys);
2030 
2031 	if (multipath_hash)
2032 		mhash = jhash_2words(mhash, multipath_hash, 0);
2033 
2034 	return mhash >> 1;
2035 }
2036 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2037 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2038 static int ip_mkroute_input(struct sk_buff *skb,
2039 			    struct fib_result *res,
2040 			    struct in_device *in_dev,
2041 			    __be32 daddr, __be32 saddr, u32 tos,
2042 			    struct flow_keys *hkeys)
2043 {
2044 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2045 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2046 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2047 
2048 		fib_select_multipath(res, h);
2049 	}
2050 #endif
2051 
2052 	/* create a routing cache entry */
2053 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2054 }
2055 
2056 /*
2057  *	NOTE. We drop all the packets that has local source
2058  *	addresses, because every properly looped back packet
2059  *	must have correct destination already attached by output routine.
2060  *
2061  *	Such approach solves two big problems:
2062  *	1. Not simplex devices are handled properly.
2063  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2064  *	called with rcu_read_lock()
2065  */
2066 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2067 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2068 			       u8 tos, struct net_device *dev,
2069 			       struct fib_result *res)
2070 {
2071 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2072 	struct flow_keys *flkeys = NULL, _flkeys;
2073 	struct net    *net = dev_net(dev);
2074 	struct ip_tunnel_info *tun_info;
2075 	int		err = -EINVAL;
2076 	unsigned int	flags = 0;
2077 	u32		itag = 0;
2078 	struct rtable	*rth;
2079 	struct flowi4	fl4;
2080 	bool do_cache = true;
2081 
2082 	/* IP on this device is disabled. */
2083 
2084 	if (!in_dev)
2085 		goto out;
2086 
2087 	/* Check for the most weird martians, which can be not detected
2088 	   by fib_lookup.
2089 	 */
2090 
2091 	tun_info = skb_tunnel_info(skb);
2092 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2093 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2094 	else
2095 		fl4.flowi4_tun_key.tun_id = 0;
2096 	skb_dst_drop(skb);
2097 
2098 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2099 		goto martian_source;
2100 
2101 	res->fi = NULL;
2102 	res->table = NULL;
2103 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2104 		goto brd_input;
2105 
2106 	/* Accept zero addresses only to limited broadcast;
2107 	 * I even do not know to fix it or not. Waiting for complains :-)
2108 	 */
2109 	if (ipv4_is_zeronet(saddr))
2110 		goto martian_source;
2111 
2112 	if (ipv4_is_zeronet(daddr))
2113 		goto martian_destination;
2114 
2115 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2116 	 * and call it once if daddr or/and saddr are loopback addresses
2117 	 */
2118 	if (ipv4_is_loopback(daddr)) {
2119 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2120 			goto martian_destination;
2121 	} else if (ipv4_is_loopback(saddr)) {
2122 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2123 			goto martian_source;
2124 	}
2125 
2126 	/*
2127 	 *	Now we are ready to route packet.
2128 	 */
2129 	fl4.flowi4_oif = 0;
2130 	fl4.flowi4_iif = dev->ifindex;
2131 	fl4.flowi4_mark = skb->mark;
2132 	fl4.flowi4_tos = tos;
2133 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2134 	fl4.flowi4_flags = 0;
2135 	fl4.daddr = daddr;
2136 	fl4.saddr = saddr;
2137 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2138 	fl4.flowi4_multipath_hash = 0;
2139 
2140 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2141 		flkeys = &_flkeys;
2142 	} else {
2143 		fl4.flowi4_proto = 0;
2144 		fl4.fl4_sport = 0;
2145 		fl4.fl4_dport = 0;
2146 	}
2147 
2148 	err = fib_lookup(net, &fl4, res, 0);
2149 	if (err != 0) {
2150 		if (!IN_DEV_FORWARD(in_dev))
2151 			err = -EHOSTUNREACH;
2152 		goto no_route;
2153 	}
2154 
2155 	if (res->type == RTN_BROADCAST) {
2156 		if (IN_DEV_BFORWARD(in_dev))
2157 			goto make_route;
2158 		/* not do cache if bc_forwarding is enabled */
2159 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2160 			do_cache = false;
2161 		goto brd_input;
2162 	}
2163 
2164 	if (res->type == RTN_LOCAL) {
2165 		err = fib_validate_source(skb, saddr, daddr, tos,
2166 					  0, dev, in_dev, &itag);
2167 		if (err < 0)
2168 			goto martian_source;
2169 		goto local_input;
2170 	}
2171 
2172 	if (!IN_DEV_FORWARD(in_dev)) {
2173 		err = -EHOSTUNREACH;
2174 		goto no_route;
2175 	}
2176 	if (res->type != RTN_UNICAST)
2177 		goto martian_destination;
2178 
2179 make_route:
2180 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2181 out:	return err;
2182 
2183 brd_input:
2184 	if (skb->protocol != htons(ETH_P_IP))
2185 		goto e_inval;
2186 
2187 	if (!ipv4_is_zeronet(saddr)) {
2188 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2189 					  in_dev, &itag);
2190 		if (err < 0)
2191 			goto martian_source;
2192 	}
2193 	flags |= RTCF_BROADCAST;
2194 	res->type = RTN_BROADCAST;
2195 	RT_CACHE_STAT_INC(in_brd);
2196 
2197 local_input:
2198 	do_cache &= res->fi && !itag;
2199 	if (do_cache) {
2200 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2201 
2202 		rth = rcu_dereference(nhc->nhc_rth_input);
2203 		if (rt_cache_valid(rth)) {
2204 			skb_dst_set_noref(skb, &rth->dst);
2205 			err = 0;
2206 			goto out;
2207 		}
2208 	}
2209 
2210 	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2211 			   flags | RTCF_LOCAL, res->type,
2212 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2213 	if (!rth)
2214 		goto e_nobufs;
2215 
2216 	rth->dst.output= ip_rt_bug;
2217 #ifdef CONFIG_IP_ROUTE_CLASSID
2218 	rth->dst.tclassid = itag;
2219 #endif
2220 	rth->rt_is_input = 1;
2221 
2222 	RT_CACHE_STAT_INC(in_slow_tot);
2223 	if (res->type == RTN_UNREACHABLE) {
2224 		rth->dst.input= ip_error;
2225 		rth->dst.error= -err;
2226 		rth->rt_flags 	&= ~RTCF_LOCAL;
2227 	}
2228 
2229 	if (do_cache) {
2230 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2231 
2232 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2233 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2234 			WARN_ON(rth->dst.input == lwtunnel_input);
2235 			rth->dst.lwtstate->orig_input = rth->dst.input;
2236 			rth->dst.input = lwtunnel_input;
2237 		}
2238 
2239 		if (unlikely(!rt_cache_route(nhc, rth)))
2240 			rt_add_uncached_list(rth);
2241 	}
2242 	skb_dst_set(skb, &rth->dst);
2243 	err = 0;
2244 	goto out;
2245 
2246 no_route:
2247 	RT_CACHE_STAT_INC(in_no_route);
2248 	res->type = RTN_UNREACHABLE;
2249 	res->fi = NULL;
2250 	res->table = NULL;
2251 	goto local_input;
2252 
2253 	/*
2254 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2255 	 */
2256 martian_destination:
2257 	RT_CACHE_STAT_INC(in_martian_dst);
2258 #ifdef CONFIG_IP_ROUTE_VERBOSE
2259 	if (IN_DEV_LOG_MARTIANS(in_dev))
2260 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2261 				     &daddr, &saddr, dev->name);
2262 #endif
2263 
2264 e_inval:
2265 	err = -EINVAL;
2266 	goto out;
2267 
2268 e_nobufs:
2269 	err = -ENOBUFS;
2270 	goto out;
2271 
2272 martian_source:
2273 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2274 	goto out;
2275 }
2276 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2277 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2278 			 u8 tos, struct net_device *dev)
2279 {
2280 	struct fib_result res;
2281 	int err;
2282 
2283 	tos &= IPTOS_RT_MASK;
2284 	rcu_read_lock();
2285 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2286 	rcu_read_unlock();
2287 
2288 	return err;
2289 }
2290 EXPORT_SYMBOL(ip_route_input_noref);
2291 
2292 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2293 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2294 		       u8 tos, struct net_device *dev, struct fib_result *res)
2295 {
2296 	/* Multicast recognition logic is moved from route cache to here.
2297 	   The problem was that too many Ethernet cards have broken/missing
2298 	   hardware multicast filters :-( As result the host on multicasting
2299 	   network acquires a lot of useless route cache entries, sort of
2300 	   SDR messages from all the world. Now we try to get rid of them.
2301 	   Really, provided software IP multicast filter is organized
2302 	   reasonably (at least, hashed), it does not result in a slowdown
2303 	   comparing with route cache reject entries.
2304 	   Note, that multicast routers are not affected, because
2305 	   route cache entry is created eventually.
2306 	 */
2307 	if (ipv4_is_multicast(daddr)) {
2308 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2309 		int our = 0;
2310 		int err = -EINVAL;
2311 
2312 		if (!in_dev)
2313 			return err;
2314 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2315 				      ip_hdr(skb)->protocol);
2316 
2317 		/* check l3 master if no match yet */
2318 		if (!our && netif_is_l3_slave(dev)) {
2319 			struct in_device *l3_in_dev;
2320 
2321 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2322 			if (l3_in_dev)
2323 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2324 						      ip_hdr(skb)->protocol);
2325 		}
2326 
2327 		if (our
2328 #ifdef CONFIG_IP_MROUTE
2329 			||
2330 		    (!ipv4_is_local_multicast(daddr) &&
2331 		     IN_DEV_MFORWARD(in_dev))
2332 #endif
2333 		   ) {
2334 			err = ip_route_input_mc(skb, daddr, saddr,
2335 						tos, dev, our);
2336 		}
2337 		return err;
2338 	}
2339 
2340 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2341 }
2342 
2343 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2344 static struct rtable *__mkroute_output(const struct fib_result *res,
2345 				       const struct flowi4 *fl4, int orig_oif,
2346 				       struct net_device *dev_out,
2347 				       unsigned int flags)
2348 {
2349 	struct fib_info *fi = res->fi;
2350 	struct fib_nh_exception *fnhe;
2351 	struct in_device *in_dev;
2352 	u16 type = res->type;
2353 	struct rtable *rth;
2354 	bool do_cache;
2355 
2356 	in_dev = __in_dev_get_rcu(dev_out);
2357 	if (!in_dev)
2358 		return ERR_PTR(-EINVAL);
2359 
2360 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2361 		if (ipv4_is_loopback(fl4->saddr) &&
2362 		    !(dev_out->flags & IFF_LOOPBACK) &&
2363 		    !netif_is_l3_master(dev_out))
2364 			return ERR_PTR(-EINVAL);
2365 
2366 	if (ipv4_is_lbcast(fl4->daddr))
2367 		type = RTN_BROADCAST;
2368 	else if (ipv4_is_multicast(fl4->daddr))
2369 		type = RTN_MULTICAST;
2370 	else if (ipv4_is_zeronet(fl4->daddr))
2371 		return ERR_PTR(-EINVAL);
2372 
2373 	if (dev_out->flags & IFF_LOOPBACK)
2374 		flags |= RTCF_LOCAL;
2375 
2376 	do_cache = true;
2377 	if (type == RTN_BROADCAST) {
2378 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2379 		fi = NULL;
2380 	} else if (type == RTN_MULTICAST) {
2381 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2382 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2383 				     fl4->flowi4_proto))
2384 			flags &= ~RTCF_LOCAL;
2385 		else
2386 			do_cache = false;
2387 		/* If multicast route do not exist use
2388 		 * default one, but do not gateway in this case.
2389 		 * Yes, it is hack.
2390 		 */
2391 		if (fi && res->prefixlen < 4)
2392 			fi = NULL;
2393 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2394 		   (orig_oif != dev_out->ifindex)) {
2395 		/* For local routes that require a particular output interface
2396 		 * we do not want to cache the result.  Caching the result
2397 		 * causes incorrect behaviour when there are multiple source
2398 		 * addresses on the interface, the end result being that if the
2399 		 * intended recipient is waiting on that interface for the
2400 		 * packet he won't receive it because it will be delivered on
2401 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2402 		 * be set to the loopback interface as well.
2403 		 */
2404 		do_cache = false;
2405 	}
2406 
2407 	fnhe = NULL;
2408 	do_cache &= fi != NULL;
2409 	if (fi) {
2410 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2411 		struct rtable __rcu **prth;
2412 
2413 		fnhe = find_exception(nhc, fl4->daddr);
2414 		if (!do_cache)
2415 			goto add;
2416 		if (fnhe) {
2417 			prth = &fnhe->fnhe_rth_output;
2418 		} else {
2419 			if (unlikely(fl4->flowi4_flags &
2420 				     FLOWI_FLAG_KNOWN_NH &&
2421 				     !(nhc->nhc_gw_family &&
2422 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2423 				do_cache = false;
2424 				goto add;
2425 			}
2426 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2427 		}
2428 		rth = rcu_dereference(*prth);
2429 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2430 			return rth;
2431 	}
2432 
2433 add:
2434 	rth = rt_dst_alloc(dev_out, flags, type,
2435 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2436 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2437 			   do_cache);
2438 	if (!rth)
2439 		return ERR_PTR(-ENOBUFS);
2440 
2441 	rth->rt_iif = orig_oif;
2442 
2443 	RT_CACHE_STAT_INC(out_slow_tot);
2444 
2445 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2446 		if (flags & RTCF_LOCAL &&
2447 		    !(dev_out->flags & IFF_LOOPBACK)) {
2448 			rth->dst.output = ip_mc_output;
2449 			RT_CACHE_STAT_INC(out_slow_mc);
2450 		}
2451 #ifdef CONFIG_IP_MROUTE
2452 		if (type == RTN_MULTICAST) {
2453 			if (IN_DEV_MFORWARD(in_dev) &&
2454 			    !ipv4_is_local_multicast(fl4->daddr)) {
2455 				rth->dst.input = ip_mr_input;
2456 				rth->dst.output = ip_mc_output;
2457 			}
2458 		}
2459 #endif
2460 	}
2461 
2462 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2463 	lwtunnel_set_redirect(&rth->dst);
2464 
2465 	return rth;
2466 }
2467 
2468 /*
2469  * Major route resolver routine.
2470  */
2471 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2472 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2473 					const struct sk_buff *skb)
2474 {
2475 	__u8 tos = RT_FL_TOS(fl4);
2476 	struct fib_result res = {
2477 		.type		= RTN_UNSPEC,
2478 		.fi		= NULL,
2479 		.table		= NULL,
2480 		.tclassid	= 0,
2481 	};
2482 	struct rtable *rth;
2483 
2484 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2485 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2486 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2487 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2488 
2489 	rcu_read_lock();
2490 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2491 	rcu_read_unlock();
2492 
2493 	return rth;
2494 }
2495 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2496 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2497 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2498 					    struct fib_result *res,
2499 					    const struct sk_buff *skb)
2500 {
2501 	struct net_device *dev_out = NULL;
2502 	int orig_oif = fl4->flowi4_oif;
2503 	unsigned int flags = 0;
2504 	struct rtable *rth;
2505 	int err;
2506 
2507 	if (fl4->saddr) {
2508 		if (ipv4_is_multicast(fl4->saddr) ||
2509 		    ipv4_is_lbcast(fl4->saddr) ||
2510 		    ipv4_is_zeronet(fl4->saddr)) {
2511 			rth = ERR_PTR(-EINVAL);
2512 			goto out;
2513 		}
2514 
2515 		rth = ERR_PTR(-ENETUNREACH);
2516 
2517 		/* I removed check for oif == dev_out->oif here.
2518 		   It was wrong for two reasons:
2519 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2520 		      is assigned to multiple interfaces.
2521 		   2. Moreover, we are allowed to send packets with saddr
2522 		      of another iface. --ANK
2523 		 */
2524 
2525 		if (fl4->flowi4_oif == 0 &&
2526 		    (ipv4_is_multicast(fl4->daddr) ||
2527 		     ipv4_is_lbcast(fl4->daddr))) {
2528 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2529 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2530 			if (!dev_out)
2531 				goto out;
2532 
2533 			/* Special hack: user can direct multicasts
2534 			   and limited broadcast via necessary interface
2535 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2536 			   This hack is not just for fun, it allows
2537 			   vic,vat and friends to work.
2538 			   They bind socket to loopback, set ttl to zero
2539 			   and expect that it will work.
2540 			   From the viewpoint of routing cache they are broken,
2541 			   because we are not allowed to build multicast path
2542 			   with loopback source addr (look, routing cache
2543 			   cannot know, that ttl is zero, so that packet
2544 			   will not leave this host and route is valid).
2545 			   Luckily, this hack is good workaround.
2546 			 */
2547 
2548 			fl4->flowi4_oif = dev_out->ifindex;
2549 			goto make_route;
2550 		}
2551 
2552 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2553 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2554 			if (!__ip_dev_find(net, fl4->saddr, false))
2555 				goto out;
2556 		}
2557 	}
2558 
2559 
2560 	if (fl4->flowi4_oif) {
2561 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2562 		rth = ERR_PTR(-ENODEV);
2563 		if (!dev_out)
2564 			goto out;
2565 
2566 		/* RACE: Check return value of inet_select_addr instead. */
2567 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2568 			rth = ERR_PTR(-ENETUNREACH);
2569 			goto out;
2570 		}
2571 		if (ipv4_is_local_multicast(fl4->daddr) ||
2572 		    ipv4_is_lbcast(fl4->daddr) ||
2573 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2574 			if (!fl4->saddr)
2575 				fl4->saddr = inet_select_addr(dev_out, 0,
2576 							      RT_SCOPE_LINK);
2577 			goto make_route;
2578 		}
2579 		if (!fl4->saddr) {
2580 			if (ipv4_is_multicast(fl4->daddr))
2581 				fl4->saddr = inet_select_addr(dev_out, 0,
2582 							      fl4->flowi4_scope);
2583 			else if (!fl4->daddr)
2584 				fl4->saddr = inet_select_addr(dev_out, 0,
2585 							      RT_SCOPE_HOST);
2586 		}
2587 	}
2588 
2589 	if (!fl4->daddr) {
2590 		fl4->daddr = fl4->saddr;
2591 		if (!fl4->daddr)
2592 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2593 		dev_out = net->loopback_dev;
2594 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2595 		res->type = RTN_LOCAL;
2596 		flags |= RTCF_LOCAL;
2597 		goto make_route;
2598 	}
2599 
2600 	err = fib_lookup(net, fl4, res, 0);
2601 	if (err) {
2602 		res->fi = NULL;
2603 		res->table = NULL;
2604 		if (fl4->flowi4_oif &&
2605 		    (ipv4_is_multicast(fl4->daddr) ||
2606 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2607 			/* Apparently, routing tables are wrong. Assume,
2608 			   that the destination is on link.
2609 
2610 			   WHY? DW.
2611 			   Because we are allowed to send to iface
2612 			   even if it has NO routes and NO assigned
2613 			   addresses. When oif is specified, routing
2614 			   tables are looked up with only one purpose:
2615 			   to catch if destination is gatewayed, rather than
2616 			   direct. Moreover, if MSG_DONTROUTE is set,
2617 			   we send packet, ignoring both routing tables
2618 			   and ifaddr state. --ANK
2619 
2620 
2621 			   We could make it even if oif is unknown,
2622 			   likely IPv6, but we do not.
2623 			 */
2624 
2625 			if (fl4->saddr == 0)
2626 				fl4->saddr = inet_select_addr(dev_out, 0,
2627 							      RT_SCOPE_LINK);
2628 			res->type = RTN_UNICAST;
2629 			goto make_route;
2630 		}
2631 		rth = ERR_PTR(err);
2632 		goto out;
2633 	}
2634 
2635 	if (res->type == RTN_LOCAL) {
2636 		if (!fl4->saddr) {
2637 			if (res->fi->fib_prefsrc)
2638 				fl4->saddr = res->fi->fib_prefsrc;
2639 			else
2640 				fl4->saddr = fl4->daddr;
2641 		}
2642 
2643 		/* L3 master device is the loopback for that domain */
2644 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2645 			net->loopback_dev;
2646 
2647 		/* make sure orig_oif points to fib result device even
2648 		 * though packet rx/tx happens over loopback or l3mdev
2649 		 */
2650 		orig_oif = FIB_RES_OIF(*res);
2651 
2652 		fl4->flowi4_oif = dev_out->ifindex;
2653 		flags |= RTCF_LOCAL;
2654 		goto make_route;
2655 	}
2656 
2657 	fib_select_path(net, res, fl4, skb);
2658 
2659 	dev_out = FIB_RES_DEV(*res);
2660 
2661 make_route:
2662 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2663 
2664 out:
2665 	return rth;
2666 }
2667 
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2668 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2669 {
2670 	return NULL;
2671 }
2672 
ipv4_blackhole_mtu(const struct dst_entry * dst)2673 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2674 {
2675 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2676 
2677 	return mtu ? : dst->dev->mtu;
2678 }
2679 
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)2680 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2681 					  struct sk_buff *skb, u32 mtu,
2682 					  bool confirm_neigh)
2683 {
2684 }
2685 
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2686 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2687 				       struct sk_buff *skb)
2688 {
2689 }
2690 
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2691 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2692 					  unsigned long old)
2693 {
2694 	return NULL;
2695 }
2696 
2697 static struct dst_ops ipv4_dst_blackhole_ops = {
2698 	.family			=	AF_INET,
2699 	.check			=	ipv4_blackhole_dst_check,
2700 	.mtu			=	ipv4_blackhole_mtu,
2701 	.default_advmss		=	ipv4_default_advmss,
2702 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2703 	.redirect		=	ipv4_rt_blackhole_redirect,
2704 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2705 	.neigh_lookup		=	ipv4_neigh_lookup,
2706 };
2707 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2708 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2709 {
2710 	struct rtable *ort = (struct rtable *) dst_orig;
2711 	struct rtable *rt;
2712 
2713 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2714 	if (rt) {
2715 		struct dst_entry *new = &rt->dst;
2716 
2717 		new->__use = 1;
2718 		new->input = dst_discard;
2719 		new->output = dst_discard_out;
2720 
2721 		new->dev = net->loopback_dev;
2722 		if (new->dev)
2723 			dev_hold(new->dev);
2724 
2725 		rt->rt_is_input = ort->rt_is_input;
2726 		rt->rt_iif = ort->rt_iif;
2727 		rt->rt_pmtu = ort->rt_pmtu;
2728 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2729 
2730 		rt->rt_genid = rt_genid_ipv4(net);
2731 		rt->rt_flags = ort->rt_flags;
2732 		rt->rt_type = ort->rt_type;
2733 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2734 		rt->rt_gw_family = ort->rt_gw_family;
2735 		if (rt->rt_gw_family == AF_INET)
2736 			rt->rt_gw4 = ort->rt_gw4;
2737 		else if (rt->rt_gw_family == AF_INET6)
2738 			rt->rt_gw6 = ort->rt_gw6;
2739 
2740 		INIT_LIST_HEAD(&rt->rt_uncached);
2741 	}
2742 
2743 	dst_release(dst_orig);
2744 
2745 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2746 }
2747 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2748 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2749 				    const struct sock *sk)
2750 {
2751 	struct rtable *rt = __ip_route_output_key(net, flp4);
2752 
2753 	if (IS_ERR(rt))
2754 		return rt;
2755 
2756 	if (flp4->flowi4_proto) {
2757 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2758 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2759 							flowi4_to_flowi(flp4),
2760 							sk, 0);
2761 	}
2762 
2763 	return rt;
2764 }
2765 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2766 
2767 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2768 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2769 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2770 			struct sk_buff *skb, u32 portid, u32 seq,
2771 			unsigned int flags)
2772 {
2773 	struct rtmsg *r;
2774 	struct nlmsghdr *nlh;
2775 	unsigned long expires = 0;
2776 	u32 error;
2777 	u32 metrics[RTAX_MAX];
2778 
2779 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2780 	if (!nlh)
2781 		return -EMSGSIZE;
2782 
2783 	r = nlmsg_data(nlh);
2784 	r->rtm_family	 = AF_INET;
2785 	r->rtm_dst_len	= 32;
2786 	r->rtm_src_len	= 0;
2787 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2788 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2789 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2790 		goto nla_put_failure;
2791 	r->rtm_type	= rt->rt_type;
2792 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2793 	r->rtm_protocol = RTPROT_UNSPEC;
2794 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2795 	if (rt->rt_flags & RTCF_NOTIFY)
2796 		r->rtm_flags |= RTM_F_NOTIFY;
2797 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2798 		r->rtm_flags |= RTCF_DOREDIRECT;
2799 
2800 	if (nla_put_in_addr(skb, RTA_DST, dst))
2801 		goto nla_put_failure;
2802 	if (src) {
2803 		r->rtm_src_len = 32;
2804 		if (nla_put_in_addr(skb, RTA_SRC, src))
2805 			goto nla_put_failure;
2806 	}
2807 	if (rt->dst.dev &&
2808 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2809 		goto nla_put_failure;
2810 #ifdef CONFIG_IP_ROUTE_CLASSID
2811 	if (rt->dst.tclassid &&
2812 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2813 		goto nla_put_failure;
2814 #endif
2815 	if (fl4 && !rt_is_input_route(rt) &&
2816 	    fl4->saddr != src) {
2817 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2818 			goto nla_put_failure;
2819 	}
2820 	if (rt->rt_uses_gateway) {
2821 		if (rt->rt_gw_family == AF_INET &&
2822 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2823 			goto nla_put_failure;
2824 		} else if (rt->rt_gw_family == AF_INET6) {
2825 			int alen = sizeof(struct in6_addr);
2826 			struct nlattr *nla;
2827 			struct rtvia *via;
2828 
2829 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2830 			if (!nla)
2831 				goto nla_put_failure;
2832 
2833 			via = nla_data(nla);
2834 			via->rtvia_family = AF_INET6;
2835 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2836 		}
2837 	}
2838 
2839 	expires = rt->dst.expires;
2840 	if (expires) {
2841 		unsigned long now = jiffies;
2842 
2843 		if (time_before(now, expires))
2844 			expires -= now;
2845 		else
2846 			expires = 0;
2847 	}
2848 
2849 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2850 	if (rt->rt_pmtu && expires)
2851 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2852 	if (rt->rt_mtu_locked && expires)
2853 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2854 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2855 		goto nla_put_failure;
2856 
2857 	if (fl4) {
2858 		if (fl4->flowi4_mark &&
2859 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2860 			goto nla_put_failure;
2861 
2862 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2863 		    nla_put_u32(skb, RTA_UID,
2864 				from_kuid_munged(current_user_ns(),
2865 						 fl4->flowi4_uid)))
2866 			goto nla_put_failure;
2867 
2868 		if (rt_is_input_route(rt)) {
2869 #ifdef CONFIG_IP_MROUTE
2870 			if (ipv4_is_multicast(dst) &&
2871 			    !ipv4_is_local_multicast(dst) &&
2872 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2873 				int err = ipmr_get_route(net, skb,
2874 							 fl4->saddr, fl4->daddr,
2875 							 r, portid);
2876 
2877 				if (err <= 0) {
2878 					if (err == 0)
2879 						return 0;
2880 					goto nla_put_failure;
2881 				}
2882 			} else
2883 #endif
2884 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2885 					goto nla_put_failure;
2886 		}
2887 	}
2888 
2889 	error = rt->dst.error;
2890 
2891 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2892 		goto nla_put_failure;
2893 
2894 	nlmsg_end(skb, nlh);
2895 	return 0;
2896 
2897 nla_put_failure:
2898 	nlmsg_cancel(skb, nlh);
2899 	return -EMSGSIZE;
2900 }
2901 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2902 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2903 			    struct netlink_callback *cb, u32 table_id,
2904 			    struct fnhe_hash_bucket *bucket, int genid,
2905 			    int *fa_index, int fa_start, unsigned int flags)
2906 {
2907 	int i;
2908 
2909 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2910 		struct fib_nh_exception *fnhe;
2911 
2912 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2913 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2914 			struct rtable *rt;
2915 			int err;
2916 
2917 			if (*fa_index < fa_start)
2918 				goto next;
2919 
2920 			if (fnhe->fnhe_genid != genid)
2921 				goto next;
2922 
2923 			if (fnhe->fnhe_expires &&
2924 			    time_after(jiffies, fnhe->fnhe_expires))
2925 				goto next;
2926 
2927 			rt = rcu_dereference(fnhe->fnhe_rth_input);
2928 			if (!rt)
2929 				rt = rcu_dereference(fnhe->fnhe_rth_output);
2930 			if (!rt)
2931 				goto next;
2932 
2933 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2934 					   table_id, NULL, skb,
2935 					   NETLINK_CB(cb->skb).portid,
2936 					   cb->nlh->nlmsg_seq, flags);
2937 			if (err)
2938 				return err;
2939 next:
2940 			(*fa_index)++;
2941 		}
2942 	}
2943 
2944 	return 0;
2945 }
2946 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)2947 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2948 		       u32 table_id, struct fib_info *fi,
2949 		       int *fa_index, int fa_start, unsigned int flags)
2950 {
2951 	struct net *net = sock_net(cb->skb->sk);
2952 	int nhsel, genid = fnhe_genid(net);
2953 
2954 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2955 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2956 		struct fnhe_hash_bucket *bucket;
2957 		int err;
2958 
2959 		if (nhc->nhc_flags & RTNH_F_DEAD)
2960 			continue;
2961 
2962 		rcu_read_lock();
2963 		bucket = rcu_dereference(nhc->nhc_exceptions);
2964 		err = 0;
2965 		if (bucket)
2966 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2967 					       genid, fa_index, fa_start,
2968 					       flags);
2969 		rcu_read_unlock();
2970 		if (err)
2971 			return err;
2972 	}
2973 
2974 	return 0;
2975 }
2976 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)2977 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2978 						   u8 ip_proto, __be16 sport,
2979 						   __be16 dport)
2980 {
2981 	struct sk_buff *skb;
2982 	struct iphdr *iph;
2983 
2984 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2985 	if (!skb)
2986 		return NULL;
2987 
2988 	/* Reserve room for dummy headers, this skb can pass
2989 	 * through good chunk of routing engine.
2990 	 */
2991 	skb_reset_mac_header(skb);
2992 	skb_reset_network_header(skb);
2993 	skb->protocol = htons(ETH_P_IP);
2994 	iph = skb_put(skb, sizeof(struct iphdr));
2995 	iph->protocol = ip_proto;
2996 	iph->saddr = src;
2997 	iph->daddr = dst;
2998 	iph->version = 0x4;
2999 	iph->frag_off = 0;
3000 	iph->ihl = 0x5;
3001 	skb_set_transport_header(skb, skb->len);
3002 
3003 	switch (iph->protocol) {
3004 	case IPPROTO_UDP: {
3005 		struct udphdr *udph;
3006 
3007 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3008 		udph->source = sport;
3009 		udph->dest = dport;
3010 		udph->len = htons(sizeof(struct udphdr));
3011 		udph->check = 0;
3012 		break;
3013 	}
3014 	case IPPROTO_TCP: {
3015 		struct tcphdr *tcph;
3016 
3017 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3018 		tcph->source	= sport;
3019 		tcph->dest	= dport;
3020 		tcph->doff	= sizeof(struct tcphdr) / 4;
3021 		tcph->rst = 1;
3022 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3023 					    src, dst, 0);
3024 		break;
3025 	}
3026 	case IPPROTO_ICMP: {
3027 		struct icmphdr *icmph;
3028 
3029 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3030 		icmph->type = ICMP_ECHO;
3031 		icmph->code = 0;
3032 	}
3033 	}
3034 
3035 	return skb;
3036 }
3037 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3038 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3039 				       const struct nlmsghdr *nlh,
3040 				       struct nlattr **tb,
3041 				       struct netlink_ext_ack *extack)
3042 {
3043 	struct rtmsg *rtm;
3044 	int i, err;
3045 
3046 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3047 		NL_SET_ERR_MSG(extack,
3048 			       "ipv4: Invalid header for route get request");
3049 		return -EINVAL;
3050 	}
3051 
3052 	if (!netlink_strict_get_check(skb))
3053 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3054 					      rtm_ipv4_policy, extack);
3055 
3056 	rtm = nlmsg_data(nlh);
3057 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3058 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3059 	    rtm->rtm_table || rtm->rtm_protocol ||
3060 	    rtm->rtm_scope || rtm->rtm_type) {
3061 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3062 		return -EINVAL;
3063 	}
3064 
3065 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3066 			       RTM_F_LOOKUP_TABLE |
3067 			       RTM_F_FIB_MATCH)) {
3068 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3069 		return -EINVAL;
3070 	}
3071 
3072 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3073 					    rtm_ipv4_policy, extack);
3074 	if (err)
3075 		return err;
3076 
3077 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3078 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3079 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3080 		return -EINVAL;
3081 	}
3082 
3083 	for (i = 0; i <= RTA_MAX; i++) {
3084 		if (!tb[i])
3085 			continue;
3086 
3087 		switch (i) {
3088 		case RTA_IIF:
3089 		case RTA_OIF:
3090 		case RTA_SRC:
3091 		case RTA_DST:
3092 		case RTA_IP_PROTO:
3093 		case RTA_SPORT:
3094 		case RTA_DPORT:
3095 		case RTA_MARK:
3096 		case RTA_UID:
3097 			break;
3098 		default:
3099 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3100 			return -EINVAL;
3101 		}
3102 	}
3103 
3104 	return 0;
3105 }
3106 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3107 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3108 			     struct netlink_ext_ack *extack)
3109 {
3110 	struct net *net = sock_net(in_skb->sk);
3111 	struct nlattr *tb[RTA_MAX+1];
3112 	u32 table_id = RT_TABLE_MAIN;
3113 	__be16 sport = 0, dport = 0;
3114 	struct fib_result res = {};
3115 	u8 ip_proto = IPPROTO_UDP;
3116 	struct rtable *rt = NULL;
3117 	struct sk_buff *skb;
3118 	struct rtmsg *rtm;
3119 	struct flowi4 fl4 = {};
3120 	__be32 dst = 0;
3121 	__be32 src = 0;
3122 	kuid_t uid;
3123 	u32 iif;
3124 	int err;
3125 	int mark;
3126 
3127 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3128 	if (err < 0)
3129 		return err;
3130 
3131 	rtm = nlmsg_data(nlh);
3132 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3133 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3134 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3135 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3136 	if (tb[RTA_UID])
3137 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3138 	else
3139 		uid = (iif ? INVALID_UID : current_uid());
3140 
3141 	if (tb[RTA_IP_PROTO]) {
3142 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3143 						  &ip_proto, AF_INET, extack);
3144 		if (err)
3145 			return err;
3146 	}
3147 
3148 	if (tb[RTA_SPORT])
3149 		sport = nla_get_be16(tb[RTA_SPORT]);
3150 
3151 	if (tb[RTA_DPORT])
3152 		dport = nla_get_be16(tb[RTA_DPORT]);
3153 
3154 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3155 	if (!skb)
3156 		return -ENOBUFS;
3157 
3158 	fl4.daddr = dst;
3159 	fl4.saddr = src;
3160 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3161 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3162 	fl4.flowi4_mark = mark;
3163 	fl4.flowi4_uid = uid;
3164 	if (sport)
3165 		fl4.fl4_sport = sport;
3166 	if (dport)
3167 		fl4.fl4_dport = dport;
3168 	fl4.flowi4_proto = ip_proto;
3169 
3170 	rcu_read_lock();
3171 
3172 	if (iif) {
3173 		struct net_device *dev;
3174 
3175 		dev = dev_get_by_index_rcu(net, iif);
3176 		if (!dev) {
3177 			err = -ENODEV;
3178 			goto errout_rcu;
3179 		}
3180 
3181 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3182 		skb->dev	= dev;
3183 		skb->mark	= mark;
3184 		err = ip_route_input_rcu(skb, dst, src,
3185 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3186 					 &res);
3187 
3188 		rt = skb_rtable(skb);
3189 		if (err == 0 && rt->dst.error)
3190 			err = -rt->dst.error;
3191 	} else {
3192 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3193 		skb->dev = net->loopback_dev;
3194 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3195 		err = 0;
3196 		if (IS_ERR(rt))
3197 			err = PTR_ERR(rt);
3198 		else
3199 			skb_dst_set(skb, &rt->dst);
3200 	}
3201 
3202 	if (err)
3203 		goto errout_rcu;
3204 
3205 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3206 		rt->rt_flags |= RTCF_NOTIFY;
3207 
3208 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3209 		table_id = res.table ? res.table->tb_id : 0;
3210 
3211 	/* reset skb for netlink reply msg */
3212 	skb_trim(skb, 0);
3213 	skb_reset_network_header(skb);
3214 	skb_reset_transport_header(skb);
3215 	skb_reset_mac_header(skb);
3216 
3217 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3218 		if (!res.fi) {
3219 			err = fib_props[res.type].error;
3220 			if (!err)
3221 				err = -EHOSTUNREACH;
3222 			goto errout_rcu;
3223 		}
3224 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3225 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3226 				    rt->rt_type, res.prefix, res.prefixlen,
3227 				    fl4.flowi4_tos, res.fi, 0);
3228 	} else {
3229 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3230 				   NETLINK_CB(in_skb).portid,
3231 				   nlh->nlmsg_seq, 0);
3232 	}
3233 	if (err < 0)
3234 		goto errout_rcu;
3235 
3236 	rcu_read_unlock();
3237 
3238 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3239 
3240 errout_free:
3241 	return err;
3242 errout_rcu:
3243 	rcu_read_unlock();
3244 	kfree_skb(skb);
3245 	goto errout_free;
3246 }
3247 
ip_rt_multicast_event(struct in_device * in_dev)3248 void ip_rt_multicast_event(struct in_device *in_dev)
3249 {
3250 	rt_cache_flush(dev_net(in_dev->dev));
3251 }
3252 
3253 #ifdef CONFIG_SYSCTL
3254 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3255 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3256 static int ip_rt_gc_elasticity __read_mostly	= 8;
3257 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3258 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)3259 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3260 					void __user *buffer,
3261 					size_t *lenp, loff_t *ppos)
3262 {
3263 	struct net *net = (struct net *)__ctl->extra1;
3264 
3265 	if (write) {
3266 		rt_cache_flush(net);
3267 		fnhe_genid_bump(net);
3268 		return 0;
3269 	}
3270 
3271 	return -EINVAL;
3272 }
3273 
3274 static struct ctl_table ipv4_route_table[] = {
3275 	{
3276 		.procname	= "gc_thresh",
3277 		.data		= &ipv4_dst_ops.gc_thresh,
3278 		.maxlen		= sizeof(int),
3279 		.mode		= 0644,
3280 		.proc_handler	= proc_dointvec,
3281 	},
3282 	{
3283 		.procname	= "max_size",
3284 		.data		= &ip_rt_max_size,
3285 		.maxlen		= sizeof(int),
3286 		.mode		= 0644,
3287 		.proc_handler	= proc_dointvec,
3288 	},
3289 	{
3290 		/*  Deprecated. Use gc_min_interval_ms */
3291 
3292 		.procname	= "gc_min_interval",
3293 		.data		= &ip_rt_gc_min_interval,
3294 		.maxlen		= sizeof(int),
3295 		.mode		= 0644,
3296 		.proc_handler	= proc_dointvec_jiffies,
3297 	},
3298 	{
3299 		.procname	= "gc_min_interval_ms",
3300 		.data		= &ip_rt_gc_min_interval,
3301 		.maxlen		= sizeof(int),
3302 		.mode		= 0644,
3303 		.proc_handler	= proc_dointvec_ms_jiffies,
3304 	},
3305 	{
3306 		.procname	= "gc_timeout",
3307 		.data		= &ip_rt_gc_timeout,
3308 		.maxlen		= sizeof(int),
3309 		.mode		= 0644,
3310 		.proc_handler	= proc_dointvec_jiffies,
3311 	},
3312 	{
3313 		.procname	= "gc_interval",
3314 		.data		= &ip_rt_gc_interval,
3315 		.maxlen		= sizeof(int),
3316 		.mode		= 0644,
3317 		.proc_handler	= proc_dointvec_jiffies,
3318 	},
3319 	{
3320 		.procname	= "redirect_load",
3321 		.data		= &ip_rt_redirect_load,
3322 		.maxlen		= sizeof(int),
3323 		.mode		= 0644,
3324 		.proc_handler	= proc_dointvec,
3325 	},
3326 	{
3327 		.procname	= "redirect_number",
3328 		.data		= &ip_rt_redirect_number,
3329 		.maxlen		= sizeof(int),
3330 		.mode		= 0644,
3331 		.proc_handler	= proc_dointvec,
3332 	},
3333 	{
3334 		.procname	= "redirect_silence",
3335 		.data		= &ip_rt_redirect_silence,
3336 		.maxlen		= sizeof(int),
3337 		.mode		= 0644,
3338 		.proc_handler	= proc_dointvec,
3339 	},
3340 	{
3341 		.procname	= "error_cost",
3342 		.data		= &ip_rt_error_cost,
3343 		.maxlen		= sizeof(int),
3344 		.mode		= 0644,
3345 		.proc_handler	= proc_dointvec,
3346 	},
3347 	{
3348 		.procname	= "error_burst",
3349 		.data		= &ip_rt_error_burst,
3350 		.maxlen		= sizeof(int),
3351 		.mode		= 0644,
3352 		.proc_handler	= proc_dointvec,
3353 	},
3354 	{
3355 		.procname	= "gc_elasticity",
3356 		.data		= &ip_rt_gc_elasticity,
3357 		.maxlen		= sizeof(int),
3358 		.mode		= 0644,
3359 		.proc_handler	= proc_dointvec,
3360 	},
3361 	{
3362 		.procname	= "mtu_expires",
3363 		.data		= &ip_rt_mtu_expires,
3364 		.maxlen		= sizeof(int),
3365 		.mode		= 0644,
3366 		.proc_handler	= proc_dointvec_jiffies,
3367 	},
3368 	{
3369 		.procname	= "min_pmtu",
3370 		.data		= &ip_rt_min_pmtu,
3371 		.maxlen		= sizeof(int),
3372 		.mode		= 0644,
3373 		.proc_handler	= proc_dointvec_minmax,
3374 		.extra1		= &ip_min_valid_pmtu,
3375 	},
3376 	{
3377 		.procname	= "min_adv_mss",
3378 		.data		= &ip_rt_min_advmss,
3379 		.maxlen		= sizeof(int),
3380 		.mode		= 0644,
3381 		.proc_handler	= proc_dointvec,
3382 	},
3383 	{ }
3384 };
3385 
3386 static const char ipv4_route_flush_procname[] = "flush";
3387 
3388 static struct ctl_table ipv4_route_flush_table[] = {
3389 	{
3390 		.procname	= ipv4_route_flush_procname,
3391 		.maxlen		= sizeof(int),
3392 		.mode		= 0200,
3393 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3394 	},
3395 	{ },
3396 };
3397 
sysctl_route_net_init(struct net * net)3398 static __net_init int sysctl_route_net_init(struct net *net)
3399 {
3400 	struct ctl_table *tbl;
3401 
3402 	tbl = ipv4_route_flush_table;
3403 	if (!net_eq(net, &init_net)) {
3404 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3405 		if (!tbl)
3406 			goto err_dup;
3407 
3408 		/* Don't export non-whitelisted sysctls to unprivileged users */
3409 		if (net->user_ns != &init_user_ns) {
3410 			if (tbl[0].procname != ipv4_route_flush_procname)
3411 				tbl[0].procname = NULL;
3412 		}
3413 	}
3414 	tbl[0].extra1 = net;
3415 
3416 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3417 	if (!net->ipv4.route_hdr)
3418 		goto err_reg;
3419 	return 0;
3420 
3421 err_reg:
3422 	if (tbl != ipv4_route_flush_table)
3423 		kfree(tbl);
3424 err_dup:
3425 	return -ENOMEM;
3426 }
3427 
sysctl_route_net_exit(struct net * net)3428 static __net_exit void sysctl_route_net_exit(struct net *net)
3429 {
3430 	struct ctl_table *tbl;
3431 
3432 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3433 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3434 	BUG_ON(tbl == ipv4_route_flush_table);
3435 	kfree(tbl);
3436 }
3437 
3438 static __net_initdata struct pernet_operations sysctl_route_ops = {
3439 	.init = sysctl_route_net_init,
3440 	.exit = sysctl_route_net_exit,
3441 };
3442 #endif
3443 
rt_genid_init(struct net * net)3444 static __net_init int rt_genid_init(struct net *net)
3445 {
3446 	atomic_set(&net->ipv4.rt_genid, 0);
3447 	atomic_set(&net->fnhe_genid, 0);
3448 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3449 	return 0;
3450 }
3451 
3452 static __net_initdata struct pernet_operations rt_genid_ops = {
3453 	.init = rt_genid_init,
3454 };
3455 
ipv4_inetpeer_init(struct net * net)3456 static int __net_init ipv4_inetpeer_init(struct net *net)
3457 {
3458 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3459 
3460 	if (!bp)
3461 		return -ENOMEM;
3462 	inet_peer_base_init(bp);
3463 	net->ipv4.peers = bp;
3464 	return 0;
3465 }
3466 
ipv4_inetpeer_exit(struct net * net)3467 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3468 {
3469 	struct inet_peer_base *bp = net->ipv4.peers;
3470 
3471 	net->ipv4.peers = NULL;
3472 	inetpeer_invalidate_tree(bp);
3473 	kfree(bp);
3474 }
3475 
3476 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3477 	.init	=	ipv4_inetpeer_init,
3478 	.exit	=	ipv4_inetpeer_exit,
3479 };
3480 
3481 #ifdef CONFIG_IP_ROUTE_CLASSID
3482 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3483 #endif /* CONFIG_IP_ROUTE_CLASSID */
3484 
ip_rt_init(void)3485 int __init ip_rt_init(void)
3486 {
3487 	void *idents_hash;
3488 	int cpu;
3489 
3490 	/* For modern hosts, this will use 2 MB of memory */
3491 	idents_hash = alloc_large_system_hash("IP idents",
3492 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3493 					      0,
3494 					      16, /* one bucket per 64 KB */
3495 					      HASH_ZERO,
3496 					      NULL,
3497 					      &ip_idents_mask,
3498 					      2048,
3499 					      256*1024);
3500 
3501 	ip_idents = idents_hash;
3502 
3503 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3504 
3505 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3506 
3507 	for_each_possible_cpu(cpu) {
3508 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3509 
3510 		INIT_LIST_HEAD(&ul->head);
3511 		spin_lock_init(&ul->lock);
3512 	}
3513 #ifdef CONFIG_IP_ROUTE_CLASSID
3514 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3515 	if (!ip_rt_acct)
3516 		panic("IP: failed to allocate ip_rt_acct\n");
3517 #endif
3518 
3519 	ipv4_dst_ops.kmem_cachep =
3520 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3521 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3522 
3523 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3524 
3525 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3526 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3527 
3528 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3529 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3530 
3531 	ipv4_dst_ops.gc_thresh = ~0;
3532 	ip_rt_max_size = INT_MAX;
3533 
3534 	devinet_init();
3535 	ip_fib_init();
3536 
3537 	if (ip_rt_proc_init())
3538 		pr_err("Unable to create route proc files\n");
3539 #ifdef CONFIG_XFRM
3540 	xfrm_init();
3541 	xfrm4_init();
3542 #endif
3543 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3544 		      RTNL_FLAG_DOIT_UNLOCKED);
3545 
3546 #ifdef CONFIG_SYSCTL
3547 	register_pernet_subsys(&sysctl_route_ops);
3548 #endif
3549 	register_pernet_subsys(&rt_genid_ops);
3550 	register_pernet_subsys(&ipv4_inetpeer_ops);
3551 	return 0;
3552 }
3553 
3554 #ifdef CONFIG_SYSCTL
3555 /*
3556  * We really need to sanitize the damn ipv4 init order, then all
3557  * this nonsense will go away.
3558  */
ip_static_sysctl_init(void)3559 void __init ip_static_sysctl_init(void)
3560 {
3561 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3562 }
3563 #endif
3564