• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu,
144 					   bool confirm_neigh);
145 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 					struct sk_buff *skb);
147 static void		ipv4_dst_destroy(struct dst_entry *dst);
148 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159 
160 static struct dst_ops ipv4_dst_ops = {
161 	.family =		AF_INET,
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.negative_advice =	ipv4_negative_advice,
168 	.link_failure =		ipv4_link_failure,
169 	.update_pmtu =		ip_rt_update_pmtu,
170 	.redirect =		ip_do_redirect,
171 	.local_out =		__ip_local_out,
172 	.neigh_lookup =		ipv4_neigh_lookup,
173 	.confirm_neigh =	ipv4_confirm_neigh,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct proc_ops rt_cache_proc_ops = {
242 	.proc_open	= rt_cache_seq_open,
243 	.proc_read	= seq_read,
244 	.proc_lseek	= seq_lseek,
245 	.proc_release	= seq_release,
246 };
247 
248 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 	int cpu;
252 
253 	if (*pos == 0)
254 		return SEQ_START_TOKEN;
255 
256 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 		if (!cpu_possible(cpu))
258 			continue;
259 		*pos = cpu+1;
260 		return &per_cpu(rt_cache_stat, cpu);
261 	}
262 	return NULL;
263 }
264 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 	int cpu;
268 
269 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 		if (!cpu_possible(cpu))
271 			continue;
272 		*pos = cpu+1;
273 		return &per_cpu(rt_cache_stat, cpu);
274 	}
275 	(*pos)++;
276 	return NULL;
277 
278 }
279 
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct proc_ops rt_cpu_proc_ops = {
333 	.proc_open	= rt_cpu_seq_open,
334 	.proc_read	= seq_read,
335 	.proc_lseek	= seq_lseek,
336 	.proc_release	= seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 #endif
364 
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 	struct proc_dir_entry *pde;
368 
369 	pde = proc_create("rt_cache", 0444, net->proc_net,
370 			  &rt_cache_proc_ops);
371 	if (!pde)
372 		goto err1;
373 
374 	pde = proc_create("rt_cache", 0444,
375 			  net->proc_net_stat, &rt_cpu_proc_ops);
376 	if (!pde)
377 		goto err2;
378 
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 	pde = proc_create_single("rt_acct", 0, net->proc_net,
381 			rt_acct_proc_show);
382 	if (!pde)
383 		goto err3;
384 #endif
385 	return 0;
386 
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 	remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 	remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 	return -ENOMEM;
395 }
396 
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 	remove_proc_entry("rt_cache", net->proc_net_stat);
400 	remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 	remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405 
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407 	.init = ip_rt_do_proc_init,
408 	.exit = ip_rt_do_proc_exit,
409 };
410 
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 	return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415 
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 	return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422 
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427 
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 	rt_genid_bump_ipv4(net);
431 }
432 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 					   struct sk_buff *skb,
435 					   const void *daddr)
436 {
437 	const struct rtable *rt = container_of(dst, struct rtable, dst);
438 	struct net_device *dev = dst->dev;
439 	struct neighbour *n;
440 
441 	rcu_read_lock_bh();
442 
443 	if (likely(rt->rt_gw_family == AF_INET)) {
444 		n = ip_neigh_gw4(dev, rt->rt_gw4);
445 	} else if (rt->rt_gw_family == AF_INET6) {
446 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448 		__be32 pkey;
449 
450 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 		n = ip_neigh_gw4(dev, pkey);
452 	}
453 
454 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 		n = NULL;
456 
457 	rcu_read_unlock_bh();
458 
459 	return n;
460 }
461 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 	const struct rtable *rt = container_of(dst, struct rtable, dst);
465 	struct net_device *dev = dst->dev;
466 	const __be32 *pkey = daddr;
467 
468 	if (rt->rt_gw_family == AF_INET) {
469 		pkey = (const __be32 *)&rt->rt_gw4;
470 	} else if (rt->rt_gw_family == AF_INET6) {
471 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 	} else if (!daddr ||
473 		 (rt->rt_flags &
474 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 		return;
476 	}
477 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479 
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486 
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 	u32 bucket, old, now = (u32)jiffies;
494 	atomic_t *p_id;
495 	u32 *p_tstamp;
496 	u32 delta = 0;
497 
498 	bucket = hash & ip_idents_mask;
499 	p_tstamp = ip_tstamps + bucket;
500 	p_id = ip_idents + bucket;
501 	old = READ_ONCE(*p_tstamp);
502 
503 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 		delta = prandom_u32_max(now - old);
505 
506 	/* If UBSAN reports an error there, please make sure your compiler
507 	 * supports -fno-strict-overflow before reporting it that was a bug
508 	 * in UBSAN, and it has been fixed in GCC-8.
509 	 */
510 	return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 	u32 hash, id;
517 
518 	/* Note the following code is not safe, but this is okay. */
519 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 		get_random_bytes(&net->ipv4.ip_id_key,
521 				 sizeof(net->ipv4.ip_id_key));
522 
523 	hash = siphash_3u32((__force u32)iph->daddr,
524 			    (__force u32)iph->saddr,
525 			    iph->protocol,
526 			    &net->ipv4.ip_id_key);
527 	id = ip_idents_reserve(hash, segs);
528 	iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)532 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
533 			     const struct sock *sk,
534 			     const struct iphdr *iph,
535 			     int oif, u8 tos,
536 			     u8 prot, u32 mark, int flow_flags)
537 {
538 	if (sk) {
539 		const struct inet_sock *inet = inet_sk(sk);
540 
541 		oif = sk->sk_bound_dev_if;
542 		mark = sk->sk_mark;
543 		tos = RT_CONN_FLAGS(sk);
544 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
545 	}
546 	flowi4_init_output(fl4, oif, mark, tos,
547 			   RT_SCOPE_UNIVERSE, prot,
548 			   flow_flags,
549 			   iph->daddr, iph->saddr, 0, 0,
550 			   sock_net_uid(net, sk));
551 }
552 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 			       const struct sock *sk)
555 {
556 	const struct net *net = dev_net(skb->dev);
557 	const struct iphdr *iph = ip_hdr(skb);
558 	int oif = skb->dev->ifindex;
559 	u8 tos = RT_TOS(iph->tos);
560 	u8 prot = iph->protocol;
561 	u32 mark = skb->mark;
562 
563 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
564 }
565 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)566 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
567 {
568 	const struct inet_sock *inet = inet_sk(sk);
569 	const struct ip_options_rcu *inet_opt;
570 	__be32 daddr = inet->inet_daddr;
571 
572 	rcu_read_lock();
573 	inet_opt = rcu_dereference(inet->inet_opt);
574 	if (inet_opt && inet_opt->opt.srr)
575 		daddr = inet_opt->opt.faddr;
576 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
577 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
578 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
579 			   inet_sk_flowi_flags(sk),
580 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
581 	rcu_read_unlock();
582 }
583 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)584 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
585 				 const struct sk_buff *skb)
586 {
587 	if (skb)
588 		build_skb_flow_key(fl4, skb, sk);
589 	else
590 		build_sk_flow_key(fl4, sk);
591 }
592 
593 static DEFINE_SPINLOCK(fnhe_lock);
594 
fnhe_flush_routes(struct fib_nh_exception * fnhe)595 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
596 {
597 	struct rtable *rt;
598 
599 	rt = rcu_dereference(fnhe->fnhe_rth_input);
600 	if (rt) {
601 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
602 		dst_dev_put(&rt->dst);
603 		dst_release(&rt->dst);
604 	}
605 	rt = rcu_dereference(fnhe->fnhe_rth_output);
606 	if (rt) {
607 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
608 		dst_dev_put(&rt->dst);
609 		dst_release(&rt->dst);
610 	}
611 }
612 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)613 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
614 {
615 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
616 	struct fib_nh_exception *fnhe, *oldest = NULL;
617 
618 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
619 		fnhe = rcu_dereference_protected(*fnhe_p,
620 						 lockdep_is_held(&fnhe_lock));
621 		if (!fnhe)
622 			break;
623 		if (!oldest ||
624 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
625 			oldest = fnhe;
626 			oldest_p = fnhe_p;
627 		}
628 	}
629 	fnhe_flush_routes(oldest);
630 	*oldest_p = oldest->fnhe_next;
631 	kfree_rcu(oldest, rcu);
632 }
633 
fnhe_hashfun(__be32 daddr)634 static u32 fnhe_hashfun(__be32 daddr)
635 {
636 	static siphash_key_t fnhe_hash_key __read_mostly;
637 	u64 hval;
638 
639 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
640 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
641 	return hash_64(hval, FNHE_HASH_SHIFT);
642 }
643 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)644 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
645 {
646 	rt->rt_pmtu = fnhe->fnhe_pmtu;
647 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
648 	rt->dst.expires = fnhe->fnhe_expires;
649 
650 	if (fnhe->fnhe_gw) {
651 		rt->rt_flags |= RTCF_REDIRECTED;
652 		rt->rt_uses_gateway = 1;
653 		rt->rt_gw_family = AF_INET;
654 		rt->rt_gw4 = fnhe->fnhe_gw;
655 	}
656 }
657 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)658 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
659 				  __be32 gw, u32 pmtu, bool lock,
660 				  unsigned long expires)
661 {
662 	struct fnhe_hash_bucket *hash;
663 	struct fib_nh_exception *fnhe;
664 	struct rtable *rt;
665 	u32 genid, hval;
666 	unsigned int i;
667 	int depth;
668 
669 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
670 	hval = fnhe_hashfun(daddr);
671 
672 	spin_lock_bh(&fnhe_lock);
673 
674 	hash = rcu_dereference(nhc->nhc_exceptions);
675 	if (!hash) {
676 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
677 		if (!hash)
678 			goto out_unlock;
679 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
680 	}
681 
682 	hash += hval;
683 
684 	depth = 0;
685 	for (fnhe = rcu_dereference(hash->chain); fnhe;
686 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
687 		if (fnhe->fnhe_daddr == daddr)
688 			break;
689 		depth++;
690 	}
691 
692 	if (fnhe) {
693 		if (fnhe->fnhe_genid != genid)
694 			fnhe->fnhe_genid = genid;
695 		if (gw)
696 			fnhe->fnhe_gw = gw;
697 		if (pmtu) {
698 			fnhe->fnhe_pmtu = pmtu;
699 			fnhe->fnhe_mtu_locked = lock;
700 		}
701 		fnhe->fnhe_expires = max(1UL, expires);
702 		/* Update all cached dsts too */
703 		rt = rcu_dereference(fnhe->fnhe_rth_input);
704 		if (rt)
705 			fill_route_from_fnhe(rt, fnhe);
706 		rt = rcu_dereference(fnhe->fnhe_rth_output);
707 		if (rt)
708 			fill_route_from_fnhe(rt, fnhe);
709 	} else {
710 		/* Randomize max depth to avoid some side channels attacks. */
711 		int max_depth = FNHE_RECLAIM_DEPTH +
712 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
713 
714 		while (depth > max_depth) {
715 			fnhe_remove_oldest(hash);
716 			depth--;
717 		}
718 
719 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
720 		if (!fnhe)
721 			goto out_unlock;
722 
723 		fnhe->fnhe_next = hash->chain;
724 
725 		fnhe->fnhe_genid = genid;
726 		fnhe->fnhe_daddr = daddr;
727 		fnhe->fnhe_gw = gw;
728 		fnhe->fnhe_pmtu = pmtu;
729 		fnhe->fnhe_mtu_locked = lock;
730 		fnhe->fnhe_expires = max(1UL, expires);
731 
732 		rcu_assign_pointer(hash->chain, fnhe);
733 
734 		/* Exception created; mark the cached routes for the nexthop
735 		 * stale, so anyone caching it rechecks if this exception
736 		 * applies to them.
737 		 */
738 		rt = rcu_dereference(nhc->nhc_rth_input);
739 		if (rt)
740 			rt->dst.obsolete = DST_OBSOLETE_KILL;
741 
742 		for_each_possible_cpu(i) {
743 			struct rtable __rcu **prt;
744 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
745 			rt = rcu_dereference(*prt);
746 			if (rt)
747 				rt->dst.obsolete = DST_OBSOLETE_KILL;
748 		}
749 	}
750 
751 	fnhe->fnhe_stamp = jiffies;
752 
753 out_unlock:
754 	spin_unlock_bh(&fnhe_lock);
755 }
756 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)757 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
758 			     bool kill_route)
759 {
760 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
761 	__be32 old_gw = ip_hdr(skb)->saddr;
762 	struct net_device *dev = skb->dev;
763 	struct in_device *in_dev;
764 	struct fib_result res;
765 	struct neighbour *n;
766 	struct net *net;
767 
768 	switch (icmp_hdr(skb)->code & 7) {
769 	case ICMP_REDIR_NET:
770 	case ICMP_REDIR_NETTOS:
771 	case ICMP_REDIR_HOST:
772 	case ICMP_REDIR_HOSTTOS:
773 		break;
774 
775 	default:
776 		return;
777 	}
778 
779 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
780 		return;
781 
782 	in_dev = __in_dev_get_rcu(dev);
783 	if (!in_dev)
784 		return;
785 
786 	net = dev_net(dev);
787 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
788 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
789 	    ipv4_is_zeronet(new_gw))
790 		goto reject_redirect;
791 
792 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
793 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
794 			goto reject_redirect;
795 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
796 			goto reject_redirect;
797 	} else {
798 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
799 			goto reject_redirect;
800 	}
801 
802 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
803 	if (!n)
804 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
805 	if (!IS_ERR(n)) {
806 		if (!(n->nud_state & NUD_VALID)) {
807 			neigh_event_send(n, NULL);
808 		} else {
809 			if (fib_lookup(net, fl4, &res, 0) == 0) {
810 				struct fib_nh_common *nhc;
811 
812 				fib_select_path(net, &res, fl4, skb);
813 				nhc = FIB_RES_NHC(res);
814 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
815 						0, false,
816 						jiffies + ip_rt_gc_timeout);
817 			}
818 			if (kill_route)
819 				rt->dst.obsolete = DST_OBSOLETE_KILL;
820 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
821 		}
822 		neigh_release(n);
823 	}
824 	return;
825 
826 reject_redirect:
827 #ifdef CONFIG_IP_ROUTE_VERBOSE
828 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
829 		const struct iphdr *iph = (const struct iphdr *) skb->data;
830 		__be32 daddr = iph->daddr;
831 		__be32 saddr = iph->saddr;
832 
833 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
834 				     "  Advised path = %pI4 -> %pI4\n",
835 				     &old_gw, dev->name, &new_gw,
836 				     &saddr, &daddr);
837 	}
838 #endif
839 	;
840 }
841 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)842 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
843 {
844 	struct rtable *rt;
845 	struct flowi4 fl4;
846 	const struct iphdr *iph = (const struct iphdr *) skb->data;
847 	struct net *net = dev_net(skb->dev);
848 	int oif = skb->dev->ifindex;
849 	u8 tos = RT_TOS(iph->tos);
850 	u8 prot = iph->protocol;
851 	u32 mark = skb->mark;
852 
853 	rt = (struct rtable *) dst;
854 
855 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
856 	__ip_do_redirect(rt, skb, &fl4, true);
857 }
858 
ipv4_negative_advice(struct dst_entry * dst)859 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
860 {
861 	struct rtable *rt = (struct rtable *)dst;
862 	struct dst_entry *ret = dst;
863 
864 	if (rt) {
865 		if (dst->obsolete > 0) {
866 			ip_rt_put(rt);
867 			ret = NULL;
868 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
869 			   rt->dst.expires) {
870 			ip_rt_put(rt);
871 			ret = NULL;
872 		}
873 	}
874 	return ret;
875 }
876 
877 /*
878  * Algorithm:
879  *	1. The first ip_rt_redirect_number redirects are sent
880  *	   with exponential backoff, then we stop sending them at all,
881  *	   assuming that the host ignores our redirects.
882  *	2. If we did not see packets requiring redirects
883  *	   during ip_rt_redirect_silence, we assume that the host
884  *	   forgot redirected route and start to send redirects again.
885  *
886  * This algorithm is much cheaper and more intelligent than dumb load limiting
887  * in icmp.c.
888  *
889  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
890  * and "frag. need" (breaks PMTU discovery) in icmp.c.
891  */
892 
ip_rt_send_redirect(struct sk_buff * skb)893 void ip_rt_send_redirect(struct sk_buff *skb)
894 {
895 	struct rtable *rt = skb_rtable(skb);
896 	struct in_device *in_dev;
897 	struct inet_peer *peer;
898 	struct net *net;
899 	int log_martians;
900 	int vif;
901 
902 	rcu_read_lock();
903 	in_dev = __in_dev_get_rcu(rt->dst.dev);
904 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
905 		rcu_read_unlock();
906 		return;
907 	}
908 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
909 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
910 	rcu_read_unlock();
911 
912 	net = dev_net(rt->dst.dev);
913 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
914 	if (!peer) {
915 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
916 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
917 		return;
918 	}
919 
920 	/* No redirected packets during ip_rt_redirect_silence;
921 	 * reset the algorithm.
922 	 */
923 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
924 		peer->rate_tokens = 0;
925 		peer->n_redirects = 0;
926 	}
927 
928 	/* Too many ignored redirects; do not send anything
929 	 * set dst.rate_last to the last seen redirected packet.
930 	 */
931 	if (peer->n_redirects >= ip_rt_redirect_number) {
932 		peer->rate_last = jiffies;
933 		goto out_put_peer;
934 	}
935 
936 	/* Check for load limit; set rate_last to the latest sent
937 	 * redirect.
938 	 */
939 	if (peer->n_redirects == 0 ||
940 	    time_after(jiffies,
941 		       (peer->rate_last +
942 			(ip_rt_redirect_load << peer->n_redirects)))) {
943 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
944 
945 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
946 		peer->rate_last = jiffies;
947 		++peer->n_redirects;
948 #ifdef CONFIG_IP_ROUTE_VERBOSE
949 		if (log_martians &&
950 		    peer->n_redirects == ip_rt_redirect_number)
951 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
952 					     &ip_hdr(skb)->saddr, inet_iif(skb),
953 					     &ip_hdr(skb)->daddr, &gw);
954 #endif
955 	}
956 out_put_peer:
957 	inet_putpeer(peer);
958 }
959 
ip_error(struct sk_buff * skb)960 static int ip_error(struct sk_buff *skb)
961 {
962 	struct rtable *rt = skb_rtable(skb);
963 	struct net_device *dev = skb->dev;
964 	struct in_device *in_dev;
965 	struct inet_peer *peer;
966 	unsigned long now;
967 	struct net *net;
968 	bool send;
969 	int code;
970 
971 	if (netif_is_l3_master(skb->dev)) {
972 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
973 		if (!dev)
974 			goto out;
975 	}
976 
977 	in_dev = __in_dev_get_rcu(dev);
978 
979 	/* IP on this device is disabled. */
980 	if (!in_dev)
981 		goto out;
982 
983 	net = dev_net(rt->dst.dev);
984 	if (!IN_DEV_FORWARD(in_dev)) {
985 		switch (rt->dst.error) {
986 		case EHOSTUNREACH:
987 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
988 			break;
989 
990 		case ENETUNREACH:
991 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
992 			break;
993 		}
994 		goto out;
995 	}
996 
997 	switch (rt->dst.error) {
998 	case EINVAL:
999 	default:
1000 		goto out;
1001 	case EHOSTUNREACH:
1002 		code = ICMP_HOST_UNREACH;
1003 		break;
1004 	case ENETUNREACH:
1005 		code = ICMP_NET_UNREACH;
1006 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1007 		break;
1008 	case EACCES:
1009 		code = ICMP_PKT_FILTERED;
1010 		break;
1011 	}
1012 
1013 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014 			       l3mdev_master_ifindex(skb->dev), 1);
1015 
1016 	send = true;
1017 	if (peer) {
1018 		now = jiffies;
1019 		peer->rate_tokens += now - peer->rate_last;
1020 		if (peer->rate_tokens > ip_rt_error_burst)
1021 			peer->rate_tokens = ip_rt_error_burst;
1022 		peer->rate_last = now;
1023 		if (peer->rate_tokens >= ip_rt_error_cost)
1024 			peer->rate_tokens -= ip_rt_error_cost;
1025 		else
1026 			send = false;
1027 		inet_putpeer(peer);
1028 	}
1029 	if (send)
1030 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1031 
1032 out:	kfree_skb(skb);
1033 	return 0;
1034 }
1035 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1036 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1037 {
1038 	struct dst_entry *dst = &rt->dst;
1039 	struct net *net = dev_net(dst->dev);
1040 	struct fib_result res;
1041 	bool lock = false;
1042 	u32 old_mtu;
1043 
1044 	if (ip_mtu_locked(dst))
1045 		return;
1046 
1047 	old_mtu = ipv4_mtu(dst);
1048 	if (old_mtu < mtu)
1049 		return;
1050 
1051 	if (mtu < ip_rt_min_pmtu) {
1052 		lock = true;
1053 		mtu = min(old_mtu, ip_rt_min_pmtu);
1054 	}
1055 
1056 	if (rt->rt_pmtu == mtu && !lock &&
1057 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1058 		return;
1059 
1060 	rcu_read_lock();
1061 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1062 		struct fib_nh_common *nhc;
1063 
1064 		fib_select_path(net, &res, fl4, NULL);
1065 		nhc = FIB_RES_NHC(res);
1066 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1067 				      jiffies + ip_rt_mtu_expires);
1068 	}
1069 	rcu_read_unlock();
1070 }
1071 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1072 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1073 			      struct sk_buff *skb, u32 mtu,
1074 			      bool confirm_neigh)
1075 {
1076 	struct rtable *rt = (struct rtable *) dst;
1077 	struct flowi4 fl4;
1078 
1079 	ip_rt_build_flow_key(&fl4, sk, skb);
1080 
1081 	/* Don't make lookup fail for bridged encapsulations */
1082 	if (skb && netif_is_any_bridge_port(skb->dev))
1083 		fl4.flowi4_oif = 0;
1084 
1085 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1086 }
1087 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1088 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1089 		      int oif, u8 protocol)
1090 {
1091 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1092 	struct flowi4 fl4;
1093 	struct rtable *rt;
1094 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1095 
1096 	__build_flow_key(net, &fl4, NULL, iph, oif,
1097 			 RT_TOS(iph->tos), protocol, mark, 0);
1098 	rt = __ip_route_output_key(net, &fl4);
1099 	if (!IS_ERR(rt)) {
1100 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1101 		ip_rt_put(rt);
1102 	}
1103 }
1104 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1105 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1106 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1107 {
1108 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1109 	struct flowi4 fl4;
1110 	struct rtable *rt;
1111 
1112 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1113 
1114 	if (!fl4.flowi4_mark)
1115 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1116 
1117 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1118 	if (!IS_ERR(rt)) {
1119 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1120 		ip_rt_put(rt);
1121 	}
1122 }
1123 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1124 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1125 {
1126 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1127 	struct flowi4 fl4;
1128 	struct rtable *rt;
1129 	struct dst_entry *odst = NULL;
1130 	bool new = false;
1131 	struct net *net = sock_net(sk);
1132 
1133 	bh_lock_sock(sk);
1134 
1135 	if (!ip_sk_accept_pmtu(sk))
1136 		goto out;
1137 
1138 	odst = sk_dst_get(sk);
1139 
1140 	if (sock_owned_by_user(sk) || !odst) {
1141 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1142 		goto out;
1143 	}
1144 
1145 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1146 
1147 	rt = (struct rtable *)odst;
1148 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1149 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1150 		if (IS_ERR(rt))
1151 			goto out;
1152 
1153 		new = true;
1154 	}
1155 
1156 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1157 
1158 	if (!dst_check(&rt->dst, 0)) {
1159 		if (new)
1160 			dst_release(&rt->dst);
1161 
1162 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1163 		if (IS_ERR(rt))
1164 			goto out;
1165 
1166 		new = true;
1167 	}
1168 
1169 	if (new)
1170 		sk_dst_set(sk, &rt->dst);
1171 
1172 out:
1173 	bh_unlock_sock(sk);
1174 	dst_release(odst);
1175 }
1176 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1177 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1178 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1179 		   int oif, u8 protocol)
1180 {
1181 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1182 	struct flowi4 fl4;
1183 	struct rtable *rt;
1184 
1185 	__build_flow_key(net, &fl4, NULL, iph, oif,
1186 			 RT_TOS(iph->tos), protocol, 0, 0);
1187 	rt = __ip_route_output_key(net, &fl4);
1188 	if (!IS_ERR(rt)) {
1189 		__ip_do_redirect(rt, skb, &fl4, false);
1190 		ip_rt_put(rt);
1191 	}
1192 }
1193 EXPORT_SYMBOL_GPL(ipv4_redirect);
1194 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1195 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1196 {
1197 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1198 	struct flowi4 fl4;
1199 	struct rtable *rt;
1200 	struct net *net = sock_net(sk);
1201 
1202 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1203 	rt = __ip_route_output_key(net, &fl4);
1204 	if (!IS_ERR(rt)) {
1205 		__ip_do_redirect(rt, skb, &fl4, false);
1206 		ip_rt_put(rt);
1207 	}
1208 }
1209 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1210 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1211 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1212 {
1213 	struct rtable *rt = (struct rtable *) dst;
1214 
1215 	/* All IPV4 dsts are created with ->obsolete set to the value
1216 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1217 	 * into this function always.
1218 	 *
1219 	 * When a PMTU/redirect information update invalidates a route,
1220 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1221 	 * DST_OBSOLETE_DEAD.
1222 	 */
1223 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1224 		return NULL;
1225 	return dst;
1226 }
1227 
ipv4_send_dest_unreach(struct sk_buff * skb)1228 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1229 {
1230 	struct ip_options opt;
1231 	int res;
1232 
1233 	/* Recompile ip options since IPCB may not be valid anymore.
1234 	 * Also check we have a reasonable ipv4 header.
1235 	 */
1236 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1237 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1238 		return;
1239 
1240 	memset(&opt, 0, sizeof(opt));
1241 	if (ip_hdr(skb)->ihl > 5) {
1242 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1243 			return;
1244 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1245 
1246 		rcu_read_lock();
1247 		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1248 		rcu_read_unlock();
1249 
1250 		if (res)
1251 			return;
1252 	}
1253 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1254 }
1255 
ipv4_link_failure(struct sk_buff * skb)1256 static void ipv4_link_failure(struct sk_buff *skb)
1257 {
1258 	struct rtable *rt;
1259 
1260 	ipv4_send_dest_unreach(skb);
1261 
1262 	rt = skb_rtable(skb);
1263 	if (rt)
1264 		dst_set_expires(&rt->dst, 0);
1265 }
1266 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1267 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1268 {
1269 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1270 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1271 		 skb->dev ? skb->dev->name : "?");
1272 	kfree_skb(skb);
1273 	WARN_ON(1);
1274 	return 0;
1275 }
1276 
1277 /*
1278    We do not cache source address of outgoing interface,
1279    because it is used only by IP RR, TS and SRR options,
1280    so that it out of fast path.
1281 
1282    BTW remember: "addr" is allowed to be not aligned
1283    in IP options!
1284  */
1285 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1286 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1287 {
1288 	__be32 src;
1289 
1290 	if (rt_is_output_route(rt))
1291 		src = ip_hdr(skb)->saddr;
1292 	else {
1293 		struct fib_result res;
1294 		struct iphdr *iph = ip_hdr(skb);
1295 		struct flowi4 fl4 = {
1296 			.daddr = iph->daddr,
1297 			.saddr = iph->saddr,
1298 			.flowi4_tos = RT_TOS(iph->tos),
1299 			.flowi4_oif = rt->dst.dev->ifindex,
1300 			.flowi4_iif = skb->dev->ifindex,
1301 			.flowi4_mark = skb->mark,
1302 		};
1303 
1304 		rcu_read_lock();
1305 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1306 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1307 		else
1308 			src = inet_select_addr(rt->dst.dev,
1309 					       rt_nexthop(rt, iph->daddr),
1310 					       RT_SCOPE_UNIVERSE);
1311 		rcu_read_unlock();
1312 	}
1313 	memcpy(addr, &src, 4);
1314 }
1315 
1316 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1317 static void set_class_tag(struct rtable *rt, u32 tag)
1318 {
1319 	if (!(rt->dst.tclassid & 0xFFFF))
1320 		rt->dst.tclassid |= tag & 0xFFFF;
1321 	if (!(rt->dst.tclassid & 0xFFFF0000))
1322 		rt->dst.tclassid |= tag & 0xFFFF0000;
1323 }
1324 #endif
1325 
ipv4_default_advmss(const struct dst_entry * dst)1326 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1327 {
1328 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1329 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1330 				    ip_rt_min_advmss);
1331 
1332 	return min(advmss, IPV4_MAX_PMTU - header_size);
1333 }
1334 
ipv4_mtu(const struct dst_entry * dst)1335 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1336 {
1337 	const struct rtable *rt = (const struct rtable *)dst;
1338 	unsigned int mtu = rt->rt_pmtu;
1339 
1340 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1341 		mtu = dst_metric_raw(dst, RTAX_MTU);
1342 
1343 	if (mtu)
1344 		goto out;
1345 
1346 	mtu = READ_ONCE(dst->dev->mtu);
1347 
1348 	if (unlikely(ip_mtu_locked(dst))) {
1349 		if (rt->rt_uses_gateway && mtu > 576)
1350 			mtu = 576;
1351 	}
1352 
1353 out:
1354 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1355 
1356 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1357 }
1358 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1359 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1360 {
1361 	struct fnhe_hash_bucket *hash;
1362 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1363 	u32 hval = fnhe_hashfun(daddr);
1364 
1365 	spin_lock_bh(&fnhe_lock);
1366 
1367 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1368 					 lockdep_is_held(&fnhe_lock));
1369 	hash += hval;
1370 
1371 	fnhe_p = &hash->chain;
1372 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1373 	while (fnhe) {
1374 		if (fnhe->fnhe_daddr == daddr) {
1375 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1376 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1377 			/* set fnhe_daddr to 0 to ensure it won't bind with
1378 			 * new dsts in rt_bind_exception().
1379 			 */
1380 			fnhe->fnhe_daddr = 0;
1381 			fnhe_flush_routes(fnhe);
1382 			kfree_rcu(fnhe, rcu);
1383 			break;
1384 		}
1385 		fnhe_p = &fnhe->fnhe_next;
1386 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1387 						 lockdep_is_held(&fnhe_lock));
1388 	}
1389 
1390 	spin_unlock_bh(&fnhe_lock);
1391 }
1392 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1393 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1394 					       __be32 daddr)
1395 {
1396 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1397 	struct fib_nh_exception *fnhe;
1398 	u32 hval;
1399 
1400 	if (!hash)
1401 		return NULL;
1402 
1403 	hval = fnhe_hashfun(daddr);
1404 
1405 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1406 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1407 		if (fnhe->fnhe_daddr == daddr) {
1408 			if (fnhe->fnhe_expires &&
1409 			    time_after(jiffies, fnhe->fnhe_expires)) {
1410 				ip_del_fnhe(nhc, daddr);
1411 				break;
1412 			}
1413 			return fnhe;
1414 		}
1415 	}
1416 	return NULL;
1417 }
1418 
1419 /* MTU selection:
1420  * 1. mtu on route is locked - use it
1421  * 2. mtu from nexthop exception
1422  * 3. mtu from egress device
1423  */
1424 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1425 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1426 {
1427 	struct fib_nh_common *nhc = res->nhc;
1428 	struct net_device *dev = nhc->nhc_dev;
1429 	struct fib_info *fi = res->fi;
1430 	u32 mtu = 0;
1431 
1432 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1433 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1434 		mtu = fi->fib_mtu;
1435 
1436 	if (likely(!mtu)) {
1437 		struct fib_nh_exception *fnhe;
1438 
1439 		fnhe = find_exception(nhc, daddr);
1440 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1441 			mtu = fnhe->fnhe_pmtu;
1442 	}
1443 
1444 	if (likely(!mtu))
1445 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1446 
1447 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1448 }
1449 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1450 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1451 			      __be32 daddr, const bool do_cache)
1452 {
1453 	bool ret = false;
1454 
1455 	spin_lock_bh(&fnhe_lock);
1456 
1457 	if (daddr == fnhe->fnhe_daddr) {
1458 		struct rtable __rcu **porig;
1459 		struct rtable *orig;
1460 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1461 
1462 		if (rt_is_input_route(rt))
1463 			porig = &fnhe->fnhe_rth_input;
1464 		else
1465 			porig = &fnhe->fnhe_rth_output;
1466 		orig = rcu_dereference(*porig);
1467 
1468 		if (fnhe->fnhe_genid != genid) {
1469 			fnhe->fnhe_genid = genid;
1470 			fnhe->fnhe_gw = 0;
1471 			fnhe->fnhe_pmtu = 0;
1472 			fnhe->fnhe_expires = 0;
1473 			fnhe->fnhe_mtu_locked = false;
1474 			fnhe_flush_routes(fnhe);
1475 			orig = NULL;
1476 		}
1477 		fill_route_from_fnhe(rt, fnhe);
1478 		if (!rt->rt_gw4) {
1479 			rt->rt_gw4 = daddr;
1480 			rt->rt_gw_family = AF_INET;
1481 		}
1482 
1483 		if (do_cache) {
1484 			dst_hold(&rt->dst);
1485 			rcu_assign_pointer(*porig, rt);
1486 			if (orig) {
1487 				dst_dev_put(&orig->dst);
1488 				dst_release(&orig->dst);
1489 			}
1490 			ret = true;
1491 		}
1492 
1493 		fnhe->fnhe_stamp = jiffies;
1494 	}
1495 	spin_unlock_bh(&fnhe_lock);
1496 
1497 	return ret;
1498 }
1499 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1500 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1501 {
1502 	struct rtable *orig, *prev, **p;
1503 	bool ret = true;
1504 
1505 	if (rt_is_input_route(rt)) {
1506 		p = (struct rtable **)&nhc->nhc_rth_input;
1507 	} else {
1508 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1509 	}
1510 	orig = *p;
1511 
1512 	/* hold dst before doing cmpxchg() to avoid race condition
1513 	 * on this dst
1514 	 */
1515 	dst_hold(&rt->dst);
1516 	prev = cmpxchg(p, orig, rt);
1517 	if (prev == orig) {
1518 		if (orig) {
1519 			rt_add_uncached_list(orig);
1520 			dst_release(&orig->dst);
1521 		}
1522 	} else {
1523 		dst_release(&rt->dst);
1524 		ret = false;
1525 	}
1526 
1527 	return ret;
1528 }
1529 
1530 struct uncached_list {
1531 	spinlock_t		lock;
1532 	struct list_head	head;
1533 };
1534 
1535 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1536 
rt_add_uncached_list(struct rtable * rt)1537 void rt_add_uncached_list(struct rtable *rt)
1538 {
1539 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1540 
1541 	rt->rt_uncached_list = ul;
1542 
1543 	spin_lock_bh(&ul->lock);
1544 	list_add_tail(&rt->rt_uncached, &ul->head);
1545 	spin_unlock_bh(&ul->lock);
1546 }
1547 
rt_del_uncached_list(struct rtable * rt)1548 void rt_del_uncached_list(struct rtable *rt)
1549 {
1550 	if (!list_empty(&rt->rt_uncached)) {
1551 		struct uncached_list *ul = rt->rt_uncached_list;
1552 
1553 		spin_lock_bh(&ul->lock);
1554 		list_del(&rt->rt_uncached);
1555 		spin_unlock_bh(&ul->lock);
1556 	}
1557 }
1558 
ipv4_dst_destroy(struct dst_entry * dst)1559 static void ipv4_dst_destroy(struct dst_entry *dst)
1560 {
1561 	struct rtable *rt = (struct rtable *)dst;
1562 
1563 	ip_dst_metrics_put(dst);
1564 	rt_del_uncached_list(rt);
1565 }
1566 
rt_flush_dev(struct net_device * dev)1567 void rt_flush_dev(struct net_device *dev)
1568 {
1569 	struct rtable *rt;
1570 	int cpu;
1571 
1572 	for_each_possible_cpu(cpu) {
1573 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1574 
1575 		spin_lock_bh(&ul->lock);
1576 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1577 			if (rt->dst.dev != dev)
1578 				continue;
1579 			rt->dst.dev = blackhole_netdev;
1580 			dev_hold(rt->dst.dev);
1581 			dev_put(dev);
1582 		}
1583 		spin_unlock_bh(&ul->lock);
1584 	}
1585 }
1586 
rt_cache_valid(const struct rtable * rt)1587 static bool rt_cache_valid(const struct rtable *rt)
1588 {
1589 	return	rt &&
1590 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1591 		!rt_is_expired(rt);
1592 }
1593 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1594 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1595 			   const struct fib_result *res,
1596 			   struct fib_nh_exception *fnhe,
1597 			   struct fib_info *fi, u16 type, u32 itag,
1598 			   const bool do_cache)
1599 {
1600 	bool cached = false;
1601 
1602 	if (fi) {
1603 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1604 
1605 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1606 			rt->rt_uses_gateway = 1;
1607 			rt->rt_gw_family = nhc->nhc_gw_family;
1608 			/* only INET and INET6 are supported */
1609 			if (likely(nhc->nhc_gw_family == AF_INET))
1610 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1611 			else
1612 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1613 		}
1614 
1615 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1616 
1617 #ifdef CONFIG_IP_ROUTE_CLASSID
1618 		if (nhc->nhc_family == AF_INET) {
1619 			struct fib_nh *nh;
1620 
1621 			nh = container_of(nhc, struct fib_nh, nh_common);
1622 			rt->dst.tclassid = nh->nh_tclassid;
1623 		}
1624 #endif
1625 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1626 		if (unlikely(fnhe))
1627 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1628 		else if (do_cache)
1629 			cached = rt_cache_route(nhc, rt);
1630 		if (unlikely(!cached)) {
1631 			/* Routes we intend to cache in nexthop exception or
1632 			 * FIB nexthop have the DST_NOCACHE bit clear.
1633 			 * However, if we are unsuccessful at storing this
1634 			 * route into the cache we really need to set it.
1635 			 */
1636 			if (!rt->rt_gw4) {
1637 				rt->rt_gw_family = AF_INET;
1638 				rt->rt_gw4 = daddr;
1639 			}
1640 			rt_add_uncached_list(rt);
1641 		}
1642 	} else
1643 		rt_add_uncached_list(rt);
1644 
1645 #ifdef CONFIG_IP_ROUTE_CLASSID
1646 #ifdef CONFIG_IP_MULTIPLE_TABLES
1647 	set_class_tag(rt, res->tclassid);
1648 #endif
1649 	set_class_tag(rt, itag);
1650 #endif
1651 }
1652 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1653 struct rtable *rt_dst_alloc(struct net_device *dev,
1654 			    unsigned int flags, u16 type,
1655 			    bool nopolicy, bool noxfrm)
1656 {
1657 	struct rtable *rt;
1658 
1659 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1660 		       (nopolicy ? DST_NOPOLICY : 0) |
1661 		       (noxfrm ? DST_NOXFRM : 0));
1662 
1663 	if (rt) {
1664 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1665 		rt->rt_flags = flags;
1666 		rt->rt_type = type;
1667 		rt->rt_is_input = 0;
1668 		rt->rt_iif = 0;
1669 		rt->rt_pmtu = 0;
1670 		rt->rt_mtu_locked = 0;
1671 		rt->rt_uses_gateway = 0;
1672 		rt->rt_gw_family = 0;
1673 		rt->rt_gw4 = 0;
1674 		INIT_LIST_HEAD(&rt->rt_uncached);
1675 
1676 		rt->dst.output = ip_output;
1677 		if (flags & RTCF_LOCAL)
1678 			rt->dst.input = ip_local_deliver;
1679 	}
1680 
1681 	return rt;
1682 }
1683 EXPORT_SYMBOL(rt_dst_alloc);
1684 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1685 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1686 {
1687 	struct rtable *new_rt;
1688 
1689 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1690 			   rt->dst.flags);
1691 
1692 	if (new_rt) {
1693 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1694 		new_rt->rt_flags = rt->rt_flags;
1695 		new_rt->rt_type = rt->rt_type;
1696 		new_rt->rt_is_input = rt->rt_is_input;
1697 		new_rt->rt_iif = rt->rt_iif;
1698 		new_rt->rt_pmtu = rt->rt_pmtu;
1699 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1700 		new_rt->rt_gw_family = rt->rt_gw_family;
1701 		if (rt->rt_gw_family == AF_INET)
1702 			new_rt->rt_gw4 = rt->rt_gw4;
1703 		else if (rt->rt_gw_family == AF_INET6)
1704 			new_rt->rt_gw6 = rt->rt_gw6;
1705 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1706 
1707 		new_rt->dst.input = rt->dst.input;
1708 		new_rt->dst.output = rt->dst.output;
1709 		new_rt->dst.error = rt->dst.error;
1710 		new_rt->dst.lastuse = jiffies;
1711 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1712 	}
1713 	return new_rt;
1714 }
1715 EXPORT_SYMBOL(rt_dst_clone);
1716 
1717 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1718 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1719 			  u8 tos, struct net_device *dev,
1720 			  struct in_device *in_dev, u32 *itag)
1721 {
1722 	int err;
1723 
1724 	/* Primary sanity checks. */
1725 	if (!in_dev)
1726 		return -EINVAL;
1727 
1728 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1729 	    skb->protocol != htons(ETH_P_IP))
1730 		return -EINVAL;
1731 
1732 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1733 		return -EINVAL;
1734 
1735 	if (ipv4_is_zeronet(saddr)) {
1736 		if (!ipv4_is_local_multicast(daddr) &&
1737 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1738 			return -EINVAL;
1739 	} else {
1740 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1741 					  in_dev, itag);
1742 		if (err < 0)
1743 			return err;
1744 	}
1745 	return 0;
1746 }
1747 
1748 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1749 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1750 			     u8 tos, struct net_device *dev, int our)
1751 {
1752 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1753 	unsigned int flags = RTCF_MULTICAST;
1754 	struct rtable *rth;
1755 	u32 itag = 0;
1756 	int err;
1757 
1758 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1759 	if (err)
1760 		return err;
1761 
1762 	if (our)
1763 		flags |= RTCF_LOCAL;
1764 
1765 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1766 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1767 	if (!rth)
1768 		return -ENOBUFS;
1769 
1770 #ifdef CONFIG_IP_ROUTE_CLASSID
1771 	rth->dst.tclassid = itag;
1772 #endif
1773 	rth->dst.output = ip_rt_bug;
1774 	rth->rt_is_input= 1;
1775 
1776 #ifdef CONFIG_IP_MROUTE
1777 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1778 		rth->dst.input = ip_mr_input;
1779 #endif
1780 	RT_CACHE_STAT_INC(in_slow_mc);
1781 
1782 	skb_dst_set(skb, &rth->dst);
1783 	return 0;
1784 }
1785 
1786 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1787 static void ip_handle_martian_source(struct net_device *dev,
1788 				     struct in_device *in_dev,
1789 				     struct sk_buff *skb,
1790 				     __be32 daddr,
1791 				     __be32 saddr)
1792 {
1793 	RT_CACHE_STAT_INC(in_martian_src);
1794 #ifdef CONFIG_IP_ROUTE_VERBOSE
1795 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1796 		/*
1797 		 *	RFC1812 recommendation, if source is martian,
1798 		 *	the only hint is MAC header.
1799 		 */
1800 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1801 			&daddr, &saddr, dev->name);
1802 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1803 			print_hex_dump(KERN_WARNING, "ll header: ",
1804 				       DUMP_PREFIX_OFFSET, 16, 1,
1805 				       skb_mac_header(skb),
1806 				       dev->hard_header_len, false);
1807 		}
1808 	}
1809 #endif
1810 }
1811 
1812 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1813 static int __mkroute_input(struct sk_buff *skb,
1814 			   const struct fib_result *res,
1815 			   struct in_device *in_dev,
1816 			   __be32 daddr, __be32 saddr, u32 tos)
1817 {
1818 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1819 	struct net_device *dev = nhc->nhc_dev;
1820 	struct fib_nh_exception *fnhe;
1821 	struct rtable *rth;
1822 	int err;
1823 	struct in_device *out_dev;
1824 	bool do_cache;
1825 	u32 itag = 0;
1826 
1827 	/* get a working reference to the output device */
1828 	out_dev = __in_dev_get_rcu(dev);
1829 	if (!out_dev) {
1830 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1831 		return -EINVAL;
1832 	}
1833 
1834 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1835 				  in_dev->dev, in_dev, &itag);
1836 	if (err < 0) {
1837 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1838 					 saddr);
1839 
1840 		goto cleanup;
1841 	}
1842 
1843 	do_cache = res->fi && !itag;
1844 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1845 	    skb->protocol == htons(ETH_P_IP)) {
1846 		__be32 gw;
1847 
1848 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1849 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1850 		    inet_addr_onlink(out_dev, saddr, gw))
1851 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1852 	}
1853 
1854 	if (skb->protocol != htons(ETH_P_IP)) {
1855 		/* Not IP (i.e. ARP). Do not create route, if it is
1856 		 * invalid for proxy arp. DNAT routes are always valid.
1857 		 *
1858 		 * Proxy arp feature have been extended to allow, ARP
1859 		 * replies back to the same interface, to support
1860 		 * Private VLAN switch technologies. See arp.c.
1861 		 */
1862 		if (out_dev == in_dev &&
1863 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1864 			err = -EINVAL;
1865 			goto cleanup;
1866 		}
1867 	}
1868 
1869 	fnhe = find_exception(nhc, daddr);
1870 	if (do_cache) {
1871 		if (fnhe)
1872 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1873 		else
1874 			rth = rcu_dereference(nhc->nhc_rth_input);
1875 		if (rt_cache_valid(rth)) {
1876 			skb_dst_set_noref(skb, &rth->dst);
1877 			goto out;
1878 		}
1879 	}
1880 
1881 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1882 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1883 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
1884 	if (!rth) {
1885 		err = -ENOBUFS;
1886 		goto cleanup;
1887 	}
1888 
1889 	rth->rt_is_input = 1;
1890 	RT_CACHE_STAT_INC(in_slow_tot);
1891 
1892 	rth->dst.input = ip_forward;
1893 
1894 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1895 		       do_cache);
1896 	lwtunnel_set_redirect(&rth->dst);
1897 	skb_dst_set(skb, &rth->dst);
1898 out:
1899 	err = 0;
1900  cleanup:
1901 	return err;
1902 }
1903 
1904 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1905 /* To make ICMP packets follow the right flow, the multipath hash is
1906  * calculated from the inner IP addresses.
1907  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1908 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1909 				 struct flow_keys *hash_keys)
1910 {
1911 	const struct iphdr *outer_iph = ip_hdr(skb);
1912 	const struct iphdr *key_iph = outer_iph;
1913 	const struct iphdr *inner_iph;
1914 	const struct icmphdr *icmph;
1915 	struct iphdr _inner_iph;
1916 	struct icmphdr _icmph;
1917 
1918 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1919 		goto out;
1920 
1921 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1922 		goto out;
1923 
1924 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1925 				   &_icmph);
1926 	if (!icmph)
1927 		goto out;
1928 
1929 	if (!icmp_is_err(icmph->type))
1930 		goto out;
1931 
1932 	inner_iph = skb_header_pointer(skb,
1933 				       outer_iph->ihl * 4 + sizeof(_icmph),
1934 				       sizeof(_inner_iph), &_inner_iph);
1935 	if (!inner_iph)
1936 		goto out;
1937 
1938 	key_iph = inner_iph;
1939 out:
1940 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1941 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1942 }
1943 
1944 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1945 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1946 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1947 {
1948 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1949 	struct flow_keys hash_keys;
1950 	u32 mhash;
1951 
1952 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1953 	case 0:
1954 		memset(&hash_keys, 0, sizeof(hash_keys));
1955 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1956 		if (skb) {
1957 			ip_multipath_l3_keys(skb, &hash_keys);
1958 		} else {
1959 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1960 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1961 		}
1962 		break;
1963 	case 1:
1964 		/* skb is currently provided only when forwarding */
1965 		if (skb) {
1966 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1967 			struct flow_keys keys;
1968 
1969 			/* short-circuit if we already have L4 hash present */
1970 			if (skb->l4_hash)
1971 				return skb_get_hash_raw(skb) >> 1;
1972 
1973 			memset(&hash_keys, 0, sizeof(hash_keys));
1974 
1975 			if (!flkeys) {
1976 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1977 				flkeys = &keys;
1978 			}
1979 
1980 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1981 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1982 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1983 			hash_keys.ports.src = flkeys->ports.src;
1984 			hash_keys.ports.dst = flkeys->ports.dst;
1985 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1986 		} else {
1987 			memset(&hash_keys, 0, sizeof(hash_keys));
1988 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1990 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991 			hash_keys.ports.src = fl4->fl4_sport;
1992 			hash_keys.ports.dst = fl4->fl4_dport;
1993 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1994 		}
1995 		break;
1996 	case 2:
1997 		memset(&hash_keys, 0, sizeof(hash_keys));
1998 		/* skb is currently provided only when forwarding */
1999 		if (skb) {
2000 			struct flow_keys keys;
2001 
2002 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2003 			/* Inner can be v4 or v6 */
2004 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2005 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2006 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2007 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2008 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2009 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2010 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2011 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2012 				hash_keys.tags.flow_label = keys.tags.flow_label;
2013 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2014 			} else {
2015 				/* Same as case 0 */
2016 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2017 				ip_multipath_l3_keys(skb, &hash_keys);
2018 			}
2019 		} else {
2020 			/* Same as case 0 */
2021 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2023 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2024 		}
2025 		break;
2026 	}
2027 	mhash = flow_hash_from_keys(&hash_keys);
2028 
2029 	if (multipath_hash)
2030 		mhash = jhash_2words(mhash, multipath_hash, 0);
2031 
2032 	return mhash >> 1;
2033 }
2034 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2035 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2036 static int ip_mkroute_input(struct sk_buff *skb,
2037 			    struct fib_result *res,
2038 			    struct in_device *in_dev,
2039 			    __be32 daddr, __be32 saddr, u32 tos,
2040 			    struct flow_keys *hkeys)
2041 {
2042 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2043 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2044 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2045 
2046 		fib_select_multipath(res, h);
2047 	}
2048 #endif
2049 
2050 	/* create a routing cache entry */
2051 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2052 }
2053 
2054 /* Implements all the saddr-related checks as ip_route_input_slow(),
2055  * assuming daddr is valid and the destination is not a local broadcast one.
2056  * Uses the provided hint instead of performing a route lookup.
2057  */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2058 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2059 		      u8 tos, struct net_device *dev,
2060 		      const struct sk_buff *hint)
2061 {
2062 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2063 	struct rtable *rt = skb_rtable(hint);
2064 	struct net *net = dev_net(dev);
2065 	int err = -EINVAL;
2066 	u32 tag = 0;
2067 
2068 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2069 		goto martian_source;
2070 
2071 	if (ipv4_is_zeronet(saddr))
2072 		goto martian_source;
2073 
2074 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2075 		goto martian_source;
2076 
2077 	if (rt->rt_type != RTN_LOCAL)
2078 		goto skip_validate_source;
2079 
2080 	tos &= IPTOS_RT_MASK;
2081 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2082 	if (err < 0)
2083 		goto martian_source;
2084 
2085 skip_validate_source:
2086 	skb_dst_copy(skb, hint);
2087 	return 0;
2088 
2089 martian_source:
2090 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2091 	return err;
2092 }
2093 
2094 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2095 static struct net_device *ip_rt_get_dev(struct net *net,
2096 					const struct fib_result *res)
2097 {
2098 	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2099 	struct net_device *dev = NULL;
2100 
2101 	if (nhc)
2102 		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2103 
2104 	return dev ? : net->loopback_dev;
2105 }
2106 
2107 /*
2108  *	NOTE. We drop all the packets that has local source
2109  *	addresses, because every properly looped back packet
2110  *	must have correct destination already attached by output routine.
2111  *	Changes in the enforced policies must be applied also to
2112  *	ip_route_use_hint().
2113  *
2114  *	Such approach solves two big problems:
2115  *	1. Not simplex devices are handled properly.
2116  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2117  *	called with rcu_read_lock()
2118  */
2119 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2120 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2121 			       u8 tos, struct net_device *dev,
2122 			       struct fib_result *res)
2123 {
2124 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2125 	struct flow_keys *flkeys = NULL, _flkeys;
2126 	struct net    *net = dev_net(dev);
2127 	struct ip_tunnel_info *tun_info;
2128 	int		err = -EINVAL;
2129 	unsigned int	flags = 0;
2130 	u32		itag = 0;
2131 	struct rtable	*rth;
2132 	struct flowi4	fl4;
2133 	bool do_cache = true;
2134 
2135 	/* IP on this device is disabled. */
2136 
2137 	if (!in_dev)
2138 		goto out;
2139 
2140 	/* Check for the most weird martians, which can be not detected
2141 	   by fib_lookup.
2142 	 */
2143 
2144 	tun_info = skb_tunnel_info(skb);
2145 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2146 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2147 	else
2148 		fl4.flowi4_tun_key.tun_id = 0;
2149 	skb_dst_drop(skb);
2150 
2151 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2152 		goto martian_source;
2153 
2154 	res->fi = NULL;
2155 	res->table = NULL;
2156 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2157 		goto brd_input;
2158 
2159 	/* Accept zero addresses only to limited broadcast;
2160 	 * I even do not know to fix it or not. Waiting for complains :-)
2161 	 */
2162 	if (ipv4_is_zeronet(saddr))
2163 		goto martian_source;
2164 
2165 	if (ipv4_is_zeronet(daddr))
2166 		goto martian_destination;
2167 
2168 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2169 	 * and call it once if daddr or/and saddr are loopback addresses
2170 	 */
2171 	if (ipv4_is_loopback(daddr)) {
2172 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2173 			goto martian_destination;
2174 	} else if (ipv4_is_loopback(saddr)) {
2175 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2176 			goto martian_source;
2177 	}
2178 
2179 	/*
2180 	 *	Now we are ready to route packet.
2181 	 */
2182 	fl4.flowi4_oif = 0;
2183 	fl4.flowi4_iif = dev->ifindex;
2184 	fl4.flowi4_mark = skb->mark;
2185 	fl4.flowi4_tos = tos;
2186 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2187 	fl4.flowi4_flags = 0;
2188 	fl4.daddr = daddr;
2189 	fl4.saddr = saddr;
2190 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2191 	fl4.flowi4_multipath_hash = 0;
2192 
2193 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2194 		flkeys = &_flkeys;
2195 	} else {
2196 		fl4.flowi4_proto = 0;
2197 		fl4.fl4_sport = 0;
2198 		fl4.fl4_dport = 0;
2199 	}
2200 
2201 	err = fib_lookup(net, &fl4, res, 0);
2202 	if (err != 0) {
2203 		if (!IN_DEV_FORWARD(in_dev))
2204 			err = -EHOSTUNREACH;
2205 		goto no_route;
2206 	}
2207 
2208 	if (res->type == RTN_BROADCAST) {
2209 		if (IN_DEV_BFORWARD(in_dev))
2210 			goto make_route;
2211 		/* not do cache if bc_forwarding is enabled */
2212 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2213 			do_cache = false;
2214 		goto brd_input;
2215 	}
2216 
2217 	if (res->type == RTN_LOCAL) {
2218 		err = fib_validate_source(skb, saddr, daddr, tos,
2219 					  0, dev, in_dev, &itag);
2220 		if (err < 0)
2221 			goto martian_source;
2222 		goto local_input;
2223 	}
2224 
2225 	if (!IN_DEV_FORWARD(in_dev)) {
2226 		err = -EHOSTUNREACH;
2227 		goto no_route;
2228 	}
2229 	if (res->type != RTN_UNICAST)
2230 		goto martian_destination;
2231 
2232 make_route:
2233 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2234 out:	return err;
2235 
2236 brd_input:
2237 	if (skb->protocol != htons(ETH_P_IP))
2238 		goto e_inval;
2239 
2240 	if (!ipv4_is_zeronet(saddr)) {
2241 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2242 					  in_dev, &itag);
2243 		if (err < 0)
2244 			goto martian_source;
2245 	}
2246 	flags |= RTCF_BROADCAST;
2247 	res->type = RTN_BROADCAST;
2248 	RT_CACHE_STAT_INC(in_brd);
2249 
2250 local_input:
2251 	do_cache &= res->fi && !itag;
2252 	if (do_cache) {
2253 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2254 
2255 		rth = rcu_dereference(nhc->nhc_rth_input);
2256 		if (rt_cache_valid(rth)) {
2257 			skb_dst_set_noref(skb, &rth->dst);
2258 			err = 0;
2259 			goto out;
2260 		}
2261 	}
2262 
2263 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2264 			   flags | RTCF_LOCAL, res->type,
2265 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2266 	if (!rth)
2267 		goto e_nobufs;
2268 
2269 	rth->dst.output= ip_rt_bug;
2270 #ifdef CONFIG_IP_ROUTE_CLASSID
2271 	rth->dst.tclassid = itag;
2272 #endif
2273 	rth->rt_is_input = 1;
2274 
2275 	RT_CACHE_STAT_INC(in_slow_tot);
2276 	if (res->type == RTN_UNREACHABLE) {
2277 		rth->dst.input= ip_error;
2278 		rth->dst.error= -err;
2279 		rth->rt_flags 	&= ~RTCF_LOCAL;
2280 	}
2281 
2282 	if (do_cache) {
2283 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2284 
2285 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2286 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2287 			WARN_ON(rth->dst.input == lwtunnel_input);
2288 			rth->dst.lwtstate->orig_input = rth->dst.input;
2289 			rth->dst.input = lwtunnel_input;
2290 		}
2291 
2292 		if (unlikely(!rt_cache_route(nhc, rth)))
2293 			rt_add_uncached_list(rth);
2294 	}
2295 	skb_dst_set(skb, &rth->dst);
2296 	err = 0;
2297 	goto out;
2298 
2299 no_route:
2300 	RT_CACHE_STAT_INC(in_no_route);
2301 	res->type = RTN_UNREACHABLE;
2302 	res->fi = NULL;
2303 	res->table = NULL;
2304 	goto local_input;
2305 
2306 	/*
2307 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2308 	 */
2309 martian_destination:
2310 	RT_CACHE_STAT_INC(in_martian_dst);
2311 #ifdef CONFIG_IP_ROUTE_VERBOSE
2312 	if (IN_DEV_LOG_MARTIANS(in_dev))
2313 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2314 				     &daddr, &saddr, dev->name);
2315 #endif
2316 
2317 e_inval:
2318 	err = -EINVAL;
2319 	goto out;
2320 
2321 e_nobufs:
2322 	err = -ENOBUFS;
2323 	goto out;
2324 
2325 martian_source:
2326 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2327 	goto out;
2328 }
2329 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2330 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2331 			 u8 tos, struct net_device *dev)
2332 {
2333 	struct fib_result res;
2334 	int err;
2335 
2336 	tos &= IPTOS_RT_MASK;
2337 	rcu_read_lock();
2338 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2339 	rcu_read_unlock();
2340 
2341 	return err;
2342 }
2343 EXPORT_SYMBOL(ip_route_input_noref);
2344 
2345 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2346 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2347 		       u8 tos, struct net_device *dev, struct fib_result *res)
2348 {
2349 	/* Multicast recognition logic is moved from route cache to here.
2350 	   The problem was that too many Ethernet cards have broken/missing
2351 	   hardware multicast filters :-( As result the host on multicasting
2352 	   network acquires a lot of useless route cache entries, sort of
2353 	   SDR messages from all the world. Now we try to get rid of them.
2354 	   Really, provided software IP multicast filter is organized
2355 	   reasonably (at least, hashed), it does not result in a slowdown
2356 	   comparing with route cache reject entries.
2357 	   Note, that multicast routers are not affected, because
2358 	   route cache entry is created eventually.
2359 	 */
2360 	if (ipv4_is_multicast(daddr)) {
2361 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2362 		int our = 0;
2363 		int err = -EINVAL;
2364 
2365 		if (!in_dev)
2366 			return err;
2367 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2368 				      ip_hdr(skb)->protocol);
2369 
2370 		/* check l3 master if no match yet */
2371 		if (!our && netif_is_l3_slave(dev)) {
2372 			struct in_device *l3_in_dev;
2373 
2374 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2375 			if (l3_in_dev)
2376 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2377 						      ip_hdr(skb)->protocol);
2378 		}
2379 
2380 		if (our
2381 #ifdef CONFIG_IP_MROUTE
2382 			||
2383 		    (!ipv4_is_local_multicast(daddr) &&
2384 		     IN_DEV_MFORWARD(in_dev))
2385 #endif
2386 		   ) {
2387 			err = ip_route_input_mc(skb, daddr, saddr,
2388 						tos, dev, our);
2389 		}
2390 		return err;
2391 	}
2392 
2393 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2394 }
2395 
2396 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2397 static struct rtable *__mkroute_output(const struct fib_result *res,
2398 				       const struct flowi4 *fl4, int orig_oif,
2399 				       struct net_device *dev_out,
2400 				       unsigned int flags)
2401 {
2402 	struct fib_info *fi = res->fi;
2403 	struct fib_nh_exception *fnhe;
2404 	struct in_device *in_dev;
2405 	u16 type = res->type;
2406 	struct rtable *rth;
2407 	bool do_cache;
2408 
2409 	in_dev = __in_dev_get_rcu(dev_out);
2410 	if (!in_dev)
2411 		return ERR_PTR(-EINVAL);
2412 
2413 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2414 		if (ipv4_is_loopback(fl4->saddr) &&
2415 		    !(dev_out->flags & IFF_LOOPBACK) &&
2416 		    !netif_is_l3_master(dev_out))
2417 			return ERR_PTR(-EINVAL);
2418 
2419 	if (ipv4_is_lbcast(fl4->daddr))
2420 		type = RTN_BROADCAST;
2421 	else if (ipv4_is_multicast(fl4->daddr))
2422 		type = RTN_MULTICAST;
2423 	else if (ipv4_is_zeronet(fl4->daddr))
2424 		return ERR_PTR(-EINVAL);
2425 
2426 	if (dev_out->flags & IFF_LOOPBACK)
2427 		flags |= RTCF_LOCAL;
2428 
2429 	do_cache = true;
2430 	if (type == RTN_BROADCAST) {
2431 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2432 		fi = NULL;
2433 	} else if (type == RTN_MULTICAST) {
2434 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2435 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2436 				     fl4->flowi4_proto))
2437 			flags &= ~RTCF_LOCAL;
2438 		else
2439 			do_cache = false;
2440 		/* If multicast route do not exist use
2441 		 * default one, but do not gateway in this case.
2442 		 * Yes, it is hack.
2443 		 */
2444 		if (fi && res->prefixlen < 4)
2445 			fi = NULL;
2446 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2447 		   (orig_oif != dev_out->ifindex)) {
2448 		/* For local routes that require a particular output interface
2449 		 * we do not want to cache the result.  Caching the result
2450 		 * causes incorrect behaviour when there are multiple source
2451 		 * addresses on the interface, the end result being that if the
2452 		 * intended recipient is waiting on that interface for the
2453 		 * packet he won't receive it because it will be delivered on
2454 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2455 		 * be set to the loopback interface as well.
2456 		 */
2457 		do_cache = false;
2458 	}
2459 
2460 	fnhe = NULL;
2461 	do_cache &= fi != NULL;
2462 	if (fi) {
2463 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2464 		struct rtable __rcu **prth;
2465 
2466 		fnhe = find_exception(nhc, fl4->daddr);
2467 		if (!do_cache)
2468 			goto add;
2469 		if (fnhe) {
2470 			prth = &fnhe->fnhe_rth_output;
2471 		} else {
2472 			if (unlikely(fl4->flowi4_flags &
2473 				     FLOWI_FLAG_KNOWN_NH &&
2474 				     !(nhc->nhc_gw_family &&
2475 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2476 				do_cache = false;
2477 				goto add;
2478 			}
2479 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2480 		}
2481 		rth = rcu_dereference(*prth);
2482 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2483 			return rth;
2484 	}
2485 
2486 add:
2487 	rth = rt_dst_alloc(dev_out, flags, type,
2488 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2489 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2490 	if (!rth)
2491 		return ERR_PTR(-ENOBUFS);
2492 
2493 	rth->rt_iif = orig_oif;
2494 
2495 	RT_CACHE_STAT_INC(out_slow_tot);
2496 
2497 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2498 		if (flags & RTCF_LOCAL &&
2499 		    !(dev_out->flags & IFF_LOOPBACK)) {
2500 			rth->dst.output = ip_mc_output;
2501 			RT_CACHE_STAT_INC(out_slow_mc);
2502 		}
2503 #ifdef CONFIG_IP_MROUTE
2504 		if (type == RTN_MULTICAST) {
2505 			if (IN_DEV_MFORWARD(in_dev) &&
2506 			    !ipv4_is_local_multicast(fl4->daddr)) {
2507 				rth->dst.input = ip_mr_input;
2508 				rth->dst.output = ip_mc_output;
2509 			}
2510 		}
2511 #endif
2512 	}
2513 
2514 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2515 	lwtunnel_set_redirect(&rth->dst);
2516 
2517 	return rth;
2518 }
2519 
2520 /*
2521  * Major route resolver routine.
2522  */
2523 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2524 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2525 					const struct sk_buff *skb)
2526 {
2527 	__u8 tos = RT_FL_TOS(fl4);
2528 	struct fib_result res = {
2529 		.type		= RTN_UNSPEC,
2530 		.fi		= NULL,
2531 		.table		= NULL,
2532 		.tclassid	= 0,
2533 	};
2534 	struct rtable *rth;
2535 
2536 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2537 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2538 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2539 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2540 
2541 	rcu_read_lock();
2542 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2543 	rcu_read_unlock();
2544 
2545 	return rth;
2546 }
2547 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2548 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2549 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2550 					    struct fib_result *res,
2551 					    const struct sk_buff *skb)
2552 {
2553 	struct net_device *dev_out = NULL;
2554 	int orig_oif = fl4->flowi4_oif;
2555 	unsigned int flags = 0;
2556 	struct rtable *rth;
2557 	int err;
2558 
2559 	if (fl4->saddr) {
2560 		if (ipv4_is_multicast(fl4->saddr) ||
2561 		    ipv4_is_lbcast(fl4->saddr) ||
2562 		    ipv4_is_zeronet(fl4->saddr)) {
2563 			rth = ERR_PTR(-EINVAL);
2564 			goto out;
2565 		}
2566 
2567 		rth = ERR_PTR(-ENETUNREACH);
2568 
2569 		/* I removed check for oif == dev_out->oif here.
2570 		   It was wrong for two reasons:
2571 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2572 		      is assigned to multiple interfaces.
2573 		   2. Moreover, we are allowed to send packets with saddr
2574 		      of another iface. --ANK
2575 		 */
2576 
2577 		if (fl4->flowi4_oif == 0 &&
2578 		    (ipv4_is_multicast(fl4->daddr) ||
2579 		     ipv4_is_lbcast(fl4->daddr))) {
2580 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2581 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2582 			if (!dev_out)
2583 				goto out;
2584 
2585 			/* Special hack: user can direct multicasts
2586 			   and limited broadcast via necessary interface
2587 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2588 			   This hack is not just for fun, it allows
2589 			   vic,vat and friends to work.
2590 			   They bind socket to loopback, set ttl to zero
2591 			   and expect that it will work.
2592 			   From the viewpoint of routing cache they are broken,
2593 			   because we are not allowed to build multicast path
2594 			   with loopback source addr (look, routing cache
2595 			   cannot know, that ttl is zero, so that packet
2596 			   will not leave this host and route is valid).
2597 			   Luckily, this hack is good workaround.
2598 			 */
2599 
2600 			fl4->flowi4_oif = dev_out->ifindex;
2601 			goto make_route;
2602 		}
2603 
2604 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2605 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2606 			if (!__ip_dev_find(net, fl4->saddr, false))
2607 				goto out;
2608 		}
2609 	}
2610 
2611 
2612 	if (fl4->flowi4_oif) {
2613 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2614 		rth = ERR_PTR(-ENODEV);
2615 		if (!dev_out)
2616 			goto out;
2617 
2618 		/* RACE: Check return value of inet_select_addr instead. */
2619 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2620 			rth = ERR_PTR(-ENETUNREACH);
2621 			goto out;
2622 		}
2623 		if (ipv4_is_local_multicast(fl4->daddr) ||
2624 		    ipv4_is_lbcast(fl4->daddr) ||
2625 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2626 			if (!fl4->saddr)
2627 				fl4->saddr = inet_select_addr(dev_out, 0,
2628 							      RT_SCOPE_LINK);
2629 			goto make_route;
2630 		}
2631 		if (!fl4->saddr) {
2632 			if (ipv4_is_multicast(fl4->daddr))
2633 				fl4->saddr = inet_select_addr(dev_out, 0,
2634 							      fl4->flowi4_scope);
2635 			else if (!fl4->daddr)
2636 				fl4->saddr = inet_select_addr(dev_out, 0,
2637 							      RT_SCOPE_HOST);
2638 		}
2639 	}
2640 
2641 	if (!fl4->daddr) {
2642 		fl4->daddr = fl4->saddr;
2643 		if (!fl4->daddr)
2644 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2645 		dev_out = net->loopback_dev;
2646 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2647 		res->type = RTN_LOCAL;
2648 		flags |= RTCF_LOCAL;
2649 		goto make_route;
2650 	}
2651 
2652 	err = fib_lookup(net, fl4, res, 0);
2653 	if (err) {
2654 		res->fi = NULL;
2655 		res->table = NULL;
2656 		if (fl4->flowi4_oif &&
2657 		    (ipv4_is_multicast(fl4->daddr) ||
2658 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2659 			/* Apparently, routing tables are wrong. Assume,
2660 			   that the destination is on link.
2661 
2662 			   WHY? DW.
2663 			   Because we are allowed to send to iface
2664 			   even if it has NO routes and NO assigned
2665 			   addresses. When oif is specified, routing
2666 			   tables are looked up with only one purpose:
2667 			   to catch if destination is gatewayed, rather than
2668 			   direct. Moreover, if MSG_DONTROUTE is set,
2669 			   we send packet, ignoring both routing tables
2670 			   and ifaddr state. --ANK
2671 
2672 
2673 			   We could make it even if oif is unknown,
2674 			   likely IPv6, but we do not.
2675 			 */
2676 
2677 			if (fl4->saddr == 0)
2678 				fl4->saddr = inet_select_addr(dev_out, 0,
2679 							      RT_SCOPE_LINK);
2680 			res->type = RTN_UNICAST;
2681 			goto make_route;
2682 		}
2683 		rth = ERR_PTR(err);
2684 		goto out;
2685 	}
2686 
2687 	if (res->type == RTN_LOCAL) {
2688 		if (!fl4->saddr) {
2689 			if (res->fi->fib_prefsrc)
2690 				fl4->saddr = res->fi->fib_prefsrc;
2691 			else
2692 				fl4->saddr = fl4->daddr;
2693 		}
2694 
2695 		/* L3 master device is the loopback for that domain */
2696 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2697 			net->loopback_dev;
2698 
2699 		/* make sure orig_oif points to fib result device even
2700 		 * though packet rx/tx happens over loopback or l3mdev
2701 		 */
2702 		orig_oif = FIB_RES_OIF(*res);
2703 
2704 		fl4->flowi4_oif = dev_out->ifindex;
2705 		flags |= RTCF_LOCAL;
2706 		goto make_route;
2707 	}
2708 
2709 	fib_select_path(net, res, fl4, skb);
2710 
2711 	dev_out = FIB_RES_DEV(*res);
2712 
2713 make_route:
2714 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2715 
2716 out:
2717 	return rth;
2718 }
2719 
2720 static struct dst_ops ipv4_dst_blackhole_ops = {
2721 	.family			= AF_INET,
2722 	.default_advmss		= ipv4_default_advmss,
2723 	.neigh_lookup		= ipv4_neigh_lookup,
2724 	.check			= dst_blackhole_check,
2725 	.cow_metrics		= dst_blackhole_cow_metrics,
2726 	.update_pmtu		= dst_blackhole_update_pmtu,
2727 	.redirect		= dst_blackhole_redirect,
2728 	.mtu			= dst_blackhole_mtu,
2729 };
2730 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2731 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2732 {
2733 	struct rtable *ort = (struct rtable *) dst_orig;
2734 	struct rtable *rt;
2735 
2736 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2737 	if (rt) {
2738 		struct dst_entry *new = &rt->dst;
2739 
2740 		new->__use = 1;
2741 		new->input = dst_discard;
2742 		new->output = dst_discard_out;
2743 
2744 		new->dev = net->loopback_dev;
2745 		if (new->dev)
2746 			dev_hold(new->dev);
2747 
2748 		rt->rt_is_input = ort->rt_is_input;
2749 		rt->rt_iif = ort->rt_iif;
2750 		rt->rt_pmtu = ort->rt_pmtu;
2751 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2752 
2753 		rt->rt_genid = rt_genid_ipv4(net);
2754 		rt->rt_flags = ort->rt_flags;
2755 		rt->rt_type = ort->rt_type;
2756 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2757 		rt->rt_gw_family = ort->rt_gw_family;
2758 		if (rt->rt_gw_family == AF_INET)
2759 			rt->rt_gw4 = ort->rt_gw4;
2760 		else if (rt->rt_gw_family == AF_INET6)
2761 			rt->rt_gw6 = ort->rt_gw6;
2762 
2763 		INIT_LIST_HEAD(&rt->rt_uncached);
2764 	}
2765 
2766 	dst_release(dst_orig);
2767 
2768 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2769 }
2770 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2771 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2772 				    const struct sock *sk)
2773 {
2774 	struct rtable *rt = __ip_route_output_key(net, flp4);
2775 
2776 	if (IS_ERR(rt))
2777 		return rt;
2778 
2779 	if (flp4->flowi4_proto) {
2780 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2781 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2782 							flowi4_to_flowi(flp4),
2783 							sk, 0);
2784 	}
2785 
2786 	return rt;
2787 }
2788 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2789 
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2790 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2791 				      struct net_device *dev,
2792 				      struct net *net, __be32 *saddr,
2793 				      const struct ip_tunnel_info *info,
2794 				      u8 protocol, bool use_cache)
2795 {
2796 #ifdef CONFIG_DST_CACHE
2797 	struct dst_cache *dst_cache;
2798 #endif
2799 	struct rtable *rt = NULL;
2800 	struct flowi4 fl4;
2801 	__u8 tos;
2802 
2803 #ifdef CONFIG_DST_CACHE
2804 	dst_cache = (struct dst_cache *)&info->dst_cache;
2805 	if (use_cache) {
2806 		rt = dst_cache_get_ip4(dst_cache, saddr);
2807 		if (rt)
2808 			return rt;
2809 	}
2810 #endif
2811 	memset(&fl4, 0, sizeof(fl4));
2812 	fl4.flowi4_mark = skb->mark;
2813 	fl4.flowi4_proto = protocol;
2814 	fl4.daddr = info->key.u.ipv4.dst;
2815 	fl4.saddr = info->key.u.ipv4.src;
2816 	tos = info->key.tos;
2817 	fl4.flowi4_tos = RT_TOS(tos);
2818 
2819 	rt = ip_route_output_key(net, &fl4);
2820 	if (IS_ERR(rt)) {
2821 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2822 		return ERR_PTR(-ENETUNREACH);
2823 	}
2824 	if (rt->dst.dev == dev) { /* is this necessary? */
2825 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2826 		ip_rt_put(rt);
2827 		return ERR_PTR(-ELOOP);
2828 	}
2829 #ifdef CONFIG_DST_CACHE
2830 	if (use_cache)
2831 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2832 #endif
2833 	*saddr = fl4.saddr;
2834 	return rt;
2835 }
2836 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2837 
2838 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2839 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2840 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2841 			struct sk_buff *skb, u32 portid, u32 seq,
2842 			unsigned int flags)
2843 {
2844 	struct rtmsg *r;
2845 	struct nlmsghdr *nlh;
2846 	unsigned long expires = 0;
2847 	u32 error;
2848 	u32 metrics[RTAX_MAX];
2849 
2850 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2851 	if (!nlh)
2852 		return -EMSGSIZE;
2853 
2854 	r = nlmsg_data(nlh);
2855 	r->rtm_family	 = AF_INET;
2856 	r->rtm_dst_len	= 32;
2857 	r->rtm_src_len	= 0;
2858 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2859 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2860 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2861 		goto nla_put_failure;
2862 	r->rtm_type	= rt->rt_type;
2863 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2864 	r->rtm_protocol = RTPROT_UNSPEC;
2865 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2866 	if (rt->rt_flags & RTCF_NOTIFY)
2867 		r->rtm_flags |= RTM_F_NOTIFY;
2868 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2869 		r->rtm_flags |= RTCF_DOREDIRECT;
2870 
2871 	if (nla_put_in_addr(skb, RTA_DST, dst))
2872 		goto nla_put_failure;
2873 	if (src) {
2874 		r->rtm_src_len = 32;
2875 		if (nla_put_in_addr(skb, RTA_SRC, src))
2876 			goto nla_put_failure;
2877 	}
2878 	if (rt->dst.dev &&
2879 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2880 		goto nla_put_failure;
2881 #ifdef CONFIG_IP_ROUTE_CLASSID
2882 	if (rt->dst.tclassid &&
2883 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2884 		goto nla_put_failure;
2885 #endif
2886 	if (fl4 && !rt_is_input_route(rt) &&
2887 	    fl4->saddr != src) {
2888 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2889 			goto nla_put_failure;
2890 	}
2891 	if (rt->rt_uses_gateway) {
2892 		if (rt->rt_gw_family == AF_INET &&
2893 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2894 			goto nla_put_failure;
2895 		} else if (rt->rt_gw_family == AF_INET6) {
2896 			int alen = sizeof(struct in6_addr);
2897 			struct nlattr *nla;
2898 			struct rtvia *via;
2899 
2900 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2901 			if (!nla)
2902 				goto nla_put_failure;
2903 
2904 			via = nla_data(nla);
2905 			via->rtvia_family = AF_INET6;
2906 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2907 		}
2908 	}
2909 
2910 	expires = rt->dst.expires;
2911 	if (expires) {
2912 		unsigned long now = jiffies;
2913 
2914 		if (time_before(now, expires))
2915 			expires -= now;
2916 		else
2917 			expires = 0;
2918 	}
2919 
2920 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2921 	if (rt->rt_pmtu && expires)
2922 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2923 	if (rt->rt_mtu_locked && expires)
2924 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2925 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2926 		goto nla_put_failure;
2927 
2928 	if (fl4) {
2929 		if (fl4->flowi4_mark &&
2930 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2931 			goto nla_put_failure;
2932 
2933 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2934 		    nla_put_u32(skb, RTA_UID,
2935 				from_kuid_munged(current_user_ns(),
2936 						 fl4->flowi4_uid)))
2937 			goto nla_put_failure;
2938 
2939 		if (rt_is_input_route(rt)) {
2940 #ifdef CONFIG_IP_MROUTE
2941 			if (ipv4_is_multicast(dst) &&
2942 			    !ipv4_is_local_multicast(dst) &&
2943 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2944 				int err = ipmr_get_route(net, skb,
2945 							 fl4->saddr, fl4->daddr,
2946 							 r, portid);
2947 
2948 				if (err <= 0) {
2949 					if (err == 0)
2950 						return 0;
2951 					goto nla_put_failure;
2952 				}
2953 			} else
2954 #endif
2955 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2956 					goto nla_put_failure;
2957 		}
2958 	}
2959 
2960 	error = rt->dst.error;
2961 
2962 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2963 		goto nla_put_failure;
2964 
2965 	nlmsg_end(skb, nlh);
2966 	return 0;
2967 
2968 nla_put_failure:
2969 	nlmsg_cancel(skb, nlh);
2970 	return -EMSGSIZE;
2971 }
2972 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2973 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2974 			    struct netlink_callback *cb, u32 table_id,
2975 			    struct fnhe_hash_bucket *bucket, int genid,
2976 			    int *fa_index, int fa_start, unsigned int flags)
2977 {
2978 	int i;
2979 
2980 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2981 		struct fib_nh_exception *fnhe;
2982 
2983 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2984 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2985 			struct rtable *rt;
2986 			int err;
2987 
2988 			if (*fa_index < fa_start)
2989 				goto next;
2990 
2991 			if (fnhe->fnhe_genid != genid)
2992 				goto next;
2993 
2994 			if (fnhe->fnhe_expires &&
2995 			    time_after(jiffies, fnhe->fnhe_expires))
2996 				goto next;
2997 
2998 			rt = rcu_dereference(fnhe->fnhe_rth_input);
2999 			if (!rt)
3000 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3001 			if (!rt)
3002 				goto next;
3003 
3004 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3005 					   table_id, NULL, skb,
3006 					   NETLINK_CB(cb->skb).portid,
3007 					   cb->nlh->nlmsg_seq, flags);
3008 			if (err)
3009 				return err;
3010 next:
3011 			(*fa_index)++;
3012 		}
3013 	}
3014 
3015 	return 0;
3016 }
3017 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3018 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3019 		       u32 table_id, struct fib_info *fi,
3020 		       int *fa_index, int fa_start, unsigned int flags)
3021 {
3022 	struct net *net = sock_net(cb->skb->sk);
3023 	int nhsel, genid = fnhe_genid(net);
3024 
3025 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3026 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3027 		struct fnhe_hash_bucket *bucket;
3028 		int err;
3029 
3030 		if (nhc->nhc_flags & RTNH_F_DEAD)
3031 			continue;
3032 
3033 		rcu_read_lock();
3034 		bucket = rcu_dereference(nhc->nhc_exceptions);
3035 		err = 0;
3036 		if (bucket)
3037 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3038 					       genid, fa_index, fa_start,
3039 					       flags);
3040 		rcu_read_unlock();
3041 		if (err)
3042 			return err;
3043 	}
3044 
3045 	return 0;
3046 }
3047 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3048 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3049 						   u8 ip_proto, __be16 sport,
3050 						   __be16 dport)
3051 {
3052 	struct sk_buff *skb;
3053 	struct iphdr *iph;
3054 
3055 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3056 	if (!skb)
3057 		return NULL;
3058 
3059 	/* Reserve room for dummy headers, this skb can pass
3060 	 * through good chunk of routing engine.
3061 	 */
3062 	skb_reset_mac_header(skb);
3063 	skb_reset_network_header(skb);
3064 	skb->protocol = htons(ETH_P_IP);
3065 	iph = skb_put(skb, sizeof(struct iphdr));
3066 	iph->protocol = ip_proto;
3067 	iph->saddr = src;
3068 	iph->daddr = dst;
3069 	iph->version = 0x4;
3070 	iph->frag_off = 0;
3071 	iph->ihl = 0x5;
3072 	skb_set_transport_header(skb, skb->len);
3073 
3074 	switch (iph->protocol) {
3075 	case IPPROTO_UDP: {
3076 		struct udphdr *udph;
3077 
3078 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3079 		udph->source = sport;
3080 		udph->dest = dport;
3081 		udph->len = htons(sizeof(struct udphdr));
3082 		udph->check = 0;
3083 		break;
3084 	}
3085 	case IPPROTO_TCP: {
3086 		struct tcphdr *tcph;
3087 
3088 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3089 		tcph->source	= sport;
3090 		tcph->dest	= dport;
3091 		tcph->doff	= sizeof(struct tcphdr) / 4;
3092 		tcph->rst = 1;
3093 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3094 					    src, dst, 0);
3095 		break;
3096 	}
3097 	case IPPROTO_ICMP: {
3098 		struct icmphdr *icmph;
3099 
3100 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3101 		icmph->type = ICMP_ECHO;
3102 		icmph->code = 0;
3103 	}
3104 	}
3105 
3106 	return skb;
3107 }
3108 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3109 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3110 				       const struct nlmsghdr *nlh,
3111 				       struct nlattr **tb,
3112 				       struct netlink_ext_ack *extack)
3113 {
3114 	struct rtmsg *rtm;
3115 	int i, err;
3116 
3117 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3118 		NL_SET_ERR_MSG(extack,
3119 			       "ipv4: Invalid header for route get request");
3120 		return -EINVAL;
3121 	}
3122 
3123 	if (!netlink_strict_get_check(skb))
3124 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3125 					      rtm_ipv4_policy, extack);
3126 
3127 	rtm = nlmsg_data(nlh);
3128 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3129 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3130 	    rtm->rtm_table || rtm->rtm_protocol ||
3131 	    rtm->rtm_scope || rtm->rtm_type) {
3132 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3133 		return -EINVAL;
3134 	}
3135 
3136 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3137 			       RTM_F_LOOKUP_TABLE |
3138 			       RTM_F_FIB_MATCH)) {
3139 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3140 		return -EINVAL;
3141 	}
3142 
3143 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3144 					    rtm_ipv4_policy, extack);
3145 	if (err)
3146 		return err;
3147 
3148 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3149 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3150 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3151 		return -EINVAL;
3152 	}
3153 
3154 	for (i = 0; i <= RTA_MAX; i++) {
3155 		if (!tb[i])
3156 			continue;
3157 
3158 		switch (i) {
3159 		case RTA_IIF:
3160 		case RTA_OIF:
3161 		case RTA_SRC:
3162 		case RTA_DST:
3163 		case RTA_IP_PROTO:
3164 		case RTA_SPORT:
3165 		case RTA_DPORT:
3166 		case RTA_MARK:
3167 		case RTA_UID:
3168 			break;
3169 		default:
3170 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3171 			return -EINVAL;
3172 		}
3173 	}
3174 
3175 	return 0;
3176 }
3177 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3178 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3179 			     struct netlink_ext_ack *extack)
3180 {
3181 	struct net *net = sock_net(in_skb->sk);
3182 	struct nlattr *tb[RTA_MAX+1];
3183 	u32 table_id = RT_TABLE_MAIN;
3184 	__be16 sport = 0, dport = 0;
3185 	struct fib_result res = {};
3186 	u8 ip_proto = IPPROTO_UDP;
3187 	struct rtable *rt = NULL;
3188 	struct sk_buff *skb;
3189 	struct rtmsg *rtm;
3190 	struct flowi4 fl4 = {};
3191 	__be32 dst = 0;
3192 	__be32 src = 0;
3193 	kuid_t uid;
3194 	u32 iif;
3195 	int err;
3196 	int mark;
3197 
3198 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3199 	if (err < 0)
3200 		return err;
3201 
3202 	rtm = nlmsg_data(nlh);
3203 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3204 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3205 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3206 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3207 	if (tb[RTA_UID])
3208 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3209 	else
3210 		uid = (iif ? INVALID_UID : current_uid());
3211 
3212 	if (tb[RTA_IP_PROTO]) {
3213 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3214 						  &ip_proto, AF_INET, extack);
3215 		if (err)
3216 			return err;
3217 	}
3218 
3219 	if (tb[RTA_SPORT])
3220 		sport = nla_get_be16(tb[RTA_SPORT]);
3221 
3222 	if (tb[RTA_DPORT])
3223 		dport = nla_get_be16(tb[RTA_DPORT]);
3224 
3225 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3226 	if (!skb)
3227 		return -ENOBUFS;
3228 
3229 	fl4.daddr = dst;
3230 	fl4.saddr = src;
3231 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3232 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3233 	fl4.flowi4_mark = mark;
3234 	fl4.flowi4_uid = uid;
3235 	if (sport)
3236 		fl4.fl4_sport = sport;
3237 	if (dport)
3238 		fl4.fl4_dport = dport;
3239 	fl4.flowi4_proto = ip_proto;
3240 
3241 	rcu_read_lock();
3242 
3243 	if (iif) {
3244 		struct net_device *dev;
3245 
3246 		dev = dev_get_by_index_rcu(net, iif);
3247 		if (!dev) {
3248 			err = -ENODEV;
3249 			goto errout_rcu;
3250 		}
3251 
3252 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3253 		skb->dev	= dev;
3254 		skb->mark	= mark;
3255 		err = ip_route_input_rcu(skb, dst, src,
3256 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3257 					 &res);
3258 
3259 		rt = skb_rtable(skb);
3260 		if (err == 0 && rt->dst.error)
3261 			err = -rt->dst.error;
3262 	} else {
3263 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3264 		skb->dev = net->loopback_dev;
3265 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3266 		err = 0;
3267 		if (IS_ERR(rt))
3268 			err = PTR_ERR(rt);
3269 		else
3270 			skb_dst_set(skb, &rt->dst);
3271 	}
3272 
3273 	if (err)
3274 		goto errout_rcu;
3275 
3276 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3277 		rt->rt_flags |= RTCF_NOTIFY;
3278 
3279 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3280 		table_id = res.table ? res.table->tb_id : 0;
3281 
3282 	/* reset skb for netlink reply msg */
3283 	skb_trim(skb, 0);
3284 	skb_reset_network_header(skb);
3285 	skb_reset_transport_header(skb);
3286 	skb_reset_mac_header(skb);
3287 
3288 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3289 		struct fib_rt_info fri;
3290 
3291 		if (!res.fi) {
3292 			err = fib_props[res.type].error;
3293 			if (!err)
3294 				err = -EHOSTUNREACH;
3295 			goto errout_rcu;
3296 		}
3297 		fri.fi = res.fi;
3298 		fri.tb_id = table_id;
3299 		fri.dst = res.prefix;
3300 		fri.dst_len = res.prefixlen;
3301 		fri.tos = fl4.flowi4_tos;
3302 		fri.type = rt->rt_type;
3303 		fri.offload = 0;
3304 		fri.trap = 0;
3305 		if (res.fa_head) {
3306 			struct fib_alias *fa;
3307 
3308 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3309 				u8 slen = 32 - fri.dst_len;
3310 
3311 				if (fa->fa_slen == slen &&
3312 				    fa->tb_id == fri.tb_id &&
3313 				    fa->fa_tos == fri.tos &&
3314 				    fa->fa_info == res.fi &&
3315 				    fa->fa_type == fri.type) {
3316 					fri.offload = fa->offload;
3317 					fri.trap = fa->trap;
3318 					break;
3319 				}
3320 			}
3321 		}
3322 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3323 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3324 	} else {
3325 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3326 				   NETLINK_CB(in_skb).portid,
3327 				   nlh->nlmsg_seq, 0);
3328 	}
3329 	if (err < 0)
3330 		goto errout_rcu;
3331 
3332 	rcu_read_unlock();
3333 
3334 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3335 
3336 errout_free:
3337 	return err;
3338 errout_rcu:
3339 	rcu_read_unlock();
3340 	kfree_skb(skb);
3341 	goto errout_free;
3342 }
3343 
ip_rt_multicast_event(struct in_device * in_dev)3344 void ip_rt_multicast_event(struct in_device *in_dev)
3345 {
3346 	rt_cache_flush(dev_net(in_dev->dev));
3347 }
3348 
3349 #ifdef CONFIG_SYSCTL
3350 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3351 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3352 static int ip_rt_gc_elasticity __read_mostly	= 8;
3353 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3354 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3355 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3356 		void *buffer, size_t *lenp, loff_t *ppos)
3357 {
3358 	struct net *net = (struct net *)__ctl->extra1;
3359 
3360 	if (write) {
3361 		rt_cache_flush(net);
3362 		fnhe_genid_bump(net);
3363 		return 0;
3364 	}
3365 
3366 	return -EINVAL;
3367 }
3368 
3369 static struct ctl_table ipv4_route_table[] = {
3370 	{
3371 		.procname	= "gc_thresh",
3372 		.data		= &ipv4_dst_ops.gc_thresh,
3373 		.maxlen		= sizeof(int),
3374 		.mode		= 0644,
3375 		.proc_handler	= proc_dointvec,
3376 	},
3377 	{
3378 		.procname	= "max_size",
3379 		.data		= &ip_rt_max_size,
3380 		.maxlen		= sizeof(int),
3381 		.mode		= 0644,
3382 		.proc_handler	= proc_dointvec,
3383 	},
3384 	{
3385 		/*  Deprecated. Use gc_min_interval_ms */
3386 
3387 		.procname	= "gc_min_interval",
3388 		.data		= &ip_rt_gc_min_interval,
3389 		.maxlen		= sizeof(int),
3390 		.mode		= 0644,
3391 		.proc_handler	= proc_dointvec_jiffies,
3392 	},
3393 	{
3394 		.procname	= "gc_min_interval_ms",
3395 		.data		= &ip_rt_gc_min_interval,
3396 		.maxlen		= sizeof(int),
3397 		.mode		= 0644,
3398 		.proc_handler	= proc_dointvec_ms_jiffies,
3399 	},
3400 	{
3401 		.procname	= "gc_timeout",
3402 		.data		= &ip_rt_gc_timeout,
3403 		.maxlen		= sizeof(int),
3404 		.mode		= 0644,
3405 		.proc_handler	= proc_dointvec_jiffies,
3406 	},
3407 	{
3408 		.procname	= "gc_interval",
3409 		.data		= &ip_rt_gc_interval,
3410 		.maxlen		= sizeof(int),
3411 		.mode		= 0644,
3412 		.proc_handler	= proc_dointvec_jiffies,
3413 	},
3414 	{
3415 		.procname	= "redirect_load",
3416 		.data		= &ip_rt_redirect_load,
3417 		.maxlen		= sizeof(int),
3418 		.mode		= 0644,
3419 		.proc_handler	= proc_dointvec,
3420 	},
3421 	{
3422 		.procname	= "redirect_number",
3423 		.data		= &ip_rt_redirect_number,
3424 		.maxlen		= sizeof(int),
3425 		.mode		= 0644,
3426 		.proc_handler	= proc_dointvec,
3427 	},
3428 	{
3429 		.procname	= "redirect_silence",
3430 		.data		= &ip_rt_redirect_silence,
3431 		.maxlen		= sizeof(int),
3432 		.mode		= 0644,
3433 		.proc_handler	= proc_dointvec,
3434 	},
3435 	{
3436 		.procname	= "error_cost",
3437 		.data		= &ip_rt_error_cost,
3438 		.maxlen		= sizeof(int),
3439 		.mode		= 0644,
3440 		.proc_handler	= proc_dointvec,
3441 	},
3442 	{
3443 		.procname	= "error_burst",
3444 		.data		= &ip_rt_error_burst,
3445 		.maxlen		= sizeof(int),
3446 		.mode		= 0644,
3447 		.proc_handler	= proc_dointvec,
3448 	},
3449 	{
3450 		.procname	= "gc_elasticity",
3451 		.data		= &ip_rt_gc_elasticity,
3452 		.maxlen		= sizeof(int),
3453 		.mode		= 0644,
3454 		.proc_handler	= proc_dointvec,
3455 	},
3456 	{
3457 		.procname	= "mtu_expires",
3458 		.data		= &ip_rt_mtu_expires,
3459 		.maxlen		= sizeof(int),
3460 		.mode		= 0644,
3461 		.proc_handler	= proc_dointvec_jiffies,
3462 	},
3463 	{
3464 		.procname	= "min_pmtu",
3465 		.data		= &ip_rt_min_pmtu,
3466 		.maxlen		= sizeof(int),
3467 		.mode		= 0644,
3468 		.proc_handler	= proc_dointvec_minmax,
3469 		.extra1		= &ip_min_valid_pmtu,
3470 	},
3471 	{
3472 		.procname	= "min_adv_mss",
3473 		.data		= &ip_rt_min_advmss,
3474 		.maxlen		= sizeof(int),
3475 		.mode		= 0644,
3476 		.proc_handler	= proc_dointvec,
3477 	},
3478 	{ }
3479 };
3480 
3481 static const char ipv4_route_flush_procname[] = "flush";
3482 
3483 static struct ctl_table ipv4_route_flush_table[] = {
3484 	{
3485 		.procname	= ipv4_route_flush_procname,
3486 		.maxlen		= sizeof(int),
3487 		.mode		= 0200,
3488 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3489 	},
3490 	{ },
3491 };
3492 
sysctl_route_net_init(struct net * net)3493 static __net_init int sysctl_route_net_init(struct net *net)
3494 {
3495 	struct ctl_table *tbl;
3496 
3497 	tbl = ipv4_route_flush_table;
3498 	if (!net_eq(net, &init_net)) {
3499 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3500 		if (!tbl)
3501 			goto err_dup;
3502 
3503 		/* Don't export non-whitelisted sysctls to unprivileged users */
3504 		if (net->user_ns != &init_user_ns) {
3505 			if (tbl[0].procname != ipv4_route_flush_procname)
3506 				tbl[0].procname = NULL;
3507 		}
3508 	}
3509 	tbl[0].extra1 = net;
3510 
3511 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3512 	if (!net->ipv4.route_hdr)
3513 		goto err_reg;
3514 	return 0;
3515 
3516 err_reg:
3517 	if (tbl != ipv4_route_flush_table)
3518 		kfree(tbl);
3519 err_dup:
3520 	return -ENOMEM;
3521 }
3522 
sysctl_route_net_exit(struct net * net)3523 static __net_exit void sysctl_route_net_exit(struct net *net)
3524 {
3525 	struct ctl_table *tbl;
3526 
3527 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3528 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3529 	BUG_ON(tbl == ipv4_route_flush_table);
3530 	kfree(tbl);
3531 }
3532 
3533 static __net_initdata struct pernet_operations sysctl_route_ops = {
3534 	.init = sysctl_route_net_init,
3535 	.exit = sysctl_route_net_exit,
3536 };
3537 #endif
3538 
rt_genid_init(struct net * net)3539 static __net_init int rt_genid_init(struct net *net)
3540 {
3541 	atomic_set(&net->ipv4.rt_genid, 0);
3542 	atomic_set(&net->fnhe_genid, 0);
3543 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3544 	return 0;
3545 }
3546 
3547 static __net_initdata struct pernet_operations rt_genid_ops = {
3548 	.init = rt_genid_init,
3549 };
3550 
ipv4_inetpeer_init(struct net * net)3551 static int __net_init ipv4_inetpeer_init(struct net *net)
3552 {
3553 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3554 
3555 	if (!bp)
3556 		return -ENOMEM;
3557 	inet_peer_base_init(bp);
3558 	net->ipv4.peers = bp;
3559 	return 0;
3560 }
3561 
ipv4_inetpeer_exit(struct net * net)3562 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3563 {
3564 	struct inet_peer_base *bp = net->ipv4.peers;
3565 
3566 	net->ipv4.peers = NULL;
3567 	inetpeer_invalidate_tree(bp);
3568 	kfree(bp);
3569 }
3570 
3571 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3572 	.init	=	ipv4_inetpeer_init,
3573 	.exit	=	ipv4_inetpeer_exit,
3574 };
3575 
3576 #ifdef CONFIG_IP_ROUTE_CLASSID
3577 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3578 #endif /* CONFIG_IP_ROUTE_CLASSID */
3579 
ip_rt_init(void)3580 int __init ip_rt_init(void)
3581 {
3582 	void *idents_hash;
3583 	int cpu;
3584 
3585 	/* For modern hosts, this will use 2 MB of memory */
3586 	idents_hash = alloc_large_system_hash("IP idents",
3587 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3588 					      0,
3589 					      16, /* one bucket per 64 KB */
3590 					      HASH_ZERO,
3591 					      NULL,
3592 					      &ip_idents_mask,
3593 					      2048,
3594 					      256*1024);
3595 
3596 	ip_idents = idents_hash;
3597 
3598 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3599 
3600 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3601 
3602 	for_each_possible_cpu(cpu) {
3603 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3604 
3605 		INIT_LIST_HEAD(&ul->head);
3606 		spin_lock_init(&ul->lock);
3607 	}
3608 #ifdef CONFIG_IP_ROUTE_CLASSID
3609 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3610 	if (!ip_rt_acct)
3611 		panic("IP: failed to allocate ip_rt_acct\n");
3612 #endif
3613 
3614 	ipv4_dst_ops.kmem_cachep =
3615 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3616 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3617 
3618 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3619 
3620 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3621 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3622 
3623 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3624 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3625 
3626 	ipv4_dst_ops.gc_thresh = ~0;
3627 	ip_rt_max_size = INT_MAX;
3628 
3629 	devinet_init();
3630 	ip_fib_init();
3631 
3632 	if (ip_rt_proc_init())
3633 		pr_err("Unable to create route proc files\n");
3634 #ifdef CONFIG_XFRM
3635 	xfrm_init();
3636 	xfrm4_init();
3637 #endif
3638 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3639 		      RTNL_FLAG_DOIT_UNLOCKED);
3640 
3641 #ifdef CONFIG_SYSCTL
3642 	register_pernet_subsys(&sysctl_route_ops);
3643 #endif
3644 	register_pernet_subsys(&rt_genid_ops);
3645 	register_pernet_subsys(&ipv4_inetpeer_ops);
3646 	return 0;
3647 }
3648 
3649 #ifdef CONFIG_SYSCTL
3650 /*
3651  * We really need to sanitize the damn ipv4 init order, then all
3652  * this nonsense will go away.
3653  */
ip_static_sysctl_init(void)3654 void __init ip_static_sysctl_init(void)
3655 {
3656 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3657 }
3658 #endif
3659