• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu,
144 					   bool confirm_neigh);
145 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 					struct sk_buff *skb);
147 static void		ipv4_dst_destroy(struct dst_entry *dst);
148 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159 
160 static struct dst_ops ipv4_dst_ops = {
161 	.family =		AF_INET,
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.negative_advice =	ipv4_negative_advice,
168 	.link_failure =		ipv4_link_failure,
169 	.update_pmtu =		ip_rt_update_pmtu,
170 	.redirect =		ip_do_redirect,
171 	.local_out =		__ip_local_out,
172 	.neigh_lookup =		ipv4_neigh_lookup,
173 	.confirm_neigh =	ipv4_confirm_neigh,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct proc_ops rt_cache_proc_ops = {
242 	.proc_open	= rt_cache_seq_open,
243 	.proc_read	= seq_read,
244 	.proc_lseek	= seq_lseek,
245 	.proc_release	= seq_release,
246 };
247 
248 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 	int cpu;
252 
253 	if (*pos == 0)
254 		return SEQ_START_TOKEN;
255 
256 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 		if (!cpu_possible(cpu))
258 			continue;
259 		*pos = cpu+1;
260 		return &per_cpu(rt_cache_stat, cpu);
261 	}
262 	return NULL;
263 }
264 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 	int cpu;
268 
269 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 		if (!cpu_possible(cpu))
271 			continue;
272 		*pos = cpu+1;
273 		return &per_cpu(rt_cache_stat, cpu);
274 	}
275 	(*pos)++;
276 	return NULL;
277 
278 }
279 
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct proc_ops rt_cpu_proc_ops = {
333 	.proc_open	= rt_cpu_seq_open,
334 	.proc_read	= seq_read,
335 	.proc_lseek	= seq_lseek,
336 	.proc_release	= seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 #endif
364 
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 	struct proc_dir_entry *pde;
368 
369 	pde = proc_create("rt_cache", 0444, net->proc_net,
370 			  &rt_cache_proc_ops);
371 	if (!pde)
372 		goto err1;
373 
374 	pde = proc_create("rt_cache", 0444,
375 			  net->proc_net_stat, &rt_cpu_proc_ops);
376 	if (!pde)
377 		goto err2;
378 
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 	pde = proc_create_single("rt_acct", 0, net->proc_net,
381 			rt_acct_proc_show);
382 	if (!pde)
383 		goto err3;
384 #endif
385 	return 0;
386 
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 	remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 	remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 	return -ENOMEM;
395 }
396 
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 	remove_proc_entry("rt_cache", net->proc_net_stat);
400 	remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 	remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405 
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407 	.init = ip_rt_do_proc_init,
408 	.exit = ip_rt_do_proc_exit,
409 };
410 
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 	return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415 
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 	return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422 
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427 
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 	rt_genid_bump_ipv4(net);
431 }
432 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 					   struct sk_buff *skb,
435 					   const void *daddr)
436 {
437 	const struct rtable *rt = container_of(dst, struct rtable, dst);
438 	struct net_device *dev = dst->dev;
439 	struct neighbour *n;
440 
441 	rcu_read_lock_bh();
442 
443 	if (likely(rt->rt_gw_family == AF_INET)) {
444 		n = ip_neigh_gw4(dev, rt->rt_gw4);
445 	} else if (rt->rt_gw_family == AF_INET6) {
446 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448 		__be32 pkey;
449 
450 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 		n = ip_neigh_gw4(dev, pkey);
452 	}
453 
454 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 		n = NULL;
456 
457 	rcu_read_unlock_bh();
458 
459 	return n;
460 }
461 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 	const struct rtable *rt = container_of(dst, struct rtable, dst);
465 	struct net_device *dev = dst->dev;
466 	const __be32 *pkey = daddr;
467 
468 	if (rt->rt_gw_family == AF_INET) {
469 		pkey = (const __be32 *)&rt->rt_gw4;
470 	} else if (rt->rt_gw_family == AF_INET6) {
471 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 	} else if (!daddr ||
473 		 (rt->rt_flags &
474 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 		return;
476 	}
477 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479 
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486 
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 	u32 bucket, old, now = (u32)jiffies;
494 	atomic_t *p_id;
495 	u32 *p_tstamp;
496 	u32 delta = 0;
497 
498 	bucket = hash & ip_idents_mask;
499 	p_tstamp = ip_tstamps + bucket;
500 	p_id = ip_idents + bucket;
501 	old = READ_ONCE(*p_tstamp);
502 
503 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 		delta = prandom_u32_max(now - old);
505 
506 	/* If UBSAN reports an error there, please make sure your compiler
507 	 * supports -fno-strict-overflow before reporting it that was a bug
508 	 * in UBSAN, and it has been fixed in GCC-8.
509 	 */
510 	return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 	u32 hash, id;
517 
518 	/* Note the following code is not safe, but this is okay. */
519 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 		get_random_bytes(&net->ipv4.ip_id_key,
521 				 sizeof(net->ipv4.ip_id_key));
522 
523 	hash = siphash_3u32((__force u32)iph->daddr,
524 			    (__force u32)iph->saddr,
525 			    iph->protocol,
526 			    &net->ipv4.ip_id_key);
527 	id = ip_idents_reserve(hash, segs);
528 	iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
ip_rt_fix_tos(struct flowi4 * fl4)532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534 	__u8 tos = RT_FL_TOS(fl4);
535 
536 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537 	fl4->flowi4_scope = tos & RTO_ONLINK ?
538 			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542 			     const struct sock *sk,
543 			     const struct iphdr *iph,
544 			     int oif, u8 tos,
545 			     u8 prot, u32 mark, int flow_flags)
546 {
547 	if (sk) {
548 		const struct inet_sock *inet = inet_sk(sk);
549 
550 		oif = sk->sk_bound_dev_if;
551 		mark = sk->sk_mark;
552 		tos = RT_CONN_FLAGS(sk);
553 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554 	}
555 	flowi4_init_output(fl4, oif, mark, tos,
556 			   RT_SCOPE_UNIVERSE, prot,
557 			   flow_flags,
558 			   iph->daddr, iph->saddr, 0, 0,
559 			   sock_net_uid(net, sk));
560 }
561 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563 			       const struct sock *sk)
564 {
565 	const struct net *net = dev_net(skb->dev);
566 	const struct iphdr *iph = ip_hdr(skb);
567 	int oif = skb->dev->ifindex;
568 	u8 tos = RT_TOS(iph->tos);
569 	u8 prot = iph->protocol;
570 	u32 mark = skb->mark;
571 
572 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577 	const struct inet_sock *inet = inet_sk(sk);
578 	const struct ip_options_rcu *inet_opt;
579 	__be32 daddr = inet->inet_daddr;
580 
581 	rcu_read_lock();
582 	inet_opt = rcu_dereference(inet->inet_opt);
583 	if (inet_opt && inet_opt->opt.srr)
584 		daddr = inet_opt->opt.faddr;
585 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588 			   inet_sk_flowi_flags(sk),
589 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590 	rcu_read_unlock();
591 }
592 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594 				 const struct sk_buff *skb)
595 {
596 	if (skb)
597 		build_skb_flow_key(fl4, skb, sk);
598 	else
599 		build_sk_flow_key(fl4, sk);
600 }
601 
602 static DEFINE_SPINLOCK(fnhe_lock);
603 
fnhe_flush_routes(struct fib_nh_exception * fnhe)604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606 	struct rtable *rt;
607 
608 	rt = rcu_dereference(fnhe->fnhe_rth_input);
609 	if (rt) {
610 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611 		dst_dev_put(&rt->dst);
612 		dst_release(&rt->dst);
613 	}
614 	rt = rcu_dereference(fnhe->fnhe_rth_output);
615 	if (rt) {
616 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617 		dst_dev_put(&rt->dst);
618 		dst_release(&rt->dst);
619 	}
620 }
621 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625 	struct fib_nh_exception *fnhe, *oldest = NULL;
626 
627 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628 		fnhe = rcu_dereference_protected(*fnhe_p,
629 						 lockdep_is_held(&fnhe_lock));
630 		if (!fnhe)
631 			break;
632 		if (!oldest ||
633 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634 			oldest = fnhe;
635 			oldest_p = fnhe_p;
636 		}
637 	}
638 	fnhe_flush_routes(oldest);
639 	*oldest_p = oldest->fnhe_next;
640 	kfree_rcu(oldest, rcu);
641 }
642 
fnhe_hashfun(__be32 daddr)643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645 	static siphash_key_t fnhe_hash_key __read_mostly;
646 	u64 hval;
647 
648 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650 	return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655 	rt->rt_pmtu = fnhe->fnhe_pmtu;
656 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657 	rt->dst.expires = fnhe->fnhe_expires;
658 
659 	if (fnhe->fnhe_gw) {
660 		rt->rt_flags |= RTCF_REDIRECTED;
661 		rt->rt_uses_gateway = 1;
662 		rt->rt_gw_family = AF_INET;
663 		rt->rt_gw4 = fnhe->fnhe_gw;
664 	}
665 }
666 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668 				  __be32 gw, u32 pmtu, bool lock,
669 				  unsigned long expires)
670 {
671 	struct fnhe_hash_bucket *hash;
672 	struct fib_nh_exception *fnhe;
673 	struct rtable *rt;
674 	u32 genid, hval;
675 	unsigned int i;
676 	int depth;
677 
678 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
679 	hval = fnhe_hashfun(daddr);
680 
681 	spin_lock_bh(&fnhe_lock);
682 
683 	hash = rcu_dereference(nhc->nhc_exceptions);
684 	if (!hash) {
685 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686 		if (!hash)
687 			goto out_unlock;
688 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
689 	}
690 
691 	hash += hval;
692 
693 	depth = 0;
694 	for (fnhe = rcu_dereference(hash->chain); fnhe;
695 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
696 		if (fnhe->fnhe_daddr == daddr)
697 			break;
698 		depth++;
699 	}
700 
701 	if (fnhe) {
702 		if (fnhe->fnhe_genid != genid)
703 			fnhe->fnhe_genid = genid;
704 		if (gw)
705 			fnhe->fnhe_gw = gw;
706 		if (pmtu) {
707 			fnhe->fnhe_pmtu = pmtu;
708 			fnhe->fnhe_mtu_locked = lock;
709 		}
710 		fnhe->fnhe_expires = max(1UL, expires);
711 		/* Update all cached dsts too */
712 		rt = rcu_dereference(fnhe->fnhe_rth_input);
713 		if (rt)
714 			fill_route_from_fnhe(rt, fnhe);
715 		rt = rcu_dereference(fnhe->fnhe_rth_output);
716 		if (rt)
717 			fill_route_from_fnhe(rt, fnhe);
718 	} else {
719 		/* Randomize max depth to avoid some side channels attacks. */
720 		int max_depth = FNHE_RECLAIM_DEPTH +
721 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
722 
723 		while (depth > max_depth) {
724 			fnhe_remove_oldest(hash);
725 			depth--;
726 		}
727 
728 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729 		if (!fnhe)
730 			goto out_unlock;
731 
732 		fnhe->fnhe_next = hash->chain;
733 
734 		fnhe->fnhe_genid = genid;
735 		fnhe->fnhe_daddr = daddr;
736 		fnhe->fnhe_gw = gw;
737 		fnhe->fnhe_pmtu = pmtu;
738 		fnhe->fnhe_mtu_locked = lock;
739 		fnhe->fnhe_expires = max(1UL, expires);
740 
741 		rcu_assign_pointer(hash->chain, fnhe);
742 
743 		/* Exception created; mark the cached routes for the nexthop
744 		 * stale, so anyone caching it rechecks if this exception
745 		 * applies to them.
746 		 */
747 		rt = rcu_dereference(nhc->nhc_rth_input);
748 		if (rt)
749 			rt->dst.obsolete = DST_OBSOLETE_KILL;
750 
751 		for_each_possible_cpu(i) {
752 			struct rtable __rcu **prt;
753 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754 			rt = rcu_dereference(*prt);
755 			if (rt)
756 				rt->dst.obsolete = DST_OBSOLETE_KILL;
757 		}
758 	}
759 
760 	fnhe->fnhe_stamp = jiffies;
761 
762 out_unlock:
763 	spin_unlock_bh(&fnhe_lock);
764 }
765 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767 			     bool kill_route)
768 {
769 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
770 	__be32 old_gw = ip_hdr(skb)->saddr;
771 	struct net_device *dev = skb->dev;
772 	struct in_device *in_dev;
773 	struct fib_result res;
774 	struct neighbour *n;
775 	struct net *net;
776 
777 	switch (icmp_hdr(skb)->code & 7) {
778 	case ICMP_REDIR_NET:
779 	case ICMP_REDIR_NETTOS:
780 	case ICMP_REDIR_HOST:
781 	case ICMP_REDIR_HOSTTOS:
782 		break;
783 
784 	default:
785 		return;
786 	}
787 
788 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789 		return;
790 
791 	in_dev = __in_dev_get_rcu(dev);
792 	if (!in_dev)
793 		return;
794 
795 	net = dev_net(dev);
796 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798 	    ipv4_is_zeronet(new_gw))
799 		goto reject_redirect;
800 
801 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803 			goto reject_redirect;
804 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805 			goto reject_redirect;
806 	} else {
807 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808 			goto reject_redirect;
809 	}
810 
811 	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
812 	if (!n)
813 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814 	if (!IS_ERR(n)) {
815 		if (!(n->nud_state & NUD_VALID)) {
816 			neigh_event_send(n, NULL);
817 		} else {
818 			if (fib_lookup(net, fl4, &res, 0) == 0) {
819 				struct fib_nh_common *nhc;
820 
821 				fib_select_path(net, &res, fl4, skb);
822 				nhc = FIB_RES_NHC(res);
823 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824 						0, false,
825 						jiffies + ip_rt_gc_timeout);
826 			}
827 			if (kill_route)
828 				rt->dst.obsolete = DST_OBSOLETE_KILL;
829 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830 		}
831 		neigh_release(n);
832 	}
833 	return;
834 
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
838 		const struct iphdr *iph = (const struct iphdr *) skb->data;
839 		__be32 daddr = iph->daddr;
840 		__be32 saddr = iph->saddr;
841 
842 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843 				     "  Advised path = %pI4 -> %pI4\n",
844 				     &old_gw, dev->name, &new_gw,
845 				     &saddr, &daddr);
846 	}
847 #endif
848 	;
849 }
850 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853 	struct rtable *rt;
854 	struct flowi4 fl4;
855 	const struct iphdr *iph = (const struct iphdr *) skb->data;
856 	struct net *net = dev_net(skb->dev);
857 	int oif = skb->dev->ifindex;
858 	u8 tos = RT_TOS(iph->tos);
859 	u8 prot = iph->protocol;
860 	u32 mark = skb->mark;
861 
862 	rt = (struct rtable *) dst;
863 
864 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865 	ip_rt_fix_tos(&fl4);
866 	__ip_do_redirect(rt, skb, &fl4, true);
867 }
868 
ipv4_negative_advice(struct dst_entry * dst)869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871 	struct rtable *rt = (struct rtable *)dst;
872 	struct dst_entry *ret = dst;
873 
874 	if (rt) {
875 		if (dst->obsolete > 0) {
876 			ip_rt_put(rt);
877 			ret = NULL;
878 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879 			   rt->dst.expires) {
880 			ip_rt_put(rt);
881 			ret = NULL;
882 		}
883 	}
884 	return ret;
885 }
886 
887 /*
888  * Algorithm:
889  *	1. The first ip_rt_redirect_number redirects are sent
890  *	   with exponential backoff, then we stop sending them at all,
891  *	   assuming that the host ignores our redirects.
892  *	2. If we did not see packets requiring redirects
893  *	   during ip_rt_redirect_silence, we assume that the host
894  *	   forgot redirected route and start to send redirects again.
895  *
896  * This algorithm is much cheaper and more intelligent than dumb load limiting
897  * in icmp.c.
898  *
899  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900  * and "frag. need" (breaks PMTU discovery) in icmp.c.
901  */
902 
ip_rt_send_redirect(struct sk_buff * skb)903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905 	struct rtable *rt = skb_rtable(skb);
906 	struct in_device *in_dev;
907 	struct inet_peer *peer;
908 	struct net *net;
909 	int log_martians;
910 	int vif;
911 
912 	rcu_read_lock();
913 	in_dev = __in_dev_get_rcu(rt->dst.dev);
914 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915 		rcu_read_unlock();
916 		return;
917 	}
918 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920 	rcu_read_unlock();
921 
922 	net = dev_net(rt->dst.dev);
923 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924 	if (!peer) {
925 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
927 		return;
928 	}
929 
930 	/* No redirected packets during ip_rt_redirect_silence;
931 	 * reset the algorithm.
932 	 */
933 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934 		peer->rate_tokens = 0;
935 		peer->n_redirects = 0;
936 	}
937 
938 	/* Too many ignored redirects; do not send anything
939 	 * set dst.rate_last to the last seen redirected packet.
940 	 */
941 	if (peer->n_redirects >= ip_rt_redirect_number) {
942 		peer->rate_last = jiffies;
943 		goto out_put_peer;
944 	}
945 
946 	/* Check for load limit; set rate_last to the latest sent
947 	 * redirect.
948 	 */
949 	if (peer->n_redirects == 0 ||
950 	    time_after(jiffies,
951 		       (peer->rate_last +
952 			(ip_rt_redirect_load << peer->n_redirects)))) {
953 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954 
955 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956 		peer->rate_last = jiffies;
957 		++peer->n_redirects;
958 #ifdef CONFIG_IP_ROUTE_VERBOSE
959 		if (log_martians &&
960 		    peer->n_redirects == ip_rt_redirect_number)
961 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962 					     &ip_hdr(skb)->saddr, inet_iif(skb),
963 					     &ip_hdr(skb)->daddr, &gw);
964 #endif
965 	}
966 out_put_peer:
967 	inet_putpeer(peer);
968 }
969 
ip_error(struct sk_buff * skb)970 static int ip_error(struct sk_buff *skb)
971 {
972 	struct rtable *rt = skb_rtable(skb);
973 	struct net_device *dev = skb->dev;
974 	struct in_device *in_dev;
975 	struct inet_peer *peer;
976 	unsigned long now;
977 	struct net *net;
978 	bool send;
979 	int code;
980 
981 	if (netif_is_l3_master(skb->dev)) {
982 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983 		if (!dev)
984 			goto out;
985 	}
986 
987 	in_dev = __in_dev_get_rcu(dev);
988 
989 	/* IP on this device is disabled. */
990 	if (!in_dev)
991 		goto out;
992 
993 	net = dev_net(rt->dst.dev);
994 	if (!IN_DEV_FORWARD(in_dev)) {
995 		switch (rt->dst.error) {
996 		case EHOSTUNREACH:
997 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998 			break;
999 
1000 		case ENETUNREACH:
1001 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002 			break;
1003 		}
1004 		goto out;
1005 	}
1006 
1007 	switch (rt->dst.error) {
1008 	case EINVAL:
1009 	default:
1010 		goto out;
1011 	case EHOSTUNREACH:
1012 		code = ICMP_HOST_UNREACH;
1013 		break;
1014 	case ENETUNREACH:
1015 		code = ICMP_NET_UNREACH;
1016 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017 		break;
1018 	case EACCES:
1019 		code = ICMP_PKT_FILTERED;
1020 		break;
1021 	}
1022 
1023 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024 			       l3mdev_master_ifindex(skb->dev), 1);
1025 
1026 	send = true;
1027 	if (peer) {
1028 		now = jiffies;
1029 		peer->rate_tokens += now - peer->rate_last;
1030 		if (peer->rate_tokens > ip_rt_error_burst)
1031 			peer->rate_tokens = ip_rt_error_burst;
1032 		peer->rate_last = now;
1033 		if (peer->rate_tokens >= ip_rt_error_cost)
1034 			peer->rate_tokens -= ip_rt_error_cost;
1035 		else
1036 			send = false;
1037 		inet_putpeer(peer);
1038 	}
1039 	if (send)
1040 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041 
1042 out:	kfree_skb(skb);
1043 	return 0;
1044 }
1045 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1046 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047 {
1048 	struct dst_entry *dst = &rt->dst;
1049 	struct net *net = dev_net(dst->dev);
1050 	struct fib_result res;
1051 	bool lock = false;
1052 	u32 old_mtu;
1053 
1054 	if (ip_mtu_locked(dst))
1055 		return;
1056 
1057 	old_mtu = ipv4_mtu(dst);
1058 	if (old_mtu < mtu)
1059 		return;
1060 
1061 	if (mtu < ip_rt_min_pmtu) {
1062 		lock = true;
1063 		mtu = min(old_mtu, ip_rt_min_pmtu);
1064 	}
1065 
1066 	if (rt->rt_pmtu == mtu && !lock &&
1067 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068 		return;
1069 
1070 	rcu_read_lock();
1071 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1072 		struct fib_nh_common *nhc;
1073 
1074 		fib_select_path(net, &res, fl4, NULL);
1075 		nhc = FIB_RES_NHC(res);
1076 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077 				      jiffies + ip_rt_mtu_expires);
1078 	}
1079 	rcu_read_unlock();
1080 }
1081 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1082 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083 			      struct sk_buff *skb, u32 mtu,
1084 			      bool confirm_neigh)
1085 {
1086 	struct rtable *rt = (struct rtable *) dst;
1087 	struct flowi4 fl4;
1088 
1089 	ip_rt_build_flow_key(&fl4, sk, skb);
1090 	ip_rt_fix_tos(&fl4);
1091 
1092 	/* Don't make lookup fail for bridged encapsulations */
1093 	if (skb && netif_is_any_bridge_port(skb->dev))
1094 		fl4.flowi4_oif = 0;
1095 
1096 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1097 }
1098 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100 		      int oif, u8 protocol)
1101 {
1102 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 	struct flowi4 fl4;
1104 	struct rtable *rt;
1105 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106 
1107 	__build_flow_key(net, &fl4, NULL, iph, oif,
1108 			 RT_TOS(iph->tos), protocol, mark, 0);
1109 	rt = __ip_route_output_key(net, &fl4);
1110 	if (!IS_ERR(rt)) {
1111 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1112 		ip_rt_put(rt);
1113 	}
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118 {
1119 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1120 	struct flowi4 fl4;
1121 	struct rtable *rt;
1122 
1123 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124 
1125 	if (!fl4.flowi4_mark)
1126 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127 
1128 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1129 	if (!IS_ERR(rt)) {
1130 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1131 		ip_rt_put(rt);
1132 	}
1133 }
1134 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136 {
1137 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1138 	struct flowi4 fl4;
1139 	struct rtable *rt;
1140 	struct dst_entry *odst = NULL;
1141 	bool new = false;
1142 	struct net *net = sock_net(sk);
1143 
1144 	bh_lock_sock(sk);
1145 
1146 	if (!ip_sk_accept_pmtu(sk))
1147 		goto out;
1148 
1149 	odst = sk_dst_get(sk);
1150 
1151 	if (sock_owned_by_user(sk) || !odst) {
1152 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1153 		goto out;
1154 	}
1155 
1156 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157 
1158 	rt = (struct rtable *)odst;
1159 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161 		if (IS_ERR(rt))
1162 			goto out;
1163 
1164 		new = true;
1165 	} else {
1166 		ip_rt_fix_tos(&fl4);
1167 	}
1168 
1169 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170 
1171 	if (!dst_check(&rt->dst, 0)) {
1172 		if (new)
1173 			dst_release(&rt->dst);
1174 
1175 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176 		if (IS_ERR(rt))
1177 			goto out;
1178 
1179 		new = true;
1180 	}
1181 
1182 	if (new)
1183 		sk_dst_set(sk, &rt->dst);
1184 
1185 out:
1186 	bh_unlock_sock(sk);
1187 	dst_release(odst);
1188 }
1189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192 		   int oif, u8 protocol)
1193 {
1194 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1195 	struct flowi4 fl4;
1196 	struct rtable *rt;
1197 
1198 	__build_flow_key(net, &fl4, NULL, iph, oif,
1199 			 RT_TOS(iph->tos), protocol, 0, 0);
1200 	rt = __ip_route_output_key(net, &fl4);
1201 	if (!IS_ERR(rt)) {
1202 		__ip_do_redirect(rt, skb, &fl4, false);
1203 		ip_rt_put(rt);
1204 	}
1205 }
1206 EXPORT_SYMBOL_GPL(ipv4_redirect);
1207 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209 {
1210 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1211 	struct flowi4 fl4;
1212 	struct rtable *rt;
1213 	struct net *net = sock_net(sk);
1214 
1215 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216 	rt = __ip_route_output_key(net, &fl4);
1217 	if (!IS_ERR(rt)) {
1218 		__ip_do_redirect(rt, skb, &fl4, false);
1219 		ip_rt_put(rt);
1220 	}
1221 }
1222 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1224 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225 {
1226 	struct rtable *rt = (struct rtable *) dst;
1227 
1228 	/* All IPV4 dsts are created with ->obsolete set to the value
1229 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230 	 * into this function always.
1231 	 *
1232 	 * When a PMTU/redirect information update invalidates a route,
1233 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234 	 * DST_OBSOLETE_DEAD.
1235 	 */
1236 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237 		return NULL;
1238 	return dst;
1239 }
1240 
ipv4_send_dest_unreach(struct sk_buff * skb)1241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242 {
1243 	struct net_device *dev;
1244 	struct ip_options opt;
1245 	int res;
1246 
1247 	/* Recompile ip options since IPCB may not be valid anymore.
1248 	 * Also check we have a reasonable ipv4 header.
1249 	 */
1250 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1251 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1252 		return;
1253 
1254 	memset(&opt, 0, sizeof(opt));
1255 	if (ip_hdr(skb)->ihl > 5) {
1256 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1257 			return;
1258 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1259 
1260 		rcu_read_lock();
1261 		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1262 		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1263 		rcu_read_unlock();
1264 
1265 		if (res)
1266 			return;
1267 	}
1268 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1269 }
1270 
ipv4_link_failure(struct sk_buff * skb)1271 static void ipv4_link_failure(struct sk_buff *skb)
1272 {
1273 	struct rtable *rt;
1274 
1275 	ipv4_send_dest_unreach(skb);
1276 
1277 	rt = skb_rtable(skb);
1278 	if (rt)
1279 		dst_set_expires(&rt->dst, 0);
1280 }
1281 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1282 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1283 {
1284 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1285 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1286 		 skb->dev ? skb->dev->name : "?");
1287 	kfree_skb(skb);
1288 	WARN_ON(1);
1289 	return 0;
1290 }
1291 
1292 /*
1293    We do not cache source address of outgoing interface,
1294    because it is used only by IP RR, TS and SRR options,
1295    so that it out of fast path.
1296 
1297    BTW remember: "addr" is allowed to be not aligned
1298    in IP options!
1299  */
1300 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1301 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1302 {
1303 	__be32 src;
1304 
1305 	if (rt_is_output_route(rt))
1306 		src = ip_hdr(skb)->saddr;
1307 	else {
1308 		struct fib_result res;
1309 		struct iphdr *iph = ip_hdr(skb);
1310 		struct flowi4 fl4 = {
1311 			.daddr = iph->daddr,
1312 			.saddr = iph->saddr,
1313 			.flowi4_tos = RT_TOS(iph->tos),
1314 			.flowi4_oif = rt->dst.dev->ifindex,
1315 			.flowi4_iif = skb->dev->ifindex,
1316 			.flowi4_mark = skb->mark,
1317 		};
1318 
1319 		rcu_read_lock();
1320 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1321 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1322 		else
1323 			src = inet_select_addr(rt->dst.dev,
1324 					       rt_nexthop(rt, iph->daddr),
1325 					       RT_SCOPE_UNIVERSE);
1326 		rcu_read_unlock();
1327 	}
1328 	memcpy(addr, &src, 4);
1329 }
1330 
1331 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1332 static void set_class_tag(struct rtable *rt, u32 tag)
1333 {
1334 	if (!(rt->dst.tclassid & 0xFFFF))
1335 		rt->dst.tclassid |= tag & 0xFFFF;
1336 	if (!(rt->dst.tclassid & 0xFFFF0000))
1337 		rt->dst.tclassid |= tag & 0xFFFF0000;
1338 }
1339 #endif
1340 
ipv4_default_advmss(const struct dst_entry * dst)1341 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1342 {
1343 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1344 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1345 				    ip_rt_min_advmss);
1346 
1347 	return min(advmss, IPV4_MAX_PMTU - header_size);
1348 }
1349 
ipv4_mtu(const struct dst_entry * dst)1350 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1351 {
1352 	const struct rtable *rt = (const struct rtable *)dst;
1353 	unsigned int mtu = rt->rt_pmtu;
1354 
1355 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1356 		mtu = dst_metric_raw(dst, RTAX_MTU);
1357 
1358 	if (mtu)
1359 		goto out;
1360 
1361 	mtu = READ_ONCE(dst->dev->mtu);
1362 
1363 	if (unlikely(ip_mtu_locked(dst))) {
1364 		if (rt->rt_uses_gateway && mtu > 576)
1365 			mtu = 576;
1366 	}
1367 
1368 out:
1369 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1370 
1371 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1372 }
1373 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1374 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1375 {
1376 	struct fnhe_hash_bucket *hash;
1377 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1378 	u32 hval = fnhe_hashfun(daddr);
1379 
1380 	spin_lock_bh(&fnhe_lock);
1381 
1382 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1383 					 lockdep_is_held(&fnhe_lock));
1384 	hash += hval;
1385 
1386 	fnhe_p = &hash->chain;
1387 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1388 	while (fnhe) {
1389 		if (fnhe->fnhe_daddr == daddr) {
1390 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1391 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1392 			/* set fnhe_daddr to 0 to ensure it won't bind with
1393 			 * new dsts in rt_bind_exception().
1394 			 */
1395 			fnhe->fnhe_daddr = 0;
1396 			fnhe_flush_routes(fnhe);
1397 			kfree_rcu(fnhe, rcu);
1398 			break;
1399 		}
1400 		fnhe_p = &fnhe->fnhe_next;
1401 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1402 						 lockdep_is_held(&fnhe_lock));
1403 	}
1404 
1405 	spin_unlock_bh(&fnhe_lock);
1406 }
1407 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1408 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1409 					       __be32 daddr)
1410 {
1411 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1412 	struct fib_nh_exception *fnhe;
1413 	u32 hval;
1414 
1415 	if (!hash)
1416 		return NULL;
1417 
1418 	hval = fnhe_hashfun(daddr);
1419 
1420 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1421 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1422 		if (fnhe->fnhe_daddr == daddr) {
1423 			if (fnhe->fnhe_expires &&
1424 			    time_after(jiffies, fnhe->fnhe_expires)) {
1425 				ip_del_fnhe(nhc, daddr);
1426 				break;
1427 			}
1428 			return fnhe;
1429 		}
1430 	}
1431 	return NULL;
1432 }
1433 
1434 /* MTU selection:
1435  * 1. mtu on route is locked - use it
1436  * 2. mtu from nexthop exception
1437  * 3. mtu from egress device
1438  */
1439 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1440 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1441 {
1442 	struct fib_nh_common *nhc = res->nhc;
1443 	struct net_device *dev = nhc->nhc_dev;
1444 	struct fib_info *fi = res->fi;
1445 	u32 mtu = 0;
1446 
1447 	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1448 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1449 		mtu = fi->fib_mtu;
1450 
1451 	if (likely(!mtu)) {
1452 		struct fib_nh_exception *fnhe;
1453 
1454 		fnhe = find_exception(nhc, daddr);
1455 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1456 			mtu = fnhe->fnhe_pmtu;
1457 	}
1458 
1459 	if (likely(!mtu))
1460 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1461 
1462 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1463 }
1464 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1465 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1466 			      __be32 daddr, const bool do_cache)
1467 {
1468 	bool ret = false;
1469 
1470 	spin_lock_bh(&fnhe_lock);
1471 
1472 	if (daddr == fnhe->fnhe_daddr) {
1473 		struct rtable __rcu **porig;
1474 		struct rtable *orig;
1475 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1476 
1477 		if (rt_is_input_route(rt))
1478 			porig = &fnhe->fnhe_rth_input;
1479 		else
1480 			porig = &fnhe->fnhe_rth_output;
1481 		orig = rcu_dereference(*porig);
1482 
1483 		if (fnhe->fnhe_genid != genid) {
1484 			fnhe->fnhe_genid = genid;
1485 			fnhe->fnhe_gw = 0;
1486 			fnhe->fnhe_pmtu = 0;
1487 			fnhe->fnhe_expires = 0;
1488 			fnhe->fnhe_mtu_locked = false;
1489 			fnhe_flush_routes(fnhe);
1490 			orig = NULL;
1491 		}
1492 		fill_route_from_fnhe(rt, fnhe);
1493 		if (!rt->rt_gw4) {
1494 			rt->rt_gw4 = daddr;
1495 			rt->rt_gw_family = AF_INET;
1496 		}
1497 
1498 		if (do_cache) {
1499 			dst_hold(&rt->dst);
1500 			rcu_assign_pointer(*porig, rt);
1501 			if (orig) {
1502 				dst_dev_put(&orig->dst);
1503 				dst_release(&orig->dst);
1504 			}
1505 			ret = true;
1506 		}
1507 
1508 		fnhe->fnhe_stamp = jiffies;
1509 	}
1510 	spin_unlock_bh(&fnhe_lock);
1511 
1512 	return ret;
1513 }
1514 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1515 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1516 {
1517 	struct rtable *orig, *prev, **p;
1518 	bool ret = true;
1519 
1520 	if (rt_is_input_route(rt)) {
1521 		p = (struct rtable **)&nhc->nhc_rth_input;
1522 	} else {
1523 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1524 	}
1525 	orig = *p;
1526 
1527 	/* hold dst before doing cmpxchg() to avoid race condition
1528 	 * on this dst
1529 	 */
1530 	dst_hold(&rt->dst);
1531 	prev = cmpxchg(p, orig, rt);
1532 	if (prev == orig) {
1533 		if (orig) {
1534 			rt_add_uncached_list(orig);
1535 			dst_release(&orig->dst);
1536 		}
1537 	} else {
1538 		dst_release(&rt->dst);
1539 		ret = false;
1540 	}
1541 
1542 	return ret;
1543 }
1544 
1545 struct uncached_list {
1546 	spinlock_t		lock;
1547 	struct list_head	head;
1548 };
1549 
1550 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1551 
rt_add_uncached_list(struct rtable * rt)1552 void rt_add_uncached_list(struct rtable *rt)
1553 {
1554 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1555 
1556 	rt->rt_uncached_list = ul;
1557 
1558 	spin_lock_bh(&ul->lock);
1559 	list_add_tail(&rt->rt_uncached, &ul->head);
1560 	spin_unlock_bh(&ul->lock);
1561 }
1562 
rt_del_uncached_list(struct rtable * rt)1563 void rt_del_uncached_list(struct rtable *rt)
1564 {
1565 	if (!list_empty(&rt->rt_uncached)) {
1566 		struct uncached_list *ul = rt->rt_uncached_list;
1567 
1568 		spin_lock_bh(&ul->lock);
1569 		list_del(&rt->rt_uncached);
1570 		spin_unlock_bh(&ul->lock);
1571 	}
1572 }
1573 
ipv4_dst_destroy(struct dst_entry * dst)1574 static void ipv4_dst_destroy(struct dst_entry *dst)
1575 {
1576 	struct rtable *rt = (struct rtable *)dst;
1577 
1578 	ip_dst_metrics_put(dst);
1579 	rt_del_uncached_list(rt);
1580 }
1581 
rt_flush_dev(struct net_device * dev)1582 void rt_flush_dev(struct net_device *dev)
1583 {
1584 	struct rtable *rt;
1585 	int cpu;
1586 
1587 	for_each_possible_cpu(cpu) {
1588 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1589 
1590 		spin_lock_bh(&ul->lock);
1591 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1592 			if (rt->dst.dev != dev)
1593 				continue;
1594 			rt->dst.dev = blackhole_netdev;
1595 			dev_hold(rt->dst.dev);
1596 			dev_put(dev);
1597 		}
1598 		spin_unlock_bh(&ul->lock);
1599 	}
1600 }
1601 
rt_cache_valid(const struct rtable * rt)1602 static bool rt_cache_valid(const struct rtable *rt)
1603 {
1604 	return	rt &&
1605 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1606 		!rt_is_expired(rt);
1607 }
1608 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1609 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1610 			   const struct fib_result *res,
1611 			   struct fib_nh_exception *fnhe,
1612 			   struct fib_info *fi, u16 type, u32 itag,
1613 			   const bool do_cache)
1614 {
1615 	bool cached = false;
1616 
1617 	if (fi) {
1618 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1619 
1620 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1621 			rt->rt_uses_gateway = 1;
1622 			rt->rt_gw_family = nhc->nhc_gw_family;
1623 			/* only INET and INET6 are supported */
1624 			if (likely(nhc->nhc_gw_family == AF_INET))
1625 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1626 			else
1627 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1628 		}
1629 
1630 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1631 
1632 #ifdef CONFIG_IP_ROUTE_CLASSID
1633 		if (nhc->nhc_family == AF_INET) {
1634 			struct fib_nh *nh;
1635 
1636 			nh = container_of(nhc, struct fib_nh, nh_common);
1637 			rt->dst.tclassid = nh->nh_tclassid;
1638 		}
1639 #endif
1640 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1641 		if (unlikely(fnhe))
1642 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1643 		else if (do_cache)
1644 			cached = rt_cache_route(nhc, rt);
1645 		if (unlikely(!cached)) {
1646 			/* Routes we intend to cache in nexthop exception or
1647 			 * FIB nexthop have the DST_NOCACHE bit clear.
1648 			 * However, if we are unsuccessful at storing this
1649 			 * route into the cache we really need to set it.
1650 			 */
1651 			if (!rt->rt_gw4) {
1652 				rt->rt_gw_family = AF_INET;
1653 				rt->rt_gw4 = daddr;
1654 			}
1655 			rt_add_uncached_list(rt);
1656 		}
1657 	} else
1658 		rt_add_uncached_list(rt);
1659 
1660 #ifdef CONFIG_IP_ROUTE_CLASSID
1661 #ifdef CONFIG_IP_MULTIPLE_TABLES
1662 	set_class_tag(rt, res->tclassid);
1663 #endif
1664 	set_class_tag(rt, itag);
1665 #endif
1666 }
1667 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1668 struct rtable *rt_dst_alloc(struct net_device *dev,
1669 			    unsigned int flags, u16 type,
1670 			    bool nopolicy, bool noxfrm)
1671 {
1672 	struct rtable *rt;
1673 
1674 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1675 		       (nopolicy ? DST_NOPOLICY : 0) |
1676 		       (noxfrm ? DST_NOXFRM : 0));
1677 
1678 	if (rt) {
1679 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1680 		rt->rt_flags = flags;
1681 		rt->rt_type = type;
1682 		rt->rt_is_input = 0;
1683 		rt->rt_iif = 0;
1684 		rt->rt_pmtu = 0;
1685 		rt->rt_mtu_locked = 0;
1686 		rt->rt_uses_gateway = 0;
1687 		rt->rt_gw_family = 0;
1688 		rt->rt_gw4 = 0;
1689 		INIT_LIST_HEAD(&rt->rt_uncached);
1690 
1691 		rt->dst.output = ip_output;
1692 		if (flags & RTCF_LOCAL)
1693 			rt->dst.input = ip_local_deliver;
1694 	}
1695 
1696 	return rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_alloc);
1699 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1700 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1701 {
1702 	struct rtable *new_rt;
1703 
1704 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1705 			   rt->dst.flags);
1706 
1707 	if (new_rt) {
1708 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1709 		new_rt->rt_flags = rt->rt_flags;
1710 		new_rt->rt_type = rt->rt_type;
1711 		new_rt->rt_is_input = rt->rt_is_input;
1712 		new_rt->rt_iif = rt->rt_iif;
1713 		new_rt->rt_pmtu = rt->rt_pmtu;
1714 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1715 		new_rt->rt_gw_family = rt->rt_gw_family;
1716 		if (rt->rt_gw_family == AF_INET)
1717 			new_rt->rt_gw4 = rt->rt_gw4;
1718 		else if (rt->rt_gw_family == AF_INET6)
1719 			new_rt->rt_gw6 = rt->rt_gw6;
1720 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1721 
1722 		new_rt->dst.input = rt->dst.input;
1723 		new_rt->dst.output = rt->dst.output;
1724 		new_rt->dst.error = rt->dst.error;
1725 		new_rt->dst.lastuse = jiffies;
1726 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1727 	}
1728 	return new_rt;
1729 }
1730 EXPORT_SYMBOL(rt_dst_clone);
1731 
1732 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1733 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1734 			  u8 tos, struct net_device *dev,
1735 			  struct in_device *in_dev, u32 *itag)
1736 {
1737 	int err;
1738 
1739 	/* Primary sanity checks. */
1740 	if (!in_dev)
1741 		return -EINVAL;
1742 
1743 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1744 	    skb->protocol != htons(ETH_P_IP))
1745 		return -EINVAL;
1746 
1747 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1748 		return -EINVAL;
1749 
1750 	if (ipv4_is_zeronet(saddr)) {
1751 		if (!ipv4_is_local_multicast(daddr) &&
1752 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1753 			return -EINVAL;
1754 	} else {
1755 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1756 					  in_dev, itag);
1757 		if (err < 0)
1758 			return err;
1759 	}
1760 	return 0;
1761 }
1762 
1763 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1764 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1765 			     u8 tos, struct net_device *dev, int our)
1766 {
1767 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1768 	unsigned int flags = RTCF_MULTICAST;
1769 	struct rtable *rth;
1770 	u32 itag = 0;
1771 	int err;
1772 
1773 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1774 	if (err)
1775 		return err;
1776 
1777 	if (our)
1778 		flags |= RTCF_LOCAL;
1779 
1780 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1781 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1782 	if (!rth)
1783 		return -ENOBUFS;
1784 
1785 #ifdef CONFIG_IP_ROUTE_CLASSID
1786 	rth->dst.tclassid = itag;
1787 #endif
1788 	rth->dst.output = ip_rt_bug;
1789 	rth->rt_is_input= 1;
1790 
1791 #ifdef CONFIG_IP_MROUTE
1792 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1793 		rth->dst.input = ip_mr_input;
1794 #endif
1795 	RT_CACHE_STAT_INC(in_slow_mc);
1796 
1797 	skb_dst_drop(skb);
1798 	skb_dst_set(skb, &rth->dst);
1799 	return 0;
1800 }
1801 
1802 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1803 static void ip_handle_martian_source(struct net_device *dev,
1804 				     struct in_device *in_dev,
1805 				     struct sk_buff *skb,
1806 				     __be32 daddr,
1807 				     __be32 saddr)
1808 {
1809 	RT_CACHE_STAT_INC(in_martian_src);
1810 #ifdef CONFIG_IP_ROUTE_VERBOSE
1811 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1812 		/*
1813 		 *	RFC1812 recommendation, if source is martian,
1814 		 *	the only hint is MAC header.
1815 		 */
1816 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1817 			&daddr, &saddr, dev->name);
1818 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1819 			print_hex_dump(KERN_WARNING, "ll header: ",
1820 				       DUMP_PREFIX_OFFSET, 16, 1,
1821 				       skb_mac_header(skb),
1822 				       dev->hard_header_len, false);
1823 		}
1824 	}
1825 #endif
1826 }
1827 
1828 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1829 static int __mkroute_input(struct sk_buff *skb,
1830 			   const struct fib_result *res,
1831 			   struct in_device *in_dev,
1832 			   __be32 daddr, __be32 saddr, u32 tos)
1833 {
1834 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1835 	struct net_device *dev = nhc->nhc_dev;
1836 	struct fib_nh_exception *fnhe;
1837 	struct rtable *rth;
1838 	int err;
1839 	struct in_device *out_dev;
1840 	bool do_cache;
1841 	u32 itag = 0;
1842 
1843 	/* get a working reference to the output device */
1844 	out_dev = __in_dev_get_rcu(dev);
1845 	if (!out_dev) {
1846 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1847 		return -EINVAL;
1848 	}
1849 
1850 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1851 				  in_dev->dev, in_dev, &itag);
1852 	if (err < 0) {
1853 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1854 					 saddr);
1855 
1856 		goto cleanup;
1857 	}
1858 
1859 	do_cache = res->fi && !itag;
1860 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1861 	    skb->protocol == htons(ETH_P_IP)) {
1862 		__be32 gw;
1863 
1864 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1865 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1866 		    inet_addr_onlink(out_dev, saddr, gw))
1867 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1868 	}
1869 
1870 	if (skb->protocol != htons(ETH_P_IP)) {
1871 		/* Not IP (i.e. ARP). Do not create route, if it is
1872 		 * invalid for proxy arp. DNAT routes are always valid.
1873 		 *
1874 		 * Proxy arp feature have been extended to allow, ARP
1875 		 * replies back to the same interface, to support
1876 		 * Private VLAN switch technologies. See arp.c.
1877 		 */
1878 		if (out_dev == in_dev &&
1879 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1880 			err = -EINVAL;
1881 			goto cleanup;
1882 		}
1883 	}
1884 
1885 	fnhe = find_exception(nhc, daddr);
1886 	if (do_cache) {
1887 		if (fnhe)
1888 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1889 		else
1890 			rth = rcu_dereference(nhc->nhc_rth_input);
1891 		if (rt_cache_valid(rth)) {
1892 			skb_dst_set_noref(skb, &rth->dst);
1893 			goto out;
1894 		}
1895 	}
1896 
1897 	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1898 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
1899 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1900 	if (!rth) {
1901 		err = -ENOBUFS;
1902 		goto cleanup;
1903 	}
1904 
1905 	rth->rt_is_input = 1;
1906 	RT_CACHE_STAT_INC(in_slow_tot);
1907 
1908 	rth->dst.input = ip_forward;
1909 
1910 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1911 		       do_cache);
1912 	lwtunnel_set_redirect(&rth->dst);
1913 	skb_dst_set(skb, &rth->dst);
1914 out:
1915 	err = 0;
1916  cleanup:
1917 	return err;
1918 }
1919 
1920 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1921 /* To make ICMP packets follow the right flow, the multipath hash is
1922  * calculated from the inner IP addresses.
1923  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1924 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1925 				 struct flow_keys *hash_keys)
1926 {
1927 	const struct iphdr *outer_iph = ip_hdr(skb);
1928 	const struct iphdr *key_iph = outer_iph;
1929 	const struct iphdr *inner_iph;
1930 	const struct icmphdr *icmph;
1931 	struct iphdr _inner_iph;
1932 	struct icmphdr _icmph;
1933 
1934 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1935 		goto out;
1936 
1937 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1938 		goto out;
1939 
1940 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1941 				   &_icmph);
1942 	if (!icmph)
1943 		goto out;
1944 
1945 	if (!icmp_is_err(icmph->type))
1946 		goto out;
1947 
1948 	inner_iph = skb_header_pointer(skb,
1949 				       outer_iph->ihl * 4 + sizeof(_icmph),
1950 				       sizeof(_inner_iph), &_inner_iph);
1951 	if (!inner_iph)
1952 		goto out;
1953 
1954 	key_iph = inner_iph;
1955 out:
1956 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1957 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1958 }
1959 
1960 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1961 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1962 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1963 {
1964 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1965 	struct flow_keys hash_keys;
1966 	u32 mhash;
1967 
1968 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1969 	case 0:
1970 		memset(&hash_keys, 0, sizeof(hash_keys));
1971 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1972 		if (skb) {
1973 			ip_multipath_l3_keys(skb, &hash_keys);
1974 		} else {
1975 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1976 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1977 		}
1978 		break;
1979 	case 1:
1980 		/* skb is currently provided only when forwarding */
1981 		if (skb) {
1982 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1983 			struct flow_keys keys;
1984 
1985 			/* short-circuit if we already have L4 hash present */
1986 			if (skb->l4_hash)
1987 				return skb_get_hash_raw(skb) >> 1;
1988 
1989 			memset(&hash_keys, 0, sizeof(hash_keys));
1990 
1991 			if (!flkeys) {
1992 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1993 				flkeys = &keys;
1994 			}
1995 
1996 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1997 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1998 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1999 			hash_keys.ports.src = flkeys->ports.src;
2000 			hash_keys.ports.dst = flkeys->ports.dst;
2001 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2002 		} else {
2003 			memset(&hash_keys, 0, sizeof(hash_keys));
2004 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2006 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2007 			hash_keys.ports.src = fl4->fl4_sport;
2008 			hash_keys.ports.dst = fl4->fl4_dport;
2009 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2010 		}
2011 		break;
2012 	case 2:
2013 		memset(&hash_keys, 0, sizeof(hash_keys));
2014 		/* skb is currently provided only when forwarding */
2015 		if (skb) {
2016 			struct flow_keys keys;
2017 
2018 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2019 			/* Inner can be v4 or v6 */
2020 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2021 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2023 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2024 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2025 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2027 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2028 				hash_keys.tags.flow_label = keys.tags.flow_label;
2029 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2030 			} else {
2031 				/* Same as case 0 */
2032 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2033 				ip_multipath_l3_keys(skb, &hash_keys);
2034 			}
2035 		} else {
2036 			/* Same as case 0 */
2037 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2038 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2039 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2040 		}
2041 		break;
2042 	}
2043 	mhash = flow_hash_from_keys(&hash_keys);
2044 
2045 	if (multipath_hash)
2046 		mhash = jhash_2words(mhash, multipath_hash, 0);
2047 
2048 	return mhash >> 1;
2049 }
2050 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2051 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2052 static int ip_mkroute_input(struct sk_buff *skb,
2053 			    struct fib_result *res,
2054 			    struct in_device *in_dev,
2055 			    __be32 daddr, __be32 saddr, u32 tos,
2056 			    struct flow_keys *hkeys)
2057 {
2058 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2059 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2060 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2061 
2062 		fib_select_multipath(res, h);
2063 		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2064 	}
2065 #endif
2066 
2067 	/* create a routing cache entry */
2068 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2069 }
2070 
2071 /* Implements all the saddr-related checks as ip_route_input_slow(),
2072  * assuming daddr is valid and the destination is not a local broadcast one.
2073  * Uses the provided hint instead of performing a route lookup.
2074  */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2075 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076 		      u8 tos, struct net_device *dev,
2077 		      const struct sk_buff *hint)
2078 {
2079 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2080 	struct rtable *rt = skb_rtable(hint);
2081 	struct net *net = dev_net(dev);
2082 	int err = -EINVAL;
2083 	u32 tag = 0;
2084 
2085 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2086 		goto martian_source;
2087 
2088 	if (ipv4_is_zeronet(saddr))
2089 		goto martian_source;
2090 
2091 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2092 		goto martian_source;
2093 
2094 	if (rt->rt_type != RTN_LOCAL)
2095 		goto skip_validate_source;
2096 
2097 	tos &= IPTOS_RT_MASK;
2098 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2099 	if (err < 0)
2100 		goto martian_source;
2101 
2102 skip_validate_source:
2103 	skb_dst_copy(skb, hint);
2104 	return 0;
2105 
2106 martian_source:
2107 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2108 	return err;
2109 }
2110 
2111 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2112 static struct net_device *ip_rt_get_dev(struct net *net,
2113 					const struct fib_result *res)
2114 {
2115 	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2116 	struct net_device *dev = NULL;
2117 
2118 	if (nhc)
2119 		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2120 
2121 	return dev ? : net->loopback_dev;
2122 }
2123 
2124 /*
2125  *	NOTE. We drop all the packets that has local source
2126  *	addresses, because every properly looped back packet
2127  *	must have correct destination already attached by output routine.
2128  *	Changes in the enforced policies must be applied also to
2129  *	ip_route_use_hint().
2130  *
2131  *	Such approach solves two big problems:
2132  *	1. Not simplex devices are handled properly.
2133  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2134  *	called with rcu_read_lock()
2135  */
2136 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2137 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2138 			       u8 tos, struct net_device *dev,
2139 			       struct fib_result *res)
2140 {
2141 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2142 	struct flow_keys *flkeys = NULL, _flkeys;
2143 	struct net    *net = dev_net(dev);
2144 	struct ip_tunnel_info *tun_info;
2145 	int		err = -EINVAL;
2146 	unsigned int	flags = 0;
2147 	u32		itag = 0;
2148 	struct rtable	*rth;
2149 	struct flowi4	fl4;
2150 	bool do_cache = true;
2151 
2152 	/* IP on this device is disabled. */
2153 
2154 	if (!in_dev)
2155 		goto out;
2156 
2157 	/* Check for the most weird martians, which can be not detected
2158 	   by fib_lookup.
2159 	 */
2160 
2161 	tun_info = skb_tunnel_info(skb);
2162 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2163 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2164 	else
2165 		fl4.flowi4_tun_key.tun_id = 0;
2166 	skb_dst_drop(skb);
2167 
2168 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2169 		goto martian_source;
2170 
2171 	res->fi = NULL;
2172 	res->table = NULL;
2173 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2174 		goto brd_input;
2175 
2176 	/* Accept zero addresses only to limited broadcast;
2177 	 * I even do not know to fix it or not. Waiting for complains :-)
2178 	 */
2179 	if (ipv4_is_zeronet(saddr))
2180 		goto martian_source;
2181 
2182 	if (ipv4_is_zeronet(daddr))
2183 		goto martian_destination;
2184 
2185 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2186 	 * and call it once if daddr or/and saddr are loopback addresses
2187 	 */
2188 	if (ipv4_is_loopback(daddr)) {
2189 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2190 			goto martian_destination;
2191 	} else if (ipv4_is_loopback(saddr)) {
2192 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2193 			goto martian_source;
2194 	}
2195 
2196 	/*
2197 	 *	Now we are ready to route packet.
2198 	 */
2199 	fl4.flowi4_oif = 0;
2200 	fl4.flowi4_iif = dev->ifindex;
2201 	fl4.flowi4_mark = skb->mark;
2202 	fl4.flowi4_tos = tos;
2203 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2204 	fl4.flowi4_flags = 0;
2205 	fl4.daddr = daddr;
2206 	fl4.saddr = saddr;
2207 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2208 	fl4.flowi4_multipath_hash = 0;
2209 
2210 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2211 		flkeys = &_flkeys;
2212 	} else {
2213 		fl4.flowi4_proto = 0;
2214 		fl4.fl4_sport = 0;
2215 		fl4.fl4_dport = 0;
2216 	}
2217 
2218 	err = fib_lookup(net, &fl4, res, 0);
2219 	if (err != 0) {
2220 		if (!IN_DEV_FORWARD(in_dev))
2221 			err = -EHOSTUNREACH;
2222 		goto no_route;
2223 	}
2224 
2225 	if (res->type == RTN_BROADCAST) {
2226 		if (IN_DEV_BFORWARD(in_dev))
2227 			goto make_route;
2228 		/* not do cache if bc_forwarding is enabled */
2229 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2230 			do_cache = false;
2231 		goto brd_input;
2232 	}
2233 
2234 	if (res->type == RTN_LOCAL) {
2235 		err = fib_validate_source(skb, saddr, daddr, tos,
2236 					  0, dev, in_dev, &itag);
2237 		if (err < 0)
2238 			goto martian_source;
2239 		goto local_input;
2240 	}
2241 
2242 	if (!IN_DEV_FORWARD(in_dev)) {
2243 		err = -EHOSTUNREACH;
2244 		goto no_route;
2245 	}
2246 	if (res->type != RTN_UNICAST)
2247 		goto martian_destination;
2248 
2249 make_route:
2250 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2251 out:	return err;
2252 
2253 brd_input:
2254 	if (skb->protocol != htons(ETH_P_IP))
2255 		goto e_inval;
2256 
2257 	if (!ipv4_is_zeronet(saddr)) {
2258 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2259 					  in_dev, &itag);
2260 		if (err < 0)
2261 			goto martian_source;
2262 	}
2263 	flags |= RTCF_BROADCAST;
2264 	res->type = RTN_BROADCAST;
2265 	RT_CACHE_STAT_INC(in_brd);
2266 
2267 local_input:
2268 	do_cache &= res->fi && !itag;
2269 	if (do_cache) {
2270 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2271 
2272 		rth = rcu_dereference(nhc->nhc_rth_input);
2273 		if (rt_cache_valid(rth)) {
2274 			skb_dst_set_noref(skb, &rth->dst);
2275 			err = 0;
2276 			goto out;
2277 		}
2278 	}
2279 
2280 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2281 			   flags | RTCF_LOCAL, res->type,
2282 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2283 	if (!rth)
2284 		goto e_nobufs;
2285 
2286 	rth->dst.output= ip_rt_bug;
2287 #ifdef CONFIG_IP_ROUTE_CLASSID
2288 	rth->dst.tclassid = itag;
2289 #endif
2290 	rth->rt_is_input = 1;
2291 
2292 	RT_CACHE_STAT_INC(in_slow_tot);
2293 	if (res->type == RTN_UNREACHABLE) {
2294 		rth->dst.input= ip_error;
2295 		rth->dst.error= -err;
2296 		rth->rt_flags 	&= ~RTCF_LOCAL;
2297 	}
2298 
2299 	if (do_cache) {
2300 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2301 
2302 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2303 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2304 			WARN_ON(rth->dst.input == lwtunnel_input);
2305 			rth->dst.lwtstate->orig_input = rth->dst.input;
2306 			rth->dst.input = lwtunnel_input;
2307 		}
2308 
2309 		if (unlikely(!rt_cache_route(nhc, rth)))
2310 			rt_add_uncached_list(rth);
2311 	}
2312 	skb_dst_set(skb, &rth->dst);
2313 	err = 0;
2314 	goto out;
2315 
2316 no_route:
2317 	RT_CACHE_STAT_INC(in_no_route);
2318 	res->type = RTN_UNREACHABLE;
2319 	res->fi = NULL;
2320 	res->table = NULL;
2321 	goto local_input;
2322 
2323 	/*
2324 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2325 	 */
2326 martian_destination:
2327 	RT_CACHE_STAT_INC(in_martian_dst);
2328 #ifdef CONFIG_IP_ROUTE_VERBOSE
2329 	if (IN_DEV_LOG_MARTIANS(in_dev))
2330 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2331 				     &daddr, &saddr, dev->name);
2332 #endif
2333 
2334 e_inval:
2335 	err = -EINVAL;
2336 	goto out;
2337 
2338 e_nobufs:
2339 	err = -ENOBUFS;
2340 	goto out;
2341 
2342 martian_source:
2343 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2344 	goto out;
2345 }
2346 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2347 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2348 			 u8 tos, struct net_device *dev)
2349 {
2350 	struct fib_result res;
2351 	int err;
2352 
2353 	tos &= IPTOS_RT_MASK;
2354 	rcu_read_lock();
2355 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2356 	rcu_read_unlock();
2357 
2358 	return err;
2359 }
2360 EXPORT_SYMBOL(ip_route_input_noref);
2361 
2362 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2363 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364 		       u8 tos, struct net_device *dev, struct fib_result *res)
2365 {
2366 	/* Multicast recognition logic is moved from route cache to here.
2367 	   The problem was that too many Ethernet cards have broken/missing
2368 	   hardware multicast filters :-( As result the host on multicasting
2369 	   network acquires a lot of useless route cache entries, sort of
2370 	   SDR messages from all the world. Now we try to get rid of them.
2371 	   Really, provided software IP multicast filter is organized
2372 	   reasonably (at least, hashed), it does not result in a slowdown
2373 	   comparing with route cache reject entries.
2374 	   Note, that multicast routers are not affected, because
2375 	   route cache entry is created eventually.
2376 	 */
2377 	if (ipv4_is_multicast(daddr)) {
2378 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2379 		int our = 0;
2380 		int err = -EINVAL;
2381 
2382 		if (!in_dev)
2383 			return err;
2384 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2385 				      ip_hdr(skb)->protocol);
2386 
2387 		/* check l3 master if no match yet */
2388 		if (!our && netif_is_l3_slave(dev)) {
2389 			struct in_device *l3_in_dev;
2390 
2391 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2392 			if (l3_in_dev)
2393 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2394 						      ip_hdr(skb)->protocol);
2395 		}
2396 
2397 		if (our
2398 #ifdef CONFIG_IP_MROUTE
2399 			||
2400 		    (!ipv4_is_local_multicast(daddr) &&
2401 		     IN_DEV_MFORWARD(in_dev))
2402 #endif
2403 		   ) {
2404 			err = ip_route_input_mc(skb, daddr, saddr,
2405 						tos, dev, our);
2406 		}
2407 		return err;
2408 	}
2409 
2410 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2411 }
2412 
2413 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2414 static struct rtable *__mkroute_output(const struct fib_result *res,
2415 				       const struct flowi4 *fl4, int orig_oif,
2416 				       struct net_device *dev_out,
2417 				       unsigned int flags)
2418 {
2419 	struct fib_info *fi = res->fi;
2420 	struct fib_nh_exception *fnhe;
2421 	struct in_device *in_dev;
2422 	u16 type = res->type;
2423 	struct rtable *rth;
2424 	bool do_cache;
2425 
2426 	in_dev = __in_dev_get_rcu(dev_out);
2427 	if (!in_dev)
2428 		return ERR_PTR(-EINVAL);
2429 
2430 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2431 		if (ipv4_is_loopback(fl4->saddr) &&
2432 		    !(dev_out->flags & IFF_LOOPBACK) &&
2433 		    !netif_is_l3_master(dev_out))
2434 			return ERR_PTR(-EINVAL);
2435 
2436 	if (ipv4_is_lbcast(fl4->daddr))
2437 		type = RTN_BROADCAST;
2438 	else if (ipv4_is_multicast(fl4->daddr))
2439 		type = RTN_MULTICAST;
2440 	else if (ipv4_is_zeronet(fl4->daddr))
2441 		return ERR_PTR(-EINVAL);
2442 
2443 	if (dev_out->flags & IFF_LOOPBACK)
2444 		flags |= RTCF_LOCAL;
2445 
2446 	do_cache = true;
2447 	if (type == RTN_BROADCAST) {
2448 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2449 		fi = NULL;
2450 	} else if (type == RTN_MULTICAST) {
2451 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2452 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2453 				     fl4->flowi4_proto))
2454 			flags &= ~RTCF_LOCAL;
2455 		else
2456 			do_cache = false;
2457 		/* If multicast route do not exist use
2458 		 * default one, but do not gateway in this case.
2459 		 * Yes, it is hack.
2460 		 */
2461 		if (fi && res->prefixlen < 4)
2462 			fi = NULL;
2463 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2464 		   (orig_oif != dev_out->ifindex)) {
2465 		/* For local routes that require a particular output interface
2466 		 * we do not want to cache the result.  Caching the result
2467 		 * causes incorrect behaviour when there are multiple source
2468 		 * addresses on the interface, the end result being that if the
2469 		 * intended recipient is waiting on that interface for the
2470 		 * packet he won't receive it because it will be delivered on
2471 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2472 		 * be set to the loopback interface as well.
2473 		 */
2474 		do_cache = false;
2475 	}
2476 
2477 	fnhe = NULL;
2478 	do_cache &= fi != NULL;
2479 	if (fi) {
2480 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2481 		struct rtable __rcu **prth;
2482 
2483 		fnhe = find_exception(nhc, fl4->daddr);
2484 		if (!do_cache)
2485 			goto add;
2486 		if (fnhe) {
2487 			prth = &fnhe->fnhe_rth_output;
2488 		} else {
2489 			if (unlikely(fl4->flowi4_flags &
2490 				     FLOWI_FLAG_KNOWN_NH &&
2491 				     !(nhc->nhc_gw_family &&
2492 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2493 				do_cache = false;
2494 				goto add;
2495 			}
2496 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2497 		}
2498 		rth = rcu_dereference(*prth);
2499 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2500 			return rth;
2501 	}
2502 
2503 add:
2504 	rth = rt_dst_alloc(dev_out, flags, type,
2505 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2506 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2507 	if (!rth)
2508 		return ERR_PTR(-ENOBUFS);
2509 
2510 	rth->rt_iif = orig_oif;
2511 
2512 	RT_CACHE_STAT_INC(out_slow_tot);
2513 
2514 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2515 		if (flags & RTCF_LOCAL &&
2516 		    !(dev_out->flags & IFF_LOOPBACK)) {
2517 			rth->dst.output = ip_mc_output;
2518 			RT_CACHE_STAT_INC(out_slow_mc);
2519 		}
2520 #ifdef CONFIG_IP_MROUTE
2521 		if (type == RTN_MULTICAST) {
2522 			if (IN_DEV_MFORWARD(in_dev) &&
2523 			    !ipv4_is_local_multicast(fl4->daddr)) {
2524 				rth->dst.input = ip_mr_input;
2525 				rth->dst.output = ip_mc_output;
2526 			}
2527 		}
2528 #endif
2529 	}
2530 
2531 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2532 	lwtunnel_set_redirect(&rth->dst);
2533 
2534 	return rth;
2535 }
2536 
2537 /*
2538  * Major route resolver routine.
2539  */
2540 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2541 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2542 					const struct sk_buff *skb)
2543 {
2544 	struct fib_result res = {
2545 		.type		= RTN_UNSPEC,
2546 		.fi		= NULL,
2547 		.table		= NULL,
2548 		.tclassid	= 0,
2549 	};
2550 	struct rtable *rth;
2551 
2552 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2553 	ip_rt_fix_tos(fl4);
2554 
2555 	rcu_read_lock();
2556 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2557 	rcu_read_unlock();
2558 
2559 	return rth;
2560 }
2561 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2562 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2563 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2564 					    struct fib_result *res,
2565 					    const struct sk_buff *skb)
2566 {
2567 	struct net_device *dev_out = NULL;
2568 	int orig_oif = fl4->flowi4_oif;
2569 	unsigned int flags = 0;
2570 	struct rtable *rth;
2571 	int err;
2572 
2573 	if (fl4->saddr) {
2574 		if (ipv4_is_multicast(fl4->saddr) ||
2575 		    ipv4_is_lbcast(fl4->saddr) ||
2576 		    ipv4_is_zeronet(fl4->saddr)) {
2577 			rth = ERR_PTR(-EINVAL);
2578 			goto out;
2579 		}
2580 
2581 		rth = ERR_PTR(-ENETUNREACH);
2582 
2583 		/* I removed check for oif == dev_out->oif here.
2584 		   It was wrong for two reasons:
2585 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2586 		      is assigned to multiple interfaces.
2587 		   2. Moreover, we are allowed to send packets with saddr
2588 		      of another iface. --ANK
2589 		 */
2590 
2591 		if (fl4->flowi4_oif == 0 &&
2592 		    (ipv4_is_multicast(fl4->daddr) ||
2593 		     ipv4_is_lbcast(fl4->daddr))) {
2594 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2595 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2596 			if (!dev_out)
2597 				goto out;
2598 
2599 			/* Special hack: user can direct multicasts
2600 			   and limited broadcast via necessary interface
2601 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2602 			   This hack is not just for fun, it allows
2603 			   vic,vat and friends to work.
2604 			   They bind socket to loopback, set ttl to zero
2605 			   and expect that it will work.
2606 			   From the viewpoint of routing cache they are broken,
2607 			   because we are not allowed to build multicast path
2608 			   with loopback source addr (look, routing cache
2609 			   cannot know, that ttl is zero, so that packet
2610 			   will not leave this host and route is valid).
2611 			   Luckily, this hack is good workaround.
2612 			 */
2613 
2614 			fl4->flowi4_oif = dev_out->ifindex;
2615 			goto make_route;
2616 		}
2617 
2618 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2619 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2620 			if (!__ip_dev_find(net, fl4->saddr, false))
2621 				goto out;
2622 		}
2623 	}
2624 
2625 
2626 	if (fl4->flowi4_oif) {
2627 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2628 		rth = ERR_PTR(-ENODEV);
2629 		if (!dev_out)
2630 			goto out;
2631 
2632 		/* RACE: Check return value of inet_select_addr instead. */
2633 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2634 			rth = ERR_PTR(-ENETUNREACH);
2635 			goto out;
2636 		}
2637 		if (ipv4_is_local_multicast(fl4->daddr) ||
2638 		    ipv4_is_lbcast(fl4->daddr) ||
2639 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2640 			if (!fl4->saddr)
2641 				fl4->saddr = inet_select_addr(dev_out, 0,
2642 							      RT_SCOPE_LINK);
2643 			goto make_route;
2644 		}
2645 		if (!fl4->saddr) {
2646 			if (ipv4_is_multicast(fl4->daddr))
2647 				fl4->saddr = inet_select_addr(dev_out, 0,
2648 							      fl4->flowi4_scope);
2649 			else if (!fl4->daddr)
2650 				fl4->saddr = inet_select_addr(dev_out, 0,
2651 							      RT_SCOPE_HOST);
2652 		}
2653 	}
2654 
2655 	if (!fl4->daddr) {
2656 		fl4->daddr = fl4->saddr;
2657 		if (!fl4->daddr)
2658 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2659 		dev_out = net->loopback_dev;
2660 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2661 		res->type = RTN_LOCAL;
2662 		flags |= RTCF_LOCAL;
2663 		goto make_route;
2664 	}
2665 
2666 	err = fib_lookup(net, fl4, res, 0);
2667 	if (err) {
2668 		res->fi = NULL;
2669 		res->table = NULL;
2670 		if (fl4->flowi4_oif &&
2671 		    (ipv4_is_multicast(fl4->daddr) ||
2672 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2673 			/* Apparently, routing tables are wrong. Assume,
2674 			   that the destination is on link.
2675 
2676 			   WHY? DW.
2677 			   Because we are allowed to send to iface
2678 			   even if it has NO routes and NO assigned
2679 			   addresses. When oif is specified, routing
2680 			   tables are looked up with only one purpose:
2681 			   to catch if destination is gatewayed, rather than
2682 			   direct. Moreover, if MSG_DONTROUTE is set,
2683 			   we send packet, ignoring both routing tables
2684 			   and ifaddr state. --ANK
2685 
2686 
2687 			   We could make it even if oif is unknown,
2688 			   likely IPv6, but we do not.
2689 			 */
2690 
2691 			if (fl4->saddr == 0)
2692 				fl4->saddr = inet_select_addr(dev_out, 0,
2693 							      RT_SCOPE_LINK);
2694 			res->type = RTN_UNICAST;
2695 			goto make_route;
2696 		}
2697 		rth = ERR_PTR(err);
2698 		goto out;
2699 	}
2700 
2701 	if (res->type == RTN_LOCAL) {
2702 		if (!fl4->saddr) {
2703 			if (res->fi->fib_prefsrc)
2704 				fl4->saddr = res->fi->fib_prefsrc;
2705 			else
2706 				fl4->saddr = fl4->daddr;
2707 		}
2708 
2709 		/* L3 master device is the loopback for that domain */
2710 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2711 			net->loopback_dev;
2712 
2713 		/* make sure orig_oif points to fib result device even
2714 		 * though packet rx/tx happens over loopback or l3mdev
2715 		 */
2716 		orig_oif = FIB_RES_OIF(*res);
2717 
2718 		fl4->flowi4_oif = dev_out->ifindex;
2719 		flags |= RTCF_LOCAL;
2720 		goto make_route;
2721 	}
2722 
2723 	fib_select_path(net, res, fl4, skb);
2724 
2725 	dev_out = FIB_RES_DEV(*res);
2726 
2727 make_route:
2728 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2729 
2730 out:
2731 	return rth;
2732 }
2733 
2734 static struct dst_ops ipv4_dst_blackhole_ops = {
2735 	.family			= AF_INET,
2736 	.default_advmss		= ipv4_default_advmss,
2737 	.neigh_lookup		= ipv4_neigh_lookup,
2738 	.check			= dst_blackhole_check,
2739 	.cow_metrics		= dst_blackhole_cow_metrics,
2740 	.update_pmtu		= dst_blackhole_update_pmtu,
2741 	.redirect		= dst_blackhole_redirect,
2742 	.mtu			= dst_blackhole_mtu,
2743 };
2744 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2745 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2746 {
2747 	struct rtable *ort = (struct rtable *) dst_orig;
2748 	struct rtable *rt;
2749 
2750 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2751 	if (rt) {
2752 		struct dst_entry *new = &rt->dst;
2753 
2754 		new->__use = 1;
2755 		new->input = dst_discard;
2756 		new->output = dst_discard_out;
2757 
2758 		new->dev = net->loopback_dev;
2759 		if (new->dev)
2760 			dev_hold(new->dev);
2761 
2762 		rt->rt_is_input = ort->rt_is_input;
2763 		rt->rt_iif = ort->rt_iif;
2764 		rt->rt_pmtu = ort->rt_pmtu;
2765 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2766 
2767 		rt->rt_genid = rt_genid_ipv4(net);
2768 		rt->rt_flags = ort->rt_flags;
2769 		rt->rt_type = ort->rt_type;
2770 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2771 		rt->rt_gw_family = ort->rt_gw_family;
2772 		if (rt->rt_gw_family == AF_INET)
2773 			rt->rt_gw4 = ort->rt_gw4;
2774 		else if (rt->rt_gw_family == AF_INET6)
2775 			rt->rt_gw6 = ort->rt_gw6;
2776 
2777 		INIT_LIST_HEAD(&rt->rt_uncached);
2778 	}
2779 
2780 	dst_release(dst_orig);
2781 
2782 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2783 }
2784 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2785 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2786 				    const struct sock *sk)
2787 {
2788 	struct rtable *rt = __ip_route_output_key(net, flp4);
2789 
2790 	if (IS_ERR(rt))
2791 		return rt;
2792 
2793 	if (flp4->flowi4_proto) {
2794 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2795 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2796 							flowi4_to_flowi(flp4),
2797 							sk, 0);
2798 	}
2799 
2800 	return rt;
2801 }
2802 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2803 
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2804 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2805 				      struct net_device *dev,
2806 				      struct net *net, __be32 *saddr,
2807 				      const struct ip_tunnel_info *info,
2808 				      u8 protocol, bool use_cache)
2809 {
2810 #ifdef CONFIG_DST_CACHE
2811 	struct dst_cache *dst_cache;
2812 #endif
2813 	struct rtable *rt = NULL;
2814 	struct flowi4 fl4;
2815 	__u8 tos;
2816 
2817 #ifdef CONFIG_DST_CACHE
2818 	dst_cache = (struct dst_cache *)&info->dst_cache;
2819 	if (use_cache) {
2820 		rt = dst_cache_get_ip4(dst_cache, saddr);
2821 		if (rt)
2822 			return rt;
2823 	}
2824 #endif
2825 	memset(&fl4, 0, sizeof(fl4));
2826 	fl4.flowi4_mark = skb->mark;
2827 	fl4.flowi4_proto = protocol;
2828 	fl4.daddr = info->key.u.ipv4.dst;
2829 	fl4.saddr = info->key.u.ipv4.src;
2830 	tos = info->key.tos;
2831 	fl4.flowi4_tos = RT_TOS(tos);
2832 
2833 	rt = ip_route_output_key(net, &fl4);
2834 	if (IS_ERR(rt)) {
2835 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2836 		return ERR_PTR(-ENETUNREACH);
2837 	}
2838 	if (rt->dst.dev == dev) { /* is this necessary? */
2839 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2840 		ip_rt_put(rt);
2841 		return ERR_PTR(-ELOOP);
2842 	}
2843 #ifdef CONFIG_DST_CACHE
2844 	if (use_cache)
2845 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2846 #endif
2847 	*saddr = fl4.saddr;
2848 	return rt;
2849 }
2850 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2851 
2852 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2853 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2854 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2855 			struct sk_buff *skb, u32 portid, u32 seq,
2856 			unsigned int flags)
2857 {
2858 	struct rtmsg *r;
2859 	struct nlmsghdr *nlh;
2860 	unsigned long expires = 0;
2861 	u32 error;
2862 	u32 metrics[RTAX_MAX];
2863 
2864 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2865 	if (!nlh)
2866 		return -EMSGSIZE;
2867 
2868 	r = nlmsg_data(nlh);
2869 	r->rtm_family	 = AF_INET;
2870 	r->rtm_dst_len	= 32;
2871 	r->rtm_src_len	= 0;
2872 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2873 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2874 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2875 		goto nla_put_failure;
2876 	r->rtm_type	= rt->rt_type;
2877 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2878 	r->rtm_protocol = RTPROT_UNSPEC;
2879 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2880 	if (rt->rt_flags & RTCF_NOTIFY)
2881 		r->rtm_flags |= RTM_F_NOTIFY;
2882 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2883 		r->rtm_flags |= RTCF_DOREDIRECT;
2884 
2885 	if (nla_put_in_addr(skb, RTA_DST, dst))
2886 		goto nla_put_failure;
2887 	if (src) {
2888 		r->rtm_src_len = 32;
2889 		if (nla_put_in_addr(skb, RTA_SRC, src))
2890 			goto nla_put_failure;
2891 	}
2892 	if (rt->dst.dev &&
2893 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2894 		goto nla_put_failure;
2895 #ifdef CONFIG_IP_ROUTE_CLASSID
2896 	if (rt->dst.tclassid &&
2897 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2898 		goto nla_put_failure;
2899 #endif
2900 	if (fl4 && !rt_is_input_route(rt) &&
2901 	    fl4->saddr != src) {
2902 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2903 			goto nla_put_failure;
2904 	}
2905 	if (rt->rt_uses_gateway) {
2906 		if (rt->rt_gw_family == AF_INET &&
2907 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2908 			goto nla_put_failure;
2909 		} else if (rt->rt_gw_family == AF_INET6) {
2910 			int alen = sizeof(struct in6_addr);
2911 			struct nlattr *nla;
2912 			struct rtvia *via;
2913 
2914 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2915 			if (!nla)
2916 				goto nla_put_failure;
2917 
2918 			via = nla_data(nla);
2919 			via->rtvia_family = AF_INET6;
2920 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2921 		}
2922 	}
2923 
2924 	expires = rt->dst.expires;
2925 	if (expires) {
2926 		unsigned long now = jiffies;
2927 
2928 		if (time_before(now, expires))
2929 			expires -= now;
2930 		else
2931 			expires = 0;
2932 	}
2933 
2934 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2935 	if (rt->rt_pmtu && expires)
2936 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2937 	if (rt->rt_mtu_locked && expires)
2938 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2939 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2940 		goto nla_put_failure;
2941 
2942 	if (fl4) {
2943 		if (fl4->flowi4_mark &&
2944 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2945 			goto nla_put_failure;
2946 
2947 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2948 		    nla_put_u32(skb, RTA_UID,
2949 				from_kuid_munged(current_user_ns(),
2950 						 fl4->flowi4_uid)))
2951 			goto nla_put_failure;
2952 
2953 		if (rt_is_input_route(rt)) {
2954 #ifdef CONFIG_IP_MROUTE
2955 			if (ipv4_is_multicast(dst) &&
2956 			    !ipv4_is_local_multicast(dst) &&
2957 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2958 				int err = ipmr_get_route(net, skb,
2959 							 fl4->saddr, fl4->daddr,
2960 							 r, portid);
2961 
2962 				if (err <= 0) {
2963 					if (err == 0)
2964 						return 0;
2965 					goto nla_put_failure;
2966 				}
2967 			} else
2968 #endif
2969 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2970 					goto nla_put_failure;
2971 		}
2972 	}
2973 
2974 	error = rt->dst.error;
2975 
2976 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2977 		goto nla_put_failure;
2978 
2979 	nlmsg_end(skb, nlh);
2980 	return 0;
2981 
2982 nla_put_failure:
2983 	nlmsg_cancel(skb, nlh);
2984 	return -EMSGSIZE;
2985 }
2986 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2987 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2988 			    struct netlink_callback *cb, u32 table_id,
2989 			    struct fnhe_hash_bucket *bucket, int genid,
2990 			    int *fa_index, int fa_start, unsigned int flags)
2991 {
2992 	int i;
2993 
2994 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2995 		struct fib_nh_exception *fnhe;
2996 
2997 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2998 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2999 			struct rtable *rt;
3000 			int err;
3001 
3002 			if (*fa_index < fa_start)
3003 				goto next;
3004 
3005 			if (fnhe->fnhe_genid != genid)
3006 				goto next;
3007 
3008 			if (fnhe->fnhe_expires &&
3009 			    time_after(jiffies, fnhe->fnhe_expires))
3010 				goto next;
3011 
3012 			rt = rcu_dereference(fnhe->fnhe_rth_input);
3013 			if (!rt)
3014 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3015 			if (!rt)
3016 				goto next;
3017 
3018 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3019 					   table_id, NULL, skb,
3020 					   NETLINK_CB(cb->skb).portid,
3021 					   cb->nlh->nlmsg_seq, flags);
3022 			if (err)
3023 				return err;
3024 next:
3025 			(*fa_index)++;
3026 		}
3027 	}
3028 
3029 	return 0;
3030 }
3031 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3032 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3033 		       u32 table_id, struct fib_info *fi,
3034 		       int *fa_index, int fa_start, unsigned int flags)
3035 {
3036 	struct net *net = sock_net(cb->skb->sk);
3037 	int nhsel, genid = fnhe_genid(net);
3038 
3039 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3040 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3041 		struct fnhe_hash_bucket *bucket;
3042 		int err;
3043 
3044 		if (nhc->nhc_flags & RTNH_F_DEAD)
3045 			continue;
3046 
3047 		rcu_read_lock();
3048 		bucket = rcu_dereference(nhc->nhc_exceptions);
3049 		err = 0;
3050 		if (bucket)
3051 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3052 					       genid, fa_index, fa_start,
3053 					       flags);
3054 		rcu_read_unlock();
3055 		if (err)
3056 			return err;
3057 	}
3058 
3059 	return 0;
3060 }
3061 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3062 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3063 						   u8 ip_proto, __be16 sport,
3064 						   __be16 dport)
3065 {
3066 	struct sk_buff *skb;
3067 	struct iphdr *iph;
3068 
3069 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3070 	if (!skb)
3071 		return NULL;
3072 
3073 	/* Reserve room for dummy headers, this skb can pass
3074 	 * through good chunk of routing engine.
3075 	 */
3076 	skb_reset_mac_header(skb);
3077 	skb_reset_network_header(skb);
3078 	skb->protocol = htons(ETH_P_IP);
3079 	iph = skb_put(skb, sizeof(struct iphdr));
3080 	iph->protocol = ip_proto;
3081 	iph->saddr = src;
3082 	iph->daddr = dst;
3083 	iph->version = 0x4;
3084 	iph->frag_off = 0;
3085 	iph->ihl = 0x5;
3086 	skb_set_transport_header(skb, skb->len);
3087 
3088 	switch (iph->protocol) {
3089 	case IPPROTO_UDP: {
3090 		struct udphdr *udph;
3091 
3092 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3093 		udph->source = sport;
3094 		udph->dest = dport;
3095 		udph->len = htons(sizeof(struct udphdr));
3096 		udph->check = 0;
3097 		break;
3098 	}
3099 	case IPPROTO_TCP: {
3100 		struct tcphdr *tcph;
3101 
3102 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3103 		tcph->source	= sport;
3104 		tcph->dest	= dport;
3105 		tcph->doff	= sizeof(struct tcphdr) / 4;
3106 		tcph->rst = 1;
3107 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3108 					    src, dst, 0);
3109 		break;
3110 	}
3111 	case IPPROTO_ICMP: {
3112 		struct icmphdr *icmph;
3113 
3114 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3115 		icmph->type = ICMP_ECHO;
3116 		icmph->code = 0;
3117 	}
3118 	}
3119 
3120 	return skb;
3121 }
3122 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3123 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3124 				       const struct nlmsghdr *nlh,
3125 				       struct nlattr **tb,
3126 				       struct netlink_ext_ack *extack)
3127 {
3128 	struct rtmsg *rtm;
3129 	int i, err;
3130 
3131 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3132 		NL_SET_ERR_MSG(extack,
3133 			       "ipv4: Invalid header for route get request");
3134 		return -EINVAL;
3135 	}
3136 
3137 	if (!netlink_strict_get_check(skb))
3138 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3139 					      rtm_ipv4_policy, extack);
3140 
3141 	rtm = nlmsg_data(nlh);
3142 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3143 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3144 	    rtm->rtm_table || rtm->rtm_protocol ||
3145 	    rtm->rtm_scope || rtm->rtm_type) {
3146 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3147 		return -EINVAL;
3148 	}
3149 
3150 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3151 			       RTM_F_LOOKUP_TABLE |
3152 			       RTM_F_FIB_MATCH)) {
3153 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3154 		return -EINVAL;
3155 	}
3156 
3157 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3158 					    rtm_ipv4_policy, extack);
3159 	if (err)
3160 		return err;
3161 
3162 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3163 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3164 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3165 		return -EINVAL;
3166 	}
3167 
3168 	for (i = 0; i <= RTA_MAX; i++) {
3169 		if (!tb[i])
3170 			continue;
3171 
3172 		switch (i) {
3173 		case RTA_IIF:
3174 		case RTA_OIF:
3175 		case RTA_SRC:
3176 		case RTA_DST:
3177 		case RTA_IP_PROTO:
3178 		case RTA_SPORT:
3179 		case RTA_DPORT:
3180 		case RTA_MARK:
3181 		case RTA_UID:
3182 			break;
3183 		default:
3184 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3185 			return -EINVAL;
3186 		}
3187 	}
3188 
3189 	return 0;
3190 }
3191 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3192 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3193 			     struct netlink_ext_ack *extack)
3194 {
3195 	struct net *net = sock_net(in_skb->sk);
3196 	struct nlattr *tb[RTA_MAX+1];
3197 	u32 table_id = RT_TABLE_MAIN;
3198 	__be16 sport = 0, dport = 0;
3199 	struct fib_result res = {};
3200 	u8 ip_proto = IPPROTO_UDP;
3201 	struct rtable *rt = NULL;
3202 	struct sk_buff *skb;
3203 	struct rtmsg *rtm;
3204 	struct flowi4 fl4 = {};
3205 	__be32 dst = 0;
3206 	__be32 src = 0;
3207 	kuid_t uid;
3208 	u32 iif;
3209 	int err;
3210 	int mark;
3211 
3212 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3213 	if (err < 0)
3214 		return err;
3215 
3216 	rtm = nlmsg_data(nlh);
3217 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3218 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3219 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3220 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3221 	if (tb[RTA_UID])
3222 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3223 	else
3224 		uid = (iif ? INVALID_UID : current_uid());
3225 
3226 	if (tb[RTA_IP_PROTO]) {
3227 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3228 						  &ip_proto, AF_INET, extack);
3229 		if (err)
3230 			return err;
3231 	}
3232 
3233 	if (tb[RTA_SPORT])
3234 		sport = nla_get_be16(tb[RTA_SPORT]);
3235 
3236 	if (tb[RTA_DPORT])
3237 		dport = nla_get_be16(tb[RTA_DPORT]);
3238 
3239 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3240 	if (!skb)
3241 		return -ENOBUFS;
3242 
3243 	fl4.daddr = dst;
3244 	fl4.saddr = src;
3245 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3246 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3247 	fl4.flowi4_mark = mark;
3248 	fl4.flowi4_uid = uid;
3249 	if (sport)
3250 		fl4.fl4_sport = sport;
3251 	if (dport)
3252 		fl4.fl4_dport = dport;
3253 	fl4.flowi4_proto = ip_proto;
3254 
3255 	rcu_read_lock();
3256 
3257 	if (iif) {
3258 		struct net_device *dev;
3259 
3260 		dev = dev_get_by_index_rcu(net, iif);
3261 		if (!dev) {
3262 			err = -ENODEV;
3263 			goto errout_rcu;
3264 		}
3265 
3266 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3267 		skb->dev	= dev;
3268 		skb->mark	= mark;
3269 		err = ip_route_input_rcu(skb, dst, src,
3270 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3271 					 &res);
3272 
3273 		rt = skb_rtable(skb);
3274 		if (err == 0 && rt->dst.error)
3275 			err = -rt->dst.error;
3276 	} else {
3277 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3278 		skb->dev = net->loopback_dev;
3279 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3280 		err = 0;
3281 		if (IS_ERR(rt))
3282 			err = PTR_ERR(rt);
3283 		else
3284 			skb_dst_set(skb, &rt->dst);
3285 	}
3286 
3287 	if (err)
3288 		goto errout_rcu;
3289 
3290 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3291 		rt->rt_flags |= RTCF_NOTIFY;
3292 
3293 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3294 		table_id = res.table ? res.table->tb_id : 0;
3295 
3296 	/* reset skb for netlink reply msg */
3297 	skb_trim(skb, 0);
3298 	skb_reset_network_header(skb);
3299 	skb_reset_transport_header(skb);
3300 	skb_reset_mac_header(skb);
3301 
3302 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3303 		struct fib_rt_info fri;
3304 
3305 		if (!res.fi) {
3306 			err = fib_props[res.type].error;
3307 			if (!err)
3308 				err = -EHOSTUNREACH;
3309 			goto errout_rcu;
3310 		}
3311 		fri.fi = res.fi;
3312 		fri.tb_id = table_id;
3313 		fri.dst = res.prefix;
3314 		fri.dst_len = res.prefixlen;
3315 		fri.tos = fl4.flowi4_tos;
3316 		fri.type = rt->rt_type;
3317 		fri.offload = 0;
3318 		fri.trap = 0;
3319 		if (res.fa_head) {
3320 			struct fib_alias *fa;
3321 
3322 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3323 				u8 slen = 32 - fri.dst_len;
3324 
3325 				if (fa->fa_slen == slen &&
3326 				    fa->tb_id == fri.tb_id &&
3327 				    fa->fa_tos == fri.tos &&
3328 				    fa->fa_info == res.fi &&
3329 				    fa->fa_type == fri.type) {
3330 					fri.offload = fa->offload;
3331 					fri.trap = fa->trap;
3332 					break;
3333 				}
3334 			}
3335 		}
3336 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3337 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3338 	} else {
3339 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3340 				   NETLINK_CB(in_skb).portid,
3341 				   nlh->nlmsg_seq, 0);
3342 	}
3343 	if (err < 0)
3344 		goto errout_rcu;
3345 
3346 	rcu_read_unlock();
3347 
3348 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3349 
3350 errout_free:
3351 	return err;
3352 errout_rcu:
3353 	rcu_read_unlock();
3354 	kfree_skb(skb);
3355 	goto errout_free;
3356 }
3357 
ip_rt_multicast_event(struct in_device * in_dev)3358 void ip_rt_multicast_event(struct in_device *in_dev)
3359 {
3360 	rt_cache_flush(dev_net(in_dev->dev));
3361 }
3362 
3363 #ifdef CONFIG_SYSCTL
3364 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3365 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3366 static int ip_rt_gc_elasticity __read_mostly	= 8;
3367 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3368 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3369 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3370 		void *buffer, size_t *lenp, loff_t *ppos)
3371 {
3372 	struct net *net = (struct net *)__ctl->extra1;
3373 
3374 	if (write) {
3375 		rt_cache_flush(net);
3376 		fnhe_genid_bump(net);
3377 		return 0;
3378 	}
3379 
3380 	return -EINVAL;
3381 }
3382 
3383 static struct ctl_table ipv4_route_table[] = {
3384 	{
3385 		.procname	= "gc_thresh",
3386 		.data		= &ipv4_dst_ops.gc_thresh,
3387 		.maxlen		= sizeof(int),
3388 		.mode		= 0644,
3389 		.proc_handler	= proc_dointvec,
3390 	},
3391 	{
3392 		.procname	= "max_size",
3393 		.data		= &ip_rt_max_size,
3394 		.maxlen		= sizeof(int),
3395 		.mode		= 0644,
3396 		.proc_handler	= proc_dointvec,
3397 	},
3398 	{
3399 		/*  Deprecated. Use gc_min_interval_ms */
3400 
3401 		.procname	= "gc_min_interval",
3402 		.data		= &ip_rt_gc_min_interval,
3403 		.maxlen		= sizeof(int),
3404 		.mode		= 0644,
3405 		.proc_handler	= proc_dointvec_jiffies,
3406 	},
3407 	{
3408 		.procname	= "gc_min_interval_ms",
3409 		.data		= &ip_rt_gc_min_interval,
3410 		.maxlen		= sizeof(int),
3411 		.mode		= 0644,
3412 		.proc_handler	= proc_dointvec_ms_jiffies,
3413 	},
3414 	{
3415 		.procname	= "gc_timeout",
3416 		.data		= &ip_rt_gc_timeout,
3417 		.maxlen		= sizeof(int),
3418 		.mode		= 0644,
3419 		.proc_handler	= proc_dointvec_jiffies,
3420 	},
3421 	{
3422 		.procname	= "gc_interval",
3423 		.data		= &ip_rt_gc_interval,
3424 		.maxlen		= sizeof(int),
3425 		.mode		= 0644,
3426 		.proc_handler	= proc_dointvec_jiffies,
3427 	},
3428 	{
3429 		.procname	= "redirect_load",
3430 		.data		= &ip_rt_redirect_load,
3431 		.maxlen		= sizeof(int),
3432 		.mode		= 0644,
3433 		.proc_handler	= proc_dointvec,
3434 	},
3435 	{
3436 		.procname	= "redirect_number",
3437 		.data		= &ip_rt_redirect_number,
3438 		.maxlen		= sizeof(int),
3439 		.mode		= 0644,
3440 		.proc_handler	= proc_dointvec,
3441 	},
3442 	{
3443 		.procname	= "redirect_silence",
3444 		.data		= &ip_rt_redirect_silence,
3445 		.maxlen		= sizeof(int),
3446 		.mode		= 0644,
3447 		.proc_handler	= proc_dointvec,
3448 	},
3449 	{
3450 		.procname	= "error_cost",
3451 		.data		= &ip_rt_error_cost,
3452 		.maxlen		= sizeof(int),
3453 		.mode		= 0644,
3454 		.proc_handler	= proc_dointvec,
3455 	},
3456 	{
3457 		.procname	= "error_burst",
3458 		.data		= &ip_rt_error_burst,
3459 		.maxlen		= sizeof(int),
3460 		.mode		= 0644,
3461 		.proc_handler	= proc_dointvec,
3462 	},
3463 	{
3464 		.procname	= "gc_elasticity",
3465 		.data		= &ip_rt_gc_elasticity,
3466 		.maxlen		= sizeof(int),
3467 		.mode		= 0644,
3468 		.proc_handler	= proc_dointvec,
3469 	},
3470 	{
3471 		.procname	= "mtu_expires",
3472 		.data		= &ip_rt_mtu_expires,
3473 		.maxlen		= sizeof(int),
3474 		.mode		= 0644,
3475 		.proc_handler	= proc_dointvec_jiffies,
3476 	},
3477 	{
3478 		.procname	= "min_pmtu",
3479 		.data		= &ip_rt_min_pmtu,
3480 		.maxlen		= sizeof(int),
3481 		.mode		= 0644,
3482 		.proc_handler	= proc_dointvec_minmax,
3483 		.extra1		= &ip_min_valid_pmtu,
3484 	},
3485 	{
3486 		.procname	= "min_adv_mss",
3487 		.data		= &ip_rt_min_advmss,
3488 		.maxlen		= sizeof(int),
3489 		.mode		= 0644,
3490 		.proc_handler	= proc_dointvec,
3491 	},
3492 	{ }
3493 };
3494 
3495 static const char ipv4_route_flush_procname[] = "flush";
3496 
3497 static struct ctl_table ipv4_route_flush_table[] = {
3498 	{
3499 		.procname	= ipv4_route_flush_procname,
3500 		.maxlen		= sizeof(int),
3501 		.mode		= 0200,
3502 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3503 	},
3504 	{ },
3505 };
3506 
sysctl_route_net_init(struct net * net)3507 static __net_init int sysctl_route_net_init(struct net *net)
3508 {
3509 	struct ctl_table *tbl;
3510 
3511 	tbl = ipv4_route_flush_table;
3512 	if (!net_eq(net, &init_net)) {
3513 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3514 		if (!tbl)
3515 			goto err_dup;
3516 
3517 		/* Don't export non-whitelisted sysctls to unprivileged users */
3518 		if (net->user_ns != &init_user_ns) {
3519 			if (tbl[0].procname != ipv4_route_flush_procname)
3520 				tbl[0].procname = NULL;
3521 		}
3522 	}
3523 	tbl[0].extra1 = net;
3524 
3525 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3526 	if (!net->ipv4.route_hdr)
3527 		goto err_reg;
3528 	return 0;
3529 
3530 err_reg:
3531 	if (tbl != ipv4_route_flush_table)
3532 		kfree(tbl);
3533 err_dup:
3534 	return -ENOMEM;
3535 }
3536 
sysctl_route_net_exit(struct net * net)3537 static __net_exit void sysctl_route_net_exit(struct net *net)
3538 {
3539 	struct ctl_table *tbl;
3540 
3541 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3542 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3543 	BUG_ON(tbl == ipv4_route_flush_table);
3544 	kfree(tbl);
3545 }
3546 
3547 static __net_initdata struct pernet_operations sysctl_route_ops = {
3548 	.init = sysctl_route_net_init,
3549 	.exit = sysctl_route_net_exit,
3550 };
3551 #endif
3552 
rt_genid_init(struct net * net)3553 static __net_init int rt_genid_init(struct net *net)
3554 {
3555 	atomic_set(&net->ipv4.rt_genid, 0);
3556 	atomic_set(&net->fnhe_genid, 0);
3557 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3558 	return 0;
3559 }
3560 
3561 static __net_initdata struct pernet_operations rt_genid_ops = {
3562 	.init = rt_genid_init,
3563 };
3564 
ipv4_inetpeer_init(struct net * net)3565 static int __net_init ipv4_inetpeer_init(struct net *net)
3566 {
3567 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3568 
3569 	if (!bp)
3570 		return -ENOMEM;
3571 	inet_peer_base_init(bp);
3572 	net->ipv4.peers = bp;
3573 	return 0;
3574 }
3575 
ipv4_inetpeer_exit(struct net * net)3576 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3577 {
3578 	struct inet_peer_base *bp = net->ipv4.peers;
3579 
3580 	net->ipv4.peers = NULL;
3581 	inetpeer_invalidate_tree(bp);
3582 	kfree(bp);
3583 }
3584 
3585 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3586 	.init	=	ipv4_inetpeer_init,
3587 	.exit	=	ipv4_inetpeer_exit,
3588 };
3589 
3590 #ifdef CONFIG_IP_ROUTE_CLASSID
3591 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3592 #endif /* CONFIG_IP_ROUTE_CLASSID */
3593 
ip_rt_init(void)3594 int __init ip_rt_init(void)
3595 {
3596 	void *idents_hash;
3597 	int cpu;
3598 
3599 	/* For modern hosts, this will use 2 MB of memory */
3600 	idents_hash = alloc_large_system_hash("IP idents",
3601 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3602 					      0,
3603 					      16, /* one bucket per 64 KB */
3604 					      HASH_ZERO,
3605 					      NULL,
3606 					      &ip_idents_mask,
3607 					      2048,
3608 					      256*1024);
3609 
3610 	ip_idents = idents_hash;
3611 
3612 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3613 
3614 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3615 
3616 	for_each_possible_cpu(cpu) {
3617 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3618 
3619 		INIT_LIST_HEAD(&ul->head);
3620 		spin_lock_init(&ul->lock);
3621 	}
3622 #ifdef CONFIG_IP_ROUTE_CLASSID
3623 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3624 	if (!ip_rt_acct)
3625 		panic("IP: failed to allocate ip_rt_acct\n");
3626 #endif
3627 
3628 	ipv4_dst_ops.kmem_cachep =
3629 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3630 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3631 
3632 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3633 
3634 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3635 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3636 
3637 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3638 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3639 
3640 	ipv4_dst_ops.gc_thresh = ~0;
3641 	ip_rt_max_size = INT_MAX;
3642 
3643 	devinet_init();
3644 	ip_fib_init();
3645 
3646 	if (ip_rt_proc_init())
3647 		pr_err("Unable to create route proc files\n");
3648 #ifdef CONFIG_XFRM
3649 	xfrm_init();
3650 	xfrm4_init();
3651 #endif
3652 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3653 		      RTNL_FLAG_DOIT_UNLOCKED);
3654 
3655 #ifdef CONFIG_SYSCTL
3656 	register_pernet_subsys(&sysctl_route_ops);
3657 #endif
3658 	register_pernet_subsys(&rt_genid_ops);
3659 	register_pernet_subsys(&ipv4_inetpeer_ops);
3660 	return 0;
3661 }
3662 
3663 #ifdef CONFIG_SYSCTL
3664 /*
3665  * We really need to sanitize the damn ipv4 init order, then all
3666  * this nonsense will go away.
3667  */
ip_static_sysctl_init(void)3668 void __init ip_static_sysctl_init(void)
3669 {
3670 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3671 }
3672 #endif
3673