• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void		 ipv4_link_failure(struct sk_buff *skb);
142 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 					   struct sk_buff *skb, u32 mtu,
144 					   bool confirm_neigh);
145 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 					struct sk_buff *skb);
147 static void		ipv4_dst_destroy(struct dst_entry *dst);
148 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 	WARN_ON(1);
152 	return NULL;
153 }
154 
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 					   struct sk_buff *skb,
157 					   const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159 
160 static struct dst_ops ipv4_dst_ops = {
161 	.family =		AF_INET,
162 	.check =		ipv4_dst_check,
163 	.default_advmss =	ipv4_default_advmss,
164 	.mtu =			ipv4_mtu,
165 	.cow_metrics =		ipv4_cow_metrics,
166 	.destroy =		ipv4_dst_destroy,
167 	.negative_advice =	ipv4_negative_advice,
168 	.link_failure =		ipv4_link_failure,
169 	.update_pmtu =		ip_rt_update_pmtu,
170 	.redirect =		ip_do_redirect,
171 	.local_out =		__ip_local_out,
172 	.neigh_lookup =		ipv4_neigh_lookup,
173 	.confirm_neigh =	ipv4_confirm_neigh,
174 };
175 
176 #define ECN_OR_COST(class)	TC_PRIO_##class
177 
178 const __u8 ip_tos2prio[16] = {
179 	TC_PRIO_BESTEFFORT,
180 	ECN_OR_COST(BESTEFFORT),
181 	TC_PRIO_BESTEFFORT,
182 	ECN_OR_COST(BESTEFFORT),
183 	TC_PRIO_BULK,
184 	ECN_OR_COST(BULK),
185 	TC_PRIO_BULK,
186 	ECN_OR_COST(BULK),
187 	TC_PRIO_INTERACTIVE,
188 	ECN_OR_COST(INTERACTIVE),
189 	TC_PRIO_INTERACTIVE,
190 	ECN_OR_COST(INTERACTIVE),
191 	TC_PRIO_INTERACTIVE_BULK,
192 	ECN_OR_COST(INTERACTIVE_BULK),
193 	TC_PRIO_INTERACTIVE_BULK,
194 	ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197 
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 	if (*pos)
205 		return NULL;
206 	return SEQ_START_TOKEN;
207 }
208 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 	++*pos;
212 	return NULL;
213 }
214 
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218 
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 	if (v == SEQ_START_TOKEN)
222 		seq_printf(seq, "%-127s\n",
223 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			   "HHUptod\tSpecDst");
226 	return 0;
227 }
228 
229 static const struct seq_operations rt_cache_seq_ops = {
230 	.start  = rt_cache_seq_start,
231 	.next   = rt_cache_seq_next,
232 	.stop   = rt_cache_seq_stop,
233 	.show   = rt_cache_seq_show,
234 };
235 
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 	return seq_open(file, &rt_cache_seq_ops);
239 }
240 
241 static const struct proc_ops rt_cache_proc_ops = {
242 	.proc_open	= rt_cache_seq_open,
243 	.proc_read	= seq_read,
244 	.proc_lseek	= seq_lseek,
245 	.proc_release	= seq_release,
246 };
247 
248 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 	int cpu;
252 
253 	if (*pos == 0)
254 		return SEQ_START_TOKEN;
255 
256 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 		if (!cpu_possible(cpu))
258 			continue;
259 		*pos = cpu+1;
260 		return &per_cpu(rt_cache_stat, cpu);
261 	}
262 	return NULL;
263 }
264 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 	int cpu;
268 
269 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 		if (!cpu_possible(cpu))
271 			continue;
272 		*pos = cpu+1;
273 		return &per_cpu(rt_cache_stat, cpu);
274 	}
275 	(*pos)++;
276 	return NULL;
277 
278 }
279 
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282 
283 }
284 
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 	struct rt_cache_stat *st = v;
288 
289 	if (v == SEQ_START_TOKEN) {
290 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 		return 0;
292 	}
293 
294 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 		   dst_entries_get_slow(&ipv4_dst_ops),
297 		   0, /* st->in_hit */
298 		   st->in_slow_tot,
299 		   st->in_slow_mc,
300 		   st->in_no_route,
301 		   st->in_brd,
302 		   st->in_martian_dst,
303 		   st->in_martian_src,
304 
305 		   0, /* st->out_hit */
306 		   st->out_slow_tot,
307 		   st->out_slow_mc,
308 
309 		   0, /* st->gc_total */
310 		   0, /* st->gc_ignored */
311 		   0, /* st->gc_goal_miss */
312 		   0, /* st->gc_dst_overflow */
313 		   0, /* st->in_hlist_search */
314 		   0  /* st->out_hlist_search */
315 		);
316 	return 0;
317 }
318 
319 static const struct seq_operations rt_cpu_seq_ops = {
320 	.start  = rt_cpu_seq_start,
321 	.next   = rt_cpu_seq_next,
322 	.stop   = rt_cpu_seq_stop,
323 	.show   = rt_cpu_seq_show,
324 };
325 
326 
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 	return seq_open(file, &rt_cpu_seq_ops);
330 }
331 
332 static const struct proc_ops rt_cpu_proc_ops = {
333 	.proc_open	= rt_cpu_seq_open,
334 	.proc_read	= seq_read,
335 	.proc_lseek	= seq_lseek,
336 	.proc_release	= seq_release,
337 };
338 
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 	struct ip_rt_acct *dst, *src;
343 	unsigned int i, j;
344 
345 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 	if (!dst)
347 		return -ENOMEM;
348 
349 	for_each_possible_cpu(i) {
350 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 		for (j = 0; j < 256; j++) {
352 			dst[j].o_bytes   += src[j].o_bytes;
353 			dst[j].o_packets += src[j].o_packets;
354 			dst[j].i_bytes   += src[j].i_bytes;
355 			dst[j].i_packets += src[j].i_packets;
356 		}
357 	}
358 
359 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 	kfree(dst);
361 	return 0;
362 }
363 #endif
364 
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 	struct proc_dir_entry *pde;
368 
369 	pde = proc_create("rt_cache", 0444, net->proc_net,
370 			  &rt_cache_proc_ops);
371 	if (!pde)
372 		goto err1;
373 
374 	pde = proc_create("rt_cache", 0444,
375 			  net->proc_net_stat, &rt_cpu_proc_ops);
376 	if (!pde)
377 		goto err2;
378 
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 	pde = proc_create_single("rt_acct", 0, net->proc_net,
381 			rt_acct_proc_show);
382 	if (!pde)
383 		goto err3;
384 #endif
385 	return 0;
386 
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 	remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 	remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 	return -ENOMEM;
395 }
396 
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 	remove_proc_entry("rt_cache", net->proc_net_stat);
400 	remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 	remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405 
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407 	.init = ip_rt_do_proc_init,
408 	.exit = ip_rt_do_proc_exit,
409 };
410 
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 	return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415 
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 	return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422 
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427 
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 	rt_genid_bump_ipv4(net);
431 }
432 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 					   struct sk_buff *skb,
435 					   const void *daddr)
436 {
437 	const struct rtable *rt = container_of(dst, struct rtable, dst);
438 	struct net_device *dev = dst->dev;
439 	struct neighbour *n;
440 
441 	rcu_read_lock_bh();
442 
443 	if (likely(rt->rt_gw_family == AF_INET)) {
444 		n = ip_neigh_gw4(dev, rt->rt_gw4);
445 	} else if (rt->rt_gw_family == AF_INET6) {
446 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448 		__be32 pkey;
449 
450 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 		n = ip_neigh_gw4(dev, pkey);
452 	}
453 
454 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 		n = NULL;
456 
457 	rcu_read_unlock_bh();
458 
459 	return n;
460 }
461 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 	const struct rtable *rt = container_of(dst, struct rtable, dst);
465 	struct net_device *dev = dst->dev;
466 	const __be32 *pkey = daddr;
467 
468 	if (rt->rt_gw_family == AF_INET) {
469 		pkey = (const __be32 *)&rt->rt_gw4;
470 	} else if (rt->rt_gw_family == AF_INET6) {
471 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 	} else if (!daddr ||
473 		 (rt->rt_flags &
474 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 		return;
476 	}
477 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479 
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486 
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 	u32 bucket, old, now = (u32)jiffies;
494 	atomic_t *p_id;
495 	u32 *p_tstamp;
496 	u32 delta = 0;
497 
498 	bucket = hash & ip_idents_mask;
499 	p_tstamp = ip_tstamps + bucket;
500 	p_id = ip_idents + bucket;
501 	old = READ_ONCE(*p_tstamp);
502 
503 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 		delta = prandom_u32_max(now - old);
505 
506 	/* If UBSAN reports an error there, please make sure your compiler
507 	 * supports -fno-strict-overflow before reporting it that was a bug
508 	 * in UBSAN, and it has been fixed in GCC-8.
509 	 */
510 	return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 	u32 hash, id;
517 
518 	/* Note the following code is not safe, but this is okay. */
519 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 		get_random_bytes(&net->ipv4.ip_id_key,
521 				 sizeof(net->ipv4.ip_id_key));
522 
523 	hash = siphash_3u32((__force u32)iph->daddr,
524 			    (__force u32)iph->saddr,
525 			    iph->protocol,
526 			    &net->ipv4.ip_id_key);
527 	id = ip_idents_reserve(hash, segs);
528 	iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531 
ip_rt_fix_tos(struct flowi4 * fl4)532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534 	__u8 tos = RT_FL_TOS(fl4);
535 
536 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537 	fl4->flowi4_scope = tos & RTO_ONLINK ?
538 			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542 			     const struct sock *sk,
543 			     const struct iphdr *iph,
544 			     int oif, u8 tos,
545 			     u8 prot, u32 mark, int flow_flags)
546 {
547 	if (sk) {
548 		const struct inet_sock *inet = inet_sk(sk);
549 
550 		oif = sk->sk_bound_dev_if;
551 		mark = sk->sk_mark;
552 		tos = RT_CONN_FLAGS(sk);
553 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554 	}
555 	flowi4_init_output(fl4, oif, mark, tos,
556 			   RT_SCOPE_UNIVERSE, prot,
557 			   flow_flags,
558 			   iph->daddr, iph->saddr, 0, 0,
559 			   sock_net_uid(net, sk));
560 }
561 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563 			       const struct sock *sk)
564 {
565 	const struct net *net = dev_net(skb->dev);
566 	const struct iphdr *iph = ip_hdr(skb);
567 	int oif = skb->dev->ifindex;
568 	u8 tos = RT_TOS(iph->tos);
569 	u8 prot = iph->protocol;
570 	u32 mark = skb->mark;
571 
572 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577 	const struct inet_sock *inet = inet_sk(sk);
578 	const struct ip_options_rcu *inet_opt;
579 	__be32 daddr = inet->inet_daddr;
580 
581 	rcu_read_lock();
582 	inet_opt = rcu_dereference(inet->inet_opt);
583 	if (inet_opt && inet_opt->opt.srr)
584 		daddr = inet_opt->opt.faddr;
585 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588 			   inet_sk_flowi_flags(sk),
589 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590 	rcu_read_unlock();
591 }
592 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594 				 const struct sk_buff *skb)
595 {
596 	if (skb)
597 		build_skb_flow_key(fl4, skb, sk);
598 	else
599 		build_sk_flow_key(fl4, sk);
600 }
601 
602 static DEFINE_SPINLOCK(fnhe_lock);
603 
fnhe_flush_routes(struct fib_nh_exception * fnhe)604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606 	struct rtable *rt;
607 
608 	rt = rcu_dereference(fnhe->fnhe_rth_input);
609 	if (rt) {
610 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611 		dst_dev_put(&rt->dst);
612 		dst_release(&rt->dst);
613 	}
614 	rt = rcu_dereference(fnhe->fnhe_rth_output);
615 	if (rt) {
616 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617 		dst_dev_put(&rt->dst);
618 		dst_release(&rt->dst);
619 	}
620 }
621 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625 	struct fib_nh_exception *fnhe, *oldest = NULL;
626 
627 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628 		fnhe = rcu_dereference_protected(*fnhe_p,
629 						 lockdep_is_held(&fnhe_lock));
630 		if (!fnhe)
631 			break;
632 		if (!oldest ||
633 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634 			oldest = fnhe;
635 			oldest_p = fnhe_p;
636 		}
637 	}
638 	fnhe_flush_routes(oldest);
639 	*oldest_p = oldest->fnhe_next;
640 	kfree_rcu(oldest, rcu);
641 }
642 
fnhe_hashfun(__be32 daddr)643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645 	static siphash_key_t fnhe_hash_key __read_mostly;
646 	u64 hval;
647 
648 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650 	return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655 	rt->rt_pmtu = fnhe->fnhe_pmtu;
656 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657 	rt->dst.expires = fnhe->fnhe_expires;
658 
659 	if (fnhe->fnhe_gw) {
660 		rt->rt_flags |= RTCF_REDIRECTED;
661 		rt->rt_uses_gateway = 1;
662 		rt->rt_gw_family = AF_INET;
663 		rt->rt_gw4 = fnhe->fnhe_gw;
664 	}
665 }
666 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668 				  __be32 gw, u32 pmtu, bool lock,
669 				  unsigned long expires)
670 {
671 	struct fnhe_hash_bucket *hash;
672 	struct fib_nh_exception *fnhe;
673 	struct rtable *rt;
674 	u32 genid, hval;
675 	unsigned int i;
676 	int depth;
677 
678 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
679 	hval = fnhe_hashfun(daddr);
680 
681 	spin_lock_bh(&fnhe_lock);
682 
683 	hash = rcu_dereference(nhc->nhc_exceptions);
684 	if (!hash) {
685 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686 		if (!hash)
687 			goto out_unlock;
688 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
689 	}
690 
691 	hash += hval;
692 
693 	depth = 0;
694 	for (fnhe = rcu_dereference(hash->chain); fnhe;
695 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
696 		if (fnhe->fnhe_daddr == daddr)
697 			break;
698 		depth++;
699 	}
700 
701 	if (fnhe) {
702 		if (fnhe->fnhe_genid != genid)
703 			fnhe->fnhe_genid = genid;
704 		if (gw)
705 			fnhe->fnhe_gw = gw;
706 		if (pmtu) {
707 			fnhe->fnhe_pmtu = pmtu;
708 			fnhe->fnhe_mtu_locked = lock;
709 		}
710 		fnhe->fnhe_expires = max(1UL, expires);
711 		/* Update all cached dsts too */
712 		rt = rcu_dereference(fnhe->fnhe_rth_input);
713 		if (rt)
714 			fill_route_from_fnhe(rt, fnhe);
715 		rt = rcu_dereference(fnhe->fnhe_rth_output);
716 		if (rt)
717 			fill_route_from_fnhe(rt, fnhe);
718 	} else {
719 		/* Randomize max depth to avoid some side channels attacks. */
720 		int max_depth = FNHE_RECLAIM_DEPTH +
721 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
722 
723 		while (depth > max_depth) {
724 			fnhe_remove_oldest(hash);
725 			depth--;
726 		}
727 
728 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729 		if (!fnhe)
730 			goto out_unlock;
731 
732 		fnhe->fnhe_next = hash->chain;
733 
734 		fnhe->fnhe_genid = genid;
735 		fnhe->fnhe_daddr = daddr;
736 		fnhe->fnhe_gw = gw;
737 		fnhe->fnhe_pmtu = pmtu;
738 		fnhe->fnhe_mtu_locked = lock;
739 		fnhe->fnhe_expires = max(1UL, expires);
740 
741 		rcu_assign_pointer(hash->chain, fnhe);
742 
743 		/* Exception created; mark the cached routes for the nexthop
744 		 * stale, so anyone caching it rechecks if this exception
745 		 * applies to them.
746 		 */
747 		rt = rcu_dereference(nhc->nhc_rth_input);
748 		if (rt)
749 			rt->dst.obsolete = DST_OBSOLETE_KILL;
750 
751 		for_each_possible_cpu(i) {
752 			struct rtable __rcu **prt;
753 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754 			rt = rcu_dereference(*prt);
755 			if (rt)
756 				rt->dst.obsolete = DST_OBSOLETE_KILL;
757 		}
758 	}
759 
760 	fnhe->fnhe_stamp = jiffies;
761 
762 out_unlock:
763 	spin_unlock_bh(&fnhe_lock);
764 }
765 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767 			     bool kill_route)
768 {
769 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
770 	__be32 old_gw = ip_hdr(skb)->saddr;
771 	struct net_device *dev = skb->dev;
772 	struct in_device *in_dev;
773 	struct fib_result res;
774 	struct neighbour *n;
775 	struct net *net;
776 
777 	switch (icmp_hdr(skb)->code & 7) {
778 	case ICMP_REDIR_NET:
779 	case ICMP_REDIR_NETTOS:
780 	case ICMP_REDIR_HOST:
781 	case ICMP_REDIR_HOSTTOS:
782 		break;
783 
784 	default:
785 		return;
786 	}
787 
788 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789 		return;
790 
791 	in_dev = __in_dev_get_rcu(dev);
792 	if (!in_dev)
793 		return;
794 
795 	net = dev_net(dev);
796 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798 	    ipv4_is_zeronet(new_gw))
799 		goto reject_redirect;
800 
801 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803 			goto reject_redirect;
804 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805 			goto reject_redirect;
806 	} else {
807 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808 			goto reject_redirect;
809 	}
810 
811 	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
812 	if (!n)
813 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814 	if (!IS_ERR(n)) {
815 		if (!(n->nud_state & NUD_VALID)) {
816 			neigh_event_send(n, NULL);
817 		} else {
818 			if (fib_lookup(net, fl4, &res, 0) == 0) {
819 				struct fib_nh_common *nhc;
820 
821 				fib_select_path(net, &res, fl4, skb);
822 				nhc = FIB_RES_NHC(res);
823 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824 						0, false,
825 						jiffies + ip_rt_gc_timeout);
826 			}
827 			if (kill_route)
828 				rt->dst.obsolete = DST_OBSOLETE_KILL;
829 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830 		}
831 		neigh_release(n);
832 	}
833 	return;
834 
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
838 		const struct iphdr *iph = (const struct iphdr *) skb->data;
839 		__be32 daddr = iph->daddr;
840 		__be32 saddr = iph->saddr;
841 
842 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843 				     "  Advised path = %pI4 -> %pI4\n",
844 				     &old_gw, dev->name, &new_gw,
845 				     &saddr, &daddr);
846 	}
847 #endif
848 	;
849 }
850 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853 	struct rtable *rt;
854 	struct flowi4 fl4;
855 	const struct iphdr *iph = (const struct iphdr *) skb->data;
856 	struct net *net = dev_net(skb->dev);
857 	int oif = skb->dev->ifindex;
858 	u8 tos = RT_TOS(iph->tos);
859 	u8 prot = iph->protocol;
860 	u32 mark = skb->mark;
861 
862 	rt = (struct rtable *) dst;
863 
864 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865 	ip_rt_fix_tos(&fl4);
866 	__ip_do_redirect(rt, skb, &fl4, true);
867 }
868 
ipv4_negative_advice(struct dst_entry * dst)869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871 	struct rtable *rt = (struct rtable *)dst;
872 	struct dst_entry *ret = dst;
873 
874 	if (rt) {
875 		if (dst->obsolete > 0) {
876 			ip_rt_put(rt);
877 			ret = NULL;
878 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879 			   rt->dst.expires) {
880 			ip_rt_put(rt);
881 			ret = NULL;
882 		}
883 	}
884 	return ret;
885 }
886 
887 /*
888  * Algorithm:
889  *	1. The first ip_rt_redirect_number redirects are sent
890  *	   with exponential backoff, then we stop sending them at all,
891  *	   assuming that the host ignores our redirects.
892  *	2. If we did not see packets requiring redirects
893  *	   during ip_rt_redirect_silence, we assume that the host
894  *	   forgot redirected route and start to send redirects again.
895  *
896  * This algorithm is much cheaper and more intelligent than dumb load limiting
897  * in icmp.c.
898  *
899  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900  * and "frag. need" (breaks PMTU discovery) in icmp.c.
901  */
902 
ip_rt_send_redirect(struct sk_buff * skb)903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905 	struct rtable *rt = skb_rtable(skb);
906 	struct in_device *in_dev;
907 	struct inet_peer *peer;
908 	struct net *net;
909 	int log_martians;
910 	int vif;
911 
912 	rcu_read_lock();
913 	in_dev = __in_dev_get_rcu(rt->dst.dev);
914 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915 		rcu_read_unlock();
916 		return;
917 	}
918 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920 	rcu_read_unlock();
921 
922 	net = dev_net(rt->dst.dev);
923 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924 	if (!peer) {
925 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
927 		return;
928 	}
929 
930 	/* No redirected packets during ip_rt_redirect_silence;
931 	 * reset the algorithm.
932 	 */
933 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934 		peer->rate_tokens = 0;
935 		peer->n_redirects = 0;
936 	}
937 
938 	/* Too many ignored redirects; do not send anything
939 	 * set dst.rate_last to the last seen redirected packet.
940 	 */
941 	if (peer->n_redirects >= ip_rt_redirect_number) {
942 		peer->rate_last = jiffies;
943 		goto out_put_peer;
944 	}
945 
946 	/* Check for load limit; set rate_last to the latest sent
947 	 * redirect.
948 	 */
949 	if (peer->n_redirects == 0 ||
950 	    time_after(jiffies,
951 		       (peer->rate_last +
952 			(ip_rt_redirect_load << peer->n_redirects)))) {
953 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954 
955 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956 		peer->rate_last = jiffies;
957 		++peer->n_redirects;
958 #ifdef CONFIG_IP_ROUTE_VERBOSE
959 		if (log_martians &&
960 		    peer->n_redirects == ip_rt_redirect_number)
961 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962 					     &ip_hdr(skb)->saddr, inet_iif(skb),
963 					     &ip_hdr(skb)->daddr, &gw);
964 #endif
965 	}
966 out_put_peer:
967 	inet_putpeer(peer);
968 }
969 
ip_error(struct sk_buff * skb)970 static int ip_error(struct sk_buff *skb)
971 {
972 	struct rtable *rt = skb_rtable(skb);
973 	struct net_device *dev = skb->dev;
974 	struct in_device *in_dev;
975 	struct inet_peer *peer;
976 	unsigned long now;
977 	struct net *net;
978 	bool send;
979 	int code;
980 
981 	if (netif_is_l3_master(skb->dev)) {
982 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983 		if (!dev)
984 			goto out;
985 	}
986 
987 	in_dev = __in_dev_get_rcu(dev);
988 
989 	/* IP on this device is disabled. */
990 	if (!in_dev)
991 		goto out;
992 
993 	net = dev_net(rt->dst.dev);
994 	if (!IN_DEV_FORWARD(in_dev)) {
995 		switch (rt->dst.error) {
996 		case EHOSTUNREACH:
997 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998 			break;
999 
1000 		case ENETUNREACH:
1001 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002 			break;
1003 		}
1004 		goto out;
1005 	}
1006 
1007 	switch (rt->dst.error) {
1008 	case EINVAL:
1009 	default:
1010 		goto out;
1011 	case EHOSTUNREACH:
1012 		code = ICMP_HOST_UNREACH;
1013 		break;
1014 	case ENETUNREACH:
1015 		code = ICMP_NET_UNREACH;
1016 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017 		break;
1018 	case EACCES:
1019 		code = ICMP_PKT_FILTERED;
1020 		break;
1021 	}
1022 
1023 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024 			       l3mdev_master_ifindex(skb->dev), 1);
1025 
1026 	send = true;
1027 	if (peer) {
1028 		now = jiffies;
1029 		peer->rate_tokens += now - peer->rate_last;
1030 		if (peer->rate_tokens > ip_rt_error_burst)
1031 			peer->rate_tokens = ip_rt_error_burst;
1032 		peer->rate_last = now;
1033 		if (peer->rate_tokens >= ip_rt_error_cost)
1034 			peer->rate_tokens -= ip_rt_error_cost;
1035 		else
1036 			send = false;
1037 		inet_putpeer(peer);
1038 	}
1039 	if (send)
1040 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041 
1042 out:	kfree_skb(skb);
1043 	return 0;
1044 }
1045 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1046 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047 {
1048 	struct dst_entry *dst = &rt->dst;
1049 	struct net *net = dev_net(dst->dev);
1050 	struct fib_result res;
1051 	bool lock = false;
1052 	u32 old_mtu;
1053 
1054 	if (ip_mtu_locked(dst))
1055 		return;
1056 
1057 	old_mtu = ipv4_mtu(dst);
1058 	if (old_mtu < mtu)
1059 		return;
1060 
1061 	if (mtu < ip_rt_min_pmtu) {
1062 		lock = true;
1063 		mtu = min(old_mtu, ip_rt_min_pmtu);
1064 	}
1065 
1066 	if (rt->rt_pmtu == mtu && !lock &&
1067 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068 		return;
1069 
1070 	rcu_read_lock();
1071 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1072 		struct fib_nh_common *nhc;
1073 
1074 		fib_select_path(net, &res, fl4, NULL);
1075 		nhc = FIB_RES_NHC(res);
1076 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077 				      jiffies + ip_rt_mtu_expires);
1078 	}
1079 	rcu_read_unlock();
1080 }
1081 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1082 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083 			      struct sk_buff *skb, u32 mtu,
1084 			      bool confirm_neigh)
1085 {
1086 	struct rtable *rt = (struct rtable *) dst;
1087 	struct flowi4 fl4;
1088 
1089 	ip_rt_build_flow_key(&fl4, sk, skb);
1090 	ip_rt_fix_tos(&fl4);
1091 
1092 	/* Don't make lookup fail for bridged encapsulations */
1093 	if (skb && netif_is_any_bridge_port(skb->dev))
1094 		fl4.flowi4_oif = 0;
1095 
1096 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1097 }
1098 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100 		      int oif, u8 protocol)
1101 {
1102 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 	struct flowi4 fl4;
1104 	struct rtable *rt;
1105 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106 
1107 	__build_flow_key(net, &fl4, NULL, iph, oif,
1108 			 RT_TOS(iph->tos), protocol, mark, 0);
1109 	rt = __ip_route_output_key(net, &fl4);
1110 	if (!IS_ERR(rt)) {
1111 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1112 		ip_rt_put(rt);
1113 	}
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118 {
1119 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1120 	struct flowi4 fl4;
1121 	struct rtable *rt;
1122 
1123 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124 
1125 	if (!fl4.flowi4_mark)
1126 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127 
1128 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1129 	if (!IS_ERR(rt)) {
1130 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1131 		ip_rt_put(rt);
1132 	}
1133 }
1134 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136 {
1137 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1138 	struct flowi4 fl4;
1139 	struct rtable *rt;
1140 	struct dst_entry *odst = NULL;
1141 	bool new = false;
1142 	struct net *net = sock_net(sk);
1143 
1144 	bh_lock_sock(sk);
1145 
1146 	if (!ip_sk_accept_pmtu(sk))
1147 		goto out;
1148 
1149 	odst = sk_dst_get(sk);
1150 
1151 	if (sock_owned_by_user(sk) || !odst) {
1152 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1153 		goto out;
1154 	}
1155 
1156 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157 
1158 	rt = (struct rtable *)odst;
1159 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161 		if (IS_ERR(rt))
1162 			goto out;
1163 
1164 		new = true;
1165 	} else {
1166 		ip_rt_fix_tos(&fl4);
1167 	}
1168 
1169 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170 
1171 	if (!dst_check(&rt->dst, 0)) {
1172 		if (new)
1173 			dst_release(&rt->dst);
1174 
1175 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176 		if (IS_ERR(rt))
1177 			goto out;
1178 
1179 		new = true;
1180 	}
1181 
1182 	if (new)
1183 		sk_dst_set(sk, &rt->dst);
1184 
1185 out:
1186 	bh_unlock_sock(sk);
1187 	dst_release(odst);
1188 }
1189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192 		   int oif, u8 protocol)
1193 {
1194 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1195 	struct flowi4 fl4;
1196 	struct rtable *rt;
1197 
1198 	__build_flow_key(net, &fl4, NULL, iph, oif,
1199 			 RT_TOS(iph->tos), protocol, 0, 0);
1200 	rt = __ip_route_output_key(net, &fl4);
1201 	if (!IS_ERR(rt)) {
1202 		__ip_do_redirect(rt, skb, &fl4, false);
1203 		ip_rt_put(rt);
1204 	}
1205 }
1206 EXPORT_SYMBOL_GPL(ipv4_redirect);
1207 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209 {
1210 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1211 	struct flowi4 fl4;
1212 	struct rtable *rt;
1213 	struct net *net = sock_net(sk);
1214 
1215 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216 	rt = __ip_route_output_key(net, &fl4);
1217 	if (!IS_ERR(rt)) {
1218 		__ip_do_redirect(rt, skb, &fl4, false);
1219 		ip_rt_put(rt);
1220 	}
1221 }
1222 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1224 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225 {
1226 	struct rtable *rt = (struct rtable *) dst;
1227 
1228 	/* All IPV4 dsts are created with ->obsolete set to the value
1229 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230 	 * into this function always.
1231 	 *
1232 	 * When a PMTU/redirect information update invalidates a route,
1233 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234 	 * DST_OBSOLETE_DEAD.
1235 	 */
1236 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237 		return NULL;
1238 	return dst;
1239 }
1240 
ipv4_send_dest_unreach(struct sk_buff * skb)1241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242 {
1243 	struct net_device *dev;
1244 	struct ip_options opt;
1245 	int res;
1246 
1247 	/* Recompile ip options since IPCB may not be valid anymore.
1248 	 * Also check we have a reasonable ipv4 header.
1249 	 */
1250 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1251 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1252 		return;
1253 
1254 	memset(&opt, 0, sizeof(opt));
1255 	if (ip_hdr(skb)->ihl > 5) {
1256 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1257 			return;
1258 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1259 
1260 		rcu_read_lock();
1261 		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1262 		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1263 		rcu_read_unlock();
1264 
1265 		if (res)
1266 			return;
1267 	}
1268 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1269 }
1270 
ipv4_link_failure(struct sk_buff * skb)1271 static void ipv4_link_failure(struct sk_buff *skb)
1272 {
1273 	struct rtable *rt;
1274 
1275 	ipv4_send_dest_unreach(skb);
1276 
1277 	rt = skb_rtable(skb);
1278 	if (rt)
1279 		dst_set_expires(&rt->dst, 0);
1280 }
1281 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1282 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1283 {
1284 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1285 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1286 		 skb->dev ? skb->dev->name : "?");
1287 	kfree_skb(skb);
1288 	WARN_ON(1);
1289 	return 0;
1290 }
1291 
1292 /*
1293    We do not cache source address of outgoing interface,
1294    because it is used only by IP RR, TS and SRR options,
1295    so that it out of fast path.
1296 
1297    BTW remember: "addr" is allowed to be not aligned
1298    in IP options!
1299  */
1300 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1301 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1302 {
1303 	__be32 src;
1304 
1305 	if (rt_is_output_route(rt))
1306 		src = ip_hdr(skb)->saddr;
1307 	else {
1308 		struct fib_result res;
1309 		struct iphdr *iph = ip_hdr(skb);
1310 		struct flowi4 fl4 = {
1311 			.daddr = iph->daddr,
1312 			.saddr = iph->saddr,
1313 			.flowi4_tos = RT_TOS(iph->tos),
1314 			.flowi4_oif = rt->dst.dev->ifindex,
1315 			.flowi4_iif = skb->dev->ifindex,
1316 			.flowi4_mark = skb->mark,
1317 		};
1318 
1319 		rcu_read_lock();
1320 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1321 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1322 		else
1323 			src = inet_select_addr(rt->dst.dev,
1324 					       rt_nexthop(rt, iph->daddr),
1325 					       RT_SCOPE_UNIVERSE);
1326 		rcu_read_unlock();
1327 	}
1328 	memcpy(addr, &src, 4);
1329 }
1330 
1331 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1332 static void set_class_tag(struct rtable *rt, u32 tag)
1333 {
1334 	if (!(rt->dst.tclassid & 0xFFFF))
1335 		rt->dst.tclassid |= tag & 0xFFFF;
1336 	if (!(rt->dst.tclassid & 0xFFFF0000))
1337 		rt->dst.tclassid |= tag & 0xFFFF0000;
1338 }
1339 #endif
1340 
ipv4_default_advmss(const struct dst_entry * dst)1341 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1342 {
1343 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1344 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1345 				    ip_rt_min_advmss);
1346 
1347 	return min(advmss, IPV4_MAX_PMTU - header_size);
1348 }
1349 
ipv4_mtu(const struct dst_entry * dst)1350 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1351 {
1352 	const struct rtable *rt = (const struct rtable *)dst;
1353 	unsigned int mtu = rt->rt_pmtu;
1354 
1355 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1356 		mtu = dst_metric_raw(dst, RTAX_MTU);
1357 
1358 	if (mtu)
1359 		goto out;
1360 
1361 	mtu = READ_ONCE(dst->dev->mtu);
1362 
1363 	if (unlikely(ip_mtu_locked(dst))) {
1364 		if (rt->rt_uses_gateway && mtu > 576)
1365 			mtu = 576;
1366 	}
1367 
1368 out:
1369 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1370 
1371 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1372 }
1373 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1374 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1375 {
1376 	struct fnhe_hash_bucket *hash;
1377 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1378 	u32 hval = fnhe_hashfun(daddr);
1379 
1380 	spin_lock_bh(&fnhe_lock);
1381 
1382 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1383 					 lockdep_is_held(&fnhe_lock));
1384 	hash += hval;
1385 
1386 	fnhe_p = &hash->chain;
1387 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1388 	while (fnhe) {
1389 		if (fnhe->fnhe_daddr == daddr) {
1390 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1391 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1392 			/* set fnhe_daddr to 0 to ensure it won't bind with
1393 			 * new dsts in rt_bind_exception().
1394 			 */
1395 			fnhe->fnhe_daddr = 0;
1396 			fnhe_flush_routes(fnhe);
1397 			kfree_rcu(fnhe, rcu);
1398 			break;
1399 		}
1400 		fnhe_p = &fnhe->fnhe_next;
1401 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1402 						 lockdep_is_held(&fnhe_lock));
1403 	}
1404 
1405 	spin_unlock_bh(&fnhe_lock);
1406 }
1407 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1408 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1409 					       __be32 daddr)
1410 {
1411 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1412 	struct fib_nh_exception *fnhe;
1413 	u32 hval;
1414 
1415 	if (!hash)
1416 		return NULL;
1417 
1418 	hval = fnhe_hashfun(daddr);
1419 
1420 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1421 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1422 		if (fnhe->fnhe_daddr == daddr) {
1423 			if (fnhe->fnhe_expires &&
1424 			    time_after(jiffies, fnhe->fnhe_expires)) {
1425 				ip_del_fnhe(nhc, daddr);
1426 				break;
1427 			}
1428 			return fnhe;
1429 		}
1430 	}
1431 	return NULL;
1432 }
1433 
1434 /* MTU selection:
1435  * 1. mtu on route is locked - use it
1436  * 2. mtu from nexthop exception
1437  * 3. mtu from egress device
1438  */
1439 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1440 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1441 {
1442 	struct fib_nh_common *nhc = res->nhc;
1443 	struct net_device *dev = nhc->nhc_dev;
1444 	struct fib_info *fi = res->fi;
1445 	u32 mtu = 0;
1446 
1447 	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1448 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1449 		mtu = fi->fib_mtu;
1450 
1451 	if (likely(!mtu)) {
1452 		struct fib_nh_exception *fnhe;
1453 
1454 		fnhe = find_exception(nhc, daddr);
1455 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1456 			mtu = fnhe->fnhe_pmtu;
1457 	}
1458 
1459 	if (likely(!mtu))
1460 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1461 
1462 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1463 }
1464 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1465 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1466 			      __be32 daddr, const bool do_cache)
1467 {
1468 	bool ret = false;
1469 
1470 	spin_lock_bh(&fnhe_lock);
1471 
1472 	if (daddr == fnhe->fnhe_daddr) {
1473 		struct rtable __rcu **porig;
1474 		struct rtable *orig;
1475 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1476 
1477 		if (rt_is_input_route(rt))
1478 			porig = &fnhe->fnhe_rth_input;
1479 		else
1480 			porig = &fnhe->fnhe_rth_output;
1481 		orig = rcu_dereference(*porig);
1482 
1483 		if (fnhe->fnhe_genid != genid) {
1484 			fnhe->fnhe_genid = genid;
1485 			fnhe->fnhe_gw = 0;
1486 			fnhe->fnhe_pmtu = 0;
1487 			fnhe->fnhe_expires = 0;
1488 			fnhe->fnhe_mtu_locked = false;
1489 			fnhe_flush_routes(fnhe);
1490 			orig = NULL;
1491 		}
1492 		fill_route_from_fnhe(rt, fnhe);
1493 		if (!rt->rt_gw4) {
1494 			rt->rt_gw4 = daddr;
1495 			rt->rt_gw_family = AF_INET;
1496 		}
1497 
1498 		if (do_cache) {
1499 			dst_hold(&rt->dst);
1500 			rcu_assign_pointer(*porig, rt);
1501 			if (orig) {
1502 				dst_dev_put(&orig->dst);
1503 				dst_release(&orig->dst);
1504 			}
1505 			ret = true;
1506 		}
1507 
1508 		fnhe->fnhe_stamp = jiffies;
1509 	}
1510 	spin_unlock_bh(&fnhe_lock);
1511 
1512 	return ret;
1513 }
1514 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1515 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1516 {
1517 	struct rtable *orig, *prev, **p;
1518 	bool ret = true;
1519 
1520 	if (rt_is_input_route(rt)) {
1521 		p = (struct rtable **)&nhc->nhc_rth_input;
1522 	} else {
1523 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1524 	}
1525 	orig = *p;
1526 
1527 	/* hold dst before doing cmpxchg() to avoid race condition
1528 	 * on this dst
1529 	 */
1530 	dst_hold(&rt->dst);
1531 	prev = cmpxchg(p, orig, rt);
1532 	if (prev == orig) {
1533 		if (orig) {
1534 			rt_add_uncached_list(orig);
1535 			dst_release(&orig->dst);
1536 		}
1537 	} else {
1538 		dst_release(&rt->dst);
1539 		ret = false;
1540 	}
1541 
1542 	return ret;
1543 }
1544 
1545 struct uncached_list {
1546 	spinlock_t		lock;
1547 	struct list_head	head;
1548 };
1549 
1550 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1551 
rt_add_uncached_list(struct rtable * rt)1552 void rt_add_uncached_list(struct rtable *rt)
1553 {
1554 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1555 
1556 	rt->rt_uncached_list = ul;
1557 
1558 	spin_lock_bh(&ul->lock);
1559 	list_add_tail(&rt->rt_uncached, &ul->head);
1560 	spin_unlock_bh(&ul->lock);
1561 }
1562 
rt_del_uncached_list(struct rtable * rt)1563 void rt_del_uncached_list(struct rtable *rt)
1564 {
1565 	if (!list_empty(&rt->rt_uncached)) {
1566 		struct uncached_list *ul = rt->rt_uncached_list;
1567 
1568 		spin_lock_bh(&ul->lock);
1569 		list_del(&rt->rt_uncached);
1570 		spin_unlock_bh(&ul->lock);
1571 	}
1572 }
1573 
ipv4_dst_destroy(struct dst_entry * dst)1574 static void ipv4_dst_destroy(struct dst_entry *dst)
1575 {
1576 	struct rtable *rt = (struct rtable *)dst;
1577 
1578 	ip_dst_metrics_put(dst);
1579 	rt_del_uncached_list(rt);
1580 }
1581 
rt_flush_dev(struct net_device * dev)1582 void rt_flush_dev(struct net_device *dev)
1583 {
1584 	struct rtable *rt;
1585 	int cpu;
1586 
1587 	for_each_possible_cpu(cpu) {
1588 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1589 
1590 		spin_lock_bh(&ul->lock);
1591 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1592 			if (rt->dst.dev != dev)
1593 				continue;
1594 			rt->dst.dev = blackhole_netdev;
1595 			dev_hold(rt->dst.dev);
1596 			dev_put(dev);
1597 		}
1598 		spin_unlock_bh(&ul->lock);
1599 	}
1600 }
1601 
rt_cache_valid(const struct rtable * rt)1602 static bool rt_cache_valid(const struct rtable *rt)
1603 {
1604 	return	rt &&
1605 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1606 		!rt_is_expired(rt);
1607 }
1608 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1609 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1610 			   const struct fib_result *res,
1611 			   struct fib_nh_exception *fnhe,
1612 			   struct fib_info *fi, u16 type, u32 itag,
1613 			   const bool do_cache)
1614 {
1615 	bool cached = false;
1616 
1617 	if (fi) {
1618 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1619 
1620 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1621 			rt->rt_uses_gateway = 1;
1622 			rt->rt_gw_family = nhc->nhc_gw_family;
1623 			/* only INET and INET6 are supported */
1624 			if (likely(nhc->nhc_gw_family == AF_INET))
1625 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1626 			else
1627 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1628 		}
1629 
1630 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1631 
1632 #ifdef CONFIG_IP_ROUTE_CLASSID
1633 		if (nhc->nhc_family == AF_INET) {
1634 			struct fib_nh *nh;
1635 
1636 			nh = container_of(nhc, struct fib_nh, nh_common);
1637 			rt->dst.tclassid = nh->nh_tclassid;
1638 		}
1639 #endif
1640 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1641 		if (unlikely(fnhe))
1642 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1643 		else if (do_cache)
1644 			cached = rt_cache_route(nhc, rt);
1645 		if (unlikely(!cached)) {
1646 			/* Routes we intend to cache in nexthop exception or
1647 			 * FIB nexthop have the DST_NOCACHE bit clear.
1648 			 * However, if we are unsuccessful at storing this
1649 			 * route into the cache we really need to set it.
1650 			 */
1651 			if (!rt->rt_gw4) {
1652 				rt->rt_gw_family = AF_INET;
1653 				rt->rt_gw4 = daddr;
1654 			}
1655 			rt_add_uncached_list(rt);
1656 		}
1657 	} else
1658 		rt_add_uncached_list(rt);
1659 
1660 #ifdef CONFIG_IP_ROUTE_CLASSID
1661 #ifdef CONFIG_IP_MULTIPLE_TABLES
1662 	set_class_tag(rt, res->tclassid);
1663 #endif
1664 	set_class_tag(rt, itag);
1665 #endif
1666 }
1667 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1668 struct rtable *rt_dst_alloc(struct net_device *dev,
1669 			    unsigned int flags, u16 type,
1670 			    bool nopolicy, bool noxfrm)
1671 {
1672 	struct rtable *rt;
1673 
1674 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1675 		       (nopolicy ? DST_NOPOLICY : 0) |
1676 		       (noxfrm ? DST_NOXFRM : 0));
1677 
1678 	if (rt) {
1679 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1680 		rt->rt_flags = flags;
1681 		rt->rt_type = type;
1682 		rt->rt_is_input = 0;
1683 		rt->rt_iif = 0;
1684 		rt->rt_pmtu = 0;
1685 		rt->rt_mtu_locked = 0;
1686 		rt->rt_uses_gateway = 0;
1687 		rt->rt_gw_family = 0;
1688 		rt->rt_gw4 = 0;
1689 		INIT_LIST_HEAD(&rt->rt_uncached);
1690 
1691 		rt->dst.output = ip_output;
1692 		if (flags & RTCF_LOCAL)
1693 			rt->dst.input = ip_local_deliver;
1694 	}
1695 
1696 	return rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_alloc);
1699 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1700 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1701 {
1702 	struct rtable *new_rt;
1703 
1704 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1705 			   rt->dst.flags);
1706 
1707 	if (new_rt) {
1708 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1709 		new_rt->rt_flags = rt->rt_flags;
1710 		new_rt->rt_type = rt->rt_type;
1711 		new_rt->rt_is_input = rt->rt_is_input;
1712 		new_rt->rt_iif = rt->rt_iif;
1713 		new_rt->rt_pmtu = rt->rt_pmtu;
1714 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1715 		new_rt->rt_gw_family = rt->rt_gw_family;
1716 		if (rt->rt_gw_family == AF_INET)
1717 			new_rt->rt_gw4 = rt->rt_gw4;
1718 		else if (rt->rt_gw_family == AF_INET6)
1719 			new_rt->rt_gw6 = rt->rt_gw6;
1720 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1721 
1722 		new_rt->dst.input = rt->dst.input;
1723 		new_rt->dst.output = rt->dst.output;
1724 		new_rt->dst.error = rt->dst.error;
1725 		new_rt->dst.lastuse = jiffies;
1726 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1727 	}
1728 	return new_rt;
1729 }
1730 EXPORT_SYMBOL(rt_dst_clone);
1731 
1732 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1733 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1734 			  u8 tos, struct net_device *dev,
1735 			  struct in_device *in_dev, u32 *itag)
1736 {
1737 	int err;
1738 
1739 	/* Primary sanity checks. */
1740 	if (!in_dev)
1741 		return -EINVAL;
1742 
1743 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1744 	    skb->protocol != htons(ETH_P_IP))
1745 		return -EINVAL;
1746 
1747 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1748 		return -EINVAL;
1749 
1750 	if (ipv4_is_zeronet(saddr)) {
1751 		if (!ipv4_is_local_multicast(daddr) &&
1752 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1753 			return -EINVAL;
1754 	} else {
1755 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1756 					  in_dev, itag);
1757 		if (err < 0)
1758 			return err;
1759 	}
1760 	return 0;
1761 }
1762 
1763 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1764 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1765 			     u8 tos, struct net_device *dev, int our)
1766 {
1767 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1768 	unsigned int flags = RTCF_MULTICAST;
1769 	struct rtable *rth;
1770 	bool no_policy;
1771 	u32 itag = 0;
1772 	int err;
1773 
1774 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1775 	if (err)
1776 		return err;
1777 
1778 	if (our)
1779 		flags |= RTCF_LOCAL;
1780 
1781 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1782 	if (no_policy)
1783 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1784 
1785 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1786 			   no_policy, false);
1787 	if (!rth)
1788 		return -ENOBUFS;
1789 
1790 #ifdef CONFIG_IP_ROUTE_CLASSID
1791 	rth->dst.tclassid = itag;
1792 #endif
1793 	rth->dst.output = ip_rt_bug;
1794 	rth->rt_is_input= 1;
1795 
1796 #ifdef CONFIG_IP_MROUTE
1797 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1798 		rth->dst.input = ip_mr_input;
1799 #endif
1800 	RT_CACHE_STAT_INC(in_slow_mc);
1801 
1802 	skb_dst_drop(skb);
1803 	skb_dst_set(skb, &rth->dst);
1804 	return 0;
1805 }
1806 
1807 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1808 static void ip_handle_martian_source(struct net_device *dev,
1809 				     struct in_device *in_dev,
1810 				     struct sk_buff *skb,
1811 				     __be32 daddr,
1812 				     __be32 saddr)
1813 {
1814 	RT_CACHE_STAT_INC(in_martian_src);
1815 #ifdef CONFIG_IP_ROUTE_VERBOSE
1816 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1817 		/*
1818 		 *	RFC1812 recommendation, if source is martian,
1819 		 *	the only hint is MAC header.
1820 		 */
1821 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1822 			&daddr, &saddr, dev->name);
1823 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1824 			print_hex_dump(KERN_WARNING, "ll header: ",
1825 				       DUMP_PREFIX_OFFSET, 16, 1,
1826 				       skb_mac_header(skb),
1827 				       dev->hard_header_len, false);
1828 		}
1829 	}
1830 #endif
1831 }
1832 
1833 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1834 static int __mkroute_input(struct sk_buff *skb,
1835 			   const struct fib_result *res,
1836 			   struct in_device *in_dev,
1837 			   __be32 daddr, __be32 saddr, u32 tos)
1838 {
1839 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1840 	struct net_device *dev = nhc->nhc_dev;
1841 	struct fib_nh_exception *fnhe;
1842 	struct rtable *rth;
1843 	int err;
1844 	struct in_device *out_dev;
1845 	bool do_cache, no_policy;
1846 	u32 itag = 0;
1847 
1848 	/* get a working reference to the output device */
1849 	out_dev = __in_dev_get_rcu(dev);
1850 	if (!out_dev) {
1851 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1852 		return -EINVAL;
1853 	}
1854 
1855 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1856 				  in_dev->dev, in_dev, &itag);
1857 	if (err < 0) {
1858 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1859 					 saddr);
1860 
1861 		goto cleanup;
1862 	}
1863 
1864 	do_cache = res->fi && !itag;
1865 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1866 	    skb->protocol == htons(ETH_P_IP)) {
1867 		__be32 gw;
1868 
1869 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1870 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1871 		    inet_addr_onlink(out_dev, saddr, gw))
1872 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1873 	}
1874 
1875 	if (skb->protocol != htons(ETH_P_IP)) {
1876 		/* Not IP (i.e. ARP). Do not create route, if it is
1877 		 * invalid for proxy arp. DNAT routes are always valid.
1878 		 *
1879 		 * Proxy arp feature have been extended to allow, ARP
1880 		 * replies back to the same interface, to support
1881 		 * Private VLAN switch technologies. See arp.c.
1882 		 */
1883 		if (out_dev == in_dev &&
1884 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1885 			err = -EINVAL;
1886 			goto cleanup;
1887 		}
1888 	}
1889 
1890 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1891 	if (no_policy)
1892 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1893 
1894 	fnhe = find_exception(nhc, daddr);
1895 	if (do_cache) {
1896 		if (fnhe)
1897 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1898 		else
1899 			rth = rcu_dereference(nhc->nhc_rth_input);
1900 		if (rt_cache_valid(rth)) {
1901 			skb_dst_set_noref(skb, &rth->dst);
1902 			goto out;
1903 		}
1904 	}
1905 
1906 	rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1907 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1908 	if (!rth) {
1909 		err = -ENOBUFS;
1910 		goto cleanup;
1911 	}
1912 
1913 	rth->rt_is_input = 1;
1914 	RT_CACHE_STAT_INC(in_slow_tot);
1915 
1916 	rth->dst.input = ip_forward;
1917 
1918 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1919 		       do_cache);
1920 	lwtunnel_set_redirect(&rth->dst);
1921 	skb_dst_set(skb, &rth->dst);
1922 out:
1923 	err = 0;
1924  cleanup:
1925 	return err;
1926 }
1927 
1928 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1929 /* To make ICMP packets follow the right flow, the multipath hash is
1930  * calculated from the inner IP addresses.
1931  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1932 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1933 				 struct flow_keys *hash_keys)
1934 {
1935 	const struct iphdr *outer_iph = ip_hdr(skb);
1936 	const struct iphdr *key_iph = outer_iph;
1937 	const struct iphdr *inner_iph;
1938 	const struct icmphdr *icmph;
1939 	struct iphdr _inner_iph;
1940 	struct icmphdr _icmph;
1941 
1942 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1943 		goto out;
1944 
1945 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1946 		goto out;
1947 
1948 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1949 				   &_icmph);
1950 	if (!icmph)
1951 		goto out;
1952 
1953 	if (!icmp_is_err(icmph->type))
1954 		goto out;
1955 
1956 	inner_iph = skb_header_pointer(skb,
1957 				       outer_iph->ihl * 4 + sizeof(_icmph),
1958 				       sizeof(_inner_iph), &_inner_iph);
1959 	if (!inner_iph)
1960 		goto out;
1961 
1962 	key_iph = inner_iph;
1963 out:
1964 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1965 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1966 }
1967 
1968 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1969 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1970 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1971 {
1972 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1973 	struct flow_keys hash_keys;
1974 	u32 mhash;
1975 
1976 	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1977 	case 0:
1978 		memset(&hash_keys, 0, sizeof(hash_keys));
1979 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1980 		if (skb) {
1981 			ip_multipath_l3_keys(skb, &hash_keys);
1982 		} else {
1983 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1984 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1985 		}
1986 		break;
1987 	case 1:
1988 		/* skb is currently provided only when forwarding */
1989 		if (skb) {
1990 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1991 			struct flow_keys keys;
1992 
1993 			/* short-circuit if we already have L4 hash present */
1994 			if (skb->l4_hash)
1995 				return skb_get_hash_raw(skb) >> 1;
1996 
1997 			memset(&hash_keys, 0, sizeof(hash_keys));
1998 
1999 			if (!flkeys) {
2000 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2001 				flkeys = &keys;
2002 			}
2003 
2004 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2006 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2007 			hash_keys.ports.src = flkeys->ports.src;
2008 			hash_keys.ports.dst = flkeys->ports.dst;
2009 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2010 		} else {
2011 			memset(&hash_keys, 0, sizeof(hash_keys));
2012 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2013 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2014 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2015 			hash_keys.ports.src = fl4->fl4_sport;
2016 			hash_keys.ports.dst = fl4->fl4_dport;
2017 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2018 		}
2019 		break;
2020 	case 2:
2021 		memset(&hash_keys, 0, sizeof(hash_keys));
2022 		/* skb is currently provided only when forwarding */
2023 		if (skb) {
2024 			struct flow_keys keys;
2025 
2026 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2027 			/* Inner can be v4 or v6 */
2028 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2029 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2030 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2031 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2032 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2033 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2035 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2036 				hash_keys.tags.flow_label = keys.tags.flow_label;
2037 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2038 			} else {
2039 				/* Same as case 0 */
2040 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2041 				ip_multipath_l3_keys(skb, &hash_keys);
2042 			}
2043 		} else {
2044 			/* Same as case 0 */
2045 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2046 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2047 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2048 		}
2049 		break;
2050 	}
2051 	mhash = flow_hash_from_keys(&hash_keys);
2052 
2053 	if (multipath_hash)
2054 		mhash = jhash_2words(mhash, multipath_hash, 0);
2055 
2056 	return mhash >> 1;
2057 }
2058 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2059 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2060 static int ip_mkroute_input(struct sk_buff *skb,
2061 			    struct fib_result *res,
2062 			    struct in_device *in_dev,
2063 			    __be32 daddr, __be32 saddr, u32 tos,
2064 			    struct flow_keys *hkeys)
2065 {
2066 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2067 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2068 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2069 
2070 		fib_select_multipath(res, h);
2071 	}
2072 #endif
2073 
2074 	/* create a routing cache entry */
2075 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2076 }
2077 
2078 /* Implements all the saddr-related checks as ip_route_input_slow(),
2079  * assuming daddr is valid and the destination is not a local broadcast one.
2080  * Uses the provided hint instead of performing a route lookup.
2081  */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2082 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2083 		      u8 tos, struct net_device *dev,
2084 		      const struct sk_buff *hint)
2085 {
2086 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2087 	struct rtable *rt = skb_rtable(hint);
2088 	struct net *net = dev_net(dev);
2089 	int err = -EINVAL;
2090 	u32 tag = 0;
2091 
2092 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2093 		goto martian_source;
2094 
2095 	if (ipv4_is_zeronet(saddr))
2096 		goto martian_source;
2097 
2098 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2099 		goto martian_source;
2100 
2101 	if (rt->rt_type != RTN_LOCAL)
2102 		goto skip_validate_source;
2103 
2104 	tos &= IPTOS_RT_MASK;
2105 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2106 	if (err < 0)
2107 		goto martian_source;
2108 
2109 skip_validate_source:
2110 	skb_dst_copy(skb, hint);
2111 	return 0;
2112 
2113 martian_source:
2114 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2115 	return err;
2116 }
2117 
2118 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2119 static struct net_device *ip_rt_get_dev(struct net *net,
2120 					const struct fib_result *res)
2121 {
2122 	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2123 	struct net_device *dev = NULL;
2124 
2125 	if (nhc)
2126 		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2127 
2128 	return dev ? : net->loopback_dev;
2129 }
2130 
2131 /*
2132  *	NOTE. We drop all the packets that has local source
2133  *	addresses, because every properly looped back packet
2134  *	must have correct destination already attached by output routine.
2135  *	Changes in the enforced policies must be applied also to
2136  *	ip_route_use_hint().
2137  *
2138  *	Such approach solves two big problems:
2139  *	1. Not simplex devices are handled properly.
2140  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2141  *	called with rcu_read_lock()
2142  */
2143 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2144 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2145 			       u8 tos, struct net_device *dev,
2146 			       struct fib_result *res)
2147 {
2148 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2149 	struct flow_keys *flkeys = NULL, _flkeys;
2150 	struct net    *net = dev_net(dev);
2151 	struct ip_tunnel_info *tun_info;
2152 	int		err = -EINVAL;
2153 	unsigned int	flags = 0;
2154 	u32		itag = 0;
2155 	struct rtable	*rth;
2156 	struct flowi4	fl4;
2157 	bool do_cache = true;
2158 	bool no_policy;
2159 
2160 	/* IP on this device is disabled. */
2161 
2162 	if (!in_dev)
2163 		goto out;
2164 
2165 	/* Check for the most weird martians, which can be not detected
2166 	   by fib_lookup.
2167 	 */
2168 
2169 	tun_info = skb_tunnel_info(skb);
2170 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2171 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2172 	else
2173 		fl4.flowi4_tun_key.tun_id = 0;
2174 	skb_dst_drop(skb);
2175 
2176 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2177 		goto martian_source;
2178 
2179 	res->fi = NULL;
2180 	res->table = NULL;
2181 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2182 		goto brd_input;
2183 
2184 	/* Accept zero addresses only to limited broadcast;
2185 	 * I even do not know to fix it or not. Waiting for complains :-)
2186 	 */
2187 	if (ipv4_is_zeronet(saddr))
2188 		goto martian_source;
2189 
2190 	if (ipv4_is_zeronet(daddr))
2191 		goto martian_destination;
2192 
2193 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2194 	 * and call it once if daddr or/and saddr are loopback addresses
2195 	 */
2196 	if (ipv4_is_loopback(daddr)) {
2197 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2198 			goto martian_destination;
2199 	} else if (ipv4_is_loopback(saddr)) {
2200 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2201 			goto martian_source;
2202 	}
2203 
2204 	/*
2205 	 *	Now we are ready to route packet.
2206 	 */
2207 	fl4.flowi4_oif = 0;
2208 	fl4.flowi4_iif = dev->ifindex;
2209 	fl4.flowi4_mark = skb->mark;
2210 	fl4.flowi4_tos = tos;
2211 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2212 	fl4.flowi4_flags = 0;
2213 	fl4.daddr = daddr;
2214 	fl4.saddr = saddr;
2215 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2216 	fl4.flowi4_multipath_hash = 0;
2217 
2218 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2219 		flkeys = &_flkeys;
2220 	} else {
2221 		fl4.flowi4_proto = 0;
2222 		fl4.fl4_sport = 0;
2223 		fl4.fl4_dport = 0;
2224 	}
2225 
2226 	err = fib_lookup(net, &fl4, res, 0);
2227 	if (err != 0) {
2228 		if (!IN_DEV_FORWARD(in_dev))
2229 			err = -EHOSTUNREACH;
2230 		goto no_route;
2231 	}
2232 
2233 	if (res->type == RTN_BROADCAST) {
2234 		if (IN_DEV_BFORWARD(in_dev))
2235 			goto make_route;
2236 		/* not do cache if bc_forwarding is enabled */
2237 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2238 			do_cache = false;
2239 		goto brd_input;
2240 	}
2241 
2242 	if (res->type == RTN_LOCAL) {
2243 		err = fib_validate_source(skb, saddr, daddr, tos,
2244 					  0, dev, in_dev, &itag);
2245 		if (err < 0)
2246 			goto martian_source;
2247 		goto local_input;
2248 	}
2249 
2250 	if (!IN_DEV_FORWARD(in_dev)) {
2251 		err = -EHOSTUNREACH;
2252 		goto no_route;
2253 	}
2254 	if (res->type != RTN_UNICAST)
2255 		goto martian_destination;
2256 
2257 make_route:
2258 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2259 out:	return err;
2260 
2261 brd_input:
2262 	if (skb->protocol != htons(ETH_P_IP))
2263 		goto e_inval;
2264 
2265 	if (!ipv4_is_zeronet(saddr)) {
2266 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2267 					  in_dev, &itag);
2268 		if (err < 0)
2269 			goto martian_source;
2270 	}
2271 	flags |= RTCF_BROADCAST;
2272 	res->type = RTN_BROADCAST;
2273 	RT_CACHE_STAT_INC(in_brd);
2274 
2275 local_input:
2276 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2277 	if (no_policy)
2278 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2279 
2280 	do_cache &= res->fi && !itag;
2281 	if (do_cache) {
2282 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2283 
2284 		rth = rcu_dereference(nhc->nhc_rth_input);
2285 		if (rt_cache_valid(rth)) {
2286 			skb_dst_set_noref(skb, &rth->dst);
2287 			err = 0;
2288 			goto out;
2289 		}
2290 	}
2291 
2292 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2293 			   flags | RTCF_LOCAL, res->type,
2294 			   no_policy, false);
2295 	if (!rth)
2296 		goto e_nobufs;
2297 
2298 	rth->dst.output= ip_rt_bug;
2299 #ifdef CONFIG_IP_ROUTE_CLASSID
2300 	rth->dst.tclassid = itag;
2301 #endif
2302 	rth->rt_is_input = 1;
2303 
2304 	RT_CACHE_STAT_INC(in_slow_tot);
2305 	if (res->type == RTN_UNREACHABLE) {
2306 		rth->dst.input= ip_error;
2307 		rth->dst.error= -err;
2308 		rth->rt_flags 	&= ~RTCF_LOCAL;
2309 	}
2310 
2311 	if (do_cache) {
2312 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2313 
2314 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2315 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2316 			WARN_ON(rth->dst.input == lwtunnel_input);
2317 			rth->dst.lwtstate->orig_input = rth->dst.input;
2318 			rth->dst.input = lwtunnel_input;
2319 		}
2320 
2321 		if (unlikely(!rt_cache_route(nhc, rth)))
2322 			rt_add_uncached_list(rth);
2323 	}
2324 	skb_dst_set(skb, &rth->dst);
2325 	err = 0;
2326 	goto out;
2327 
2328 no_route:
2329 	RT_CACHE_STAT_INC(in_no_route);
2330 	res->type = RTN_UNREACHABLE;
2331 	res->fi = NULL;
2332 	res->table = NULL;
2333 	goto local_input;
2334 
2335 	/*
2336 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2337 	 */
2338 martian_destination:
2339 	RT_CACHE_STAT_INC(in_martian_dst);
2340 #ifdef CONFIG_IP_ROUTE_VERBOSE
2341 	if (IN_DEV_LOG_MARTIANS(in_dev))
2342 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2343 				     &daddr, &saddr, dev->name);
2344 #endif
2345 
2346 e_inval:
2347 	err = -EINVAL;
2348 	goto out;
2349 
2350 e_nobufs:
2351 	err = -ENOBUFS;
2352 	goto out;
2353 
2354 martian_source:
2355 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2356 	goto out;
2357 }
2358 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2359 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2360 			 u8 tos, struct net_device *dev)
2361 {
2362 	struct fib_result res;
2363 	int err;
2364 
2365 	tos &= IPTOS_RT_MASK;
2366 	rcu_read_lock();
2367 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2368 	rcu_read_unlock();
2369 
2370 	return err;
2371 }
2372 EXPORT_SYMBOL(ip_route_input_noref);
2373 
2374 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2375 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2376 		       u8 tos, struct net_device *dev, struct fib_result *res)
2377 {
2378 	/* Multicast recognition logic is moved from route cache to here.
2379 	   The problem was that too many Ethernet cards have broken/missing
2380 	   hardware multicast filters :-( As result the host on multicasting
2381 	   network acquires a lot of useless route cache entries, sort of
2382 	   SDR messages from all the world. Now we try to get rid of them.
2383 	   Really, provided software IP multicast filter is organized
2384 	   reasonably (at least, hashed), it does not result in a slowdown
2385 	   comparing with route cache reject entries.
2386 	   Note, that multicast routers are not affected, because
2387 	   route cache entry is created eventually.
2388 	 */
2389 	if (ipv4_is_multicast(daddr)) {
2390 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2391 		int our = 0;
2392 		int err = -EINVAL;
2393 
2394 		if (!in_dev)
2395 			return err;
2396 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2397 				      ip_hdr(skb)->protocol);
2398 
2399 		/* check l3 master if no match yet */
2400 		if (!our && netif_is_l3_slave(dev)) {
2401 			struct in_device *l3_in_dev;
2402 
2403 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2404 			if (l3_in_dev)
2405 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2406 						      ip_hdr(skb)->protocol);
2407 		}
2408 
2409 		if (our
2410 #ifdef CONFIG_IP_MROUTE
2411 			||
2412 		    (!ipv4_is_local_multicast(daddr) &&
2413 		     IN_DEV_MFORWARD(in_dev))
2414 #endif
2415 		   ) {
2416 			err = ip_route_input_mc(skb, daddr, saddr,
2417 						tos, dev, our);
2418 		}
2419 		return err;
2420 	}
2421 
2422 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2423 }
2424 
2425 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2426 static struct rtable *__mkroute_output(const struct fib_result *res,
2427 				       const struct flowi4 *fl4, int orig_oif,
2428 				       struct net_device *dev_out,
2429 				       unsigned int flags)
2430 {
2431 	struct fib_info *fi = res->fi;
2432 	struct fib_nh_exception *fnhe;
2433 	struct in_device *in_dev;
2434 	u16 type = res->type;
2435 	struct rtable *rth;
2436 	bool do_cache;
2437 
2438 	in_dev = __in_dev_get_rcu(dev_out);
2439 	if (!in_dev)
2440 		return ERR_PTR(-EINVAL);
2441 
2442 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2443 		if (ipv4_is_loopback(fl4->saddr) &&
2444 		    !(dev_out->flags & IFF_LOOPBACK) &&
2445 		    !netif_is_l3_master(dev_out))
2446 			return ERR_PTR(-EINVAL);
2447 
2448 	if (ipv4_is_lbcast(fl4->daddr))
2449 		type = RTN_BROADCAST;
2450 	else if (ipv4_is_multicast(fl4->daddr))
2451 		type = RTN_MULTICAST;
2452 	else if (ipv4_is_zeronet(fl4->daddr))
2453 		return ERR_PTR(-EINVAL);
2454 
2455 	if (dev_out->flags & IFF_LOOPBACK)
2456 		flags |= RTCF_LOCAL;
2457 
2458 	do_cache = true;
2459 	if (type == RTN_BROADCAST) {
2460 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2461 		fi = NULL;
2462 	} else if (type == RTN_MULTICAST) {
2463 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2464 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2465 				     fl4->flowi4_proto))
2466 			flags &= ~RTCF_LOCAL;
2467 		else
2468 			do_cache = false;
2469 		/* If multicast route do not exist use
2470 		 * default one, but do not gateway in this case.
2471 		 * Yes, it is hack.
2472 		 */
2473 		if (fi && res->prefixlen < 4)
2474 			fi = NULL;
2475 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2476 		   (orig_oif != dev_out->ifindex)) {
2477 		/* For local routes that require a particular output interface
2478 		 * we do not want to cache the result.  Caching the result
2479 		 * causes incorrect behaviour when there are multiple source
2480 		 * addresses on the interface, the end result being that if the
2481 		 * intended recipient is waiting on that interface for the
2482 		 * packet he won't receive it because it will be delivered on
2483 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2484 		 * be set to the loopback interface as well.
2485 		 */
2486 		do_cache = false;
2487 	}
2488 
2489 	fnhe = NULL;
2490 	do_cache &= fi != NULL;
2491 	if (fi) {
2492 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2493 		struct rtable __rcu **prth;
2494 
2495 		fnhe = find_exception(nhc, fl4->daddr);
2496 		if (!do_cache)
2497 			goto add;
2498 		if (fnhe) {
2499 			prth = &fnhe->fnhe_rth_output;
2500 		} else {
2501 			if (unlikely(fl4->flowi4_flags &
2502 				     FLOWI_FLAG_KNOWN_NH &&
2503 				     !(nhc->nhc_gw_family &&
2504 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2505 				do_cache = false;
2506 				goto add;
2507 			}
2508 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2509 		}
2510 		rth = rcu_dereference(*prth);
2511 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2512 			return rth;
2513 	}
2514 
2515 add:
2516 	rth = rt_dst_alloc(dev_out, flags, type,
2517 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2518 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2519 	if (!rth)
2520 		return ERR_PTR(-ENOBUFS);
2521 
2522 	rth->rt_iif = orig_oif;
2523 
2524 	RT_CACHE_STAT_INC(out_slow_tot);
2525 
2526 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2527 		if (flags & RTCF_LOCAL &&
2528 		    !(dev_out->flags & IFF_LOOPBACK)) {
2529 			rth->dst.output = ip_mc_output;
2530 			RT_CACHE_STAT_INC(out_slow_mc);
2531 		}
2532 #ifdef CONFIG_IP_MROUTE
2533 		if (type == RTN_MULTICAST) {
2534 			if (IN_DEV_MFORWARD(in_dev) &&
2535 			    !ipv4_is_local_multicast(fl4->daddr)) {
2536 				rth->dst.input = ip_mr_input;
2537 				rth->dst.output = ip_mc_output;
2538 			}
2539 		}
2540 #endif
2541 	}
2542 
2543 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2544 	lwtunnel_set_redirect(&rth->dst);
2545 
2546 	return rth;
2547 }
2548 
2549 /*
2550  * Major route resolver routine.
2551  */
2552 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2553 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2554 					const struct sk_buff *skb)
2555 {
2556 	struct fib_result res = {
2557 		.type		= RTN_UNSPEC,
2558 		.fi		= NULL,
2559 		.table		= NULL,
2560 		.tclassid	= 0,
2561 	};
2562 	struct rtable *rth;
2563 
2564 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2565 	ip_rt_fix_tos(fl4);
2566 
2567 	rcu_read_lock();
2568 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2569 	rcu_read_unlock();
2570 
2571 	return rth;
2572 }
2573 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2574 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2575 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2576 					    struct fib_result *res,
2577 					    const struct sk_buff *skb)
2578 {
2579 	struct net_device *dev_out = NULL;
2580 	int orig_oif = fl4->flowi4_oif;
2581 	unsigned int flags = 0;
2582 	struct rtable *rth;
2583 	int err;
2584 
2585 	if (fl4->saddr) {
2586 		if (ipv4_is_multicast(fl4->saddr) ||
2587 		    ipv4_is_lbcast(fl4->saddr) ||
2588 		    ipv4_is_zeronet(fl4->saddr)) {
2589 			rth = ERR_PTR(-EINVAL);
2590 			goto out;
2591 		}
2592 
2593 		rth = ERR_PTR(-ENETUNREACH);
2594 
2595 		/* I removed check for oif == dev_out->oif here.
2596 		   It was wrong for two reasons:
2597 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2598 		      is assigned to multiple interfaces.
2599 		   2. Moreover, we are allowed to send packets with saddr
2600 		      of another iface. --ANK
2601 		 */
2602 
2603 		if (fl4->flowi4_oif == 0 &&
2604 		    (ipv4_is_multicast(fl4->daddr) ||
2605 		     ipv4_is_lbcast(fl4->daddr))) {
2606 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2607 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2608 			if (!dev_out)
2609 				goto out;
2610 
2611 			/* Special hack: user can direct multicasts
2612 			   and limited broadcast via necessary interface
2613 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2614 			   This hack is not just for fun, it allows
2615 			   vic,vat and friends to work.
2616 			   They bind socket to loopback, set ttl to zero
2617 			   and expect that it will work.
2618 			   From the viewpoint of routing cache they are broken,
2619 			   because we are not allowed to build multicast path
2620 			   with loopback source addr (look, routing cache
2621 			   cannot know, that ttl is zero, so that packet
2622 			   will not leave this host and route is valid).
2623 			   Luckily, this hack is good workaround.
2624 			 */
2625 
2626 			fl4->flowi4_oif = dev_out->ifindex;
2627 			goto make_route;
2628 		}
2629 
2630 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2631 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2632 			if (!__ip_dev_find(net, fl4->saddr, false))
2633 				goto out;
2634 		}
2635 	}
2636 
2637 
2638 	if (fl4->flowi4_oif) {
2639 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2640 		rth = ERR_PTR(-ENODEV);
2641 		if (!dev_out)
2642 			goto out;
2643 
2644 		/* RACE: Check return value of inet_select_addr instead. */
2645 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2646 			rth = ERR_PTR(-ENETUNREACH);
2647 			goto out;
2648 		}
2649 		if (ipv4_is_local_multicast(fl4->daddr) ||
2650 		    ipv4_is_lbcast(fl4->daddr) ||
2651 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2652 			if (!fl4->saddr)
2653 				fl4->saddr = inet_select_addr(dev_out, 0,
2654 							      RT_SCOPE_LINK);
2655 			goto make_route;
2656 		}
2657 		if (!fl4->saddr) {
2658 			if (ipv4_is_multicast(fl4->daddr))
2659 				fl4->saddr = inet_select_addr(dev_out, 0,
2660 							      fl4->flowi4_scope);
2661 			else if (!fl4->daddr)
2662 				fl4->saddr = inet_select_addr(dev_out, 0,
2663 							      RT_SCOPE_HOST);
2664 		}
2665 	}
2666 
2667 	if (!fl4->daddr) {
2668 		fl4->daddr = fl4->saddr;
2669 		if (!fl4->daddr)
2670 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2671 		dev_out = net->loopback_dev;
2672 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2673 		res->type = RTN_LOCAL;
2674 		flags |= RTCF_LOCAL;
2675 		goto make_route;
2676 	}
2677 
2678 	err = fib_lookup(net, fl4, res, 0);
2679 	if (err) {
2680 		res->fi = NULL;
2681 		res->table = NULL;
2682 		if (fl4->flowi4_oif &&
2683 		    (ipv4_is_multicast(fl4->daddr) ||
2684 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2685 			/* Apparently, routing tables are wrong. Assume,
2686 			   that the destination is on link.
2687 
2688 			   WHY? DW.
2689 			   Because we are allowed to send to iface
2690 			   even if it has NO routes and NO assigned
2691 			   addresses. When oif is specified, routing
2692 			   tables are looked up with only one purpose:
2693 			   to catch if destination is gatewayed, rather than
2694 			   direct. Moreover, if MSG_DONTROUTE is set,
2695 			   we send packet, ignoring both routing tables
2696 			   and ifaddr state. --ANK
2697 
2698 
2699 			   We could make it even if oif is unknown,
2700 			   likely IPv6, but we do not.
2701 			 */
2702 
2703 			if (fl4->saddr == 0)
2704 				fl4->saddr = inet_select_addr(dev_out, 0,
2705 							      RT_SCOPE_LINK);
2706 			res->type = RTN_UNICAST;
2707 			goto make_route;
2708 		}
2709 		rth = ERR_PTR(err);
2710 		goto out;
2711 	}
2712 
2713 	if (res->type == RTN_LOCAL) {
2714 		if (!fl4->saddr) {
2715 			if (res->fi->fib_prefsrc)
2716 				fl4->saddr = res->fi->fib_prefsrc;
2717 			else
2718 				fl4->saddr = fl4->daddr;
2719 		}
2720 
2721 		/* L3 master device is the loopback for that domain */
2722 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2723 			net->loopback_dev;
2724 
2725 		/* make sure orig_oif points to fib result device even
2726 		 * though packet rx/tx happens over loopback or l3mdev
2727 		 */
2728 		orig_oif = FIB_RES_OIF(*res);
2729 
2730 		fl4->flowi4_oif = dev_out->ifindex;
2731 		flags |= RTCF_LOCAL;
2732 		goto make_route;
2733 	}
2734 
2735 	fib_select_path(net, res, fl4, skb);
2736 
2737 	dev_out = FIB_RES_DEV(*res);
2738 
2739 make_route:
2740 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2741 
2742 out:
2743 	return rth;
2744 }
2745 
2746 static struct dst_ops ipv4_dst_blackhole_ops = {
2747 	.family			= AF_INET,
2748 	.default_advmss		= ipv4_default_advmss,
2749 	.neigh_lookup		= ipv4_neigh_lookup,
2750 	.check			= dst_blackhole_check,
2751 	.cow_metrics		= dst_blackhole_cow_metrics,
2752 	.update_pmtu		= dst_blackhole_update_pmtu,
2753 	.redirect		= dst_blackhole_redirect,
2754 	.mtu			= dst_blackhole_mtu,
2755 };
2756 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2757 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2758 {
2759 	struct rtable *ort = (struct rtable *) dst_orig;
2760 	struct rtable *rt;
2761 
2762 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2763 	if (rt) {
2764 		struct dst_entry *new = &rt->dst;
2765 
2766 		new->__use = 1;
2767 		new->input = dst_discard;
2768 		new->output = dst_discard_out;
2769 
2770 		new->dev = net->loopback_dev;
2771 		if (new->dev)
2772 			dev_hold(new->dev);
2773 
2774 		rt->rt_is_input = ort->rt_is_input;
2775 		rt->rt_iif = ort->rt_iif;
2776 		rt->rt_pmtu = ort->rt_pmtu;
2777 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2778 
2779 		rt->rt_genid = rt_genid_ipv4(net);
2780 		rt->rt_flags = ort->rt_flags;
2781 		rt->rt_type = ort->rt_type;
2782 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2783 		rt->rt_gw_family = ort->rt_gw_family;
2784 		if (rt->rt_gw_family == AF_INET)
2785 			rt->rt_gw4 = ort->rt_gw4;
2786 		else if (rt->rt_gw_family == AF_INET6)
2787 			rt->rt_gw6 = ort->rt_gw6;
2788 
2789 		INIT_LIST_HEAD(&rt->rt_uncached);
2790 	}
2791 
2792 	dst_release(dst_orig);
2793 
2794 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2795 }
2796 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2797 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2798 				    const struct sock *sk)
2799 {
2800 	struct rtable *rt = __ip_route_output_key(net, flp4);
2801 
2802 	if (IS_ERR(rt))
2803 		return rt;
2804 
2805 	if (flp4->flowi4_proto) {
2806 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2807 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2808 							flowi4_to_flowi(flp4),
2809 							sk, 0);
2810 	}
2811 
2812 	return rt;
2813 }
2814 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2815 
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2816 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2817 				      struct net_device *dev,
2818 				      struct net *net, __be32 *saddr,
2819 				      const struct ip_tunnel_info *info,
2820 				      u8 protocol, bool use_cache)
2821 {
2822 #ifdef CONFIG_DST_CACHE
2823 	struct dst_cache *dst_cache;
2824 #endif
2825 	struct rtable *rt = NULL;
2826 	struct flowi4 fl4;
2827 	__u8 tos;
2828 
2829 #ifdef CONFIG_DST_CACHE
2830 	dst_cache = (struct dst_cache *)&info->dst_cache;
2831 	if (use_cache) {
2832 		rt = dst_cache_get_ip4(dst_cache, saddr);
2833 		if (rt)
2834 			return rt;
2835 	}
2836 #endif
2837 	memset(&fl4, 0, sizeof(fl4));
2838 	fl4.flowi4_mark = skb->mark;
2839 	fl4.flowi4_proto = protocol;
2840 	fl4.daddr = info->key.u.ipv4.dst;
2841 	fl4.saddr = info->key.u.ipv4.src;
2842 	tos = info->key.tos;
2843 	fl4.flowi4_tos = RT_TOS(tos);
2844 
2845 	rt = ip_route_output_key(net, &fl4);
2846 	if (IS_ERR(rt)) {
2847 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2848 		return ERR_PTR(-ENETUNREACH);
2849 	}
2850 	if (rt->dst.dev == dev) { /* is this necessary? */
2851 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2852 		ip_rt_put(rt);
2853 		return ERR_PTR(-ELOOP);
2854 	}
2855 #ifdef CONFIG_DST_CACHE
2856 	if (use_cache)
2857 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2858 #endif
2859 	*saddr = fl4.saddr;
2860 	return rt;
2861 }
2862 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2863 
2864 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2865 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2866 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2867 			struct sk_buff *skb, u32 portid, u32 seq,
2868 			unsigned int flags)
2869 {
2870 	struct rtmsg *r;
2871 	struct nlmsghdr *nlh;
2872 	unsigned long expires = 0;
2873 	u32 error;
2874 	u32 metrics[RTAX_MAX];
2875 
2876 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2877 	if (!nlh)
2878 		return -EMSGSIZE;
2879 
2880 	r = nlmsg_data(nlh);
2881 	r->rtm_family	 = AF_INET;
2882 	r->rtm_dst_len	= 32;
2883 	r->rtm_src_len	= 0;
2884 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2885 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2886 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2887 		goto nla_put_failure;
2888 	r->rtm_type	= rt->rt_type;
2889 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2890 	r->rtm_protocol = RTPROT_UNSPEC;
2891 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2892 	if (rt->rt_flags & RTCF_NOTIFY)
2893 		r->rtm_flags |= RTM_F_NOTIFY;
2894 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2895 		r->rtm_flags |= RTCF_DOREDIRECT;
2896 
2897 	if (nla_put_in_addr(skb, RTA_DST, dst))
2898 		goto nla_put_failure;
2899 	if (src) {
2900 		r->rtm_src_len = 32;
2901 		if (nla_put_in_addr(skb, RTA_SRC, src))
2902 			goto nla_put_failure;
2903 	}
2904 	if (rt->dst.dev &&
2905 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2906 		goto nla_put_failure;
2907 #ifdef CONFIG_IP_ROUTE_CLASSID
2908 	if (rt->dst.tclassid &&
2909 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2910 		goto nla_put_failure;
2911 #endif
2912 	if (fl4 && !rt_is_input_route(rt) &&
2913 	    fl4->saddr != src) {
2914 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2915 			goto nla_put_failure;
2916 	}
2917 	if (rt->rt_uses_gateway) {
2918 		if (rt->rt_gw_family == AF_INET &&
2919 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2920 			goto nla_put_failure;
2921 		} else if (rt->rt_gw_family == AF_INET6) {
2922 			int alen = sizeof(struct in6_addr);
2923 			struct nlattr *nla;
2924 			struct rtvia *via;
2925 
2926 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2927 			if (!nla)
2928 				goto nla_put_failure;
2929 
2930 			via = nla_data(nla);
2931 			via->rtvia_family = AF_INET6;
2932 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2933 		}
2934 	}
2935 
2936 	expires = rt->dst.expires;
2937 	if (expires) {
2938 		unsigned long now = jiffies;
2939 
2940 		if (time_before(now, expires))
2941 			expires -= now;
2942 		else
2943 			expires = 0;
2944 	}
2945 
2946 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2947 	if (rt->rt_pmtu && expires)
2948 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2949 	if (rt->rt_mtu_locked && expires)
2950 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2951 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2952 		goto nla_put_failure;
2953 
2954 	if (fl4) {
2955 		if (fl4->flowi4_mark &&
2956 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2957 			goto nla_put_failure;
2958 
2959 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2960 		    nla_put_u32(skb, RTA_UID,
2961 				from_kuid_munged(current_user_ns(),
2962 						 fl4->flowi4_uid)))
2963 			goto nla_put_failure;
2964 
2965 		if (rt_is_input_route(rt)) {
2966 #ifdef CONFIG_IP_MROUTE
2967 			if (ipv4_is_multicast(dst) &&
2968 			    !ipv4_is_local_multicast(dst) &&
2969 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2970 				int err = ipmr_get_route(net, skb,
2971 							 fl4->saddr, fl4->daddr,
2972 							 r, portid);
2973 
2974 				if (err <= 0) {
2975 					if (err == 0)
2976 						return 0;
2977 					goto nla_put_failure;
2978 				}
2979 			} else
2980 #endif
2981 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2982 					goto nla_put_failure;
2983 		}
2984 	}
2985 
2986 	error = rt->dst.error;
2987 
2988 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2989 		goto nla_put_failure;
2990 
2991 	nlmsg_end(skb, nlh);
2992 	return 0;
2993 
2994 nla_put_failure:
2995 	nlmsg_cancel(skb, nlh);
2996 	return -EMSGSIZE;
2997 }
2998 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2999 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3000 			    struct netlink_callback *cb, u32 table_id,
3001 			    struct fnhe_hash_bucket *bucket, int genid,
3002 			    int *fa_index, int fa_start, unsigned int flags)
3003 {
3004 	int i;
3005 
3006 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3007 		struct fib_nh_exception *fnhe;
3008 
3009 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3010 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3011 			struct rtable *rt;
3012 			int err;
3013 
3014 			if (*fa_index < fa_start)
3015 				goto next;
3016 
3017 			if (fnhe->fnhe_genid != genid)
3018 				goto next;
3019 
3020 			if (fnhe->fnhe_expires &&
3021 			    time_after(jiffies, fnhe->fnhe_expires))
3022 				goto next;
3023 
3024 			rt = rcu_dereference(fnhe->fnhe_rth_input);
3025 			if (!rt)
3026 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3027 			if (!rt)
3028 				goto next;
3029 
3030 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3031 					   table_id, NULL, skb,
3032 					   NETLINK_CB(cb->skb).portid,
3033 					   cb->nlh->nlmsg_seq, flags);
3034 			if (err)
3035 				return err;
3036 next:
3037 			(*fa_index)++;
3038 		}
3039 	}
3040 
3041 	return 0;
3042 }
3043 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3044 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3045 		       u32 table_id, struct fib_info *fi,
3046 		       int *fa_index, int fa_start, unsigned int flags)
3047 {
3048 	struct net *net = sock_net(cb->skb->sk);
3049 	int nhsel, genid = fnhe_genid(net);
3050 
3051 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3052 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3053 		struct fnhe_hash_bucket *bucket;
3054 		int err;
3055 
3056 		if (nhc->nhc_flags & RTNH_F_DEAD)
3057 			continue;
3058 
3059 		rcu_read_lock();
3060 		bucket = rcu_dereference(nhc->nhc_exceptions);
3061 		err = 0;
3062 		if (bucket)
3063 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3064 					       genid, fa_index, fa_start,
3065 					       flags);
3066 		rcu_read_unlock();
3067 		if (err)
3068 			return err;
3069 	}
3070 
3071 	return 0;
3072 }
3073 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3074 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3075 						   u8 ip_proto, __be16 sport,
3076 						   __be16 dport)
3077 {
3078 	struct sk_buff *skb;
3079 	struct iphdr *iph;
3080 
3081 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3082 	if (!skb)
3083 		return NULL;
3084 
3085 	/* Reserve room for dummy headers, this skb can pass
3086 	 * through good chunk of routing engine.
3087 	 */
3088 	skb_reset_mac_header(skb);
3089 	skb_reset_network_header(skb);
3090 	skb->protocol = htons(ETH_P_IP);
3091 	iph = skb_put(skb, sizeof(struct iphdr));
3092 	iph->protocol = ip_proto;
3093 	iph->saddr = src;
3094 	iph->daddr = dst;
3095 	iph->version = 0x4;
3096 	iph->frag_off = 0;
3097 	iph->ihl = 0x5;
3098 	skb_set_transport_header(skb, skb->len);
3099 
3100 	switch (iph->protocol) {
3101 	case IPPROTO_UDP: {
3102 		struct udphdr *udph;
3103 
3104 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3105 		udph->source = sport;
3106 		udph->dest = dport;
3107 		udph->len = htons(sizeof(struct udphdr));
3108 		udph->check = 0;
3109 		break;
3110 	}
3111 	case IPPROTO_TCP: {
3112 		struct tcphdr *tcph;
3113 
3114 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3115 		tcph->source	= sport;
3116 		tcph->dest	= dport;
3117 		tcph->doff	= sizeof(struct tcphdr) / 4;
3118 		tcph->rst = 1;
3119 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3120 					    src, dst, 0);
3121 		break;
3122 	}
3123 	case IPPROTO_ICMP: {
3124 		struct icmphdr *icmph;
3125 
3126 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3127 		icmph->type = ICMP_ECHO;
3128 		icmph->code = 0;
3129 	}
3130 	}
3131 
3132 	return skb;
3133 }
3134 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3135 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3136 				       const struct nlmsghdr *nlh,
3137 				       struct nlattr **tb,
3138 				       struct netlink_ext_ack *extack)
3139 {
3140 	struct rtmsg *rtm;
3141 	int i, err;
3142 
3143 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3144 		NL_SET_ERR_MSG(extack,
3145 			       "ipv4: Invalid header for route get request");
3146 		return -EINVAL;
3147 	}
3148 
3149 	if (!netlink_strict_get_check(skb))
3150 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3151 					      rtm_ipv4_policy, extack);
3152 
3153 	rtm = nlmsg_data(nlh);
3154 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3155 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3156 	    rtm->rtm_table || rtm->rtm_protocol ||
3157 	    rtm->rtm_scope || rtm->rtm_type) {
3158 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3159 		return -EINVAL;
3160 	}
3161 
3162 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3163 			       RTM_F_LOOKUP_TABLE |
3164 			       RTM_F_FIB_MATCH)) {
3165 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3166 		return -EINVAL;
3167 	}
3168 
3169 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3170 					    rtm_ipv4_policy, extack);
3171 	if (err)
3172 		return err;
3173 
3174 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3175 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3176 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3177 		return -EINVAL;
3178 	}
3179 
3180 	for (i = 0; i <= RTA_MAX; i++) {
3181 		if (!tb[i])
3182 			continue;
3183 
3184 		switch (i) {
3185 		case RTA_IIF:
3186 		case RTA_OIF:
3187 		case RTA_SRC:
3188 		case RTA_DST:
3189 		case RTA_IP_PROTO:
3190 		case RTA_SPORT:
3191 		case RTA_DPORT:
3192 		case RTA_MARK:
3193 		case RTA_UID:
3194 			break;
3195 		default:
3196 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3197 			return -EINVAL;
3198 		}
3199 	}
3200 
3201 	return 0;
3202 }
3203 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3204 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3205 			     struct netlink_ext_ack *extack)
3206 {
3207 	struct net *net = sock_net(in_skb->sk);
3208 	struct nlattr *tb[RTA_MAX+1];
3209 	u32 table_id = RT_TABLE_MAIN;
3210 	__be16 sport = 0, dport = 0;
3211 	struct fib_result res = {};
3212 	u8 ip_proto = IPPROTO_UDP;
3213 	struct rtable *rt = NULL;
3214 	struct sk_buff *skb;
3215 	struct rtmsg *rtm;
3216 	struct flowi4 fl4 = {};
3217 	__be32 dst = 0;
3218 	__be32 src = 0;
3219 	kuid_t uid;
3220 	u32 iif;
3221 	int err;
3222 	int mark;
3223 
3224 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3225 	if (err < 0)
3226 		return err;
3227 
3228 	rtm = nlmsg_data(nlh);
3229 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3230 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3231 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3232 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3233 	if (tb[RTA_UID])
3234 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3235 	else
3236 		uid = (iif ? INVALID_UID : current_uid());
3237 
3238 	if (tb[RTA_IP_PROTO]) {
3239 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3240 						  &ip_proto, AF_INET, extack);
3241 		if (err)
3242 			return err;
3243 	}
3244 
3245 	if (tb[RTA_SPORT])
3246 		sport = nla_get_be16(tb[RTA_SPORT]);
3247 
3248 	if (tb[RTA_DPORT])
3249 		dport = nla_get_be16(tb[RTA_DPORT]);
3250 
3251 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3252 	if (!skb)
3253 		return -ENOBUFS;
3254 
3255 	fl4.daddr = dst;
3256 	fl4.saddr = src;
3257 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3258 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3259 	fl4.flowi4_mark = mark;
3260 	fl4.flowi4_uid = uid;
3261 	if (sport)
3262 		fl4.fl4_sport = sport;
3263 	if (dport)
3264 		fl4.fl4_dport = dport;
3265 	fl4.flowi4_proto = ip_proto;
3266 
3267 	rcu_read_lock();
3268 
3269 	if (iif) {
3270 		struct net_device *dev;
3271 
3272 		dev = dev_get_by_index_rcu(net, iif);
3273 		if (!dev) {
3274 			err = -ENODEV;
3275 			goto errout_rcu;
3276 		}
3277 
3278 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3279 		skb->dev	= dev;
3280 		skb->mark	= mark;
3281 		err = ip_route_input_rcu(skb, dst, src,
3282 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3283 					 &res);
3284 
3285 		rt = skb_rtable(skb);
3286 		if (err == 0 && rt->dst.error)
3287 			err = -rt->dst.error;
3288 	} else {
3289 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3290 		skb->dev = net->loopback_dev;
3291 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3292 		err = 0;
3293 		if (IS_ERR(rt))
3294 			err = PTR_ERR(rt);
3295 		else
3296 			skb_dst_set(skb, &rt->dst);
3297 	}
3298 
3299 	if (err)
3300 		goto errout_rcu;
3301 
3302 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3303 		rt->rt_flags |= RTCF_NOTIFY;
3304 
3305 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3306 		table_id = res.table ? res.table->tb_id : 0;
3307 
3308 	/* reset skb for netlink reply msg */
3309 	skb_trim(skb, 0);
3310 	skb_reset_network_header(skb);
3311 	skb_reset_transport_header(skb);
3312 	skb_reset_mac_header(skb);
3313 
3314 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3315 		struct fib_rt_info fri;
3316 
3317 		if (!res.fi) {
3318 			err = fib_props[res.type].error;
3319 			if (!err)
3320 				err = -EHOSTUNREACH;
3321 			goto errout_rcu;
3322 		}
3323 		fri.fi = res.fi;
3324 		fri.tb_id = table_id;
3325 		fri.dst = res.prefix;
3326 		fri.dst_len = res.prefixlen;
3327 		fri.tos = fl4.flowi4_tos;
3328 		fri.type = rt->rt_type;
3329 		fri.offload = 0;
3330 		fri.trap = 0;
3331 		if (res.fa_head) {
3332 			struct fib_alias *fa;
3333 
3334 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3335 				u8 slen = 32 - fri.dst_len;
3336 
3337 				if (fa->fa_slen == slen &&
3338 				    fa->tb_id == fri.tb_id &&
3339 				    fa->fa_tos == fri.tos &&
3340 				    fa->fa_info == res.fi &&
3341 				    fa->fa_type == fri.type) {
3342 					fri.offload = fa->offload;
3343 					fri.trap = fa->trap;
3344 					break;
3345 				}
3346 			}
3347 		}
3348 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3349 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3350 	} else {
3351 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3352 				   NETLINK_CB(in_skb).portid,
3353 				   nlh->nlmsg_seq, 0);
3354 	}
3355 	if (err < 0)
3356 		goto errout_rcu;
3357 
3358 	rcu_read_unlock();
3359 
3360 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3361 
3362 errout_free:
3363 	return err;
3364 errout_rcu:
3365 	rcu_read_unlock();
3366 	kfree_skb(skb);
3367 	goto errout_free;
3368 }
3369 
ip_rt_multicast_event(struct in_device * in_dev)3370 void ip_rt_multicast_event(struct in_device *in_dev)
3371 {
3372 	rt_cache_flush(dev_net(in_dev->dev));
3373 }
3374 
3375 #ifdef CONFIG_SYSCTL
3376 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3377 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3378 static int ip_rt_gc_elasticity __read_mostly	= 8;
3379 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3380 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3381 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3382 		void *buffer, size_t *lenp, loff_t *ppos)
3383 {
3384 	struct net *net = (struct net *)__ctl->extra1;
3385 
3386 	if (write) {
3387 		rt_cache_flush(net);
3388 		fnhe_genid_bump(net);
3389 		return 0;
3390 	}
3391 
3392 	return -EINVAL;
3393 }
3394 
3395 static struct ctl_table ipv4_route_table[] = {
3396 	{
3397 		.procname	= "gc_thresh",
3398 		.data		= &ipv4_dst_ops.gc_thresh,
3399 		.maxlen		= sizeof(int),
3400 		.mode		= 0644,
3401 		.proc_handler	= proc_dointvec,
3402 	},
3403 	{
3404 		.procname	= "max_size",
3405 		.data		= &ip_rt_max_size,
3406 		.maxlen		= sizeof(int),
3407 		.mode		= 0644,
3408 		.proc_handler	= proc_dointvec,
3409 	},
3410 	{
3411 		/*  Deprecated. Use gc_min_interval_ms */
3412 
3413 		.procname	= "gc_min_interval",
3414 		.data		= &ip_rt_gc_min_interval,
3415 		.maxlen		= sizeof(int),
3416 		.mode		= 0644,
3417 		.proc_handler	= proc_dointvec_jiffies,
3418 	},
3419 	{
3420 		.procname	= "gc_min_interval_ms",
3421 		.data		= &ip_rt_gc_min_interval,
3422 		.maxlen		= sizeof(int),
3423 		.mode		= 0644,
3424 		.proc_handler	= proc_dointvec_ms_jiffies,
3425 	},
3426 	{
3427 		.procname	= "gc_timeout",
3428 		.data		= &ip_rt_gc_timeout,
3429 		.maxlen		= sizeof(int),
3430 		.mode		= 0644,
3431 		.proc_handler	= proc_dointvec_jiffies,
3432 	},
3433 	{
3434 		.procname	= "gc_interval",
3435 		.data		= &ip_rt_gc_interval,
3436 		.maxlen		= sizeof(int),
3437 		.mode		= 0644,
3438 		.proc_handler	= proc_dointvec_jiffies,
3439 	},
3440 	{
3441 		.procname	= "redirect_load",
3442 		.data		= &ip_rt_redirect_load,
3443 		.maxlen		= sizeof(int),
3444 		.mode		= 0644,
3445 		.proc_handler	= proc_dointvec,
3446 	},
3447 	{
3448 		.procname	= "redirect_number",
3449 		.data		= &ip_rt_redirect_number,
3450 		.maxlen		= sizeof(int),
3451 		.mode		= 0644,
3452 		.proc_handler	= proc_dointvec,
3453 	},
3454 	{
3455 		.procname	= "redirect_silence",
3456 		.data		= &ip_rt_redirect_silence,
3457 		.maxlen		= sizeof(int),
3458 		.mode		= 0644,
3459 		.proc_handler	= proc_dointvec,
3460 	},
3461 	{
3462 		.procname	= "error_cost",
3463 		.data		= &ip_rt_error_cost,
3464 		.maxlen		= sizeof(int),
3465 		.mode		= 0644,
3466 		.proc_handler	= proc_dointvec,
3467 	},
3468 	{
3469 		.procname	= "error_burst",
3470 		.data		= &ip_rt_error_burst,
3471 		.maxlen		= sizeof(int),
3472 		.mode		= 0644,
3473 		.proc_handler	= proc_dointvec,
3474 	},
3475 	{
3476 		.procname	= "gc_elasticity",
3477 		.data		= &ip_rt_gc_elasticity,
3478 		.maxlen		= sizeof(int),
3479 		.mode		= 0644,
3480 		.proc_handler	= proc_dointvec,
3481 	},
3482 	{
3483 		.procname	= "mtu_expires",
3484 		.data		= &ip_rt_mtu_expires,
3485 		.maxlen		= sizeof(int),
3486 		.mode		= 0644,
3487 		.proc_handler	= proc_dointvec_jiffies,
3488 	},
3489 	{
3490 		.procname	= "min_pmtu",
3491 		.data		= &ip_rt_min_pmtu,
3492 		.maxlen		= sizeof(int),
3493 		.mode		= 0644,
3494 		.proc_handler	= proc_dointvec_minmax,
3495 		.extra1		= &ip_min_valid_pmtu,
3496 	},
3497 	{
3498 		.procname	= "min_adv_mss",
3499 		.data		= &ip_rt_min_advmss,
3500 		.maxlen		= sizeof(int),
3501 		.mode		= 0644,
3502 		.proc_handler	= proc_dointvec,
3503 	},
3504 	{ }
3505 };
3506 
3507 static const char ipv4_route_flush_procname[] = "flush";
3508 
3509 static struct ctl_table ipv4_route_flush_table[] = {
3510 	{
3511 		.procname	= ipv4_route_flush_procname,
3512 		.maxlen		= sizeof(int),
3513 		.mode		= 0200,
3514 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3515 	},
3516 	{ },
3517 };
3518 
sysctl_route_net_init(struct net * net)3519 static __net_init int sysctl_route_net_init(struct net *net)
3520 {
3521 	struct ctl_table *tbl;
3522 
3523 	tbl = ipv4_route_flush_table;
3524 	if (!net_eq(net, &init_net)) {
3525 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3526 		if (!tbl)
3527 			goto err_dup;
3528 
3529 		/* Don't export non-whitelisted sysctls to unprivileged users */
3530 		if (net->user_ns != &init_user_ns) {
3531 			if (tbl[0].procname != ipv4_route_flush_procname)
3532 				tbl[0].procname = NULL;
3533 		}
3534 	}
3535 	tbl[0].extra1 = net;
3536 
3537 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3538 	if (!net->ipv4.route_hdr)
3539 		goto err_reg;
3540 	return 0;
3541 
3542 err_reg:
3543 	if (tbl != ipv4_route_flush_table)
3544 		kfree(tbl);
3545 err_dup:
3546 	return -ENOMEM;
3547 }
3548 
sysctl_route_net_exit(struct net * net)3549 static __net_exit void sysctl_route_net_exit(struct net *net)
3550 {
3551 	struct ctl_table *tbl;
3552 
3553 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3554 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3555 	BUG_ON(tbl == ipv4_route_flush_table);
3556 	kfree(tbl);
3557 }
3558 
3559 static __net_initdata struct pernet_operations sysctl_route_ops = {
3560 	.init = sysctl_route_net_init,
3561 	.exit = sysctl_route_net_exit,
3562 };
3563 #endif
3564 
rt_genid_init(struct net * net)3565 static __net_init int rt_genid_init(struct net *net)
3566 {
3567 	atomic_set(&net->ipv4.rt_genid, 0);
3568 	atomic_set(&net->fnhe_genid, 0);
3569 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3570 	return 0;
3571 }
3572 
3573 static __net_initdata struct pernet_operations rt_genid_ops = {
3574 	.init = rt_genid_init,
3575 };
3576 
ipv4_inetpeer_init(struct net * net)3577 static int __net_init ipv4_inetpeer_init(struct net *net)
3578 {
3579 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3580 
3581 	if (!bp)
3582 		return -ENOMEM;
3583 	inet_peer_base_init(bp);
3584 	net->ipv4.peers = bp;
3585 	return 0;
3586 }
3587 
ipv4_inetpeer_exit(struct net * net)3588 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3589 {
3590 	struct inet_peer_base *bp = net->ipv4.peers;
3591 
3592 	net->ipv4.peers = NULL;
3593 	inetpeer_invalidate_tree(bp);
3594 	kfree(bp);
3595 }
3596 
3597 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3598 	.init	=	ipv4_inetpeer_init,
3599 	.exit	=	ipv4_inetpeer_exit,
3600 };
3601 
3602 #ifdef CONFIG_IP_ROUTE_CLASSID
3603 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3604 #endif /* CONFIG_IP_ROUTE_CLASSID */
3605 
ip_rt_init(void)3606 int __init ip_rt_init(void)
3607 {
3608 	void *idents_hash;
3609 	int cpu;
3610 
3611 	/* For modern hosts, this will use 2 MB of memory */
3612 	idents_hash = alloc_large_system_hash("IP idents",
3613 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3614 					      0,
3615 					      16, /* one bucket per 64 KB */
3616 					      HASH_ZERO,
3617 					      NULL,
3618 					      &ip_idents_mask,
3619 					      2048,
3620 					      256*1024);
3621 
3622 	ip_idents = idents_hash;
3623 
3624 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3625 
3626 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3627 
3628 	for_each_possible_cpu(cpu) {
3629 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3630 
3631 		INIT_LIST_HEAD(&ul->head);
3632 		spin_lock_init(&ul->lock);
3633 	}
3634 #ifdef CONFIG_IP_ROUTE_CLASSID
3635 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3636 	if (!ip_rt_acct)
3637 		panic("IP: failed to allocate ip_rt_acct\n");
3638 #endif
3639 
3640 	ipv4_dst_ops.kmem_cachep =
3641 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3642 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3643 
3644 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3645 
3646 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3647 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3648 
3649 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3650 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3651 
3652 	ipv4_dst_ops.gc_thresh = ~0;
3653 	ip_rt_max_size = INT_MAX;
3654 
3655 	devinet_init();
3656 	ip_fib_init();
3657 
3658 	if (ip_rt_proc_init())
3659 		pr_err("Unable to create route proc files\n");
3660 #ifdef CONFIG_XFRM
3661 	xfrm_init();
3662 	xfrm4_init();
3663 #endif
3664 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3665 		      RTNL_FLAG_DOIT_UNLOCKED);
3666 
3667 #ifdef CONFIG_SYSCTL
3668 	register_pernet_subsys(&sysctl_route_ops);
3669 #endif
3670 	register_pernet_subsys(&rt_genid_ops);
3671 	register_pernet_subsys(&ipv4_inetpeer_ops);
3672 	return 0;
3673 }
3674 
3675 #ifdef CONFIG_SYSCTL
3676 /*
3677  * We really need to sanitize the damn ipv4 init order, then all
3678  * this nonsense will go away.
3679  */
ip_static_sysctl_init(void)3680 void __init ip_static_sysctl_init(void)
3681 {
3682 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3683 }
3684 #endif
3685