• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		ROUTE - implementation of the IP router.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *		Alan Cox	:	Verify area fixes.
17  *		Alan Cox	:	cli() protects routing changes
18  *		Rui Oliveira	:	ICMP routing table updates
19  *		(rco@di.uminho.pt)	Routing table insertion and update
20  *		Linus Torvalds	:	Rewrote bits to be sensible
21  *		Alan Cox	:	Added BSD route gw semantics
22  *		Alan Cox	:	Super /proc >4K
23  *		Alan Cox	:	MTU in route table
24  *		Alan Cox	: 	MSS actually. Also added the window
25  *					clamper.
26  *		Sam Lantinga	:	Fixed route matching in rt_del()
27  *		Alan Cox	:	Routing cache support.
28  *		Alan Cox	:	Removed compatibility cruft.
29  *		Alan Cox	:	RTF_REJECT support.
30  *		Alan Cox	:	TCP irtt support.
31  *		Jonathan Naylor	:	Added Metric support.
32  *	Miquel van Smoorenburg	:	BSD API fixes.
33  *	Miquel van Smoorenburg	:	Metrics.
34  *		Alan Cox	:	Use __u32 properly
35  *		Alan Cox	:	Aligned routing errors more closely with BSD
36  *					our system is still very different.
37  *		Alan Cox	:	Faster /proc handling
38  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39  *					routing caches and better behaviour.
40  *
41  *		Olaf Erb	:	irtt wasn't being copied right.
42  *		Bjorn Ekwall	:	Kerneld route support.
43  *		Alan Cox	:	Multicast fixed (I hope)
44  * 		Pavel Krauz	:	Limited broadcast fixed
45  *		Mike McLagan	:	Routing by source
46  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47  *					route.c and rewritten from scratch.
48  *		Andi Kleen	:	Load-limit warning messages.
49  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53  *		Marc Boucher	:	routing by fwmark
54  *	Robert Olsson		:	Added rt_cache statistics
55  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
59  */
60 
61 #define pr_fmt(fmt) "IPv4: " fmt
62 
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113 
114 #include "fib_lookup.h"
115 
116 #define RT_FL_TOS(oldflp4) \
117 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly	= 9;
123 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly	= HZ;
126 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly	= 256;
130 
131 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
132 
133 /*
134  *	Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
140 static void		ipv4_negative_advice(struct sock *sk,
141 					     struct dst_entry *dst);
142 static void		 ipv4_link_failure(struct sk_buff *skb);
143 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 					   struct sk_buff *skb, u32 mtu,
145 					   bool confirm_neigh);
146 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 					struct sk_buff *skb);
148 static void		ipv4_dst_destroy(struct dst_entry *dst);
149 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 	WARN_ON(1);
153 	return NULL;
154 }
155 
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 					   struct sk_buff *skb,
158 					   const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 
161 static struct dst_ops ipv4_dst_ops = {
162 	.family =		AF_INET,
163 	.check =		ipv4_dst_check,
164 	.default_advmss =	ipv4_default_advmss,
165 	.mtu =			ipv4_mtu,
166 	.cow_metrics =		ipv4_cow_metrics,
167 	.destroy =		ipv4_dst_destroy,
168 	.negative_advice =	ipv4_negative_advice,
169 	.link_failure =		ipv4_link_failure,
170 	.update_pmtu =		ip_rt_update_pmtu,
171 	.redirect =		ip_do_redirect,
172 	.local_out =		__ip_local_out,
173 	.neigh_lookup =		ipv4_neigh_lookup,
174 	.confirm_neigh =	ipv4_confirm_neigh,
175 };
176 
177 #define ECN_OR_COST(class)	TC_PRIO_##class
178 
179 const __u8 ip_tos2prio[16] = {
180 	TC_PRIO_BESTEFFORT,
181 	ECN_OR_COST(BESTEFFORT),
182 	TC_PRIO_BESTEFFORT,
183 	ECN_OR_COST(BESTEFFORT),
184 	TC_PRIO_BULK,
185 	ECN_OR_COST(BULK),
186 	TC_PRIO_BULK,
187 	ECN_OR_COST(BULK),
188 	TC_PRIO_INTERACTIVE,
189 	ECN_OR_COST(INTERACTIVE),
190 	TC_PRIO_INTERACTIVE,
191 	ECN_OR_COST(INTERACTIVE),
192 	TC_PRIO_INTERACTIVE_BULK,
193 	ECN_OR_COST(INTERACTIVE_BULK),
194 	TC_PRIO_INTERACTIVE_BULK,
195 	ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198 
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 
202 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 	if (*pos)
206 		return NULL;
207 	return SEQ_START_TOKEN;
208 }
209 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 	++*pos;
213 	return NULL;
214 }
215 
rt_cache_seq_stop(struct seq_file * seq,void * v)216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219 
rt_cache_seq_show(struct seq_file * seq,void * v)220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 	if (v == SEQ_START_TOKEN)
223 		seq_printf(seq, "%-127s\n",
224 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 			   "HHUptod\tSpecDst");
227 	return 0;
228 }
229 
230 static const struct seq_operations rt_cache_seq_ops = {
231 	.start  = rt_cache_seq_start,
232 	.next   = rt_cache_seq_next,
233 	.stop   = rt_cache_seq_stop,
234 	.show   = rt_cache_seq_show,
235 };
236 
rt_cache_seq_open(struct inode * inode,struct file * file)237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 	return seq_open(file, &rt_cache_seq_ops);
240 }
241 
242 static const struct proc_ops rt_cache_proc_ops = {
243 	.proc_open	= rt_cache_seq_open,
244 	.proc_read	= seq_read,
245 	.proc_lseek	= seq_lseek,
246 	.proc_release	= seq_release,
247 };
248 
249 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 	int cpu;
253 
254 	if (*pos == 0)
255 		return SEQ_START_TOKEN;
256 
257 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 		if (!cpu_possible(cpu))
259 			continue;
260 		*pos = cpu+1;
261 		return &per_cpu(rt_cache_stat, cpu);
262 	}
263 	return NULL;
264 }
265 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 	int cpu;
269 
270 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 		if (!cpu_possible(cpu))
272 			continue;
273 		*pos = cpu+1;
274 		return &per_cpu(rt_cache_stat, cpu);
275 	}
276 	(*pos)++;
277 	return NULL;
278 
279 }
280 
rt_cpu_seq_stop(struct seq_file * seq,void * v)281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283 
284 }
285 
rt_cpu_seq_show(struct seq_file * seq,void * v)286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 	struct rt_cache_stat *st = v;
289 
290 	if (v == SEQ_START_TOKEN) {
291 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 		return 0;
293 	}
294 
295 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 		   dst_entries_get_slow(&ipv4_dst_ops),
298 		   0, /* st->in_hit */
299 		   st->in_slow_tot,
300 		   st->in_slow_mc,
301 		   st->in_no_route,
302 		   st->in_brd,
303 		   st->in_martian_dst,
304 		   st->in_martian_src,
305 
306 		   0, /* st->out_hit */
307 		   st->out_slow_tot,
308 		   st->out_slow_mc,
309 
310 		   0, /* st->gc_total */
311 		   0, /* st->gc_ignored */
312 		   0, /* st->gc_goal_miss */
313 		   0, /* st->gc_dst_overflow */
314 		   0, /* st->in_hlist_search */
315 		   0  /* st->out_hlist_search */
316 		);
317 	return 0;
318 }
319 
320 static const struct seq_operations rt_cpu_seq_ops = {
321 	.start  = rt_cpu_seq_start,
322 	.next   = rt_cpu_seq_next,
323 	.stop   = rt_cpu_seq_stop,
324 	.show   = rt_cpu_seq_show,
325 };
326 
327 
rt_cpu_seq_open(struct inode * inode,struct file * file)328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 	return seq_open(file, &rt_cpu_seq_ops);
331 }
332 
333 static const struct proc_ops rt_cpu_proc_ops = {
334 	.proc_open	= rt_cpu_seq_open,
335 	.proc_read	= seq_read,
336 	.proc_lseek	= seq_lseek,
337 	.proc_release	= seq_release,
338 };
339 
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 	struct ip_rt_acct *dst, *src;
344 	unsigned int i, j;
345 
346 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 	if (!dst)
348 		return -ENOMEM;
349 
350 	for_each_possible_cpu(i) {
351 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 		for (j = 0; j < 256; j++) {
353 			dst[j].o_bytes   += src[j].o_bytes;
354 			dst[j].o_packets += src[j].o_packets;
355 			dst[j].i_bytes   += src[j].i_bytes;
356 			dst[j].i_packets += src[j].i_packets;
357 		}
358 	}
359 
360 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 	kfree(dst);
362 	return 0;
363 }
364 #endif
365 
ip_rt_do_proc_init(struct net * net)366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368 	struct proc_dir_entry *pde;
369 
370 	pde = proc_create("rt_cache", 0444, net->proc_net,
371 			  &rt_cache_proc_ops);
372 	if (!pde)
373 		goto err1;
374 
375 	pde = proc_create("rt_cache", 0444,
376 			  net->proc_net_stat, &rt_cpu_proc_ops);
377 	if (!pde)
378 		goto err2;
379 
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 	pde = proc_create_single("rt_acct", 0, net->proc_net,
382 			rt_acct_proc_show);
383 	if (!pde)
384 		goto err3;
385 #endif
386 	return 0;
387 
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390 	remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393 	remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395 	return -ENOMEM;
396 }
397 
ip_rt_do_proc_exit(struct net * net)398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400 	remove_proc_entry("rt_cache", net->proc_net_stat);
401 	remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 	remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406 
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408 	.init = ip_rt_do_proc_init,
409 	.exit = ip_rt_do_proc_exit,
410 };
411 
ip_rt_proc_init(void)412 static int __init ip_rt_proc_init(void)
413 {
414 	return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416 
417 #else
ip_rt_proc_init(void)418 static inline int ip_rt_proc_init(void)
419 {
420 	return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423 
rt_is_expired(const struct rtable * rth)424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428 
rt_cache_flush(struct net * net)429 void rt_cache_flush(struct net *net)
430 {
431 	rt_genid_bump_ipv4(net);
432 }
433 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435 					   struct sk_buff *skb,
436 					   const void *daddr)
437 {
438 	const struct rtable *rt = container_of(dst, struct rtable, dst);
439 	struct net_device *dev = dst->dev;
440 	struct neighbour *n;
441 
442 	rcu_read_lock_bh();
443 
444 	if (likely(rt->rt_gw_family == AF_INET)) {
445 		n = ip_neigh_gw4(dev, rt->rt_gw4);
446 	} else if (rt->rt_gw_family == AF_INET6) {
447 		n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449 		__be32 pkey;
450 
451 		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 		n = ip_neigh_gw4(dev, pkey);
453 	}
454 
455 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 		n = NULL;
457 
458 	rcu_read_unlock_bh();
459 
460 	return n;
461 }
462 
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465 	const struct rtable *rt = container_of(dst, struct rtable, dst);
466 	struct net_device *dev = dst->dev;
467 	const __be32 *pkey = daddr;
468 
469 	if (rt->rt_gw_family == AF_INET) {
470 		pkey = (const __be32 *)&rt->rt_gw4;
471 	} else if (rt->rt_gw_family == AF_INET6) {
472 		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 	} else if (!daddr ||
474 		 (rt->rt_flags &
475 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 		return;
477 	}
478 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480 
481 /* Hash tables of size 2048..262144 depending on RAM size.
482  * Each bucket uses 8 bytes.
483  */
484 static u32 ip_idents_mask __read_mostly;
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487 
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
ip_idents_reserve(u32 hash,int segs)492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 	u32 bucket, old, now = (u32)jiffies;
495 	atomic_t *p_id;
496 	u32 *p_tstamp;
497 	u32 delta = 0;
498 
499 	bucket = hash & ip_idents_mask;
500 	p_tstamp = ip_tstamps + bucket;
501 	p_id = ip_idents + bucket;
502 	old = READ_ONCE(*p_tstamp);
503 
504 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
505 		delta = prandom_u32_max(now - old);
506 
507 	/* If UBSAN reports an error there, please make sure your compiler
508 	 * supports -fno-strict-overflow before reporting it that was a bug
509 	 * in UBSAN, and it has been fixed in GCC-8.
510 	 */
511 	return atomic_add_return(segs + delta, p_id) - segs;
512 }
513 EXPORT_SYMBOL(ip_idents_reserve);
514 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 {
517 	u32 hash, id;
518 
519 	/* Note the following code is not safe, but this is okay. */
520 	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
521 		get_random_bytes(&net->ipv4.ip_id_key,
522 				 sizeof(net->ipv4.ip_id_key));
523 
524 	hash = siphash_3u32((__force u32)iph->daddr,
525 			    (__force u32)iph->saddr,
526 			    iph->protocol,
527 			    &net->ipv4.ip_id_key);
528 	id = ip_idents_reserve(hash, segs);
529 	iph->id = htons(id);
530 }
531 EXPORT_SYMBOL(__ip_select_ident);
532 
ip_rt_fix_tos(struct flowi4 * fl4)533 static void ip_rt_fix_tos(struct flowi4 *fl4)
534 {
535 	__u8 tos = RT_FL_TOS(fl4);
536 
537 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
538 	fl4->flowi4_scope = tos & RTO_ONLINK ?
539 			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
540 }
541 
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)542 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
543 			     const struct sock *sk,
544 			     const struct iphdr *iph,
545 			     int oif, u8 tos,
546 			     u8 prot, u32 mark, int flow_flags)
547 {
548 	if (sk) {
549 		const struct inet_sock *inet = inet_sk(sk);
550 
551 		oif = sk->sk_bound_dev_if;
552 		mark = sk->sk_mark;
553 		tos = RT_CONN_FLAGS(sk);
554 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
555 	}
556 	flowi4_init_output(fl4, oif, mark, tos,
557 			   RT_SCOPE_UNIVERSE, prot,
558 			   flow_flags,
559 			   iph->daddr, iph->saddr, 0, 0,
560 			   sock_net_uid(net, sk));
561 }
562 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)563 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
564 			       const struct sock *sk)
565 {
566 	const struct net *net = dev_net(skb->dev);
567 	const struct iphdr *iph = ip_hdr(skb);
568 	int oif = skb->dev->ifindex;
569 	u8 tos = RT_TOS(iph->tos);
570 	u8 prot = iph->protocol;
571 	u32 mark = skb->mark;
572 
573 	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
574 }
575 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)576 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
577 {
578 	const struct inet_sock *inet = inet_sk(sk);
579 	const struct ip_options_rcu *inet_opt;
580 	__be32 daddr = inet->inet_daddr;
581 
582 	rcu_read_lock();
583 	inet_opt = rcu_dereference(inet->inet_opt);
584 	if (inet_opt && inet_opt->opt.srr)
585 		daddr = inet_opt->opt.faddr;
586 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
587 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
588 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
589 			   inet_sk_flowi_flags(sk),
590 			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
591 	rcu_read_unlock();
592 }
593 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)594 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
595 				 const struct sk_buff *skb)
596 {
597 	if (skb)
598 		build_skb_flow_key(fl4, skb, sk);
599 	else
600 		build_sk_flow_key(fl4, sk);
601 }
602 
603 static DEFINE_SPINLOCK(fnhe_lock);
604 
fnhe_flush_routes(struct fib_nh_exception * fnhe)605 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
606 {
607 	struct rtable *rt;
608 
609 	rt = rcu_dereference(fnhe->fnhe_rth_input);
610 	if (rt) {
611 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
612 		dst_dev_put(&rt->dst);
613 		dst_release(&rt->dst);
614 	}
615 	rt = rcu_dereference(fnhe->fnhe_rth_output);
616 	if (rt) {
617 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
618 		dst_dev_put(&rt->dst);
619 		dst_release(&rt->dst);
620 	}
621 }
622 
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)623 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
624 {
625 	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
626 	struct fib_nh_exception *fnhe, *oldest = NULL;
627 
628 	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
629 		fnhe = rcu_dereference_protected(*fnhe_p,
630 						 lockdep_is_held(&fnhe_lock));
631 		if (!fnhe)
632 			break;
633 		if (!oldest ||
634 		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
635 			oldest = fnhe;
636 			oldest_p = fnhe_p;
637 		}
638 	}
639 	fnhe_flush_routes(oldest);
640 	*oldest_p = oldest->fnhe_next;
641 	kfree_rcu(oldest, rcu);
642 }
643 
fnhe_hashfun(__be32 daddr)644 static u32 fnhe_hashfun(__be32 daddr)
645 {
646 	static siphash_key_t fnhe_hash_key __read_mostly;
647 	u64 hval;
648 
649 	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
650 	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
651 	return hash_64(hval, FNHE_HASH_SHIFT);
652 }
653 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)654 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
655 {
656 	rt->rt_pmtu = fnhe->fnhe_pmtu;
657 	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
658 	rt->dst.expires = fnhe->fnhe_expires;
659 
660 	if (fnhe->fnhe_gw) {
661 		rt->rt_flags |= RTCF_REDIRECTED;
662 		rt->rt_uses_gateway = 1;
663 		rt->rt_gw_family = AF_INET;
664 		rt->rt_gw4 = fnhe->fnhe_gw;
665 	}
666 }
667 
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)668 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
669 				  __be32 gw, u32 pmtu, bool lock,
670 				  unsigned long expires)
671 {
672 	struct fnhe_hash_bucket *hash;
673 	struct fib_nh_exception *fnhe;
674 	struct rtable *rt;
675 	u32 genid, hval;
676 	unsigned int i;
677 	int depth;
678 
679 	genid = fnhe_genid(dev_net(nhc->nhc_dev));
680 	hval = fnhe_hashfun(daddr);
681 
682 	spin_lock_bh(&fnhe_lock);
683 
684 	hash = rcu_dereference(nhc->nhc_exceptions);
685 	if (!hash) {
686 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
687 		if (!hash)
688 			goto out_unlock;
689 		rcu_assign_pointer(nhc->nhc_exceptions, hash);
690 	}
691 
692 	hash += hval;
693 
694 	depth = 0;
695 	for (fnhe = rcu_dereference(hash->chain); fnhe;
696 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
697 		if (fnhe->fnhe_daddr == daddr)
698 			break;
699 		depth++;
700 	}
701 
702 	if (fnhe) {
703 		if (fnhe->fnhe_genid != genid)
704 			fnhe->fnhe_genid = genid;
705 		if (gw)
706 			fnhe->fnhe_gw = gw;
707 		if (pmtu) {
708 			fnhe->fnhe_pmtu = pmtu;
709 			fnhe->fnhe_mtu_locked = lock;
710 		}
711 		fnhe->fnhe_expires = max(1UL, expires);
712 		/* Update all cached dsts too */
713 		rt = rcu_dereference(fnhe->fnhe_rth_input);
714 		if (rt)
715 			fill_route_from_fnhe(rt, fnhe);
716 		rt = rcu_dereference(fnhe->fnhe_rth_output);
717 		if (rt)
718 			fill_route_from_fnhe(rt, fnhe);
719 	} else {
720 		/* Randomize max depth to avoid some side channels attacks. */
721 		int max_depth = FNHE_RECLAIM_DEPTH +
722 				prandom_u32_max(FNHE_RECLAIM_DEPTH);
723 
724 		while (depth > max_depth) {
725 			fnhe_remove_oldest(hash);
726 			depth--;
727 		}
728 
729 		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
730 		if (!fnhe)
731 			goto out_unlock;
732 
733 		fnhe->fnhe_next = hash->chain;
734 
735 		fnhe->fnhe_genid = genid;
736 		fnhe->fnhe_daddr = daddr;
737 		fnhe->fnhe_gw = gw;
738 		fnhe->fnhe_pmtu = pmtu;
739 		fnhe->fnhe_mtu_locked = lock;
740 		fnhe->fnhe_expires = max(1UL, expires);
741 
742 		rcu_assign_pointer(hash->chain, fnhe);
743 
744 		/* Exception created; mark the cached routes for the nexthop
745 		 * stale, so anyone caching it rechecks if this exception
746 		 * applies to them.
747 		 */
748 		rt = rcu_dereference(nhc->nhc_rth_input);
749 		if (rt)
750 			rt->dst.obsolete = DST_OBSOLETE_KILL;
751 
752 		for_each_possible_cpu(i) {
753 			struct rtable __rcu **prt;
754 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
755 			rt = rcu_dereference(*prt);
756 			if (rt)
757 				rt->dst.obsolete = DST_OBSOLETE_KILL;
758 		}
759 	}
760 
761 	fnhe->fnhe_stamp = jiffies;
762 
763 out_unlock:
764 	spin_unlock_bh(&fnhe_lock);
765 }
766 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)767 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
768 			     bool kill_route)
769 {
770 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
771 	__be32 old_gw = ip_hdr(skb)->saddr;
772 	struct net_device *dev = skb->dev;
773 	struct in_device *in_dev;
774 	struct fib_result res;
775 	struct neighbour *n;
776 	struct net *net;
777 
778 	switch (icmp_hdr(skb)->code & 7) {
779 	case ICMP_REDIR_NET:
780 	case ICMP_REDIR_NETTOS:
781 	case ICMP_REDIR_HOST:
782 	case ICMP_REDIR_HOSTTOS:
783 		break;
784 
785 	default:
786 		return;
787 	}
788 
789 	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
790 		return;
791 
792 	in_dev = __in_dev_get_rcu(dev);
793 	if (!in_dev)
794 		return;
795 
796 	net = dev_net(dev);
797 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
798 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
799 	    ipv4_is_zeronet(new_gw))
800 		goto reject_redirect;
801 
802 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
803 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
804 			goto reject_redirect;
805 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
806 			goto reject_redirect;
807 	} else {
808 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
809 			goto reject_redirect;
810 	}
811 
812 	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
813 	if (!n)
814 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
815 	if (!IS_ERR(n)) {
816 		if (!(n->nud_state & NUD_VALID)) {
817 			neigh_event_send(n, NULL);
818 		} else {
819 			if (fib_lookup(net, fl4, &res, 0) == 0) {
820 				struct fib_nh_common *nhc;
821 
822 				fib_select_path(net, &res, fl4, skb);
823 				nhc = FIB_RES_NHC(res);
824 				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
825 						0, false,
826 						jiffies + ip_rt_gc_timeout);
827 			}
828 			if (kill_route)
829 				rt->dst.obsolete = DST_OBSOLETE_KILL;
830 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
831 		}
832 		neigh_release(n);
833 	}
834 	return;
835 
836 reject_redirect:
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
839 		const struct iphdr *iph = (const struct iphdr *) skb->data;
840 		__be32 daddr = iph->daddr;
841 		__be32 saddr = iph->saddr;
842 
843 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
844 				     "  Advised path = %pI4 -> %pI4\n",
845 				     &old_gw, dev->name, &new_gw,
846 				     &saddr, &daddr);
847 	}
848 #endif
849 	;
850 }
851 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)852 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
853 {
854 	struct rtable *rt;
855 	struct flowi4 fl4;
856 	const struct iphdr *iph = (const struct iphdr *) skb->data;
857 	struct net *net = dev_net(skb->dev);
858 	int oif = skb->dev->ifindex;
859 	u8 tos = RT_TOS(iph->tos);
860 	u8 prot = iph->protocol;
861 	u32 mark = skb->mark;
862 
863 	rt = (struct rtable *) dst;
864 
865 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
866 	ip_rt_fix_tos(&fl4);
867 	__ip_do_redirect(rt, skb, &fl4, true);
868 }
869 
ipv4_negative_advice(struct sock * sk,struct dst_entry * dst)870 static void ipv4_negative_advice(struct sock *sk,
871 				 struct dst_entry *dst)
872 {
873 	struct rtable *rt = (struct rtable *)dst;
874 
875 	if ((dst->obsolete > 0) ||
876 	    (rt->rt_flags & RTCF_REDIRECTED) ||
877 	    rt->dst.expires)
878 		sk_dst_reset(sk);
879 }
880 
881 /*
882  * Algorithm:
883  *	1. The first ip_rt_redirect_number redirects are sent
884  *	   with exponential backoff, then we stop sending them at all,
885  *	   assuming that the host ignores our redirects.
886  *	2. If we did not see packets requiring redirects
887  *	   during ip_rt_redirect_silence, we assume that the host
888  *	   forgot redirected route and start to send redirects again.
889  *
890  * This algorithm is much cheaper and more intelligent than dumb load limiting
891  * in icmp.c.
892  *
893  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
894  * and "frag. need" (breaks PMTU discovery) in icmp.c.
895  */
896 
ip_rt_send_redirect(struct sk_buff * skb)897 void ip_rt_send_redirect(struct sk_buff *skb)
898 {
899 	struct rtable *rt = skb_rtable(skb);
900 	struct in_device *in_dev;
901 	struct inet_peer *peer;
902 	struct net *net;
903 	int log_martians;
904 	int vif;
905 
906 	rcu_read_lock();
907 	in_dev = __in_dev_get_rcu(rt->dst.dev);
908 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
909 		rcu_read_unlock();
910 		return;
911 	}
912 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
913 	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
914 	rcu_read_unlock();
915 
916 	net = dev_net(rt->dst.dev);
917 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
918 	if (!peer) {
919 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
920 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
921 		return;
922 	}
923 
924 	/* No redirected packets during ip_rt_redirect_silence;
925 	 * reset the algorithm.
926 	 */
927 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
928 		peer->rate_tokens = 0;
929 		peer->n_redirects = 0;
930 	}
931 
932 	/* Too many ignored redirects; do not send anything
933 	 * set dst.rate_last to the last seen redirected packet.
934 	 */
935 	if (peer->n_redirects >= ip_rt_redirect_number) {
936 		peer->rate_last = jiffies;
937 		goto out_put_peer;
938 	}
939 
940 	/* Check for load limit; set rate_last to the latest sent
941 	 * redirect.
942 	 */
943 	if (peer->n_redirects == 0 ||
944 	    time_after(jiffies,
945 		       (peer->rate_last +
946 			(ip_rt_redirect_load << peer->n_redirects)))) {
947 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
948 
949 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
950 		peer->rate_last = jiffies;
951 		++peer->n_redirects;
952 #ifdef CONFIG_IP_ROUTE_VERBOSE
953 		if (log_martians &&
954 		    peer->n_redirects == ip_rt_redirect_number)
955 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
956 					     &ip_hdr(skb)->saddr, inet_iif(skb),
957 					     &ip_hdr(skb)->daddr, &gw);
958 #endif
959 	}
960 out_put_peer:
961 	inet_putpeer(peer);
962 }
963 
ip_error(struct sk_buff * skb)964 static int ip_error(struct sk_buff *skb)
965 {
966 	struct rtable *rt = skb_rtable(skb);
967 	struct net_device *dev = skb->dev;
968 	struct in_device *in_dev;
969 	struct inet_peer *peer;
970 	unsigned long now;
971 	struct net *net;
972 	bool send;
973 	int code;
974 
975 	if (netif_is_l3_master(skb->dev)) {
976 		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
977 		if (!dev)
978 			goto out;
979 	}
980 
981 	in_dev = __in_dev_get_rcu(dev);
982 
983 	/* IP on this device is disabled. */
984 	if (!in_dev)
985 		goto out;
986 
987 	net = dev_net(rt->dst.dev);
988 	if (!IN_DEV_FORWARD(in_dev)) {
989 		switch (rt->dst.error) {
990 		case EHOSTUNREACH:
991 			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
992 			break;
993 
994 		case ENETUNREACH:
995 			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
996 			break;
997 		}
998 		goto out;
999 	}
1000 
1001 	switch (rt->dst.error) {
1002 	case EINVAL:
1003 	default:
1004 		goto out;
1005 	case EHOSTUNREACH:
1006 		code = ICMP_HOST_UNREACH;
1007 		break;
1008 	case ENETUNREACH:
1009 		code = ICMP_NET_UNREACH;
1010 		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1011 		break;
1012 	case EACCES:
1013 		code = ICMP_PKT_FILTERED;
1014 		break;
1015 	}
1016 
1017 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1018 			       l3mdev_master_ifindex(skb->dev), 1);
1019 
1020 	send = true;
1021 	if (peer) {
1022 		now = jiffies;
1023 		peer->rate_tokens += now - peer->rate_last;
1024 		if (peer->rate_tokens > ip_rt_error_burst)
1025 			peer->rate_tokens = ip_rt_error_burst;
1026 		peer->rate_last = now;
1027 		if (peer->rate_tokens >= ip_rt_error_cost)
1028 			peer->rate_tokens -= ip_rt_error_cost;
1029 		else
1030 			send = false;
1031 		inet_putpeer(peer);
1032 	}
1033 	if (send)
1034 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1035 
1036 out:	kfree_skb(skb);
1037 	return 0;
1038 }
1039 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1040 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1041 {
1042 	struct dst_entry *dst = &rt->dst;
1043 	struct fib_result res;
1044 	bool lock = false;
1045 	struct net *net;
1046 	u32 old_mtu;
1047 
1048 	if (ip_mtu_locked(dst))
1049 		return;
1050 
1051 	old_mtu = ipv4_mtu(dst);
1052 	if (old_mtu < mtu)
1053 		return;
1054 
1055 	rcu_read_lock();
1056 	net = dev_net_rcu(dst->dev);
1057 	if (mtu < ip_rt_min_pmtu) {
1058 		lock = true;
1059 		mtu = min(old_mtu, ip_rt_min_pmtu);
1060 	}
1061 
1062 	if (rt->rt_pmtu == mtu && !lock &&
1063 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1064 		goto out;
1065 
1066 	if (fib_lookup(net, fl4, &res, 0) == 0) {
1067 		struct fib_nh_common *nhc;
1068 
1069 		fib_select_path(net, &res, fl4, NULL);
1070 		nhc = FIB_RES_NHC(res);
1071 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1072 				      jiffies + ip_rt_mtu_expires);
1073 	}
1074 out:
1075 	rcu_read_unlock();
1076 }
1077 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1078 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1079 			      struct sk_buff *skb, u32 mtu,
1080 			      bool confirm_neigh)
1081 {
1082 	struct rtable *rt = (struct rtable *) dst;
1083 	struct flowi4 fl4;
1084 
1085 	ip_rt_build_flow_key(&fl4, sk, skb);
1086 	ip_rt_fix_tos(&fl4);
1087 
1088 	/* Don't make lookup fail for bridged encapsulations */
1089 	if (skb && netif_is_any_bridge_port(skb->dev))
1090 		fl4.flowi4_oif = 0;
1091 
1092 	__ip_rt_update_pmtu(rt, &fl4, mtu);
1093 }
1094 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1095 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1096 		      int oif, u8 protocol)
1097 {
1098 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1099 	struct flowi4 fl4;
1100 	struct rtable *rt;
1101 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1102 
1103 	__build_flow_key(net, &fl4, NULL, iph, oif,
1104 			 RT_TOS(iph->tos), protocol, mark, 0);
1105 	rt = __ip_route_output_key(net, &fl4);
1106 	if (!IS_ERR(rt)) {
1107 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1108 		ip_rt_put(rt);
1109 	}
1110 }
1111 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1112 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1113 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1114 {
1115 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1116 	struct flowi4 fl4;
1117 	struct rtable *rt;
1118 
1119 	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1120 
1121 	if (!fl4.flowi4_mark)
1122 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1123 
1124 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1125 	if (!IS_ERR(rt)) {
1126 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1127 		ip_rt_put(rt);
1128 	}
1129 }
1130 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1131 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1132 {
1133 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1134 	struct flowi4 fl4;
1135 	struct rtable *rt;
1136 	struct dst_entry *odst = NULL;
1137 	bool new = false;
1138 	struct net *net = sock_net(sk);
1139 
1140 	bh_lock_sock(sk);
1141 
1142 	if (!ip_sk_accept_pmtu(sk))
1143 		goto out;
1144 
1145 	odst = sk_dst_get(sk);
1146 
1147 	if (sock_owned_by_user(sk) || !odst) {
1148 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1149 		goto out;
1150 	}
1151 
1152 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1153 
1154 	rt = (struct rtable *)odst;
1155 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1156 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157 		if (IS_ERR(rt))
1158 			goto out;
1159 
1160 		new = true;
1161 	} else {
1162 		ip_rt_fix_tos(&fl4);
1163 	}
1164 
1165 	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1166 
1167 	if (!dst_check(&rt->dst, 0)) {
1168 		if (new)
1169 			dst_release(&rt->dst);
1170 
1171 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1172 		if (IS_ERR(rt))
1173 			goto out;
1174 
1175 		new = true;
1176 	}
1177 
1178 	if (new)
1179 		sk_dst_set(sk, &rt->dst);
1180 
1181 out:
1182 	bh_unlock_sock(sk);
1183 	dst_release(odst);
1184 }
1185 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1186 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1187 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1188 		   int oif, u8 protocol)
1189 {
1190 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1191 	struct flowi4 fl4;
1192 	struct rtable *rt;
1193 
1194 	__build_flow_key(net, &fl4, NULL, iph, oif,
1195 			 RT_TOS(iph->tos), protocol, 0, 0);
1196 	rt = __ip_route_output_key(net, &fl4);
1197 	if (!IS_ERR(rt)) {
1198 		__ip_do_redirect(rt, skb, &fl4, false);
1199 		ip_rt_put(rt);
1200 	}
1201 }
1202 EXPORT_SYMBOL_GPL(ipv4_redirect);
1203 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1204 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1205 {
1206 	const struct iphdr *iph = (const struct iphdr *)skb->data;
1207 	struct flowi4 fl4;
1208 	struct rtable *rt;
1209 	struct net *net = sock_net(sk);
1210 
1211 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1212 	rt = __ip_route_output_key(net, &fl4);
1213 	if (!IS_ERR(rt)) {
1214 		__ip_do_redirect(rt, skb, &fl4, false);
1215 		ip_rt_put(rt);
1216 	}
1217 }
1218 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1219 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1220 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1221 {
1222 	struct rtable *rt = (struct rtable *) dst;
1223 
1224 	/* All IPV4 dsts are created with ->obsolete set to the value
1225 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1226 	 * into this function always.
1227 	 *
1228 	 * When a PMTU/redirect information update invalidates a route,
1229 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1230 	 * DST_OBSOLETE_DEAD.
1231 	 */
1232 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1233 		return NULL;
1234 	return dst;
1235 }
1236 
ipv4_send_dest_unreach(struct sk_buff * skb)1237 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1238 {
1239 	struct net_device *dev;
1240 	struct ip_options opt;
1241 	int res;
1242 
1243 	/* Recompile ip options since IPCB may not be valid anymore.
1244 	 * Also check we have a reasonable ipv4 header.
1245 	 */
1246 	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1247 	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1248 		return;
1249 
1250 	memset(&opt, 0, sizeof(opt));
1251 	if (ip_hdr(skb)->ihl > 5) {
1252 		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1253 			return;
1254 		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1255 
1256 		rcu_read_lock();
1257 		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1258 		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1259 		rcu_read_unlock();
1260 
1261 		if (res)
1262 			return;
1263 	}
1264 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1265 }
1266 
ipv4_link_failure(struct sk_buff * skb)1267 static void ipv4_link_failure(struct sk_buff *skb)
1268 {
1269 	struct rtable *rt;
1270 
1271 	ipv4_send_dest_unreach(skb);
1272 
1273 	rt = skb_rtable(skb);
1274 	if (rt)
1275 		dst_set_expires(&rt->dst, 0);
1276 }
1277 
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1278 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1279 {
1280 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1281 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1282 		 skb->dev ? skb->dev->name : "?");
1283 	kfree_skb(skb);
1284 	WARN_ON(1);
1285 	return 0;
1286 }
1287 
1288 /*
1289    We do not cache source address of outgoing interface,
1290    because it is used only by IP RR, TS and SRR options,
1291    so that it out of fast path.
1292 
1293    BTW remember: "addr" is allowed to be not aligned
1294    in IP options!
1295  */
1296 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1297 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1298 {
1299 	__be32 src;
1300 
1301 	if (rt_is_output_route(rt))
1302 		src = ip_hdr(skb)->saddr;
1303 	else {
1304 		struct fib_result res;
1305 		struct iphdr *iph = ip_hdr(skb);
1306 		struct flowi4 fl4 = {
1307 			.daddr = iph->daddr,
1308 			.saddr = iph->saddr,
1309 			.flowi4_tos = RT_TOS(iph->tos),
1310 			.flowi4_oif = rt->dst.dev->ifindex,
1311 			.flowi4_iif = skb->dev->ifindex,
1312 			.flowi4_mark = skb->mark,
1313 		};
1314 
1315 		rcu_read_lock();
1316 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1317 			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1318 		else
1319 			src = inet_select_addr(rt->dst.dev,
1320 					       rt_nexthop(rt, iph->daddr),
1321 					       RT_SCOPE_UNIVERSE);
1322 		rcu_read_unlock();
1323 	}
1324 	memcpy(addr, &src, 4);
1325 }
1326 
1327 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1328 static void set_class_tag(struct rtable *rt, u32 tag)
1329 {
1330 	if (!(rt->dst.tclassid & 0xFFFF))
1331 		rt->dst.tclassid |= tag & 0xFFFF;
1332 	if (!(rt->dst.tclassid & 0xFFFF0000))
1333 		rt->dst.tclassid |= tag & 0xFFFF0000;
1334 }
1335 #endif
1336 
ipv4_default_advmss(const struct dst_entry * dst)1337 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1338 {
1339 	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1340 	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1341 				    ip_rt_min_advmss);
1342 
1343 	return min(advmss, IPV4_MAX_PMTU - header_size);
1344 }
1345 
ipv4_mtu(const struct dst_entry * dst)1346 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1347 {
1348 	const struct rtable *rt = (const struct rtable *)dst;
1349 	unsigned int mtu = rt->rt_pmtu;
1350 
1351 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1352 		mtu = dst_metric_raw(dst, RTAX_MTU);
1353 
1354 	if (mtu)
1355 		goto out;
1356 
1357 	mtu = READ_ONCE(dst->dev->mtu);
1358 
1359 	if (unlikely(ip_mtu_locked(dst))) {
1360 		if (rt->rt_uses_gateway && mtu > 576)
1361 			mtu = 576;
1362 	}
1363 
1364 out:
1365 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1366 
1367 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1368 }
1369 
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1370 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1371 {
1372 	struct fnhe_hash_bucket *hash;
1373 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1374 	u32 hval = fnhe_hashfun(daddr);
1375 
1376 	spin_lock_bh(&fnhe_lock);
1377 
1378 	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1379 					 lockdep_is_held(&fnhe_lock));
1380 	hash += hval;
1381 
1382 	fnhe_p = &hash->chain;
1383 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1384 	while (fnhe) {
1385 		if (fnhe->fnhe_daddr == daddr) {
1386 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1387 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1388 			/* set fnhe_daddr to 0 to ensure it won't bind with
1389 			 * new dsts in rt_bind_exception().
1390 			 */
1391 			fnhe->fnhe_daddr = 0;
1392 			fnhe_flush_routes(fnhe);
1393 			kfree_rcu(fnhe, rcu);
1394 			break;
1395 		}
1396 		fnhe_p = &fnhe->fnhe_next;
1397 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1398 						 lockdep_is_held(&fnhe_lock));
1399 	}
1400 
1401 	spin_unlock_bh(&fnhe_lock);
1402 }
1403 
find_exception(struct fib_nh_common * nhc,__be32 daddr)1404 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1405 					       __be32 daddr)
1406 {
1407 	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1408 	struct fib_nh_exception *fnhe;
1409 	u32 hval;
1410 
1411 	if (!hash)
1412 		return NULL;
1413 
1414 	hval = fnhe_hashfun(daddr);
1415 
1416 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1417 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1418 		if (fnhe->fnhe_daddr == daddr) {
1419 			if (fnhe->fnhe_expires &&
1420 			    time_after(jiffies, fnhe->fnhe_expires)) {
1421 				ip_del_fnhe(nhc, daddr);
1422 				break;
1423 			}
1424 			return fnhe;
1425 		}
1426 	}
1427 	return NULL;
1428 }
1429 
1430 /* MTU selection:
1431  * 1. mtu on route is locked - use it
1432  * 2. mtu from nexthop exception
1433  * 3. mtu from egress device
1434  */
1435 
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1436 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1437 {
1438 	struct fib_nh_common *nhc = res->nhc;
1439 	struct net_device *dev = nhc->nhc_dev;
1440 	struct fib_info *fi = res->fi;
1441 	u32 mtu = 0;
1442 
1443 	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1444 	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1445 		mtu = fi->fib_mtu;
1446 
1447 	if (likely(!mtu)) {
1448 		struct fib_nh_exception *fnhe;
1449 
1450 		fnhe = find_exception(nhc, daddr);
1451 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1452 			mtu = fnhe->fnhe_pmtu;
1453 	}
1454 
1455 	if (likely(!mtu))
1456 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1457 
1458 	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1459 }
1460 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1461 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1462 			      __be32 daddr, const bool do_cache)
1463 {
1464 	bool ret = false;
1465 
1466 	spin_lock_bh(&fnhe_lock);
1467 
1468 	if (daddr == fnhe->fnhe_daddr) {
1469 		struct rtable __rcu **porig;
1470 		struct rtable *orig;
1471 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1472 
1473 		if (rt_is_input_route(rt))
1474 			porig = &fnhe->fnhe_rth_input;
1475 		else
1476 			porig = &fnhe->fnhe_rth_output;
1477 		orig = rcu_dereference(*porig);
1478 
1479 		if (fnhe->fnhe_genid != genid) {
1480 			fnhe->fnhe_genid = genid;
1481 			fnhe->fnhe_gw = 0;
1482 			fnhe->fnhe_pmtu = 0;
1483 			fnhe->fnhe_expires = 0;
1484 			fnhe->fnhe_mtu_locked = false;
1485 			fnhe_flush_routes(fnhe);
1486 			orig = NULL;
1487 		}
1488 		fill_route_from_fnhe(rt, fnhe);
1489 		if (!rt->rt_gw4) {
1490 			rt->rt_gw4 = daddr;
1491 			rt->rt_gw_family = AF_INET;
1492 		}
1493 
1494 		if (do_cache) {
1495 			dst_hold(&rt->dst);
1496 			rcu_assign_pointer(*porig, rt);
1497 			if (orig) {
1498 				dst_dev_put(&orig->dst);
1499 				dst_release(&orig->dst);
1500 			}
1501 			ret = true;
1502 		}
1503 
1504 		fnhe->fnhe_stamp = jiffies;
1505 	}
1506 	spin_unlock_bh(&fnhe_lock);
1507 
1508 	return ret;
1509 }
1510 
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1511 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1512 {
1513 	struct rtable *orig, *prev, **p;
1514 	bool ret = true;
1515 
1516 	if (rt_is_input_route(rt)) {
1517 		p = (struct rtable **)&nhc->nhc_rth_input;
1518 	} else {
1519 		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1520 	}
1521 	orig = *p;
1522 
1523 	/* hold dst before doing cmpxchg() to avoid race condition
1524 	 * on this dst
1525 	 */
1526 	dst_hold(&rt->dst);
1527 	prev = cmpxchg(p, orig, rt);
1528 	if (prev == orig) {
1529 		if (orig) {
1530 			rt_add_uncached_list(orig);
1531 			dst_release(&orig->dst);
1532 		}
1533 	} else {
1534 		dst_release(&rt->dst);
1535 		ret = false;
1536 	}
1537 
1538 	return ret;
1539 }
1540 
1541 struct uncached_list {
1542 	spinlock_t		lock;
1543 	struct list_head	head;
1544 };
1545 
1546 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1547 
rt_add_uncached_list(struct rtable * rt)1548 void rt_add_uncached_list(struct rtable *rt)
1549 {
1550 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1551 
1552 	rt->rt_uncached_list = ul;
1553 
1554 	spin_lock_bh(&ul->lock);
1555 	list_add_tail(&rt->rt_uncached, &ul->head);
1556 	spin_unlock_bh(&ul->lock);
1557 }
1558 
rt_del_uncached_list(struct rtable * rt)1559 void rt_del_uncached_list(struct rtable *rt)
1560 {
1561 	if (!list_empty(&rt->rt_uncached)) {
1562 		struct uncached_list *ul = rt->rt_uncached_list;
1563 
1564 		spin_lock_bh(&ul->lock);
1565 		list_del(&rt->rt_uncached);
1566 		spin_unlock_bh(&ul->lock);
1567 	}
1568 }
1569 
ipv4_dst_destroy(struct dst_entry * dst)1570 static void ipv4_dst_destroy(struct dst_entry *dst)
1571 {
1572 	struct rtable *rt = (struct rtable *)dst;
1573 
1574 	ip_dst_metrics_put(dst);
1575 	rt_del_uncached_list(rt);
1576 }
1577 
rt_flush_dev(struct net_device * dev)1578 void rt_flush_dev(struct net_device *dev)
1579 {
1580 	struct rtable *rt;
1581 	int cpu;
1582 
1583 	for_each_possible_cpu(cpu) {
1584 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1585 
1586 		spin_lock_bh(&ul->lock);
1587 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1588 			if (rt->dst.dev != dev)
1589 				continue;
1590 			rt->dst.dev = blackhole_netdev;
1591 			dev_hold(rt->dst.dev);
1592 			dev_put(dev);
1593 		}
1594 		spin_unlock_bh(&ul->lock);
1595 	}
1596 }
1597 
rt_cache_valid(const struct rtable * rt)1598 static bool rt_cache_valid(const struct rtable *rt)
1599 {
1600 	return	rt &&
1601 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1602 		!rt_is_expired(rt);
1603 }
1604 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1605 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1606 			   const struct fib_result *res,
1607 			   struct fib_nh_exception *fnhe,
1608 			   struct fib_info *fi, u16 type, u32 itag,
1609 			   const bool do_cache)
1610 {
1611 	bool cached = false;
1612 
1613 	if (fi) {
1614 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1615 
1616 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1617 			rt->rt_uses_gateway = 1;
1618 			rt->rt_gw_family = nhc->nhc_gw_family;
1619 			/* only INET and INET6 are supported */
1620 			if (likely(nhc->nhc_gw_family == AF_INET))
1621 				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1622 			else
1623 				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1624 		}
1625 
1626 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1627 
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 		if (nhc->nhc_family == AF_INET) {
1630 			struct fib_nh *nh;
1631 
1632 			nh = container_of(nhc, struct fib_nh, nh_common);
1633 			rt->dst.tclassid = nh->nh_tclassid;
1634 		}
1635 #endif
1636 		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1637 		if (unlikely(fnhe))
1638 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1639 		else if (do_cache)
1640 			cached = rt_cache_route(nhc, rt);
1641 		if (unlikely(!cached)) {
1642 			/* Routes we intend to cache in nexthop exception or
1643 			 * FIB nexthop have the DST_NOCACHE bit clear.
1644 			 * However, if we are unsuccessful at storing this
1645 			 * route into the cache we really need to set it.
1646 			 */
1647 			if (!rt->rt_gw4) {
1648 				rt->rt_gw_family = AF_INET;
1649 				rt->rt_gw4 = daddr;
1650 			}
1651 			rt_add_uncached_list(rt);
1652 		}
1653 	} else
1654 		rt_add_uncached_list(rt);
1655 
1656 #ifdef CONFIG_IP_ROUTE_CLASSID
1657 #ifdef CONFIG_IP_MULTIPLE_TABLES
1658 	set_class_tag(rt, res->tclassid);
1659 #endif
1660 	set_class_tag(rt, itag);
1661 #endif
1662 }
1663 
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1664 struct rtable *rt_dst_alloc(struct net_device *dev,
1665 			    unsigned int flags, u16 type,
1666 			    bool nopolicy, bool noxfrm)
1667 {
1668 	struct rtable *rt;
1669 
1670 	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1671 		       (nopolicy ? DST_NOPOLICY : 0) |
1672 		       (noxfrm ? DST_NOXFRM : 0));
1673 
1674 	if (rt) {
1675 		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1676 		rt->rt_flags = flags;
1677 		rt->rt_type = type;
1678 		rt->rt_is_input = 0;
1679 		rt->rt_iif = 0;
1680 		rt->rt_pmtu = 0;
1681 		rt->rt_mtu_locked = 0;
1682 		rt->rt_uses_gateway = 0;
1683 		rt->rt_gw_family = 0;
1684 		rt->rt_gw4 = 0;
1685 		INIT_LIST_HEAD(&rt->rt_uncached);
1686 
1687 		rt->dst.output = ip_output;
1688 		if (flags & RTCF_LOCAL)
1689 			rt->dst.input = ip_local_deliver;
1690 	}
1691 
1692 	return rt;
1693 }
1694 EXPORT_SYMBOL(rt_dst_alloc);
1695 
rt_dst_clone(struct net_device * dev,struct rtable * rt)1696 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1697 {
1698 	struct rtable *new_rt;
1699 
1700 	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1701 			   rt->dst.flags);
1702 
1703 	if (new_rt) {
1704 		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1705 		new_rt->rt_flags = rt->rt_flags;
1706 		new_rt->rt_type = rt->rt_type;
1707 		new_rt->rt_is_input = rt->rt_is_input;
1708 		new_rt->rt_iif = rt->rt_iif;
1709 		new_rt->rt_pmtu = rt->rt_pmtu;
1710 		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1711 		new_rt->rt_gw_family = rt->rt_gw_family;
1712 		if (rt->rt_gw_family == AF_INET)
1713 			new_rt->rt_gw4 = rt->rt_gw4;
1714 		else if (rt->rt_gw_family == AF_INET6)
1715 			new_rt->rt_gw6 = rt->rt_gw6;
1716 		INIT_LIST_HEAD(&new_rt->rt_uncached);
1717 
1718 		new_rt->dst.input = rt->dst.input;
1719 		new_rt->dst.output = rt->dst.output;
1720 		new_rt->dst.error = rt->dst.error;
1721 		new_rt->dst.lastuse = jiffies;
1722 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1723 	}
1724 	return new_rt;
1725 }
1726 EXPORT_SYMBOL(rt_dst_clone);
1727 
1728 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1729 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730 			  u8 tos, struct net_device *dev,
1731 			  struct in_device *in_dev, u32 *itag)
1732 {
1733 	int err;
1734 
1735 	/* Primary sanity checks. */
1736 	if (!in_dev)
1737 		return -EINVAL;
1738 
1739 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1740 	    skb->protocol != htons(ETH_P_IP))
1741 		return -EINVAL;
1742 
1743 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1744 		return -EINVAL;
1745 
1746 	if (ipv4_is_zeronet(saddr)) {
1747 		if (!ipv4_is_local_multicast(daddr) &&
1748 		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1749 			return -EINVAL;
1750 	} else {
1751 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1752 					  in_dev, itag);
1753 		if (err < 0)
1754 			return err;
1755 	}
1756 	return 0;
1757 }
1758 
1759 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1760 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1761 			     u8 tos, struct net_device *dev, int our)
1762 {
1763 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1764 	unsigned int flags = RTCF_MULTICAST;
1765 	struct rtable *rth;
1766 	bool no_policy;
1767 	u32 itag = 0;
1768 	int err;
1769 
1770 	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1771 	if (err)
1772 		return err;
1773 
1774 	if (our)
1775 		flags |= RTCF_LOCAL;
1776 
1777 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1778 	if (no_policy)
1779 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1780 
1781 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1782 			   no_policy, false);
1783 	if (!rth)
1784 		return -ENOBUFS;
1785 
1786 #ifdef CONFIG_IP_ROUTE_CLASSID
1787 	rth->dst.tclassid = itag;
1788 #endif
1789 	rth->dst.output = ip_rt_bug;
1790 	rth->rt_is_input= 1;
1791 
1792 #ifdef CONFIG_IP_MROUTE
1793 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1794 		rth->dst.input = ip_mr_input;
1795 #endif
1796 	RT_CACHE_STAT_INC(in_slow_mc);
1797 
1798 	skb_dst_drop(skb);
1799 	skb_dst_set(skb, &rth->dst);
1800 	return 0;
1801 }
1802 
1803 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1804 static void ip_handle_martian_source(struct net_device *dev,
1805 				     struct in_device *in_dev,
1806 				     struct sk_buff *skb,
1807 				     __be32 daddr,
1808 				     __be32 saddr)
1809 {
1810 	RT_CACHE_STAT_INC(in_martian_src);
1811 #ifdef CONFIG_IP_ROUTE_VERBOSE
1812 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1813 		/*
1814 		 *	RFC1812 recommendation, if source is martian,
1815 		 *	the only hint is MAC header.
1816 		 */
1817 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1818 			&daddr, &saddr, dev->name);
1819 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1820 			print_hex_dump(KERN_WARNING, "ll header: ",
1821 				       DUMP_PREFIX_OFFSET, 16, 1,
1822 				       skb_mac_header(skb),
1823 				       dev->hard_header_len, false);
1824 		}
1825 	}
1826 #endif
1827 }
1828 
1829 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1830 static int __mkroute_input(struct sk_buff *skb,
1831 			   const struct fib_result *res,
1832 			   struct in_device *in_dev,
1833 			   __be32 daddr, __be32 saddr, u32 tos)
1834 {
1835 	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1836 	struct net_device *dev = nhc->nhc_dev;
1837 	struct fib_nh_exception *fnhe;
1838 	struct rtable *rth;
1839 	int err;
1840 	struct in_device *out_dev;
1841 	bool do_cache, no_policy;
1842 	u32 itag = 0;
1843 
1844 	/* get a working reference to the output device */
1845 	out_dev = __in_dev_get_rcu(dev);
1846 	if (!out_dev) {
1847 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1848 		return -EINVAL;
1849 	}
1850 
1851 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1852 				  in_dev->dev, in_dev, &itag);
1853 	if (err < 0) {
1854 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1855 					 saddr);
1856 
1857 		goto cleanup;
1858 	}
1859 
1860 	do_cache = res->fi && !itag;
1861 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1862 	    skb->protocol == htons(ETH_P_IP)) {
1863 		__be32 gw;
1864 
1865 		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1866 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1867 		    inet_addr_onlink(out_dev, saddr, gw))
1868 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1869 	}
1870 
1871 	if (skb->protocol != htons(ETH_P_IP)) {
1872 		/* Not IP (i.e. ARP). Do not create route, if it is
1873 		 * invalid for proxy arp. DNAT routes are always valid.
1874 		 *
1875 		 * Proxy arp feature have been extended to allow, ARP
1876 		 * replies back to the same interface, to support
1877 		 * Private VLAN switch technologies. See arp.c.
1878 		 */
1879 		if (out_dev == in_dev &&
1880 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1881 			err = -EINVAL;
1882 			goto cleanup;
1883 		}
1884 	}
1885 
1886 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1887 	if (no_policy)
1888 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1889 
1890 	fnhe = find_exception(nhc, daddr);
1891 	if (do_cache) {
1892 		if (fnhe)
1893 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1894 		else
1895 			rth = rcu_dereference(nhc->nhc_rth_input);
1896 		if (rt_cache_valid(rth)) {
1897 			skb_dst_set_noref(skb, &rth->dst);
1898 			goto out;
1899 		}
1900 	}
1901 
1902 	rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1903 			   IN_DEV_ORCONF(out_dev, NOXFRM));
1904 	if (!rth) {
1905 		err = -ENOBUFS;
1906 		goto cleanup;
1907 	}
1908 
1909 	rth->rt_is_input = 1;
1910 	RT_CACHE_STAT_INC(in_slow_tot);
1911 
1912 	rth->dst.input = ip_forward;
1913 
1914 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1915 		       do_cache);
1916 	lwtunnel_set_redirect(&rth->dst);
1917 	skb_dst_set(skb, &rth->dst);
1918 out:
1919 	err = 0;
1920  cleanup:
1921 	return err;
1922 }
1923 
1924 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1925 /* To make ICMP packets follow the right flow, the multipath hash is
1926  * calculated from the inner IP addresses.
1927  */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1928 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1929 				 struct flow_keys *hash_keys)
1930 {
1931 	const struct iphdr *outer_iph = ip_hdr(skb);
1932 	const struct iphdr *key_iph = outer_iph;
1933 	const struct iphdr *inner_iph;
1934 	const struct icmphdr *icmph;
1935 	struct iphdr _inner_iph;
1936 	struct icmphdr _icmph;
1937 
1938 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1939 		goto out;
1940 
1941 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1942 		goto out;
1943 
1944 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1945 				   &_icmph);
1946 	if (!icmph)
1947 		goto out;
1948 
1949 	if (!icmp_is_err(icmph->type))
1950 		goto out;
1951 
1952 	inner_iph = skb_header_pointer(skb,
1953 				       outer_iph->ihl * 4 + sizeof(_icmph),
1954 				       sizeof(_inner_iph), &_inner_iph);
1955 	if (!inner_iph)
1956 		goto out;
1957 
1958 	key_iph = inner_iph;
1959 out:
1960 	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1961 	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1962 }
1963 
1964 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1965 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1966 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1967 {
1968 	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1969 	struct flow_keys hash_keys;
1970 	u32 mhash;
1971 
1972 	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
1973 	case 0:
1974 		memset(&hash_keys, 0, sizeof(hash_keys));
1975 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1976 		if (skb) {
1977 			ip_multipath_l3_keys(skb, &hash_keys);
1978 		} else {
1979 			hash_keys.addrs.v4addrs.src = fl4->saddr;
1980 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1981 		}
1982 		break;
1983 	case 1:
1984 		/* skb is currently provided only when forwarding */
1985 		if (skb) {
1986 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1987 			struct flow_keys keys;
1988 
1989 			/* short-circuit if we already have L4 hash present */
1990 			if (skb->l4_hash)
1991 				return skb_get_hash_raw(skb) >> 1;
1992 
1993 			memset(&hash_keys, 0, sizeof(hash_keys));
1994 
1995 			if (!flkeys) {
1996 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1997 				flkeys = &keys;
1998 			}
1999 
2000 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2001 			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2002 			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2003 			hash_keys.ports.src = flkeys->ports.src;
2004 			hash_keys.ports.dst = flkeys->ports.dst;
2005 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2006 		} else {
2007 			memset(&hash_keys, 0, sizeof(hash_keys));
2008 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2009 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2010 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2011 			hash_keys.ports.src = fl4->fl4_sport;
2012 			hash_keys.ports.dst = fl4->fl4_dport;
2013 			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2014 		}
2015 		break;
2016 	case 2:
2017 		memset(&hash_keys, 0, sizeof(hash_keys));
2018 		/* skb is currently provided only when forwarding */
2019 		if (skb) {
2020 			struct flow_keys keys;
2021 
2022 			skb_flow_dissect_flow_keys(skb, &keys, 0);
2023 			/* Inner can be v4 or v6 */
2024 			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2025 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2026 				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2027 				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2028 			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2029 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030 				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2031 				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2032 				hash_keys.tags.flow_label = keys.tags.flow_label;
2033 				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2034 			} else {
2035 				/* Same as case 0 */
2036 				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2037 				ip_multipath_l3_keys(skb, &hash_keys);
2038 			}
2039 		} else {
2040 			/* Same as case 0 */
2041 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2042 			hash_keys.addrs.v4addrs.src = fl4->saddr;
2043 			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2044 		}
2045 		break;
2046 	}
2047 	mhash = flow_hash_from_keys(&hash_keys);
2048 
2049 	if (multipath_hash)
2050 		mhash = jhash_2words(mhash, multipath_hash, 0);
2051 
2052 	return mhash >> 1;
2053 }
2054 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2055 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2056 static int ip_mkroute_input(struct sk_buff *skb,
2057 			    struct fib_result *res,
2058 			    struct in_device *in_dev,
2059 			    __be32 daddr, __be32 saddr, u32 tos,
2060 			    struct flow_keys *hkeys)
2061 {
2062 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2063 	if (res->fi && fib_info_num_path(res->fi) > 1) {
2064 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2065 
2066 		fib_select_multipath(res, h);
2067 		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2068 	}
2069 #endif
2070 
2071 	/* create a routing cache entry */
2072 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2073 }
2074 
2075 /* Implements all the saddr-related checks as ip_route_input_slow(),
2076  * assuming daddr is valid and the destination is not a local broadcast one.
2077  * Uses the provided hint instead of performing a route lookup.
2078  */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2079 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080 		      u8 tos, struct net_device *dev,
2081 		      const struct sk_buff *hint)
2082 {
2083 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2084 	struct rtable *rt = skb_rtable(hint);
2085 	struct net *net = dev_net(dev);
2086 	int err = -EINVAL;
2087 	u32 tag = 0;
2088 
2089 	if (!in_dev)
2090 		return -EINVAL;
2091 
2092 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2093 		goto martian_source;
2094 
2095 	if (ipv4_is_zeronet(saddr))
2096 		goto martian_source;
2097 
2098 	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2099 		goto martian_source;
2100 
2101 	if (rt->rt_type != RTN_LOCAL)
2102 		goto skip_validate_source;
2103 
2104 	tos &= IPTOS_RT_MASK;
2105 	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2106 	if (err < 0)
2107 		goto martian_source;
2108 
2109 skip_validate_source:
2110 	skb_dst_copy(skb, hint);
2111 	return 0;
2112 
2113 martian_source:
2114 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2115 	return err;
2116 }
2117 
2118 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2119 static struct net_device *ip_rt_get_dev(struct net *net,
2120 					const struct fib_result *res)
2121 {
2122 	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2123 	struct net_device *dev = NULL;
2124 
2125 	if (nhc)
2126 		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2127 
2128 	return dev ? : net->loopback_dev;
2129 }
2130 
2131 /*
2132  *	NOTE. We drop all the packets that has local source
2133  *	addresses, because every properly looped back packet
2134  *	must have correct destination already attached by output routine.
2135  *	Changes in the enforced policies must be applied also to
2136  *	ip_route_use_hint().
2137  *
2138  *	Such approach solves two big problems:
2139  *	1. Not simplex devices are handled properly.
2140  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2141  *	called with rcu_read_lock()
2142  */
2143 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2144 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2145 			       u8 tos, struct net_device *dev,
2146 			       struct fib_result *res)
2147 {
2148 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2149 	struct flow_keys *flkeys = NULL, _flkeys;
2150 	struct net    *net = dev_net(dev);
2151 	struct ip_tunnel_info *tun_info;
2152 	int		err = -EINVAL;
2153 	unsigned int	flags = 0;
2154 	u32		itag = 0;
2155 	struct rtable	*rth;
2156 	struct flowi4	fl4;
2157 	bool do_cache = true;
2158 	bool no_policy;
2159 
2160 	/* IP on this device is disabled. */
2161 
2162 	if (!in_dev)
2163 		goto out;
2164 
2165 	/* Check for the most weird martians, which can be not detected
2166 	   by fib_lookup.
2167 	 */
2168 
2169 	tun_info = skb_tunnel_info(skb);
2170 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2171 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2172 	else
2173 		fl4.flowi4_tun_key.tun_id = 0;
2174 	skb_dst_drop(skb);
2175 
2176 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2177 		goto martian_source;
2178 
2179 	res->fi = NULL;
2180 	res->table = NULL;
2181 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2182 		goto brd_input;
2183 
2184 	/* Accept zero addresses only to limited broadcast;
2185 	 * I even do not know to fix it or not. Waiting for complains :-)
2186 	 */
2187 	if (ipv4_is_zeronet(saddr))
2188 		goto martian_source;
2189 
2190 	if (ipv4_is_zeronet(daddr))
2191 		goto martian_destination;
2192 
2193 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2194 	 * and call it once if daddr or/and saddr are loopback addresses
2195 	 */
2196 	if (ipv4_is_loopback(daddr)) {
2197 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2198 			goto martian_destination;
2199 	} else if (ipv4_is_loopback(saddr)) {
2200 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2201 			goto martian_source;
2202 	}
2203 
2204 	/*
2205 	 *	Now we are ready to route packet.
2206 	 */
2207 	fl4.flowi4_oif = 0;
2208 	fl4.flowi4_iif = dev->ifindex;
2209 	fl4.flowi4_mark = skb->mark;
2210 	fl4.flowi4_tos = tos;
2211 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2212 	fl4.flowi4_flags = 0;
2213 	fl4.daddr = daddr;
2214 	fl4.saddr = saddr;
2215 	fl4.flowi4_uid = sock_net_uid(net, NULL);
2216 	fl4.flowi4_multipath_hash = 0;
2217 
2218 	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2219 		flkeys = &_flkeys;
2220 	} else {
2221 		fl4.flowi4_proto = 0;
2222 		fl4.fl4_sport = 0;
2223 		fl4.fl4_dport = 0;
2224 	}
2225 
2226 	err = fib_lookup(net, &fl4, res, 0);
2227 	if (err != 0) {
2228 		if (!IN_DEV_FORWARD(in_dev))
2229 			err = -EHOSTUNREACH;
2230 		goto no_route;
2231 	}
2232 
2233 	if (res->type == RTN_BROADCAST) {
2234 		if (IN_DEV_BFORWARD(in_dev))
2235 			goto make_route;
2236 		/* not do cache if bc_forwarding is enabled */
2237 		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2238 			do_cache = false;
2239 		goto brd_input;
2240 	}
2241 
2242 	if (res->type == RTN_LOCAL) {
2243 		err = fib_validate_source(skb, saddr, daddr, tos,
2244 					  0, dev, in_dev, &itag);
2245 		if (err < 0)
2246 			goto martian_source;
2247 		goto local_input;
2248 	}
2249 
2250 	if (!IN_DEV_FORWARD(in_dev)) {
2251 		err = -EHOSTUNREACH;
2252 		goto no_route;
2253 	}
2254 	if (res->type != RTN_UNICAST)
2255 		goto martian_destination;
2256 
2257 make_route:
2258 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2259 out:	return err;
2260 
2261 brd_input:
2262 	if (skb->protocol != htons(ETH_P_IP))
2263 		goto e_inval;
2264 
2265 	if (!ipv4_is_zeronet(saddr)) {
2266 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2267 					  in_dev, &itag);
2268 		if (err < 0)
2269 			goto martian_source;
2270 	}
2271 	flags |= RTCF_BROADCAST;
2272 	res->type = RTN_BROADCAST;
2273 	RT_CACHE_STAT_INC(in_brd);
2274 
2275 local_input:
2276 	no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2277 	if (no_policy)
2278 		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2279 
2280 	do_cache &= res->fi && !itag;
2281 	if (do_cache) {
2282 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2283 
2284 		rth = rcu_dereference(nhc->nhc_rth_input);
2285 		if (rt_cache_valid(rth)) {
2286 			skb_dst_set_noref(skb, &rth->dst);
2287 			err = 0;
2288 			goto out;
2289 		}
2290 	}
2291 
2292 	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2293 			   flags | RTCF_LOCAL, res->type,
2294 			   no_policy, false);
2295 	if (!rth)
2296 		goto e_nobufs;
2297 
2298 	rth->dst.output= ip_rt_bug;
2299 #ifdef CONFIG_IP_ROUTE_CLASSID
2300 	rth->dst.tclassid = itag;
2301 #endif
2302 	rth->rt_is_input = 1;
2303 
2304 	RT_CACHE_STAT_INC(in_slow_tot);
2305 	if (res->type == RTN_UNREACHABLE) {
2306 		rth->dst.input= ip_error;
2307 		rth->dst.error= -err;
2308 		rth->rt_flags 	&= ~RTCF_LOCAL;
2309 	}
2310 
2311 	if (do_cache) {
2312 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2313 
2314 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2315 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2316 			WARN_ON(rth->dst.input == lwtunnel_input);
2317 			rth->dst.lwtstate->orig_input = rth->dst.input;
2318 			rth->dst.input = lwtunnel_input;
2319 		}
2320 
2321 		if (unlikely(!rt_cache_route(nhc, rth)))
2322 			rt_add_uncached_list(rth);
2323 	}
2324 	skb_dst_set(skb, &rth->dst);
2325 	err = 0;
2326 	goto out;
2327 
2328 no_route:
2329 	RT_CACHE_STAT_INC(in_no_route);
2330 	res->type = RTN_UNREACHABLE;
2331 	res->fi = NULL;
2332 	res->table = NULL;
2333 	goto local_input;
2334 
2335 	/*
2336 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2337 	 */
2338 martian_destination:
2339 	RT_CACHE_STAT_INC(in_martian_dst);
2340 #ifdef CONFIG_IP_ROUTE_VERBOSE
2341 	if (IN_DEV_LOG_MARTIANS(in_dev))
2342 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2343 				     &daddr, &saddr, dev->name);
2344 #endif
2345 
2346 e_inval:
2347 	err = -EINVAL;
2348 	goto out;
2349 
2350 e_nobufs:
2351 	err = -ENOBUFS;
2352 	goto out;
2353 
2354 martian_source:
2355 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2356 	goto out;
2357 }
2358 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2359 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2360 			 u8 tos, struct net_device *dev)
2361 {
2362 	struct fib_result res;
2363 	int err;
2364 
2365 	tos &= IPTOS_RT_MASK;
2366 	rcu_read_lock();
2367 	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2368 	rcu_read_unlock();
2369 
2370 	return err;
2371 }
2372 EXPORT_SYMBOL(ip_route_input_noref);
2373 
2374 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2375 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2376 		       u8 tos, struct net_device *dev, struct fib_result *res)
2377 {
2378 	/* Multicast recognition logic is moved from route cache to here.
2379 	   The problem was that too many Ethernet cards have broken/missing
2380 	   hardware multicast filters :-( As result the host on multicasting
2381 	   network acquires a lot of useless route cache entries, sort of
2382 	   SDR messages from all the world. Now we try to get rid of them.
2383 	   Really, provided software IP multicast filter is organized
2384 	   reasonably (at least, hashed), it does not result in a slowdown
2385 	   comparing with route cache reject entries.
2386 	   Note, that multicast routers are not affected, because
2387 	   route cache entry is created eventually.
2388 	 */
2389 	if (ipv4_is_multicast(daddr)) {
2390 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2391 		int our = 0;
2392 		int err = -EINVAL;
2393 
2394 		if (!in_dev)
2395 			return err;
2396 		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2397 				      ip_hdr(skb)->protocol);
2398 
2399 		/* check l3 master if no match yet */
2400 		if (!our && netif_is_l3_slave(dev)) {
2401 			struct in_device *l3_in_dev;
2402 
2403 			l3_in_dev = __in_dev_get_rcu(skb->dev);
2404 			if (l3_in_dev)
2405 				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2406 						      ip_hdr(skb)->protocol);
2407 		}
2408 
2409 		if (our
2410 #ifdef CONFIG_IP_MROUTE
2411 			||
2412 		    (!ipv4_is_local_multicast(daddr) &&
2413 		     IN_DEV_MFORWARD(in_dev))
2414 #endif
2415 		   ) {
2416 			err = ip_route_input_mc(skb, daddr, saddr,
2417 						tos, dev, our);
2418 		}
2419 		return err;
2420 	}
2421 
2422 	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2423 }
2424 
2425 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2426 static struct rtable *__mkroute_output(const struct fib_result *res,
2427 				       const struct flowi4 *fl4, int orig_oif,
2428 				       struct net_device *dev_out,
2429 				       unsigned int flags)
2430 {
2431 	struct fib_info *fi = res->fi;
2432 	struct fib_nh_exception *fnhe;
2433 	struct in_device *in_dev;
2434 	u16 type = res->type;
2435 	struct rtable *rth;
2436 	bool do_cache;
2437 
2438 	in_dev = __in_dev_get_rcu(dev_out);
2439 	if (!in_dev)
2440 		return ERR_PTR(-EINVAL);
2441 
2442 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2443 		if (ipv4_is_loopback(fl4->saddr) &&
2444 		    !(dev_out->flags & IFF_LOOPBACK) &&
2445 		    !netif_is_l3_master(dev_out))
2446 			return ERR_PTR(-EINVAL);
2447 
2448 	if (ipv4_is_lbcast(fl4->daddr))
2449 		type = RTN_BROADCAST;
2450 	else if (ipv4_is_multicast(fl4->daddr))
2451 		type = RTN_MULTICAST;
2452 	else if (ipv4_is_zeronet(fl4->daddr))
2453 		return ERR_PTR(-EINVAL);
2454 
2455 	if (dev_out->flags & IFF_LOOPBACK)
2456 		flags |= RTCF_LOCAL;
2457 
2458 	do_cache = true;
2459 	if (type == RTN_BROADCAST) {
2460 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2461 		fi = NULL;
2462 	} else if (type == RTN_MULTICAST) {
2463 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2464 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2465 				     fl4->flowi4_proto))
2466 			flags &= ~RTCF_LOCAL;
2467 		else
2468 			do_cache = false;
2469 		/* If multicast route do not exist use
2470 		 * default one, but do not gateway in this case.
2471 		 * Yes, it is hack.
2472 		 */
2473 		if (fi && res->prefixlen < 4)
2474 			fi = NULL;
2475 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2476 		   (orig_oif != dev_out->ifindex)) {
2477 		/* For local routes that require a particular output interface
2478 		 * we do not want to cache the result.  Caching the result
2479 		 * causes incorrect behaviour when there are multiple source
2480 		 * addresses on the interface, the end result being that if the
2481 		 * intended recipient is waiting on that interface for the
2482 		 * packet he won't receive it because it will be delivered on
2483 		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2484 		 * be set to the loopback interface as well.
2485 		 */
2486 		do_cache = false;
2487 	}
2488 
2489 	fnhe = NULL;
2490 	do_cache &= fi != NULL;
2491 	if (fi) {
2492 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2493 		struct rtable __rcu **prth;
2494 
2495 		fnhe = find_exception(nhc, fl4->daddr);
2496 		if (!do_cache)
2497 			goto add;
2498 		if (fnhe) {
2499 			prth = &fnhe->fnhe_rth_output;
2500 		} else {
2501 			if (unlikely(fl4->flowi4_flags &
2502 				     FLOWI_FLAG_KNOWN_NH &&
2503 				     !(nhc->nhc_gw_family &&
2504 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2505 				do_cache = false;
2506 				goto add;
2507 			}
2508 			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2509 		}
2510 		rth = rcu_dereference(*prth);
2511 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2512 			return rth;
2513 	}
2514 
2515 add:
2516 	rth = rt_dst_alloc(dev_out, flags, type,
2517 			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2518 			   IN_DEV_ORCONF(in_dev, NOXFRM));
2519 	if (!rth)
2520 		return ERR_PTR(-ENOBUFS);
2521 
2522 	rth->rt_iif = orig_oif;
2523 
2524 	RT_CACHE_STAT_INC(out_slow_tot);
2525 
2526 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2527 		if (flags & RTCF_LOCAL &&
2528 		    !(dev_out->flags & IFF_LOOPBACK)) {
2529 			rth->dst.output = ip_mc_output;
2530 			RT_CACHE_STAT_INC(out_slow_mc);
2531 		}
2532 #ifdef CONFIG_IP_MROUTE
2533 		if (type == RTN_MULTICAST) {
2534 			if (IN_DEV_MFORWARD(in_dev) &&
2535 			    !ipv4_is_local_multicast(fl4->daddr)) {
2536 				rth->dst.input = ip_mr_input;
2537 				rth->dst.output = ip_mc_output;
2538 			}
2539 		}
2540 #endif
2541 	}
2542 
2543 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2544 	lwtunnel_set_redirect(&rth->dst);
2545 
2546 	return rth;
2547 }
2548 
2549 /*
2550  * Major route resolver routine.
2551  */
2552 
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2553 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2554 					const struct sk_buff *skb)
2555 {
2556 	struct fib_result res = {
2557 		.type		= RTN_UNSPEC,
2558 		.fi		= NULL,
2559 		.table		= NULL,
2560 		.tclassid	= 0,
2561 	};
2562 	struct rtable *rth;
2563 
2564 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2565 	ip_rt_fix_tos(fl4);
2566 
2567 	rcu_read_lock();
2568 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2569 	rcu_read_unlock();
2570 
2571 	return rth;
2572 }
2573 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2574 
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2575 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2576 					    struct fib_result *res,
2577 					    const struct sk_buff *skb)
2578 {
2579 	struct net_device *dev_out = NULL;
2580 	int orig_oif = fl4->flowi4_oif;
2581 	unsigned int flags = 0;
2582 	struct rtable *rth;
2583 	int err;
2584 
2585 	if (fl4->saddr) {
2586 		if (ipv4_is_multicast(fl4->saddr) ||
2587 		    ipv4_is_lbcast(fl4->saddr) ||
2588 		    ipv4_is_zeronet(fl4->saddr)) {
2589 			rth = ERR_PTR(-EINVAL);
2590 			goto out;
2591 		}
2592 
2593 		rth = ERR_PTR(-ENETUNREACH);
2594 
2595 		/* I removed check for oif == dev_out->oif here.
2596 		   It was wrong for two reasons:
2597 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2598 		      is assigned to multiple interfaces.
2599 		   2. Moreover, we are allowed to send packets with saddr
2600 		      of another iface. --ANK
2601 		 */
2602 
2603 		if (fl4->flowi4_oif == 0 &&
2604 		    (ipv4_is_multicast(fl4->daddr) ||
2605 		     ipv4_is_lbcast(fl4->daddr))) {
2606 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2607 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2608 			if (!dev_out)
2609 				goto out;
2610 
2611 			/* Special hack: user can direct multicasts
2612 			   and limited broadcast via necessary interface
2613 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2614 			   This hack is not just for fun, it allows
2615 			   vic,vat and friends to work.
2616 			   They bind socket to loopback, set ttl to zero
2617 			   and expect that it will work.
2618 			   From the viewpoint of routing cache they are broken,
2619 			   because we are not allowed to build multicast path
2620 			   with loopback source addr (look, routing cache
2621 			   cannot know, that ttl is zero, so that packet
2622 			   will not leave this host and route is valid).
2623 			   Luckily, this hack is good workaround.
2624 			 */
2625 
2626 			fl4->flowi4_oif = dev_out->ifindex;
2627 			goto make_route;
2628 		}
2629 
2630 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2631 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2632 			if (!__ip_dev_find(net, fl4->saddr, false))
2633 				goto out;
2634 		}
2635 	}
2636 
2637 
2638 	if (fl4->flowi4_oif) {
2639 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2640 		rth = ERR_PTR(-ENODEV);
2641 		if (!dev_out)
2642 			goto out;
2643 
2644 		/* RACE: Check return value of inet_select_addr instead. */
2645 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2646 			rth = ERR_PTR(-ENETUNREACH);
2647 			goto out;
2648 		}
2649 		if (ipv4_is_local_multicast(fl4->daddr) ||
2650 		    ipv4_is_lbcast(fl4->daddr) ||
2651 		    fl4->flowi4_proto == IPPROTO_IGMP) {
2652 			if (!fl4->saddr)
2653 				fl4->saddr = inet_select_addr(dev_out, 0,
2654 							      RT_SCOPE_LINK);
2655 			goto make_route;
2656 		}
2657 		if (!fl4->saddr) {
2658 			if (ipv4_is_multicast(fl4->daddr))
2659 				fl4->saddr = inet_select_addr(dev_out, 0,
2660 							      fl4->flowi4_scope);
2661 			else if (!fl4->daddr)
2662 				fl4->saddr = inet_select_addr(dev_out, 0,
2663 							      RT_SCOPE_HOST);
2664 		}
2665 	}
2666 
2667 	if (!fl4->daddr) {
2668 		fl4->daddr = fl4->saddr;
2669 		if (!fl4->daddr)
2670 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2671 		dev_out = net->loopback_dev;
2672 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2673 		res->type = RTN_LOCAL;
2674 		flags |= RTCF_LOCAL;
2675 		goto make_route;
2676 	}
2677 
2678 	err = fib_lookup(net, fl4, res, 0);
2679 	if (err) {
2680 		res->fi = NULL;
2681 		res->table = NULL;
2682 		if (fl4->flowi4_oif &&
2683 		    (ipv4_is_multicast(fl4->daddr) ||
2684 		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2685 			/* Apparently, routing tables are wrong. Assume,
2686 			   that the destination is on link.
2687 
2688 			   WHY? DW.
2689 			   Because we are allowed to send to iface
2690 			   even if it has NO routes and NO assigned
2691 			   addresses. When oif is specified, routing
2692 			   tables are looked up with only one purpose:
2693 			   to catch if destination is gatewayed, rather than
2694 			   direct. Moreover, if MSG_DONTROUTE is set,
2695 			   we send packet, ignoring both routing tables
2696 			   and ifaddr state. --ANK
2697 
2698 
2699 			   We could make it even if oif is unknown,
2700 			   likely IPv6, but we do not.
2701 			 */
2702 
2703 			if (fl4->saddr == 0)
2704 				fl4->saddr = inet_select_addr(dev_out, 0,
2705 							      RT_SCOPE_LINK);
2706 			res->type = RTN_UNICAST;
2707 			goto make_route;
2708 		}
2709 		rth = ERR_PTR(err);
2710 		goto out;
2711 	}
2712 
2713 	if (res->type == RTN_LOCAL) {
2714 		if (!fl4->saddr) {
2715 			if (res->fi->fib_prefsrc)
2716 				fl4->saddr = res->fi->fib_prefsrc;
2717 			else
2718 				fl4->saddr = fl4->daddr;
2719 		}
2720 
2721 		/* L3 master device is the loopback for that domain */
2722 		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2723 			net->loopback_dev;
2724 
2725 		/* make sure orig_oif points to fib result device even
2726 		 * though packet rx/tx happens over loopback or l3mdev
2727 		 */
2728 		orig_oif = FIB_RES_OIF(*res);
2729 
2730 		fl4->flowi4_oif = dev_out->ifindex;
2731 		flags |= RTCF_LOCAL;
2732 		goto make_route;
2733 	}
2734 
2735 	fib_select_path(net, res, fl4, skb);
2736 
2737 	dev_out = FIB_RES_DEV(*res);
2738 
2739 make_route:
2740 	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2741 
2742 out:
2743 	return rth;
2744 }
2745 
2746 static struct dst_ops ipv4_dst_blackhole_ops = {
2747 	.family			= AF_INET,
2748 	.default_advmss		= ipv4_default_advmss,
2749 	.neigh_lookup		= ipv4_neigh_lookup,
2750 	.check			= dst_blackhole_check,
2751 	.cow_metrics		= dst_blackhole_cow_metrics,
2752 	.update_pmtu		= dst_blackhole_update_pmtu,
2753 	.redirect		= dst_blackhole_redirect,
2754 	.mtu			= dst_blackhole_mtu,
2755 };
2756 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2757 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2758 {
2759 	struct rtable *ort = (struct rtable *) dst_orig;
2760 	struct rtable *rt;
2761 
2762 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2763 	if (rt) {
2764 		struct dst_entry *new = &rt->dst;
2765 
2766 		new->__use = 1;
2767 		new->input = dst_discard;
2768 		new->output = dst_discard_out;
2769 
2770 		new->dev = net->loopback_dev;
2771 		if (new->dev)
2772 			dev_hold(new->dev);
2773 
2774 		rt->rt_is_input = ort->rt_is_input;
2775 		rt->rt_iif = ort->rt_iif;
2776 		rt->rt_pmtu = ort->rt_pmtu;
2777 		rt->rt_mtu_locked = ort->rt_mtu_locked;
2778 
2779 		rt->rt_genid = rt_genid_ipv4(net);
2780 		rt->rt_flags = ort->rt_flags;
2781 		rt->rt_type = ort->rt_type;
2782 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2783 		rt->rt_gw_family = ort->rt_gw_family;
2784 		if (rt->rt_gw_family == AF_INET)
2785 			rt->rt_gw4 = ort->rt_gw4;
2786 		else if (rt->rt_gw_family == AF_INET6)
2787 			rt->rt_gw6 = ort->rt_gw6;
2788 
2789 		INIT_LIST_HEAD(&rt->rt_uncached);
2790 	}
2791 
2792 	dst_release(dst_orig);
2793 
2794 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2795 }
2796 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2797 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2798 				    const struct sock *sk)
2799 {
2800 	struct rtable *rt = __ip_route_output_key(net, flp4);
2801 
2802 	if (IS_ERR(rt))
2803 		return rt;
2804 
2805 	if (flp4->flowi4_proto) {
2806 		flp4->flowi4_oif = rt->dst.dev->ifindex;
2807 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2808 							flowi4_to_flowi(flp4),
2809 							sk, 0);
2810 	}
2811 
2812 	return rt;
2813 }
2814 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2815 
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2816 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2817 				      struct net_device *dev,
2818 				      struct net *net, __be32 *saddr,
2819 				      const struct ip_tunnel_info *info,
2820 				      u8 protocol, bool use_cache)
2821 {
2822 #ifdef CONFIG_DST_CACHE
2823 	struct dst_cache *dst_cache;
2824 #endif
2825 	struct rtable *rt = NULL;
2826 	struct flowi4 fl4;
2827 	__u8 tos;
2828 
2829 #ifdef CONFIG_DST_CACHE
2830 	dst_cache = (struct dst_cache *)&info->dst_cache;
2831 	if (use_cache) {
2832 		rt = dst_cache_get_ip4(dst_cache, saddr);
2833 		if (rt)
2834 			return rt;
2835 	}
2836 #endif
2837 	memset(&fl4, 0, sizeof(fl4));
2838 	fl4.flowi4_mark = skb->mark;
2839 	fl4.flowi4_proto = protocol;
2840 	fl4.daddr = info->key.u.ipv4.dst;
2841 	fl4.saddr = info->key.u.ipv4.src;
2842 	tos = info->key.tos;
2843 	fl4.flowi4_tos = RT_TOS(tos);
2844 
2845 	rt = ip_route_output_key(net, &fl4);
2846 	if (IS_ERR(rt)) {
2847 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2848 		return ERR_PTR(-ENETUNREACH);
2849 	}
2850 	if (rt->dst.dev == dev) { /* is this necessary? */
2851 		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2852 		ip_rt_put(rt);
2853 		return ERR_PTR(-ELOOP);
2854 	}
2855 #ifdef CONFIG_DST_CACHE
2856 	if (use_cache)
2857 		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2858 #endif
2859 	*saddr = fl4.saddr;
2860 	return rt;
2861 }
2862 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2863 
2864 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2865 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2866 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2867 			struct sk_buff *skb, u32 portid, u32 seq,
2868 			unsigned int flags)
2869 {
2870 	struct rtmsg *r;
2871 	struct nlmsghdr *nlh;
2872 	unsigned long expires = 0;
2873 	u32 error;
2874 	u32 metrics[RTAX_MAX];
2875 
2876 	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2877 	if (!nlh)
2878 		return -EMSGSIZE;
2879 
2880 	r = nlmsg_data(nlh);
2881 	r->rtm_family	 = AF_INET;
2882 	r->rtm_dst_len	= 32;
2883 	r->rtm_src_len	= 0;
2884 	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2885 	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2886 	if (nla_put_u32(skb, RTA_TABLE, table_id))
2887 		goto nla_put_failure;
2888 	r->rtm_type	= rt->rt_type;
2889 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2890 	r->rtm_protocol = RTPROT_UNSPEC;
2891 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2892 	if (rt->rt_flags & RTCF_NOTIFY)
2893 		r->rtm_flags |= RTM_F_NOTIFY;
2894 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2895 		r->rtm_flags |= RTCF_DOREDIRECT;
2896 
2897 	if (nla_put_in_addr(skb, RTA_DST, dst))
2898 		goto nla_put_failure;
2899 	if (src) {
2900 		r->rtm_src_len = 32;
2901 		if (nla_put_in_addr(skb, RTA_SRC, src))
2902 			goto nla_put_failure;
2903 	}
2904 	if (rt->dst.dev &&
2905 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2906 		goto nla_put_failure;
2907 #ifdef CONFIG_IP_ROUTE_CLASSID
2908 	if (rt->dst.tclassid &&
2909 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2910 		goto nla_put_failure;
2911 #endif
2912 	if (fl4 && !rt_is_input_route(rt) &&
2913 	    fl4->saddr != src) {
2914 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2915 			goto nla_put_failure;
2916 	}
2917 	if (rt->rt_uses_gateway) {
2918 		if (rt->rt_gw_family == AF_INET &&
2919 		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2920 			goto nla_put_failure;
2921 		} else if (rt->rt_gw_family == AF_INET6) {
2922 			int alen = sizeof(struct in6_addr);
2923 			struct nlattr *nla;
2924 			struct rtvia *via;
2925 
2926 			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2927 			if (!nla)
2928 				goto nla_put_failure;
2929 
2930 			via = nla_data(nla);
2931 			via->rtvia_family = AF_INET6;
2932 			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2933 		}
2934 	}
2935 
2936 	expires = rt->dst.expires;
2937 	if (expires) {
2938 		unsigned long now = jiffies;
2939 
2940 		if (time_before(now, expires))
2941 			expires -= now;
2942 		else
2943 			expires = 0;
2944 	}
2945 
2946 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2947 	if (rt->rt_pmtu && expires)
2948 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2949 	if (rt->rt_mtu_locked && expires)
2950 		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2951 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2952 		goto nla_put_failure;
2953 
2954 	if (fl4) {
2955 		if (fl4->flowi4_mark &&
2956 		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2957 			goto nla_put_failure;
2958 
2959 		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2960 		    nla_put_u32(skb, RTA_UID,
2961 				from_kuid_munged(current_user_ns(),
2962 						 fl4->flowi4_uid)))
2963 			goto nla_put_failure;
2964 
2965 		if (rt_is_input_route(rt)) {
2966 #ifdef CONFIG_IP_MROUTE
2967 			if (ipv4_is_multicast(dst) &&
2968 			    !ipv4_is_local_multicast(dst) &&
2969 			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2970 				int err = ipmr_get_route(net, skb,
2971 							 fl4->saddr, fl4->daddr,
2972 							 r, portid);
2973 
2974 				if (err <= 0) {
2975 					if (err == 0)
2976 						return 0;
2977 					goto nla_put_failure;
2978 				}
2979 			} else
2980 #endif
2981 				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2982 					goto nla_put_failure;
2983 		}
2984 	}
2985 
2986 	error = rt->dst.error;
2987 
2988 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2989 		goto nla_put_failure;
2990 
2991 	nlmsg_end(skb, nlh);
2992 	return 0;
2993 
2994 nla_put_failure:
2995 	nlmsg_cancel(skb, nlh);
2996 	return -EMSGSIZE;
2997 }
2998 
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2999 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3000 			    struct netlink_callback *cb, u32 table_id,
3001 			    struct fnhe_hash_bucket *bucket, int genid,
3002 			    int *fa_index, int fa_start, unsigned int flags)
3003 {
3004 	int i;
3005 
3006 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3007 		struct fib_nh_exception *fnhe;
3008 
3009 		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3010 		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3011 			struct rtable *rt;
3012 			int err;
3013 
3014 			if (*fa_index < fa_start)
3015 				goto next;
3016 
3017 			if (fnhe->fnhe_genid != genid)
3018 				goto next;
3019 
3020 			if (fnhe->fnhe_expires &&
3021 			    time_after(jiffies, fnhe->fnhe_expires))
3022 				goto next;
3023 
3024 			rt = rcu_dereference(fnhe->fnhe_rth_input);
3025 			if (!rt)
3026 				rt = rcu_dereference(fnhe->fnhe_rth_output);
3027 			if (!rt)
3028 				goto next;
3029 
3030 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3031 					   table_id, NULL, skb,
3032 					   NETLINK_CB(cb->skb).portid,
3033 					   cb->nlh->nlmsg_seq, flags);
3034 			if (err)
3035 				return err;
3036 next:
3037 			(*fa_index)++;
3038 		}
3039 	}
3040 
3041 	return 0;
3042 }
3043 
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3044 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3045 		       u32 table_id, struct fib_info *fi,
3046 		       int *fa_index, int fa_start, unsigned int flags)
3047 {
3048 	struct net *net = sock_net(cb->skb->sk);
3049 	int nhsel, genid = fnhe_genid(net);
3050 
3051 	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3052 		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3053 		struct fnhe_hash_bucket *bucket;
3054 		int err;
3055 
3056 		if (nhc->nhc_flags & RTNH_F_DEAD)
3057 			continue;
3058 
3059 		rcu_read_lock();
3060 		bucket = rcu_dereference(nhc->nhc_exceptions);
3061 		err = 0;
3062 		if (bucket)
3063 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3064 					       genid, fa_index, fa_start,
3065 					       flags);
3066 		rcu_read_unlock();
3067 		if (err)
3068 			return err;
3069 	}
3070 
3071 	return 0;
3072 }
3073 
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3074 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3075 						   u8 ip_proto, __be16 sport,
3076 						   __be16 dport)
3077 {
3078 	struct sk_buff *skb;
3079 	struct iphdr *iph;
3080 
3081 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3082 	if (!skb)
3083 		return NULL;
3084 
3085 	/* Reserve room for dummy headers, this skb can pass
3086 	 * through good chunk of routing engine.
3087 	 */
3088 	skb_reset_mac_header(skb);
3089 	skb_reset_network_header(skb);
3090 	skb->protocol = htons(ETH_P_IP);
3091 	iph = skb_put(skb, sizeof(struct iphdr));
3092 	iph->protocol = ip_proto;
3093 	iph->saddr = src;
3094 	iph->daddr = dst;
3095 	iph->version = 0x4;
3096 	iph->frag_off = 0;
3097 	iph->ihl = 0x5;
3098 	skb_set_transport_header(skb, skb->len);
3099 
3100 	switch (iph->protocol) {
3101 	case IPPROTO_UDP: {
3102 		struct udphdr *udph;
3103 
3104 		udph = skb_put_zero(skb, sizeof(struct udphdr));
3105 		udph->source = sport;
3106 		udph->dest = dport;
3107 		udph->len = htons(sizeof(struct udphdr));
3108 		udph->check = 0;
3109 		break;
3110 	}
3111 	case IPPROTO_TCP: {
3112 		struct tcphdr *tcph;
3113 
3114 		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3115 		tcph->source	= sport;
3116 		tcph->dest	= dport;
3117 		tcph->doff	= sizeof(struct tcphdr) / 4;
3118 		tcph->rst = 1;
3119 		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3120 					    src, dst, 0);
3121 		break;
3122 	}
3123 	case IPPROTO_ICMP: {
3124 		struct icmphdr *icmph;
3125 
3126 		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3127 		icmph->type = ICMP_ECHO;
3128 		icmph->code = 0;
3129 	}
3130 	}
3131 
3132 	return skb;
3133 }
3134 
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3135 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3136 				       const struct nlmsghdr *nlh,
3137 				       struct nlattr **tb,
3138 				       struct netlink_ext_ack *extack)
3139 {
3140 	struct rtmsg *rtm;
3141 	int i, err;
3142 
3143 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3144 		NL_SET_ERR_MSG(extack,
3145 			       "ipv4: Invalid header for route get request");
3146 		return -EINVAL;
3147 	}
3148 
3149 	if (!netlink_strict_get_check(skb))
3150 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3151 					      rtm_ipv4_policy, extack);
3152 
3153 	rtm = nlmsg_data(nlh);
3154 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3155 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3156 	    rtm->rtm_table || rtm->rtm_protocol ||
3157 	    rtm->rtm_scope || rtm->rtm_type) {
3158 		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3159 		return -EINVAL;
3160 	}
3161 
3162 	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3163 			       RTM_F_LOOKUP_TABLE |
3164 			       RTM_F_FIB_MATCH)) {
3165 		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3166 		return -EINVAL;
3167 	}
3168 
3169 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3170 					    rtm_ipv4_policy, extack);
3171 	if (err)
3172 		return err;
3173 
3174 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3175 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3176 		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3177 		return -EINVAL;
3178 	}
3179 
3180 	for (i = 0; i <= RTA_MAX; i++) {
3181 		if (!tb[i])
3182 			continue;
3183 
3184 		switch (i) {
3185 		case RTA_IIF:
3186 		case RTA_OIF:
3187 		case RTA_SRC:
3188 		case RTA_DST:
3189 		case RTA_IP_PROTO:
3190 		case RTA_SPORT:
3191 		case RTA_DPORT:
3192 		case RTA_MARK:
3193 		case RTA_UID:
3194 			break;
3195 		default:
3196 			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3197 			return -EINVAL;
3198 		}
3199 	}
3200 
3201 	return 0;
3202 }
3203 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3204 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3205 			     struct netlink_ext_ack *extack)
3206 {
3207 	struct net *net = sock_net(in_skb->sk);
3208 	struct nlattr *tb[RTA_MAX+1];
3209 	u32 table_id = RT_TABLE_MAIN;
3210 	__be16 sport = 0, dport = 0;
3211 	struct fib_result res = {};
3212 	u8 ip_proto = IPPROTO_UDP;
3213 	struct rtable *rt = NULL;
3214 	struct sk_buff *skb;
3215 	struct rtmsg *rtm;
3216 	struct flowi4 fl4 = {};
3217 	__be32 dst = 0;
3218 	__be32 src = 0;
3219 	kuid_t uid;
3220 	u32 iif;
3221 	int err;
3222 	int mark;
3223 
3224 	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3225 	if (err < 0)
3226 		return err;
3227 
3228 	rtm = nlmsg_data(nlh);
3229 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3230 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3231 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3232 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3233 	if (tb[RTA_UID])
3234 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3235 	else
3236 		uid = (iif ? INVALID_UID : current_uid());
3237 
3238 	if (tb[RTA_IP_PROTO]) {
3239 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3240 						  &ip_proto, AF_INET, extack);
3241 		if (err)
3242 			return err;
3243 	}
3244 
3245 	if (tb[RTA_SPORT])
3246 		sport = nla_get_be16(tb[RTA_SPORT]);
3247 
3248 	if (tb[RTA_DPORT])
3249 		dport = nla_get_be16(tb[RTA_DPORT]);
3250 
3251 	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3252 	if (!skb)
3253 		return -ENOBUFS;
3254 
3255 	fl4.daddr = dst;
3256 	fl4.saddr = src;
3257 	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3258 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3259 	fl4.flowi4_mark = mark;
3260 	fl4.flowi4_uid = uid;
3261 	if (sport)
3262 		fl4.fl4_sport = sport;
3263 	if (dport)
3264 		fl4.fl4_dport = dport;
3265 	fl4.flowi4_proto = ip_proto;
3266 
3267 	rcu_read_lock();
3268 
3269 	if (iif) {
3270 		struct net_device *dev;
3271 
3272 		dev = dev_get_by_index_rcu(net, iif);
3273 		if (!dev) {
3274 			err = -ENODEV;
3275 			goto errout_rcu;
3276 		}
3277 
3278 		fl4.flowi4_iif = iif; /* for rt_fill_info */
3279 		skb->dev	= dev;
3280 		skb->mark	= mark;
3281 		err = ip_route_input_rcu(skb, dst, src,
3282 					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3283 					 &res);
3284 
3285 		rt = skb_rtable(skb);
3286 		if (err == 0 && rt->dst.error)
3287 			err = -rt->dst.error;
3288 	} else {
3289 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3290 		skb->dev = net->loopback_dev;
3291 		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3292 		err = 0;
3293 		if (IS_ERR(rt))
3294 			err = PTR_ERR(rt);
3295 		else
3296 			skb_dst_set(skb, &rt->dst);
3297 	}
3298 
3299 	if (err)
3300 		goto errout_rcu;
3301 
3302 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3303 		rt->rt_flags |= RTCF_NOTIFY;
3304 
3305 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3306 		table_id = res.table ? res.table->tb_id : 0;
3307 
3308 	/* reset skb for netlink reply msg */
3309 	skb_trim(skb, 0);
3310 	skb_reset_network_header(skb);
3311 	skb_reset_transport_header(skb);
3312 	skb_reset_mac_header(skb);
3313 
3314 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3315 		struct fib_rt_info fri;
3316 
3317 		if (!res.fi) {
3318 			err = fib_props[res.type].error;
3319 			if (!err)
3320 				err = -EHOSTUNREACH;
3321 			goto errout_rcu;
3322 		}
3323 		fri.fi = res.fi;
3324 		fri.tb_id = table_id;
3325 		fri.dst = res.prefix;
3326 		fri.dst_len = res.prefixlen;
3327 		fri.tos = fl4.flowi4_tos;
3328 		fri.type = rt->rt_type;
3329 		fri.offload = 0;
3330 		fri.trap = 0;
3331 		if (res.fa_head) {
3332 			struct fib_alias *fa;
3333 
3334 			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3335 				u8 slen = 32 - fri.dst_len;
3336 
3337 				if (fa->fa_slen == slen &&
3338 				    fa->tb_id == fri.tb_id &&
3339 				    fa->fa_tos == fri.tos &&
3340 				    fa->fa_info == res.fi &&
3341 				    fa->fa_type == fri.type) {
3342 					fri.offload = fa->offload;
3343 					fri.trap = fa->trap;
3344 					break;
3345 				}
3346 			}
3347 		}
3348 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3349 				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3350 	} else {
3351 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3352 				   NETLINK_CB(in_skb).portid,
3353 				   nlh->nlmsg_seq, 0);
3354 	}
3355 	if (err < 0)
3356 		goto errout_rcu;
3357 
3358 	rcu_read_unlock();
3359 
3360 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3361 
3362 errout_free:
3363 	return err;
3364 errout_rcu:
3365 	rcu_read_unlock();
3366 	kfree_skb(skb);
3367 	goto errout_free;
3368 }
3369 
ip_rt_multicast_event(struct in_device * in_dev)3370 void ip_rt_multicast_event(struct in_device *in_dev)
3371 {
3372 	rt_cache_flush(dev_net(in_dev->dev));
3373 }
3374 
3375 #ifdef CONFIG_SYSCTL
3376 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3377 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3378 static int ip_rt_gc_elasticity __read_mostly	= 8;
3379 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3380 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3381 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3382 		void *buffer, size_t *lenp, loff_t *ppos)
3383 {
3384 	struct net *net = (struct net *)__ctl->extra1;
3385 
3386 	if (write) {
3387 		rt_cache_flush(net);
3388 		fnhe_genid_bump(net);
3389 		return 0;
3390 	}
3391 
3392 	return -EINVAL;
3393 }
3394 
3395 static struct ctl_table ipv4_route_table[] = {
3396 	{
3397 		.procname	= "gc_thresh",
3398 		.data		= &ipv4_dst_ops.gc_thresh,
3399 		.maxlen		= sizeof(int),
3400 		.mode		= 0644,
3401 		.proc_handler	= proc_dointvec,
3402 	},
3403 	{
3404 		.procname	= "max_size",
3405 		.data		= &ip_rt_max_size,
3406 		.maxlen		= sizeof(int),
3407 		.mode		= 0644,
3408 		.proc_handler	= proc_dointvec,
3409 	},
3410 	{
3411 		/*  Deprecated. Use gc_min_interval_ms */
3412 
3413 		.procname	= "gc_min_interval",
3414 		.data		= &ip_rt_gc_min_interval,
3415 		.maxlen		= sizeof(int),
3416 		.mode		= 0644,
3417 		.proc_handler	= proc_dointvec_jiffies,
3418 	},
3419 	{
3420 		.procname	= "gc_min_interval_ms",
3421 		.data		= &ip_rt_gc_min_interval,
3422 		.maxlen		= sizeof(int),
3423 		.mode		= 0644,
3424 		.proc_handler	= proc_dointvec_ms_jiffies,
3425 	},
3426 	{
3427 		.procname	= "gc_timeout",
3428 		.data		= &ip_rt_gc_timeout,
3429 		.maxlen		= sizeof(int),
3430 		.mode		= 0644,
3431 		.proc_handler	= proc_dointvec_jiffies,
3432 	},
3433 	{
3434 		.procname	= "gc_interval",
3435 		.data		= &ip_rt_gc_interval,
3436 		.maxlen		= sizeof(int),
3437 		.mode		= 0644,
3438 		.proc_handler	= proc_dointvec_jiffies,
3439 	},
3440 	{
3441 		.procname	= "redirect_load",
3442 		.data		= &ip_rt_redirect_load,
3443 		.maxlen		= sizeof(int),
3444 		.mode		= 0644,
3445 		.proc_handler	= proc_dointvec,
3446 	},
3447 	{
3448 		.procname	= "redirect_number",
3449 		.data		= &ip_rt_redirect_number,
3450 		.maxlen		= sizeof(int),
3451 		.mode		= 0644,
3452 		.proc_handler	= proc_dointvec,
3453 	},
3454 	{
3455 		.procname	= "redirect_silence",
3456 		.data		= &ip_rt_redirect_silence,
3457 		.maxlen		= sizeof(int),
3458 		.mode		= 0644,
3459 		.proc_handler	= proc_dointvec,
3460 	},
3461 	{
3462 		.procname	= "error_cost",
3463 		.data		= &ip_rt_error_cost,
3464 		.maxlen		= sizeof(int),
3465 		.mode		= 0644,
3466 		.proc_handler	= proc_dointvec,
3467 	},
3468 	{
3469 		.procname	= "error_burst",
3470 		.data		= &ip_rt_error_burst,
3471 		.maxlen		= sizeof(int),
3472 		.mode		= 0644,
3473 		.proc_handler	= proc_dointvec,
3474 	},
3475 	{
3476 		.procname	= "gc_elasticity",
3477 		.data		= &ip_rt_gc_elasticity,
3478 		.maxlen		= sizeof(int),
3479 		.mode		= 0644,
3480 		.proc_handler	= proc_dointvec,
3481 	},
3482 	{
3483 		.procname	= "mtu_expires",
3484 		.data		= &ip_rt_mtu_expires,
3485 		.maxlen		= sizeof(int),
3486 		.mode		= 0644,
3487 		.proc_handler	= proc_dointvec_jiffies,
3488 	},
3489 	{
3490 		.procname	= "min_pmtu",
3491 		.data		= &ip_rt_min_pmtu,
3492 		.maxlen		= sizeof(int),
3493 		.mode		= 0644,
3494 		.proc_handler	= proc_dointvec_minmax,
3495 		.extra1		= &ip_min_valid_pmtu,
3496 	},
3497 	{
3498 		.procname	= "min_adv_mss",
3499 		.data		= &ip_rt_min_advmss,
3500 		.maxlen		= sizeof(int),
3501 		.mode		= 0644,
3502 		.proc_handler	= proc_dointvec,
3503 	},
3504 	{ }
3505 };
3506 
3507 static const char ipv4_route_flush_procname[] = "flush";
3508 
3509 static struct ctl_table ipv4_route_flush_table[] = {
3510 	{
3511 		.procname	= ipv4_route_flush_procname,
3512 		.maxlen		= sizeof(int),
3513 		.mode		= 0200,
3514 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3515 	},
3516 	{ },
3517 };
3518 
sysctl_route_net_init(struct net * net)3519 static __net_init int sysctl_route_net_init(struct net *net)
3520 {
3521 	struct ctl_table *tbl;
3522 
3523 	tbl = ipv4_route_flush_table;
3524 	if (!net_eq(net, &init_net)) {
3525 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3526 		if (!tbl)
3527 			goto err_dup;
3528 
3529 		/* Don't export non-whitelisted sysctls to unprivileged users */
3530 		if (net->user_ns != &init_user_ns) {
3531 			if (tbl[0].procname != ipv4_route_flush_procname)
3532 				tbl[0].procname = NULL;
3533 		}
3534 	}
3535 	tbl[0].extra1 = net;
3536 
3537 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3538 	if (!net->ipv4.route_hdr)
3539 		goto err_reg;
3540 	return 0;
3541 
3542 err_reg:
3543 	if (tbl != ipv4_route_flush_table)
3544 		kfree(tbl);
3545 err_dup:
3546 	return -ENOMEM;
3547 }
3548 
sysctl_route_net_exit(struct net * net)3549 static __net_exit void sysctl_route_net_exit(struct net *net)
3550 {
3551 	struct ctl_table *tbl;
3552 
3553 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3554 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3555 	BUG_ON(tbl == ipv4_route_flush_table);
3556 	kfree(tbl);
3557 }
3558 
3559 static __net_initdata struct pernet_operations sysctl_route_ops = {
3560 	.init = sysctl_route_net_init,
3561 	.exit = sysctl_route_net_exit,
3562 };
3563 #endif
3564 
rt_genid_init(struct net * net)3565 static __net_init int rt_genid_init(struct net *net)
3566 {
3567 	atomic_set(&net->ipv4.rt_genid, 0);
3568 	atomic_set(&net->fnhe_genid, 0);
3569 	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3570 	return 0;
3571 }
3572 
3573 static __net_initdata struct pernet_operations rt_genid_ops = {
3574 	.init = rt_genid_init,
3575 };
3576 
ipv4_inetpeer_init(struct net * net)3577 static int __net_init ipv4_inetpeer_init(struct net *net)
3578 {
3579 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3580 
3581 	if (!bp)
3582 		return -ENOMEM;
3583 	inet_peer_base_init(bp);
3584 	net->ipv4.peers = bp;
3585 	return 0;
3586 }
3587 
ipv4_inetpeer_exit(struct net * net)3588 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3589 {
3590 	struct inet_peer_base *bp = net->ipv4.peers;
3591 
3592 	net->ipv4.peers = NULL;
3593 	inetpeer_invalidate_tree(bp);
3594 	kfree(bp);
3595 }
3596 
3597 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3598 	.init	=	ipv4_inetpeer_init,
3599 	.exit	=	ipv4_inetpeer_exit,
3600 };
3601 
3602 #ifdef CONFIG_IP_ROUTE_CLASSID
3603 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3604 #endif /* CONFIG_IP_ROUTE_CLASSID */
3605 
ip_rt_init(void)3606 int __init ip_rt_init(void)
3607 {
3608 	void *idents_hash;
3609 	int cpu;
3610 
3611 	/* For modern hosts, this will use 2 MB of memory */
3612 	idents_hash = alloc_large_system_hash("IP idents",
3613 					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3614 					      0,
3615 					      16, /* one bucket per 64 KB */
3616 					      HASH_ZERO,
3617 					      NULL,
3618 					      &ip_idents_mask,
3619 					      2048,
3620 					      256*1024);
3621 
3622 	ip_idents = idents_hash;
3623 
3624 	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3625 
3626 	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3627 
3628 	for_each_possible_cpu(cpu) {
3629 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3630 
3631 		INIT_LIST_HEAD(&ul->head);
3632 		spin_lock_init(&ul->lock);
3633 	}
3634 #ifdef CONFIG_IP_ROUTE_CLASSID
3635 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3636 	if (!ip_rt_acct)
3637 		panic("IP: failed to allocate ip_rt_acct\n");
3638 #endif
3639 
3640 	ipv4_dst_ops.kmem_cachep =
3641 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3642 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3643 
3644 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3645 
3646 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3647 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3648 
3649 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3650 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3651 
3652 	ipv4_dst_ops.gc_thresh = ~0;
3653 	ip_rt_max_size = INT_MAX;
3654 
3655 	devinet_init();
3656 	ip_fib_init();
3657 
3658 	if (ip_rt_proc_init())
3659 		pr_err("Unable to create route proc files\n");
3660 #ifdef CONFIG_XFRM
3661 	xfrm_init();
3662 	xfrm4_init();
3663 #endif
3664 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3665 		      RTNL_FLAG_DOIT_UNLOCKED);
3666 
3667 #ifdef CONFIG_SYSCTL
3668 	register_pernet_subsys(&sysctl_route_ops);
3669 #endif
3670 	register_pernet_subsys(&rt_genid_ops);
3671 	register_pernet_subsys(&ipv4_inetpeer_ops);
3672 	return 0;
3673 }
3674 
3675 #ifdef CONFIG_SYSCTL
3676 /*
3677  * We really need to sanitize the damn ipv4 init order, then all
3678  * this nonsense will go away.
3679  */
ip_static_sysctl_init(void)3680 void __init ip_static_sysctl_init(void)
3681 {
3682 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3683 }
3684 #endif
3685