1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly = 256;
131
132 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
133
134 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
135
136 /*
137 * Interface to generic destination cache.
138 */
139
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void ipv4_link_failure(struct sk_buff *skb);
145 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb, u32 mtu);
147 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb);
149 static void ipv4_dst_destroy(struct dst_entry *dst);
150
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
152 {
153 WARN_ON(1);
154 return NULL;
155 }
156
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 struct sk_buff *skb,
159 const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
163 .check = ipv4_dst_check,
164 .default_advmss = ipv4_default_advmss,
165 .mtu = ipv4_mtu,
166 .cow_metrics = ipv4_cow_metrics,
167 .destroy = ipv4_dst_destroy,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class) TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207 }
208
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 ++*pos;
212 return NULL;
213 }
214
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234 };
235
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242 .owner = THIS_MODULE,
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
246 .release = seq_release,
247 };
248
249
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
261 return &per_cpu(rt_cache_stat, cpu);
262 }
263 return NULL;
264 }
265
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 int cpu;
269
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 return NULL;
277
278 }
279
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 0, /* st->in_hit */
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 0, /* st->out_hit */
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
315 );
316 return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324 };
325
326
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333 .owner = THIS_MODULE,
334 .open = rt_cpu_seq_open,
335 .read = seq_read,
336 .llseek = seq_lseek,
337 .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
358 }
359
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363 }
364
rt_acct_proc_open(struct inode * inode,struct file * file)365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371 .owner = THIS_MODULE,
372 .open = rt_acct_proc_open,
373 .read = seq_read,
374 .llseek = seq_lseek,
375 .release = single_release,
376 };
377 #endif
378
ip_rt_do_proc_init(struct net * net)379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381 struct proc_dir_entry *pde;
382
383 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 &rt_cache_seq_fops);
385 if (!pde)
386 goto err1;
387
388 pde = proc_create("rt_cache", S_IRUGO,
389 net->proc_net_stat, &rt_cpu_seq_fops);
390 if (!pde)
391 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395 if (!pde)
396 goto err3;
397 #endif
398 return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402 remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405 remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407 return -ENOMEM;
408 }
409
ip_rt_do_proc_exit(struct net * net)410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412 remove_proc_entry("rt_cache", net->proc_net_stat);
413 remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415 remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
420 .init = ip_rt_do_proc_init,
421 .exit = ip_rt_do_proc_exit,
422 };
423
ip_rt_proc_init(void)424 static int __init ip_rt_proc_init(void)
425 {
426 return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
ip_rt_proc_init(void)430 static inline int ip_rt_proc_init(void)
431 {
432 return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
rt_is_expired(const struct rtable * rth)436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
439 }
440
rt_cache_flush(struct net * net)441 void rt_cache_flush(struct net *net)
442 {
443 rt_genid_bump_ipv4(net);
444 }
445
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 struct sk_buff *skb,
448 const void *daddr)
449 {
450 struct net_device *dev = dst->dev;
451 const __be32 *pkey = daddr;
452 const struct rtable *rt;
453 struct neighbour *n;
454
455 rt = (const struct rtable *) dst;
456 if (rt->rt_gateway)
457 pkey = (const __be32 *) &rt->rt_gateway;
458 else if (skb)
459 pkey = &ip_hdr(skb)->daddr;
460
461 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 if (n)
463 return n;
464 return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 #define IP_IDENTS_SZ 2048u
468
469 static atomic_t *ip_idents __read_mostly;
470 static u32 *ip_tstamps __read_mostly;
471
472 /* In order to protect privacy, we add a perturbation to identifiers
473 * if one generator is seldom used. This makes hard for an attacker
474 * to infer how many packets were sent between two points in time.
475 */
ip_idents_reserve(u32 hash,int segs)476 u32 ip_idents_reserve(u32 hash, int segs)
477 {
478 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
479 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
480 u32 old = ACCESS_ONCE(*p_tstamp);
481 u32 now = (u32)jiffies;
482 u32 new, delta = 0;
483
484 if (old != now && cmpxchg(p_tstamp, old, now) == old)
485 delta = prandom_u32_max(now - old);
486
487 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
488 do {
489 old = (u32)atomic_read(p_id);
490 new = old + delta + segs;
491 } while (atomic_cmpxchg(p_id, old, new) != old);
492
493 return new - segs;
494 }
495 EXPORT_SYMBOL(ip_idents_reserve);
496
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)497 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
498 {
499 static u32 ip_idents_hashrnd __read_mostly;
500 u32 hash, id;
501
502 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
503
504 hash = jhash_3words((__force u32)iph->daddr,
505 (__force u32)iph->saddr,
506 iph->protocol ^ net_hash_mix(net),
507 ip_idents_hashrnd);
508 id = ip_idents_reserve(hash, segs);
509 iph->id = htons(id);
510 }
511 EXPORT_SYMBOL(__ip_select_ident);
512
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)513 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
514 const struct sock *sk,
515 const struct iphdr *iph,
516 int oif, u8 tos,
517 u8 prot, u32 mark, int flow_flags)
518 {
519 if (sk) {
520 const struct inet_sock *inet = inet_sk(sk);
521
522 oif = sk->sk_bound_dev_if;
523 mark = sk->sk_mark;
524 tos = RT_CONN_FLAGS(sk);
525 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
526 }
527 flowi4_init_output(fl4, oif, mark, tos,
528 RT_SCOPE_UNIVERSE, prot,
529 flow_flags,
530 iph->daddr, iph->saddr, 0, 0,
531 sock_net_uid(net, sk));
532 }
533
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)534 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
535 const struct sock *sk)
536 {
537 const struct net *net = dev_net(skb->dev);
538 const struct iphdr *iph = ip_hdr(skb);
539 int oif = skb->dev->ifindex;
540 u8 tos = RT_TOS(iph->tos);
541 u8 prot = iph->protocol;
542 u32 mark = skb->mark;
543
544 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
545 }
546
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)547 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
548 {
549 const struct inet_sock *inet = inet_sk(sk);
550 const struct ip_options_rcu *inet_opt;
551 __be32 daddr = inet->inet_daddr;
552
553 rcu_read_lock();
554 inet_opt = rcu_dereference(inet->inet_opt);
555 if (inet_opt && inet_opt->opt.srr)
556 daddr = inet_opt->opt.faddr;
557 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
558 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
559 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
560 inet_sk_flowi_flags(sk),
561 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
562 rcu_read_unlock();
563 }
564
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)565 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
566 const struct sk_buff *skb)
567 {
568 if (skb)
569 build_skb_flow_key(fl4, skb, sk);
570 else
571 build_sk_flow_key(fl4, sk);
572 }
573
rt_free(struct rtable * rt)574 static inline void rt_free(struct rtable *rt)
575 {
576 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
577 }
578
579 static DEFINE_SPINLOCK(fnhe_lock);
580
fnhe_flush_routes(struct fib_nh_exception * fnhe)581 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
582 {
583 struct rtable *rt;
584
585 rt = rcu_dereference(fnhe->fnhe_rth_input);
586 if (rt) {
587 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
588 rt_free(rt);
589 }
590 rt = rcu_dereference(fnhe->fnhe_rth_output);
591 if (rt) {
592 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
593 rt_free(rt);
594 }
595 }
596
fnhe_oldest(struct fnhe_hash_bucket * hash)597 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
598 {
599 struct fib_nh_exception *fnhe, *oldest;
600
601 oldest = rcu_dereference(hash->chain);
602 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
603 fnhe = rcu_dereference(fnhe->fnhe_next)) {
604 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
605 oldest = fnhe;
606 }
607 fnhe_flush_routes(oldest);
608 return oldest;
609 }
610
fnhe_hashfun(__be32 daddr)611 static inline u32 fnhe_hashfun(__be32 daddr)
612 {
613 static u32 fnhe_hashrnd __read_mostly;
614 u32 hval;
615
616 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
617 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
618 return hash_32(hval, FNHE_HASH_SHIFT);
619 }
620
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)621 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
622 {
623 rt->rt_pmtu = fnhe->fnhe_pmtu;
624 rt->dst.expires = fnhe->fnhe_expires;
625
626 if (fnhe->fnhe_gw) {
627 rt->rt_flags |= RTCF_REDIRECTED;
628 rt->rt_gateway = fnhe->fnhe_gw;
629 rt->rt_uses_gateway = 1;
630 }
631 }
632
update_or_create_fnhe(struct fib_nh * nh,__be32 daddr,__be32 gw,u32 pmtu,unsigned long expires)633 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
634 u32 pmtu, unsigned long expires)
635 {
636 struct fnhe_hash_bucket *hash;
637 struct fib_nh_exception *fnhe;
638 struct rtable *rt;
639 u32 genid, hval;
640 unsigned int i;
641 int depth;
642
643 genid = fnhe_genid(dev_net(nh->nh_dev));
644 hval = fnhe_hashfun(daddr);
645
646 spin_lock_bh(&fnhe_lock);
647
648 hash = rcu_dereference(nh->nh_exceptions);
649 if (!hash) {
650 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
651 if (!hash)
652 goto out_unlock;
653 rcu_assign_pointer(nh->nh_exceptions, hash);
654 }
655
656 hash += hval;
657
658 depth = 0;
659 for (fnhe = rcu_dereference(hash->chain); fnhe;
660 fnhe = rcu_dereference(fnhe->fnhe_next)) {
661 if (fnhe->fnhe_daddr == daddr)
662 break;
663 depth++;
664 }
665
666 if (fnhe) {
667 if (fnhe->fnhe_genid != genid)
668 fnhe->fnhe_genid = genid;
669 if (gw)
670 fnhe->fnhe_gw = gw;
671 if (pmtu)
672 fnhe->fnhe_pmtu = pmtu;
673 fnhe->fnhe_expires = max(1UL, expires);
674 /* Update all cached dsts too */
675 rt = rcu_dereference(fnhe->fnhe_rth_input);
676 if (rt)
677 fill_route_from_fnhe(rt, fnhe);
678 rt = rcu_dereference(fnhe->fnhe_rth_output);
679 if (rt)
680 fill_route_from_fnhe(rt, fnhe);
681 } else {
682 if (depth > FNHE_RECLAIM_DEPTH)
683 fnhe = fnhe_oldest(hash);
684 else {
685 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
686 if (!fnhe)
687 goto out_unlock;
688
689 fnhe->fnhe_next = hash->chain;
690 rcu_assign_pointer(hash->chain, fnhe);
691 }
692 fnhe->fnhe_genid = genid;
693 fnhe->fnhe_daddr = daddr;
694 fnhe->fnhe_gw = gw;
695 fnhe->fnhe_pmtu = pmtu;
696 fnhe->fnhe_expires = expires;
697
698 /* Exception created; mark the cached routes for the nexthop
699 * stale, so anyone caching it rechecks if this exception
700 * applies to them.
701 */
702 rt = rcu_dereference(nh->nh_rth_input);
703 if (rt)
704 rt->dst.obsolete = DST_OBSOLETE_KILL;
705
706 for_each_possible_cpu(i) {
707 struct rtable __rcu **prt;
708 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
709 rt = rcu_dereference(*prt);
710 if (rt)
711 rt->dst.obsolete = DST_OBSOLETE_KILL;
712 }
713 }
714
715 fnhe->fnhe_stamp = jiffies;
716
717 out_unlock:
718 spin_unlock_bh(&fnhe_lock);
719 }
720
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)721 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
722 bool kill_route)
723 {
724 __be32 new_gw = icmp_hdr(skb)->un.gateway;
725 __be32 old_gw = ip_hdr(skb)->saddr;
726 struct net_device *dev = skb->dev;
727 struct in_device *in_dev;
728 struct fib_result res;
729 struct neighbour *n;
730 struct net *net;
731
732 switch (icmp_hdr(skb)->code & 7) {
733 case ICMP_REDIR_NET:
734 case ICMP_REDIR_NETTOS:
735 case ICMP_REDIR_HOST:
736 case ICMP_REDIR_HOSTTOS:
737 break;
738
739 default:
740 return;
741 }
742
743 if (rt->rt_gateway != old_gw)
744 return;
745
746 in_dev = __in_dev_get_rcu(dev);
747 if (!in_dev)
748 return;
749
750 net = dev_net(dev);
751 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
752 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
753 ipv4_is_zeronet(new_gw))
754 goto reject_redirect;
755
756 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
757 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
758 goto reject_redirect;
759 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
760 goto reject_redirect;
761 } else {
762 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
763 goto reject_redirect;
764 }
765
766 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
767 if (!n)
768 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
769 if (!IS_ERR(n)) {
770 if (!(n->nud_state & NUD_VALID)) {
771 neigh_event_send(n, NULL);
772 } else {
773 if (fib_lookup(net, fl4, &res, 0) == 0) {
774 struct fib_nh *nh = &FIB_RES_NH(res);
775
776 update_or_create_fnhe(nh, fl4->daddr, new_gw,
777 0, jiffies + ip_rt_gc_timeout);
778 }
779 if (kill_route)
780 rt->dst.obsolete = DST_OBSOLETE_KILL;
781 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
782 }
783 neigh_release(n);
784 }
785 return;
786
787 reject_redirect:
788 #ifdef CONFIG_IP_ROUTE_VERBOSE
789 if (IN_DEV_LOG_MARTIANS(in_dev)) {
790 const struct iphdr *iph = (const struct iphdr *) skb->data;
791 __be32 daddr = iph->daddr;
792 __be32 saddr = iph->saddr;
793
794 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
795 " Advised path = %pI4 -> %pI4\n",
796 &old_gw, dev->name, &new_gw,
797 &saddr, &daddr);
798 }
799 #endif
800 ;
801 }
802
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)803 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
804 {
805 struct rtable *rt;
806 struct flowi4 fl4;
807 const struct iphdr *iph = (const struct iphdr *) skb->data;
808 struct net *net = dev_net(skb->dev);
809 int oif = skb->dev->ifindex;
810 u8 tos = RT_TOS(iph->tos);
811 u8 prot = iph->protocol;
812 u32 mark = skb->mark;
813
814 rt = (struct rtable *) dst;
815
816 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
817 __ip_do_redirect(rt, skb, &fl4, true);
818 }
819
ipv4_negative_advice(struct dst_entry * dst)820 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
821 {
822 struct rtable *rt = (struct rtable *)dst;
823 struct dst_entry *ret = dst;
824
825 if (rt) {
826 if (dst->obsolete > 0) {
827 ip_rt_put(rt);
828 ret = NULL;
829 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
830 rt->dst.expires) {
831 ip_rt_put(rt);
832 ret = NULL;
833 }
834 }
835 return ret;
836 }
837
838 /*
839 * Algorithm:
840 * 1. The first ip_rt_redirect_number redirects are sent
841 * with exponential backoff, then we stop sending them at all,
842 * assuming that the host ignores our redirects.
843 * 2. If we did not see packets requiring redirects
844 * during ip_rt_redirect_silence, we assume that the host
845 * forgot redirected route and start to send redirects again.
846 *
847 * This algorithm is much cheaper and more intelligent than dumb load limiting
848 * in icmp.c.
849 *
850 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
851 * and "frag. need" (breaks PMTU discovery) in icmp.c.
852 */
853
ip_rt_send_redirect(struct sk_buff * skb)854 void ip_rt_send_redirect(struct sk_buff *skb)
855 {
856 struct rtable *rt = skb_rtable(skb);
857 struct in_device *in_dev;
858 struct inet_peer *peer;
859 struct net *net;
860 int log_martians;
861 int vif;
862
863 rcu_read_lock();
864 in_dev = __in_dev_get_rcu(rt->dst.dev);
865 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
866 rcu_read_unlock();
867 return;
868 }
869 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
870 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
871 rcu_read_unlock();
872
873 net = dev_net(rt->dst.dev);
874 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
875 if (!peer) {
876 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
877 rt_nexthop(rt, ip_hdr(skb)->daddr));
878 return;
879 }
880
881 /* No redirected packets during ip_rt_redirect_silence;
882 * reset the algorithm.
883 */
884 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
885 peer->rate_tokens = 0;
886
887 /* Too many ignored redirects; do not send anything
888 * set dst.rate_last to the last seen redirected packet.
889 */
890 if (peer->rate_tokens >= ip_rt_redirect_number) {
891 peer->rate_last = jiffies;
892 goto out_put_peer;
893 }
894
895 /* Check for load limit; set rate_last to the latest sent
896 * redirect.
897 */
898 if (peer->rate_tokens == 0 ||
899 time_after(jiffies,
900 (peer->rate_last +
901 (ip_rt_redirect_load << peer->rate_tokens)))) {
902 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
903
904 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
905 peer->rate_last = jiffies;
906 ++peer->rate_tokens;
907 #ifdef CONFIG_IP_ROUTE_VERBOSE
908 if (log_martians &&
909 peer->rate_tokens == ip_rt_redirect_number)
910 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
911 &ip_hdr(skb)->saddr, inet_iif(skb),
912 &ip_hdr(skb)->daddr, &gw);
913 #endif
914 }
915 out_put_peer:
916 inet_putpeer(peer);
917 }
918
ip_error(struct sk_buff * skb)919 static int ip_error(struct sk_buff *skb)
920 {
921 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
922 struct rtable *rt = skb_rtable(skb);
923 struct inet_peer *peer;
924 unsigned long now;
925 struct net *net;
926 bool send;
927 int code;
928
929 /* IP on this device is disabled. */
930 if (!in_dev)
931 goto out;
932
933 net = dev_net(rt->dst.dev);
934 if (!IN_DEV_FORWARD(in_dev)) {
935 switch (rt->dst.error) {
936 case EHOSTUNREACH:
937 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
938 break;
939
940 case ENETUNREACH:
941 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
942 break;
943 }
944 goto out;
945 }
946
947 switch (rt->dst.error) {
948 case EINVAL:
949 default:
950 goto out;
951 case EHOSTUNREACH:
952 code = ICMP_HOST_UNREACH;
953 break;
954 case ENETUNREACH:
955 code = ICMP_NET_UNREACH;
956 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
957 break;
958 case EACCES:
959 code = ICMP_PKT_FILTERED;
960 break;
961 }
962
963 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
964 l3mdev_master_ifindex(skb->dev), 1);
965
966 send = true;
967 if (peer) {
968 now = jiffies;
969 peer->rate_tokens += now - peer->rate_last;
970 if (peer->rate_tokens > ip_rt_error_burst)
971 peer->rate_tokens = ip_rt_error_burst;
972 peer->rate_last = now;
973 if (peer->rate_tokens >= ip_rt_error_cost)
974 peer->rate_tokens -= ip_rt_error_cost;
975 else
976 send = false;
977 inet_putpeer(peer);
978 }
979 if (send)
980 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
981
982 out: kfree_skb(skb);
983 return 0;
984 }
985
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)986 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
987 {
988 struct dst_entry *dst = &rt->dst;
989 struct fib_result res;
990
991 if (dst_metric_locked(dst, RTAX_MTU))
992 return;
993
994 if (ipv4_mtu(dst) < mtu)
995 return;
996
997 if (mtu < ip_rt_min_pmtu)
998 mtu = ip_rt_min_pmtu;
999
1000 if (rt->rt_pmtu == mtu &&
1001 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1002 return;
1003
1004 rcu_read_lock();
1005 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1006 struct fib_nh *nh = &FIB_RES_NH(res);
1007
1008 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1009 jiffies + ip_rt_mtu_expires);
1010 }
1011 rcu_read_unlock();
1012 }
1013
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)1014 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1015 struct sk_buff *skb, u32 mtu)
1016 {
1017 struct rtable *rt = (struct rtable *) dst;
1018 struct flowi4 fl4;
1019
1020 ip_rt_build_flow_key(&fl4, sk, skb);
1021 __ip_rt_update_pmtu(rt, &fl4, mtu);
1022 }
1023
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u32 mark,u8 protocol,int flow_flags)1024 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1025 int oif, u32 mark, u8 protocol, int flow_flags)
1026 {
1027 const struct iphdr *iph = (const struct iphdr *) skb->data;
1028 struct flowi4 fl4;
1029 struct rtable *rt;
1030
1031 if (!mark)
1032 mark = IP4_REPLY_MARK(net, skb->mark);
1033
1034 __build_flow_key(net, &fl4, NULL, iph, oif,
1035 RT_TOS(iph->tos), protocol, mark, flow_flags);
1036 rt = __ip_route_output_key(net, &fl4);
1037 if (!IS_ERR(rt)) {
1038 __ip_rt_update_pmtu(rt, &fl4, mtu);
1039 ip_rt_put(rt);
1040 }
1041 }
1042 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1043
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1044 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1045 {
1046 const struct iphdr *iph = (const struct iphdr *) skb->data;
1047 struct flowi4 fl4;
1048 struct rtable *rt;
1049
1050 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1051
1052 if (!fl4.flowi4_mark)
1053 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1054
1055 rt = __ip_route_output_key(sock_net(sk), &fl4);
1056 if (!IS_ERR(rt)) {
1057 __ip_rt_update_pmtu(rt, &fl4, mtu);
1058 ip_rt_put(rt);
1059 }
1060 }
1061
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1062 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1063 {
1064 const struct iphdr *iph = (const struct iphdr *) skb->data;
1065 struct flowi4 fl4;
1066 struct rtable *rt;
1067 struct dst_entry *odst = NULL;
1068 bool new = false;
1069 struct net *net = sock_net(sk);
1070
1071 bh_lock_sock(sk);
1072
1073 if (!ip_sk_accept_pmtu(sk))
1074 goto out;
1075
1076 odst = sk_dst_get(sk);
1077
1078 if (sock_owned_by_user(sk) || !odst) {
1079 __ipv4_sk_update_pmtu(skb, sk, mtu);
1080 goto out;
1081 }
1082
1083 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1084
1085 rt = (struct rtable *)odst;
1086 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1087 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1088 if (IS_ERR(rt))
1089 goto out;
1090
1091 new = true;
1092 }
1093
1094 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1095
1096 if (!dst_check(&rt->dst, 0)) {
1097 if (new)
1098 dst_release(&rt->dst);
1099
1100 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1101 if (IS_ERR(rt))
1102 goto out;
1103
1104 new = true;
1105 }
1106
1107 if (new)
1108 sk_dst_set(sk, &rt->dst);
1109
1110 out:
1111 bh_unlock_sock(sk);
1112 dst_release(odst);
1113 }
1114 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1115
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u32 mark,u8 protocol,int flow_flags)1116 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1117 int oif, u32 mark, u8 protocol, int flow_flags)
1118 {
1119 const struct iphdr *iph = (const struct iphdr *) skb->data;
1120 struct flowi4 fl4;
1121 struct rtable *rt;
1122
1123 __build_flow_key(net, &fl4, NULL, iph, oif,
1124 RT_TOS(iph->tos), protocol, mark, flow_flags);
1125 rt = __ip_route_output_key(net, &fl4);
1126 if (!IS_ERR(rt)) {
1127 __ip_do_redirect(rt, skb, &fl4, false);
1128 ip_rt_put(rt);
1129 }
1130 }
1131 EXPORT_SYMBOL_GPL(ipv4_redirect);
1132
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1133 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1134 {
1135 const struct iphdr *iph = (const struct iphdr *) skb->data;
1136 struct flowi4 fl4;
1137 struct rtable *rt;
1138 struct net *net = sock_net(sk);
1139
1140 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1141 rt = __ip_route_output_key(net, &fl4);
1142 if (!IS_ERR(rt)) {
1143 __ip_do_redirect(rt, skb, &fl4, false);
1144 ip_rt_put(rt);
1145 }
1146 }
1147 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1148
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1149 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1150 {
1151 struct rtable *rt = (struct rtable *) dst;
1152
1153 /* All IPV4 dsts are created with ->obsolete set to the value
1154 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1155 * into this function always.
1156 *
1157 * When a PMTU/redirect information update invalidates a route,
1158 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1159 * DST_OBSOLETE_DEAD by dst_free().
1160 */
1161 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1162 return NULL;
1163 return dst;
1164 }
1165
ipv4_link_failure(struct sk_buff * skb)1166 static void ipv4_link_failure(struct sk_buff *skb)
1167 {
1168 struct rtable *rt;
1169
1170 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1171
1172 rt = skb_rtable(skb);
1173 if (rt)
1174 dst_set_expires(&rt->dst, 0);
1175 }
1176
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1177 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1178 {
1179 pr_debug("%s: %pI4 -> %pI4, %s\n",
1180 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1181 skb->dev ? skb->dev->name : "?");
1182 kfree_skb(skb);
1183 WARN_ON(1);
1184 return 0;
1185 }
1186
1187 /*
1188 We do not cache source address of outgoing interface,
1189 because it is used only by IP RR, TS and SRR options,
1190 so that it out of fast path.
1191
1192 BTW remember: "addr" is allowed to be not aligned
1193 in IP options!
1194 */
1195
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1196 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1197 {
1198 __be32 src;
1199
1200 if (rt_is_output_route(rt))
1201 src = ip_hdr(skb)->saddr;
1202 else {
1203 struct fib_result res;
1204 struct flowi4 fl4;
1205 struct iphdr *iph;
1206
1207 iph = ip_hdr(skb);
1208
1209 memset(&fl4, 0, sizeof(fl4));
1210 fl4.daddr = iph->daddr;
1211 fl4.saddr = iph->saddr;
1212 fl4.flowi4_tos = RT_TOS(iph->tos);
1213 fl4.flowi4_oif = rt->dst.dev->ifindex;
1214 fl4.flowi4_iif = skb->dev->ifindex;
1215 fl4.flowi4_mark = skb->mark;
1216
1217 rcu_read_lock();
1218 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1219 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1220 else
1221 src = inet_select_addr(rt->dst.dev,
1222 rt_nexthop(rt, iph->daddr),
1223 RT_SCOPE_UNIVERSE);
1224 rcu_read_unlock();
1225 }
1226 memcpy(addr, &src, 4);
1227 }
1228
1229 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1230 static void set_class_tag(struct rtable *rt, u32 tag)
1231 {
1232 if (!(rt->dst.tclassid & 0xFFFF))
1233 rt->dst.tclassid |= tag & 0xFFFF;
1234 if (!(rt->dst.tclassid & 0xFFFF0000))
1235 rt->dst.tclassid |= tag & 0xFFFF0000;
1236 }
1237 #endif
1238
ipv4_default_advmss(const struct dst_entry * dst)1239 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1240 {
1241 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1242
1243 if (advmss == 0) {
1244 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1245 ip_rt_min_advmss);
1246 if (advmss > 65535 - 40)
1247 advmss = 65535 - 40;
1248 }
1249 return advmss;
1250 }
1251
ipv4_mtu(const struct dst_entry * dst)1252 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1253 {
1254 const struct rtable *rt = (const struct rtable *) dst;
1255 unsigned int mtu = rt->rt_pmtu;
1256
1257 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1258 mtu = dst_metric_raw(dst, RTAX_MTU);
1259
1260 if (mtu)
1261 return mtu;
1262
1263 mtu = READ_ONCE(dst->dev->mtu);
1264
1265 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1266 if (rt->rt_uses_gateway && mtu > 576)
1267 mtu = 576;
1268 }
1269
1270 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1271
1272 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1273 }
1274
find_exception(struct fib_nh * nh,__be32 daddr)1275 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1276 {
1277 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1278 struct fib_nh_exception *fnhe;
1279 u32 hval;
1280
1281 if (!hash)
1282 return NULL;
1283
1284 hval = fnhe_hashfun(daddr);
1285
1286 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1287 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1288 if (fnhe->fnhe_daddr == daddr)
1289 return fnhe;
1290 }
1291 return NULL;
1292 }
1293
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr)1294 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1295 __be32 daddr)
1296 {
1297 bool ret = false;
1298
1299 spin_lock_bh(&fnhe_lock);
1300
1301 if (daddr == fnhe->fnhe_daddr) {
1302 struct rtable __rcu **porig;
1303 struct rtable *orig;
1304 int genid = fnhe_genid(dev_net(rt->dst.dev));
1305
1306 if (rt_is_input_route(rt))
1307 porig = &fnhe->fnhe_rth_input;
1308 else
1309 porig = &fnhe->fnhe_rth_output;
1310 orig = rcu_dereference(*porig);
1311
1312 if (fnhe->fnhe_genid != genid) {
1313 fnhe->fnhe_genid = genid;
1314 fnhe->fnhe_gw = 0;
1315 fnhe->fnhe_pmtu = 0;
1316 fnhe->fnhe_expires = 0;
1317 fnhe_flush_routes(fnhe);
1318 orig = NULL;
1319 }
1320 fill_route_from_fnhe(rt, fnhe);
1321 if (!rt->rt_gateway)
1322 rt->rt_gateway = daddr;
1323
1324 if (!(rt->dst.flags & DST_NOCACHE)) {
1325 rcu_assign_pointer(*porig, rt);
1326 if (orig)
1327 rt_free(orig);
1328 ret = true;
1329 }
1330
1331 fnhe->fnhe_stamp = jiffies;
1332 }
1333 spin_unlock_bh(&fnhe_lock);
1334
1335 return ret;
1336 }
1337
rt_cache_route(struct fib_nh * nh,struct rtable * rt)1338 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1339 {
1340 struct rtable *orig, *prev, **p;
1341 bool ret = true;
1342
1343 if (rt_is_input_route(rt)) {
1344 p = (struct rtable **)&nh->nh_rth_input;
1345 } else {
1346 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1347 }
1348 orig = *p;
1349
1350 prev = cmpxchg(p, orig, rt);
1351 if (prev == orig) {
1352 if (orig)
1353 rt_free(orig);
1354 } else
1355 ret = false;
1356
1357 return ret;
1358 }
1359
1360 struct uncached_list {
1361 spinlock_t lock;
1362 struct list_head head;
1363 };
1364
1365 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1366
rt_add_uncached_list(struct rtable * rt)1367 static void rt_add_uncached_list(struct rtable *rt)
1368 {
1369 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1370
1371 rt->rt_uncached_list = ul;
1372
1373 spin_lock_bh(&ul->lock);
1374 list_add_tail(&rt->rt_uncached, &ul->head);
1375 spin_unlock_bh(&ul->lock);
1376 }
1377
ipv4_dst_destroy(struct dst_entry * dst)1378 static void ipv4_dst_destroy(struct dst_entry *dst)
1379 {
1380 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1381 struct rtable *rt = (struct rtable *) dst;
1382
1383 if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1384 kfree(p);
1385
1386 if (!list_empty(&rt->rt_uncached)) {
1387 struct uncached_list *ul = rt->rt_uncached_list;
1388
1389 spin_lock_bh(&ul->lock);
1390 list_del(&rt->rt_uncached);
1391 spin_unlock_bh(&ul->lock);
1392 }
1393 }
1394
rt_flush_dev(struct net_device * dev)1395 void rt_flush_dev(struct net_device *dev)
1396 {
1397 struct net *net = dev_net(dev);
1398 struct rtable *rt;
1399 int cpu;
1400
1401 for_each_possible_cpu(cpu) {
1402 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1403
1404 spin_lock_bh(&ul->lock);
1405 list_for_each_entry(rt, &ul->head, rt_uncached) {
1406 if (rt->dst.dev != dev)
1407 continue;
1408 rt->dst.dev = net->loopback_dev;
1409 dev_hold(rt->dst.dev);
1410 dev_put(dev);
1411 }
1412 spin_unlock_bh(&ul->lock);
1413 }
1414 }
1415
rt_cache_valid(const struct rtable * rt)1416 static bool rt_cache_valid(const struct rtable *rt)
1417 {
1418 return rt &&
1419 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1420 !rt_is_expired(rt);
1421 }
1422
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag)1423 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1424 const struct fib_result *res,
1425 struct fib_nh_exception *fnhe,
1426 struct fib_info *fi, u16 type, u32 itag)
1427 {
1428 bool cached = false;
1429
1430 if (fi) {
1431 struct fib_nh *nh = &FIB_RES_NH(*res);
1432
1433 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1434 rt->rt_gateway = nh->nh_gw;
1435 rt->rt_uses_gateway = 1;
1436 }
1437 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1438 if (fi->fib_metrics != &dst_default_metrics) {
1439 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1440 atomic_inc(&fi->fib_metrics->refcnt);
1441 }
1442 #ifdef CONFIG_IP_ROUTE_CLASSID
1443 rt->dst.tclassid = nh->nh_tclassid;
1444 #endif
1445 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1446 if (unlikely(fnhe))
1447 cached = rt_bind_exception(rt, fnhe, daddr);
1448 else if (!(rt->dst.flags & DST_NOCACHE))
1449 cached = rt_cache_route(nh, rt);
1450 if (unlikely(!cached)) {
1451 /* Routes we intend to cache in nexthop exception or
1452 * FIB nexthop have the DST_NOCACHE bit clear.
1453 * However, if we are unsuccessful at storing this
1454 * route into the cache we really need to set it.
1455 */
1456 rt->dst.flags |= DST_NOCACHE;
1457 if (!rt->rt_gateway)
1458 rt->rt_gateway = daddr;
1459 rt_add_uncached_list(rt);
1460 }
1461 } else
1462 rt_add_uncached_list(rt);
1463
1464 #ifdef CONFIG_IP_ROUTE_CLASSID
1465 #ifdef CONFIG_IP_MULTIPLE_TABLES
1466 set_class_tag(rt, res->tclassid);
1467 #endif
1468 set_class_tag(rt, itag);
1469 #endif
1470 }
1471
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm,bool will_cache)1472 struct rtable *rt_dst_alloc(struct net_device *dev,
1473 unsigned int flags, u16 type,
1474 bool nopolicy, bool noxfrm, bool will_cache)
1475 {
1476 struct rtable *rt;
1477
1478 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1479 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1480 (nopolicy ? DST_NOPOLICY : 0) |
1481 (noxfrm ? DST_NOXFRM : 0));
1482
1483 if (rt) {
1484 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1485 rt->rt_flags = flags;
1486 rt->rt_type = type;
1487 rt->rt_is_input = 0;
1488 rt->rt_iif = 0;
1489 rt->rt_pmtu = 0;
1490 rt->rt_gateway = 0;
1491 rt->rt_uses_gateway = 0;
1492 rt->rt_table_id = 0;
1493 INIT_LIST_HEAD(&rt->rt_uncached);
1494
1495 rt->dst.output = ip_output;
1496 if (flags & RTCF_LOCAL)
1497 rt->dst.input = ip_local_deliver;
1498 }
1499
1500 return rt;
1501 }
1502 EXPORT_SYMBOL(rt_dst_alloc);
1503
1504 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1505 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1506 u8 tos, struct net_device *dev, int our)
1507 {
1508 struct rtable *rth;
1509 struct in_device *in_dev = __in_dev_get_rcu(dev);
1510 unsigned int flags = RTCF_MULTICAST;
1511 u32 itag = 0;
1512 int err;
1513
1514 /* Primary sanity checks. */
1515
1516 if (!in_dev)
1517 return -EINVAL;
1518
1519 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1520 skb->protocol != htons(ETH_P_IP))
1521 goto e_inval;
1522
1523 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1524 goto e_inval;
1525
1526 if (ipv4_is_zeronet(saddr)) {
1527 if (!ipv4_is_local_multicast(daddr))
1528 goto e_inval;
1529 } else {
1530 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1531 in_dev, &itag);
1532 if (err < 0)
1533 goto e_err;
1534 }
1535 if (our)
1536 flags |= RTCF_LOCAL;
1537
1538 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1539 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1540 if (!rth)
1541 goto e_nobufs;
1542
1543 #ifdef CONFIG_IP_ROUTE_CLASSID
1544 rth->dst.tclassid = itag;
1545 #endif
1546 rth->dst.output = ip_rt_bug;
1547 rth->rt_is_input= 1;
1548
1549 #ifdef CONFIG_IP_MROUTE
1550 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1551 rth->dst.input = ip_mr_input;
1552 #endif
1553 RT_CACHE_STAT_INC(in_slow_mc);
1554
1555 skb_dst_set(skb, &rth->dst);
1556 return 0;
1557
1558 e_nobufs:
1559 return -ENOBUFS;
1560 e_inval:
1561 return -EINVAL;
1562 e_err:
1563 return err;
1564 }
1565
1566
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1567 static void ip_handle_martian_source(struct net_device *dev,
1568 struct in_device *in_dev,
1569 struct sk_buff *skb,
1570 __be32 daddr,
1571 __be32 saddr)
1572 {
1573 RT_CACHE_STAT_INC(in_martian_src);
1574 #ifdef CONFIG_IP_ROUTE_VERBOSE
1575 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1576 /*
1577 * RFC1812 recommendation, if source is martian,
1578 * the only hint is MAC header.
1579 */
1580 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1581 &daddr, &saddr, dev->name);
1582 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1583 print_hex_dump(KERN_WARNING, "ll header: ",
1584 DUMP_PREFIX_OFFSET, 16, 1,
1585 skb_mac_header(skb),
1586 dev->hard_header_len, true);
1587 }
1588 }
1589 #endif
1590 }
1591
ip_del_fnhe(struct fib_nh * nh,__be32 daddr)1592 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1593 {
1594 struct fnhe_hash_bucket *hash;
1595 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1596 u32 hval = fnhe_hashfun(daddr);
1597
1598 spin_lock_bh(&fnhe_lock);
1599
1600 hash = rcu_dereference_protected(nh->nh_exceptions,
1601 lockdep_is_held(&fnhe_lock));
1602 hash += hval;
1603
1604 fnhe_p = &hash->chain;
1605 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1606 while (fnhe) {
1607 if (fnhe->fnhe_daddr == daddr) {
1608 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1609 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1610 fnhe_flush_routes(fnhe);
1611 kfree_rcu(fnhe, rcu);
1612 break;
1613 }
1614 fnhe_p = &fnhe->fnhe_next;
1615 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1616 lockdep_is_held(&fnhe_lock));
1617 }
1618
1619 spin_unlock_bh(&fnhe_lock);
1620 }
1621
1622 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1623 static int __mkroute_input(struct sk_buff *skb,
1624 const struct fib_result *res,
1625 struct in_device *in_dev,
1626 __be32 daddr, __be32 saddr, u32 tos)
1627 {
1628 struct fib_nh_exception *fnhe;
1629 struct rtable *rth;
1630 int err;
1631 struct in_device *out_dev;
1632 bool do_cache;
1633 u32 itag = 0;
1634
1635 /* get a working reference to the output device */
1636 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1637 if (!out_dev) {
1638 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1639 return -EINVAL;
1640 }
1641
1642 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1643 in_dev->dev, in_dev, &itag);
1644 if (err < 0) {
1645 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1646 saddr);
1647
1648 goto cleanup;
1649 }
1650
1651 do_cache = res->fi && !itag;
1652 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1653 skb->protocol == htons(ETH_P_IP) &&
1654 (IN_DEV_SHARED_MEDIA(out_dev) ||
1655 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1656 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1657
1658 if (skb->protocol != htons(ETH_P_IP)) {
1659 /* Not IP (i.e. ARP). Do not create route, if it is
1660 * invalid for proxy arp. DNAT routes are always valid.
1661 *
1662 * Proxy arp feature have been extended to allow, ARP
1663 * replies back to the same interface, to support
1664 * Private VLAN switch technologies. See arp.c.
1665 */
1666 if (out_dev == in_dev &&
1667 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1668 err = -EINVAL;
1669 goto cleanup;
1670 }
1671 }
1672
1673 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1674 if (do_cache) {
1675 if (fnhe) {
1676 rth = rcu_dereference(fnhe->fnhe_rth_input);
1677 if (rth && rth->dst.expires &&
1678 time_after(jiffies, rth->dst.expires)) {
1679 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1680 fnhe = NULL;
1681 } else {
1682 goto rt_cache;
1683 }
1684 }
1685
1686 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1687
1688 rt_cache:
1689 if (rt_cache_valid(rth)) {
1690 skb_dst_set_noref(skb, &rth->dst);
1691 goto out;
1692 }
1693 }
1694
1695 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1696 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1697 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1698 if (!rth) {
1699 err = -ENOBUFS;
1700 goto cleanup;
1701 }
1702
1703 rth->rt_is_input = 1;
1704 if (res->table)
1705 rth->rt_table_id = res->table->tb_id;
1706 RT_CACHE_STAT_INC(in_slow_tot);
1707
1708 rth->dst.input = ip_forward;
1709
1710 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1711 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1712 rth->dst.lwtstate->orig_output = rth->dst.output;
1713 rth->dst.output = lwtunnel_output;
1714 }
1715 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1716 rth->dst.lwtstate->orig_input = rth->dst.input;
1717 rth->dst.input = lwtunnel_input;
1718 }
1719 skb_dst_set(skb, &rth->dst);
1720 out:
1721 err = 0;
1722 cleanup:
1723 return err;
1724 }
1725
1726 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1727
1728 /* To make ICMP packets follow the right flow, the multipath hash is
1729 * calculated from the inner IP addresses in reverse order.
1730 */
ip_multipath_icmp_hash(struct sk_buff * skb)1731 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1732 {
1733 const struct iphdr *outer_iph = ip_hdr(skb);
1734 struct icmphdr _icmph;
1735 const struct icmphdr *icmph;
1736 struct iphdr _inner_iph;
1737 const struct iphdr *inner_iph;
1738
1739 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1740 goto standard_hash;
1741
1742 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1743 &_icmph);
1744 if (!icmph)
1745 goto standard_hash;
1746
1747 if (icmph->type != ICMP_DEST_UNREACH &&
1748 icmph->type != ICMP_REDIRECT &&
1749 icmph->type != ICMP_TIME_EXCEEDED &&
1750 icmph->type != ICMP_PARAMETERPROB) {
1751 goto standard_hash;
1752 }
1753
1754 inner_iph = skb_header_pointer(skb,
1755 outer_iph->ihl * 4 + sizeof(_icmph),
1756 sizeof(_inner_iph), &_inner_iph);
1757 if (!inner_iph)
1758 goto standard_hash;
1759
1760 return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1761
1762 standard_hash:
1763 return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1764 }
1765
1766 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1767
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,const struct flowi4 * fl4,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1768 static int ip_mkroute_input(struct sk_buff *skb,
1769 struct fib_result *res,
1770 const struct flowi4 *fl4,
1771 struct in_device *in_dev,
1772 __be32 daddr, __be32 saddr, u32 tos)
1773 {
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1775 if (res->fi && res->fi->fib_nhs > 1) {
1776 int h;
1777
1778 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1779 h = ip_multipath_icmp_hash(skb);
1780 else
1781 h = fib_multipath_hash(saddr, daddr);
1782 fib_select_multipath(res, h);
1783 }
1784 #endif
1785
1786 /* create a routing cache entry */
1787 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1788 }
1789
1790 /*
1791 * NOTE. We drop all the packets that has local source
1792 * addresses, because every properly looped back packet
1793 * must have correct destination already attached by output routine.
1794 *
1795 * Such approach solves two big problems:
1796 * 1. Not simplex devices are handled properly.
1797 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1798 * called with rcu_read_lock()
1799 */
1800
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1801 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1802 u8 tos, struct net_device *dev)
1803 {
1804 struct fib_result res;
1805 struct in_device *in_dev = __in_dev_get_rcu(dev);
1806 struct ip_tunnel_info *tun_info;
1807 struct flowi4 fl4;
1808 unsigned int flags = 0;
1809 u32 itag = 0;
1810 struct rtable *rth;
1811 int err = -EINVAL;
1812 struct net *net = dev_net(dev);
1813 bool do_cache;
1814
1815 /* IP on this device is disabled. */
1816
1817 if (!in_dev)
1818 goto out;
1819
1820 /* Check for the most weird martians, which can be not detected
1821 by fib_lookup.
1822 */
1823
1824 tun_info = skb_tunnel_info(skb);
1825 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1826 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1827 else
1828 fl4.flowi4_tun_key.tun_id = 0;
1829 skb_dst_drop(skb);
1830
1831 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1832 goto martian_source;
1833
1834 res.fi = NULL;
1835 res.table = NULL;
1836 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1837 goto brd_input;
1838
1839 /* Accept zero addresses only to limited broadcast;
1840 * I even do not know to fix it or not. Waiting for complains :-)
1841 */
1842 if (ipv4_is_zeronet(saddr))
1843 goto martian_source;
1844
1845 if (ipv4_is_zeronet(daddr))
1846 goto martian_destination;
1847
1848 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1849 * and call it once if daddr or/and saddr are loopback addresses
1850 */
1851 if (ipv4_is_loopback(daddr)) {
1852 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1853 goto martian_destination;
1854 } else if (ipv4_is_loopback(saddr)) {
1855 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1856 goto martian_source;
1857 }
1858
1859 /*
1860 * Now we are ready to route packet.
1861 */
1862 fl4.flowi4_oif = 0;
1863 fl4.flowi4_iif = dev->ifindex;
1864 fl4.flowi4_mark = skb->mark;
1865 fl4.flowi4_tos = tos;
1866 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1867 fl4.flowi4_flags = 0;
1868 fl4.daddr = daddr;
1869 fl4.saddr = saddr;
1870 fl4.flowi4_uid = sock_net_uid(net, NULL);
1871 err = fib_lookup(net, &fl4, &res, 0);
1872 if (err != 0) {
1873 if (!IN_DEV_FORWARD(in_dev))
1874 err = -EHOSTUNREACH;
1875 goto no_route;
1876 }
1877
1878 if (res.type == RTN_BROADCAST)
1879 goto brd_input;
1880
1881 if (res.type == RTN_LOCAL) {
1882 err = fib_validate_source(skb, saddr, daddr, tos,
1883 0, dev, in_dev, &itag);
1884 if (err < 0)
1885 goto martian_source;
1886 goto local_input;
1887 }
1888
1889 if (!IN_DEV_FORWARD(in_dev)) {
1890 err = -EHOSTUNREACH;
1891 goto no_route;
1892 }
1893 if (res.type != RTN_UNICAST)
1894 goto martian_destination;
1895
1896 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1897 out: return err;
1898
1899 brd_input:
1900 if (skb->protocol != htons(ETH_P_IP))
1901 goto e_inval;
1902
1903 if (!ipv4_is_zeronet(saddr)) {
1904 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1905 in_dev, &itag);
1906 if (err < 0)
1907 goto martian_source;
1908 }
1909 flags |= RTCF_BROADCAST;
1910 res.type = RTN_BROADCAST;
1911 RT_CACHE_STAT_INC(in_brd);
1912
1913 local_input:
1914 do_cache = false;
1915 if (res.fi) {
1916 if (!itag) {
1917 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1918 if (rt_cache_valid(rth)) {
1919 skb_dst_set_noref(skb, &rth->dst);
1920 err = 0;
1921 goto out;
1922 }
1923 do_cache = true;
1924 }
1925 }
1926
1927 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1928 flags | RTCF_LOCAL, res.type,
1929 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1930 if (!rth)
1931 goto e_nobufs;
1932
1933 rth->dst.output= ip_rt_bug;
1934 #ifdef CONFIG_IP_ROUTE_CLASSID
1935 rth->dst.tclassid = itag;
1936 #endif
1937 rth->rt_is_input = 1;
1938 if (res.table)
1939 rth->rt_table_id = res.table->tb_id;
1940
1941 RT_CACHE_STAT_INC(in_slow_tot);
1942 if (res.type == RTN_UNREACHABLE) {
1943 rth->dst.input= ip_error;
1944 rth->dst.error= -err;
1945 rth->rt_flags &= ~RTCF_LOCAL;
1946 }
1947 if (do_cache) {
1948 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1949 rth->dst.flags |= DST_NOCACHE;
1950 rt_add_uncached_list(rth);
1951 }
1952 }
1953 skb_dst_set(skb, &rth->dst);
1954 err = 0;
1955 goto out;
1956
1957 no_route:
1958 RT_CACHE_STAT_INC(in_no_route);
1959 res.type = RTN_UNREACHABLE;
1960 res.fi = NULL;
1961 res.table = NULL;
1962 goto local_input;
1963
1964 /*
1965 * Do not cache martian addresses: they should be logged (RFC1812)
1966 */
1967 martian_destination:
1968 RT_CACHE_STAT_INC(in_martian_dst);
1969 #ifdef CONFIG_IP_ROUTE_VERBOSE
1970 if (IN_DEV_LOG_MARTIANS(in_dev))
1971 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1972 &daddr, &saddr, dev->name);
1973 #endif
1974
1975 e_inval:
1976 err = -EINVAL;
1977 goto out;
1978
1979 e_nobufs:
1980 err = -ENOBUFS;
1981 goto out;
1982
1983 martian_source:
1984 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1985 goto out;
1986 }
1987
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1988 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1989 u8 tos, struct net_device *dev)
1990 {
1991 int res;
1992
1993 tos &= IPTOS_RT_MASK;
1994 rcu_read_lock();
1995
1996 /* Multicast recognition logic is moved from route cache to here.
1997 The problem was that too many Ethernet cards have broken/missing
1998 hardware multicast filters :-( As result the host on multicasting
1999 network acquires a lot of useless route cache entries, sort of
2000 SDR messages from all the world. Now we try to get rid of them.
2001 Really, provided software IP multicast filter is organized
2002 reasonably (at least, hashed), it does not result in a slowdown
2003 comparing with route cache reject entries.
2004 Note, that multicast routers are not affected, because
2005 route cache entry is created eventually.
2006 */
2007 if (ipv4_is_multicast(daddr)) {
2008 struct in_device *in_dev = __in_dev_get_rcu(dev);
2009
2010 if (in_dev) {
2011 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2012 ip_hdr(skb)->protocol);
2013 if (our
2014 #ifdef CONFIG_IP_MROUTE
2015 ||
2016 (!ipv4_is_local_multicast(daddr) &&
2017 IN_DEV_MFORWARD(in_dev))
2018 #endif
2019 ) {
2020 int res = ip_route_input_mc(skb, daddr, saddr,
2021 tos, dev, our);
2022 rcu_read_unlock();
2023 return res;
2024 }
2025 }
2026 rcu_read_unlock();
2027 return -EINVAL;
2028 }
2029 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2030 rcu_read_unlock();
2031 return res;
2032 }
2033 EXPORT_SYMBOL(ip_route_input_noref);
2034
2035 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2036 static struct rtable *__mkroute_output(const struct fib_result *res,
2037 const struct flowi4 *fl4, int orig_oif,
2038 struct net_device *dev_out,
2039 unsigned int flags)
2040 {
2041 struct fib_info *fi = res->fi;
2042 struct fib_nh_exception *fnhe;
2043 struct in_device *in_dev;
2044 u16 type = res->type;
2045 struct rtable *rth;
2046 bool do_cache;
2047
2048 in_dev = __in_dev_get_rcu(dev_out);
2049 if (!in_dev)
2050 return ERR_PTR(-EINVAL);
2051
2052 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2053 if (ipv4_is_loopback(fl4->saddr) &&
2054 !(dev_out->flags & IFF_LOOPBACK) &&
2055 !netif_is_l3_master(dev_out))
2056 return ERR_PTR(-EINVAL);
2057
2058 if (ipv4_is_lbcast(fl4->daddr))
2059 type = RTN_BROADCAST;
2060 else if (ipv4_is_multicast(fl4->daddr))
2061 type = RTN_MULTICAST;
2062 else if (ipv4_is_zeronet(fl4->daddr))
2063 return ERR_PTR(-EINVAL);
2064
2065 if (dev_out->flags & IFF_LOOPBACK)
2066 flags |= RTCF_LOCAL;
2067
2068 do_cache = true;
2069 if (type == RTN_BROADCAST) {
2070 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2071 fi = NULL;
2072 } else if (type == RTN_MULTICAST) {
2073 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2074 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2075 fl4->flowi4_proto))
2076 flags &= ~RTCF_LOCAL;
2077 else
2078 do_cache = false;
2079 /* If multicast route do not exist use
2080 * default one, but do not gateway in this case.
2081 * Yes, it is hack.
2082 */
2083 if (fi && res->prefixlen < 4)
2084 fi = NULL;
2085 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2086 (orig_oif != dev_out->ifindex)) {
2087 /* For local routes that require a particular output interface
2088 * we do not want to cache the result. Caching the result
2089 * causes incorrect behaviour when there are multiple source
2090 * addresses on the interface, the end result being that if the
2091 * intended recipient is waiting on that interface for the
2092 * packet he won't receive it because it will be delivered on
2093 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2094 * be set to the loopback interface as well.
2095 */
2096 fi = NULL;
2097 }
2098
2099 fnhe = NULL;
2100 do_cache &= fi != NULL;
2101 if (do_cache) {
2102 struct rtable __rcu **prth;
2103 struct fib_nh *nh = &FIB_RES_NH(*res);
2104
2105 fnhe = find_exception(nh, fl4->daddr);
2106 if (fnhe) {
2107 prth = &fnhe->fnhe_rth_output;
2108 rth = rcu_dereference(*prth);
2109 if (rth && rth->dst.expires &&
2110 time_after(jiffies, rth->dst.expires)) {
2111 ip_del_fnhe(nh, fl4->daddr);
2112 fnhe = NULL;
2113 } else {
2114 goto rt_cache;
2115 }
2116 }
2117
2118 if (unlikely(fl4->flowi4_flags &
2119 FLOWI_FLAG_KNOWN_NH &&
2120 !(nh->nh_gw &&
2121 nh->nh_scope == RT_SCOPE_LINK))) {
2122 do_cache = false;
2123 goto add;
2124 }
2125 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2126 rth = rcu_dereference(*prth);
2127
2128 rt_cache:
2129 if (rt_cache_valid(rth)) {
2130 dst_hold(&rth->dst);
2131 return rth;
2132 }
2133 }
2134
2135 add:
2136 rth = rt_dst_alloc(dev_out, flags, type,
2137 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2138 IN_DEV_CONF_GET(in_dev, NOXFRM),
2139 do_cache);
2140 if (!rth)
2141 return ERR_PTR(-ENOBUFS);
2142
2143 rth->rt_iif = orig_oif ? : 0;
2144 if (res->table)
2145 rth->rt_table_id = res->table->tb_id;
2146
2147 RT_CACHE_STAT_INC(out_slow_tot);
2148
2149 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2150 if (flags & RTCF_LOCAL &&
2151 !(dev_out->flags & IFF_LOOPBACK)) {
2152 rth->dst.output = ip_mc_output;
2153 RT_CACHE_STAT_INC(out_slow_mc);
2154 }
2155 #ifdef CONFIG_IP_MROUTE
2156 if (type == RTN_MULTICAST) {
2157 if (IN_DEV_MFORWARD(in_dev) &&
2158 !ipv4_is_local_multicast(fl4->daddr)) {
2159 rth->dst.input = ip_mr_input;
2160 rth->dst.output = ip_mc_output;
2161 }
2162 }
2163 #endif
2164 }
2165
2166 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2167 if (lwtunnel_output_redirect(rth->dst.lwtstate))
2168 rth->dst.output = lwtunnel_output;
2169
2170 return rth;
2171 }
2172
2173 /*
2174 * Major route resolver routine.
2175 */
2176
__ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,int mp_hash)2177 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2178 int mp_hash)
2179 {
2180 struct net_device *dev_out = NULL;
2181 __u8 tos = RT_FL_TOS(fl4);
2182 unsigned int flags = 0;
2183 struct fib_result res;
2184 struct rtable *rth;
2185 int orig_oif;
2186 int err = -ENETUNREACH;
2187
2188 res.tclassid = 0;
2189 res.fi = NULL;
2190 res.table = NULL;
2191
2192 orig_oif = fl4->flowi4_oif;
2193
2194 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2195 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2196 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2197 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2198
2199 rcu_read_lock();
2200 if (fl4->saddr) {
2201 rth = ERR_PTR(-EINVAL);
2202 if (ipv4_is_multicast(fl4->saddr) ||
2203 ipv4_is_lbcast(fl4->saddr) ||
2204 ipv4_is_zeronet(fl4->saddr))
2205 goto out;
2206
2207 /* I removed check for oif == dev_out->oif here.
2208 It was wrong for two reasons:
2209 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2210 is assigned to multiple interfaces.
2211 2. Moreover, we are allowed to send packets with saddr
2212 of another iface. --ANK
2213 */
2214
2215 if (fl4->flowi4_oif == 0 &&
2216 (ipv4_is_multicast(fl4->daddr) ||
2217 ipv4_is_lbcast(fl4->daddr))) {
2218 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2219 dev_out = __ip_dev_find(net, fl4->saddr, false);
2220 if (!dev_out)
2221 goto out;
2222
2223 /* Special hack: user can direct multicasts
2224 and limited broadcast via necessary interface
2225 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2226 This hack is not just for fun, it allows
2227 vic,vat and friends to work.
2228 They bind socket to loopback, set ttl to zero
2229 and expect that it will work.
2230 From the viewpoint of routing cache they are broken,
2231 because we are not allowed to build multicast path
2232 with loopback source addr (look, routing cache
2233 cannot know, that ttl is zero, so that packet
2234 will not leave this host and route is valid).
2235 Luckily, this hack is good workaround.
2236 */
2237
2238 fl4->flowi4_oif = dev_out->ifindex;
2239 goto make_route;
2240 }
2241
2242 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2243 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2244 if (!__ip_dev_find(net, fl4->saddr, false))
2245 goto out;
2246 }
2247 }
2248
2249
2250 if (fl4->flowi4_oif) {
2251 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2252 rth = ERR_PTR(-ENODEV);
2253 if (!dev_out)
2254 goto out;
2255
2256 /* RACE: Check return value of inet_select_addr instead. */
2257 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2258 rth = ERR_PTR(-ENETUNREACH);
2259 goto out;
2260 }
2261 if (ipv4_is_local_multicast(fl4->daddr) ||
2262 ipv4_is_lbcast(fl4->daddr) ||
2263 fl4->flowi4_proto == IPPROTO_IGMP) {
2264 if (!fl4->saddr)
2265 fl4->saddr = inet_select_addr(dev_out, 0,
2266 RT_SCOPE_LINK);
2267 goto make_route;
2268 }
2269 if (!fl4->saddr) {
2270 if (ipv4_is_multicast(fl4->daddr))
2271 fl4->saddr = inet_select_addr(dev_out, 0,
2272 fl4->flowi4_scope);
2273 else if (!fl4->daddr)
2274 fl4->saddr = inet_select_addr(dev_out, 0,
2275 RT_SCOPE_HOST);
2276 }
2277 }
2278
2279 if (!fl4->daddr) {
2280 fl4->daddr = fl4->saddr;
2281 if (!fl4->daddr)
2282 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2283 dev_out = net->loopback_dev;
2284 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2285 res.type = RTN_LOCAL;
2286 flags |= RTCF_LOCAL;
2287 goto make_route;
2288 }
2289
2290 err = fib_lookup(net, fl4, &res, 0);
2291 if (err) {
2292 res.fi = NULL;
2293 res.table = NULL;
2294 if (fl4->flowi4_oif &&
2295 !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2296 /* Apparently, routing tables are wrong. Assume,
2297 that the destination is on link.
2298
2299 WHY? DW.
2300 Because we are allowed to send to iface
2301 even if it has NO routes and NO assigned
2302 addresses. When oif is specified, routing
2303 tables are looked up with only one purpose:
2304 to catch if destination is gatewayed, rather than
2305 direct. Moreover, if MSG_DONTROUTE is set,
2306 we send packet, ignoring both routing tables
2307 and ifaddr state. --ANK
2308
2309
2310 We could make it even if oif is unknown,
2311 likely IPv6, but we do not.
2312 */
2313
2314 if (fl4->saddr == 0)
2315 fl4->saddr = inet_select_addr(dev_out, 0,
2316 RT_SCOPE_LINK);
2317 res.type = RTN_UNICAST;
2318 goto make_route;
2319 }
2320 rth = ERR_PTR(err);
2321 goto out;
2322 }
2323
2324 if (res.type == RTN_LOCAL) {
2325 if (!fl4->saddr) {
2326 if (res.fi->fib_prefsrc)
2327 fl4->saddr = res.fi->fib_prefsrc;
2328 else
2329 fl4->saddr = fl4->daddr;
2330 }
2331
2332 /* L3 master device is the loopback for that domain */
2333 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2334 fl4->flowi4_oif = dev_out->ifindex;
2335 flags |= RTCF_LOCAL;
2336 goto make_route;
2337 }
2338
2339 fib_select_path(net, &res, fl4, mp_hash);
2340
2341 dev_out = FIB_RES_DEV(res);
2342 fl4->flowi4_oif = dev_out->ifindex;
2343
2344
2345 make_route:
2346 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2347
2348 out:
2349 rcu_read_unlock();
2350 return rth;
2351 }
2352 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2353
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2354 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2355 {
2356 return NULL;
2357 }
2358
ipv4_blackhole_mtu(const struct dst_entry * dst)2359 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2360 {
2361 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2362
2363 return mtu ? : dst->dev->mtu;
2364 }
2365
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)2366 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2367 struct sk_buff *skb, u32 mtu)
2368 {
2369 }
2370
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2371 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2372 struct sk_buff *skb)
2373 {
2374 }
2375
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2376 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2377 unsigned long old)
2378 {
2379 return NULL;
2380 }
2381
2382 static struct dst_ops ipv4_dst_blackhole_ops = {
2383 .family = AF_INET,
2384 .check = ipv4_blackhole_dst_check,
2385 .mtu = ipv4_blackhole_mtu,
2386 .default_advmss = ipv4_default_advmss,
2387 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2388 .redirect = ipv4_rt_blackhole_redirect,
2389 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2390 .neigh_lookup = ipv4_neigh_lookup,
2391 };
2392
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2393 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2394 {
2395 struct rtable *ort = (struct rtable *) dst_orig;
2396 struct rtable *rt;
2397
2398 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2399 if (rt) {
2400 struct dst_entry *new = &rt->dst;
2401
2402 new->__use = 1;
2403 new->input = dst_discard;
2404 new->output = dst_discard_out;
2405
2406 new->dev = ort->dst.dev;
2407 if (new->dev)
2408 dev_hold(new->dev);
2409
2410 rt->rt_is_input = ort->rt_is_input;
2411 rt->rt_iif = ort->rt_iif;
2412 rt->rt_pmtu = ort->rt_pmtu;
2413
2414 rt->rt_genid = rt_genid_ipv4(net);
2415 rt->rt_flags = ort->rt_flags;
2416 rt->rt_type = ort->rt_type;
2417 rt->rt_gateway = ort->rt_gateway;
2418 rt->rt_uses_gateway = ort->rt_uses_gateway;
2419
2420 INIT_LIST_HEAD(&rt->rt_uncached);
2421 dst_free(new);
2422 }
2423
2424 dst_release(dst_orig);
2425
2426 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2427 }
2428
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2429 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2430 const struct sock *sk)
2431 {
2432 struct rtable *rt = __ip_route_output_key(net, flp4);
2433
2434 if (IS_ERR(rt))
2435 return rt;
2436
2437 if (flp4->flowi4_proto)
2438 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2439 flowi4_to_flowi(flp4),
2440 sk, 0);
2441
2442 return rt;
2443 }
2444 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2445
rt_fill_info(struct net * net,__be32 dst,__be32 src,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,int event,int nowait,unsigned int flags)2446 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2447 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2448 u32 seq, int event, int nowait, unsigned int flags)
2449 {
2450 struct rtable *rt = skb_rtable(skb);
2451 struct rtmsg *r;
2452 struct nlmsghdr *nlh;
2453 unsigned long expires = 0;
2454 u32 error;
2455 u32 metrics[RTAX_MAX];
2456
2457 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2458 if (!nlh)
2459 return -EMSGSIZE;
2460
2461 r = nlmsg_data(nlh);
2462 r->rtm_family = AF_INET;
2463 r->rtm_dst_len = 32;
2464 r->rtm_src_len = 0;
2465 r->rtm_tos = fl4->flowi4_tos;
2466 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2467 if (nla_put_u32(skb, RTA_TABLE, table_id))
2468 goto nla_put_failure;
2469 r->rtm_type = rt->rt_type;
2470 r->rtm_scope = RT_SCOPE_UNIVERSE;
2471 r->rtm_protocol = RTPROT_UNSPEC;
2472 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2473 if (rt->rt_flags & RTCF_NOTIFY)
2474 r->rtm_flags |= RTM_F_NOTIFY;
2475 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2476 r->rtm_flags |= RTCF_DOREDIRECT;
2477
2478 if (nla_put_in_addr(skb, RTA_DST, dst))
2479 goto nla_put_failure;
2480 if (src) {
2481 r->rtm_src_len = 32;
2482 if (nla_put_in_addr(skb, RTA_SRC, src))
2483 goto nla_put_failure;
2484 }
2485 if (rt->dst.dev &&
2486 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2487 goto nla_put_failure;
2488 #ifdef CONFIG_IP_ROUTE_CLASSID
2489 if (rt->dst.tclassid &&
2490 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2491 goto nla_put_failure;
2492 #endif
2493 if (!rt_is_input_route(rt) &&
2494 fl4->saddr != src) {
2495 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2496 goto nla_put_failure;
2497 }
2498 if (rt->rt_uses_gateway &&
2499 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2500 goto nla_put_failure;
2501
2502 expires = rt->dst.expires;
2503 if (expires) {
2504 unsigned long now = jiffies;
2505
2506 if (time_before(now, expires))
2507 expires -= now;
2508 else
2509 expires = 0;
2510 }
2511
2512 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2513 if (rt->rt_pmtu && expires)
2514 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2515 if (rtnetlink_put_metrics(skb, metrics) < 0)
2516 goto nla_put_failure;
2517
2518 if (fl4->flowi4_mark &&
2519 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2520 goto nla_put_failure;
2521
2522 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2523 nla_put_u32(skb, RTA_UID,
2524 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2525 goto nla_put_failure;
2526
2527 error = rt->dst.error;
2528
2529 if (rt_is_input_route(rt)) {
2530 #ifdef CONFIG_IP_MROUTE
2531 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2532 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2533 int err = ipmr_get_route(net, skb,
2534 fl4->saddr, fl4->daddr,
2535 r, nowait, portid);
2536
2537 if (err <= 0) {
2538 if (!nowait) {
2539 if (err == 0)
2540 return 0;
2541 goto nla_put_failure;
2542 } else {
2543 if (err == -EMSGSIZE)
2544 goto nla_put_failure;
2545 error = err;
2546 }
2547 }
2548 } else
2549 #endif
2550 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2551 goto nla_put_failure;
2552 }
2553
2554 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2555 goto nla_put_failure;
2556
2557 nlmsg_end(skb, nlh);
2558 return 0;
2559
2560 nla_put_failure:
2561 nlmsg_cancel(skb, nlh);
2562 return -EMSGSIZE;
2563 }
2564
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh)2565 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2566 {
2567 struct net *net = sock_net(in_skb->sk);
2568 struct rtmsg *rtm;
2569 struct nlattr *tb[RTA_MAX+1];
2570 struct rtable *rt = NULL;
2571 struct flowi4 fl4;
2572 __be32 dst = 0;
2573 __be32 src = 0;
2574 u32 iif;
2575 int err;
2576 int mark;
2577 struct sk_buff *skb;
2578 u32 table_id = RT_TABLE_MAIN;
2579 kuid_t uid;
2580
2581 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2582 if (err < 0)
2583 goto errout;
2584
2585 rtm = nlmsg_data(nlh);
2586
2587 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2588 if (!skb) {
2589 err = -ENOBUFS;
2590 goto errout;
2591 }
2592
2593 /* Reserve room for dummy headers, this skb can pass
2594 through good chunk of routing engine.
2595 */
2596 skb_reset_mac_header(skb);
2597 skb_reset_network_header(skb);
2598
2599 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2600 ip_hdr(skb)->protocol = IPPROTO_UDP;
2601 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2602
2603 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2604 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2605 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2606 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2607 if (tb[RTA_UID])
2608 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2609 else
2610 uid = (iif ? INVALID_UID : current_uid());
2611
2612 memset(&fl4, 0, sizeof(fl4));
2613 fl4.daddr = dst;
2614 fl4.saddr = src;
2615 fl4.flowi4_tos = rtm->rtm_tos;
2616 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2617 fl4.flowi4_mark = mark;
2618 fl4.flowi4_uid = uid;
2619
2620 if (iif) {
2621 struct net_device *dev;
2622
2623 dev = __dev_get_by_index(net, iif);
2624 if (!dev) {
2625 err = -ENODEV;
2626 goto errout_free;
2627 }
2628
2629 skb->protocol = htons(ETH_P_IP);
2630 skb->dev = dev;
2631 skb->mark = mark;
2632 local_bh_disable();
2633 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2634 local_bh_enable();
2635
2636 rt = skb_rtable(skb);
2637 if (err == 0 && rt->dst.error)
2638 err = -rt->dst.error;
2639 } else {
2640 rt = ip_route_output_key(net, &fl4);
2641
2642 err = 0;
2643 if (IS_ERR(rt))
2644 err = PTR_ERR(rt);
2645 }
2646
2647 if (err)
2648 goto errout_free;
2649
2650 skb_dst_set(skb, &rt->dst);
2651 if (rtm->rtm_flags & RTM_F_NOTIFY)
2652 rt->rt_flags |= RTCF_NOTIFY;
2653
2654 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2655 table_id = rt->rt_table_id;
2656
2657 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2658 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2659 RTM_NEWROUTE, 0, 0);
2660 if (err < 0)
2661 goto errout_free;
2662
2663 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2664 errout:
2665 return err;
2666
2667 errout_free:
2668 kfree_skb(skb);
2669 goto errout;
2670 }
2671
ip_rt_multicast_event(struct in_device * in_dev)2672 void ip_rt_multicast_event(struct in_device *in_dev)
2673 {
2674 rt_cache_flush(dev_net(in_dev->dev));
2675 }
2676
2677 #ifdef CONFIG_SYSCTL
2678 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2679 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2680 static int ip_rt_gc_elasticity __read_mostly = 8;
2681
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)2682 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2683 void __user *buffer,
2684 size_t *lenp, loff_t *ppos)
2685 {
2686 struct net *net = (struct net *)__ctl->extra1;
2687
2688 if (write) {
2689 rt_cache_flush(net);
2690 fnhe_genid_bump(net);
2691 return 0;
2692 }
2693
2694 return -EINVAL;
2695 }
2696
2697 static struct ctl_table ipv4_route_table[] = {
2698 {
2699 .procname = "gc_thresh",
2700 .data = &ipv4_dst_ops.gc_thresh,
2701 .maxlen = sizeof(int),
2702 .mode = 0644,
2703 .proc_handler = proc_dointvec,
2704 },
2705 {
2706 .procname = "max_size",
2707 .data = &ip_rt_max_size,
2708 .maxlen = sizeof(int),
2709 .mode = 0644,
2710 .proc_handler = proc_dointvec,
2711 },
2712 {
2713 /* Deprecated. Use gc_min_interval_ms */
2714
2715 .procname = "gc_min_interval",
2716 .data = &ip_rt_gc_min_interval,
2717 .maxlen = sizeof(int),
2718 .mode = 0644,
2719 .proc_handler = proc_dointvec_jiffies,
2720 },
2721 {
2722 .procname = "gc_min_interval_ms",
2723 .data = &ip_rt_gc_min_interval,
2724 .maxlen = sizeof(int),
2725 .mode = 0644,
2726 .proc_handler = proc_dointvec_ms_jiffies,
2727 },
2728 {
2729 .procname = "gc_timeout",
2730 .data = &ip_rt_gc_timeout,
2731 .maxlen = sizeof(int),
2732 .mode = 0644,
2733 .proc_handler = proc_dointvec_jiffies,
2734 },
2735 {
2736 .procname = "gc_interval",
2737 .data = &ip_rt_gc_interval,
2738 .maxlen = sizeof(int),
2739 .mode = 0644,
2740 .proc_handler = proc_dointvec_jiffies,
2741 },
2742 {
2743 .procname = "redirect_load",
2744 .data = &ip_rt_redirect_load,
2745 .maxlen = sizeof(int),
2746 .mode = 0644,
2747 .proc_handler = proc_dointvec,
2748 },
2749 {
2750 .procname = "redirect_number",
2751 .data = &ip_rt_redirect_number,
2752 .maxlen = sizeof(int),
2753 .mode = 0644,
2754 .proc_handler = proc_dointvec,
2755 },
2756 {
2757 .procname = "redirect_silence",
2758 .data = &ip_rt_redirect_silence,
2759 .maxlen = sizeof(int),
2760 .mode = 0644,
2761 .proc_handler = proc_dointvec,
2762 },
2763 {
2764 .procname = "error_cost",
2765 .data = &ip_rt_error_cost,
2766 .maxlen = sizeof(int),
2767 .mode = 0644,
2768 .proc_handler = proc_dointvec,
2769 },
2770 {
2771 .procname = "error_burst",
2772 .data = &ip_rt_error_burst,
2773 .maxlen = sizeof(int),
2774 .mode = 0644,
2775 .proc_handler = proc_dointvec,
2776 },
2777 {
2778 .procname = "gc_elasticity",
2779 .data = &ip_rt_gc_elasticity,
2780 .maxlen = sizeof(int),
2781 .mode = 0644,
2782 .proc_handler = proc_dointvec,
2783 },
2784 {
2785 .procname = "mtu_expires",
2786 .data = &ip_rt_mtu_expires,
2787 .maxlen = sizeof(int),
2788 .mode = 0644,
2789 .proc_handler = proc_dointvec_jiffies,
2790 },
2791 {
2792 .procname = "min_pmtu",
2793 .data = &ip_rt_min_pmtu,
2794 .maxlen = sizeof(int),
2795 .mode = 0644,
2796 .proc_handler = proc_dointvec_minmax,
2797 .extra1 = &ip_min_valid_pmtu,
2798 },
2799 {
2800 .procname = "min_adv_mss",
2801 .data = &ip_rt_min_advmss,
2802 .maxlen = sizeof(int),
2803 .mode = 0644,
2804 .proc_handler = proc_dointvec,
2805 },
2806 { }
2807 };
2808
2809 static struct ctl_table ipv4_route_flush_table[] = {
2810 {
2811 .procname = "flush",
2812 .maxlen = sizeof(int),
2813 .mode = 0200,
2814 .proc_handler = ipv4_sysctl_rtcache_flush,
2815 },
2816 { },
2817 };
2818
sysctl_route_net_init(struct net * net)2819 static __net_init int sysctl_route_net_init(struct net *net)
2820 {
2821 struct ctl_table *tbl;
2822
2823 tbl = ipv4_route_flush_table;
2824 if (!net_eq(net, &init_net)) {
2825 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2826 if (!tbl)
2827 goto err_dup;
2828
2829 /* Don't export sysctls to unprivileged users */
2830 if (net->user_ns != &init_user_ns)
2831 tbl[0].procname = NULL;
2832 }
2833 tbl[0].extra1 = net;
2834
2835 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2836 if (!net->ipv4.route_hdr)
2837 goto err_reg;
2838 return 0;
2839
2840 err_reg:
2841 if (tbl != ipv4_route_flush_table)
2842 kfree(tbl);
2843 err_dup:
2844 return -ENOMEM;
2845 }
2846
sysctl_route_net_exit(struct net * net)2847 static __net_exit void sysctl_route_net_exit(struct net *net)
2848 {
2849 struct ctl_table *tbl;
2850
2851 tbl = net->ipv4.route_hdr->ctl_table_arg;
2852 unregister_net_sysctl_table(net->ipv4.route_hdr);
2853 BUG_ON(tbl == ipv4_route_flush_table);
2854 kfree(tbl);
2855 }
2856
2857 static __net_initdata struct pernet_operations sysctl_route_ops = {
2858 .init = sysctl_route_net_init,
2859 .exit = sysctl_route_net_exit,
2860 };
2861 #endif
2862
rt_genid_init(struct net * net)2863 static __net_init int rt_genid_init(struct net *net)
2864 {
2865 atomic_set(&net->ipv4.rt_genid, 0);
2866 atomic_set(&net->fnhe_genid, 0);
2867 get_random_bytes(&net->ipv4.dev_addr_genid,
2868 sizeof(net->ipv4.dev_addr_genid));
2869 return 0;
2870 }
2871
2872 static __net_initdata struct pernet_operations rt_genid_ops = {
2873 .init = rt_genid_init,
2874 };
2875
ipv4_inetpeer_init(struct net * net)2876 static int __net_init ipv4_inetpeer_init(struct net *net)
2877 {
2878 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2879
2880 if (!bp)
2881 return -ENOMEM;
2882 inet_peer_base_init(bp);
2883 net->ipv4.peers = bp;
2884 return 0;
2885 }
2886
ipv4_inetpeer_exit(struct net * net)2887 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2888 {
2889 struct inet_peer_base *bp = net->ipv4.peers;
2890
2891 net->ipv4.peers = NULL;
2892 inetpeer_invalidate_tree(bp);
2893 kfree(bp);
2894 }
2895
2896 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2897 .init = ipv4_inetpeer_init,
2898 .exit = ipv4_inetpeer_exit,
2899 };
2900
2901 #ifdef CONFIG_IP_ROUTE_CLASSID
2902 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2903 #endif /* CONFIG_IP_ROUTE_CLASSID */
2904
ip_rt_init(void)2905 int __init ip_rt_init(void)
2906 {
2907 int rc = 0;
2908 int cpu;
2909
2910 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2911 if (!ip_idents)
2912 panic("IP: failed to allocate ip_idents\n");
2913
2914 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2915
2916 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2917 if (!ip_tstamps)
2918 panic("IP: failed to allocate ip_tstamps\n");
2919
2920 for_each_possible_cpu(cpu) {
2921 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2922
2923 INIT_LIST_HEAD(&ul->head);
2924 spin_lock_init(&ul->lock);
2925 }
2926 #ifdef CONFIG_IP_ROUTE_CLASSID
2927 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2928 if (!ip_rt_acct)
2929 panic("IP: failed to allocate ip_rt_acct\n");
2930 #endif
2931
2932 ipv4_dst_ops.kmem_cachep =
2933 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2934 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2935
2936 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2937
2938 if (dst_entries_init(&ipv4_dst_ops) < 0)
2939 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2940
2941 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2942 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2943
2944 ipv4_dst_ops.gc_thresh = ~0;
2945 ip_rt_max_size = INT_MAX;
2946
2947 devinet_init();
2948 ip_fib_init();
2949
2950 if (ip_rt_proc_init())
2951 pr_err("Unable to create route proc files\n");
2952 #ifdef CONFIG_XFRM
2953 xfrm_init();
2954 xfrm4_init();
2955 #endif
2956 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2957
2958 #ifdef CONFIG_SYSCTL
2959 register_pernet_subsys(&sysctl_route_ops);
2960 #endif
2961 register_pernet_subsys(&rt_genid_ops);
2962 register_pernet_subsys(&ipv4_inetpeer_ops);
2963 return rc;
2964 }
2965
2966 #ifdef CONFIG_SYSCTL
2967 /*
2968 * We really need to sanitize the damn ipv4 init order, then all
2969 * this nonsense will go away.
2970 */
ip_static_sysctl_init(void)2971 void __init ip_static_sysctl_init(void)
2972 {
2973 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2974 }
2975 #endif
2976