1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU 0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly = 9;
121 static int ip_rt_redirect_load __read_mostly = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly = HZ;
124 static int ip_rt_error_burst __read_mostly = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly = 256;
128
129 /*
130 * Interface to generic destination cache.
131 */
132
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void ipv4_link_failure(struct sk_buff *skb);
138 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139 struct sk_buff *skb, u32 mtu);
140 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141 struct sk_buff *skb);
142 static void ipv4_dst_destroy(struct dst_entry *dst);
143
ipv4_dst_ifdown(struct dst_entry * dst,struct net_device * dev,int how)144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145 int how)
146 {
147 }
148
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 WARN_ON(1);
152 return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160 .family = AF_INET,
161 .protocol = cpu_to_be16(ETH_P_IP),
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .ifdown = ipv4_dst_ifdown,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class) TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207 }
208
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 ++*pos;
212 return NULL;
213 }
214
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234 };
235
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242 .owner = THIS_MODULE,
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
246 .release = seq_release,
247 };
248
249
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
261 return &per_cpu(rt_cache_stat, cpu);
262 }
263 return NULL;
264 }
265
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 int cpu;
269
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 return NULL;
277
278 }
279
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 st->in_hit,
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 st->out_hit,
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 st->gc_total,
310 st->gc_ignored,
311 st->gc_goal_miss,
312 st->gc_dst_overflow,
313 st->in_hlist_search,
314 st->out_hlist_search
315 );
316 return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324 };
325
326
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333 .owner = THIS_MODULE,
334 .open = rt_cpu_seq_open,
335 .read = seq_read,
336 .llseek = seq_lseek,
337 .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
358 }
359
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363 }
364
rt_acct_proc_open(struct inode * inode,struct file * file)365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367 return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371 .owner = THIS_MODULE,
372 .open = rt_acct_proc_open,
373 .read = seq_read,
374 .llseek = seq_lseek,
375 .release = single_release,
376 };
377 #endif
378
ip_rt_do_proc_init(struct net * net)379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381 struct proc_dir_entry *pde;
382
383 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 &rt_cache_seq_fops);
385 if (!pde)
386 goto err1;
387
388 pde = proc_create("rt_cache", S_IRUGO,
389 net->proc_net_stat, &rt_cpu_seq_fops);
390 if (!pde)
391 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395 if (!pde)
396 goto err3;
397 #endif
398 return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402 remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405 remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407 return -ENOMEM;
408 }
409
ip_rt_do_proc_exit(struct net * net)410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412 remove_proc_entry("rt_cache", net->proc_net_stat);
413 remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415 remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
420 .init = ip_rt_do_proc_init,
421 .exit = ip_rt_do_proc_exit,
422 };
423
ip_rt_proc_init(void)424 static int __init ip_rt_proc_init(void)
425 {
426 return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
ip_rt_proc_init(void)430 static inline int ip_rt_proc_init(void)
431 {
432 return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
rt_is_expired(const struct rtable * rth)436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440
rt_cache_flush(struct net * net)441 void rt_cache_flush(struct net *net)
442 {
443 rt_genid_bump(net);
444 }
445
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 struct sk_buff *skb,
448 const void *daddr)
449 {
450 struct net_device *dev = dst->dev;
451 const __be32 *pkey = daddr;
452 const struct rtable *rt;
453 struct neighbour *n;
454
455 rt = (const struct rtable *) dst;
456 if (rt->rt_gateway)
457 pkey = (const __be32 *) &rt->rt_gateway;
458 else if (skb)
459 pkey = &ip_hdr(skb)->daddr;
460
461 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 if (n)
463 return n;
464 return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /*
468 * Peer allocation may fail only in serious out-of-memory conditions. However
469 * we still can generate some output.
470 * Random ID selection looks a bit dangerous because we have no chances to
471 * select ID being unique in a reasonable period of time.
472 * But broken packet identifier may be better than no packet at all.
473 */
ip_select_fb_ident(struct iphdr * iph)474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476 static DEFINE_SPINLOCK(ip_fb_id_lock);
477 static u32 ip_fallback_id;
478 u32 salt;
479
480 spin_lock_bh(&ip_fb_id_lock);
481 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482 iph->id = htons(salt & 0xFFFF);
483 ip_fallback_id = salt;
484 spin_unlock_bh(&ip_fb_id_lock);
485 }
486
__ip_select_ident(struct iphdr * iph,struct dst_entry * dst,int more)487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489 struct net *net = dev_net(dst->dev);
490 struct inet_peer *peer;
491
492 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493 if (peer) {
494 iph->id = htons(inet_getid(peer, more));
495 inet_putpeer(peer);
496 return;
497 }
498
499 ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)503 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
504 const struct sock *sk,
505 const struct iphdr *iph,
506 int oif, u8 tos,
507 u8 prot, u32 mark, int flow_flags)
508 {
509 if (sk) {
510 const struct inet_sock *inet = inet_sk(sk);
511
512 oif = sk->sk_bound_dev_if;
513 mark = sk->sk_mark;
514 tos = RT_CONN_FLAGS(sk);
515 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
516 }
517 flowi4_init_output(fl4, oif, mark, tos,
518 RT_SCOPE_UNIVERSE, prot,
519 flow_flags,
520 iph->daddr, iph->saddr, 0, 0,
521 sock_net_uid(net, sk));
522 }
523
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)524 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
525 const struct sock *sk)
526 {
527 const struct net *net = dev_net(skb->dev);
528 const struct iphdr *iph = ip_hdr(skb);
529 int oif = skb->dev->ifindex;
530 u8 tos = RT_TOS(iph->tos);
531 u8 prot = iph->protocol;
532 u32 mark = skb->mark;
533
534 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
535 }
536
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)537 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
538 {
539 const struct inet_sock *inet = inet_sk(sk);
540 const struct ip_options_rcu *inet_opt;
541 __be32 daddr = inet->inet_daddr;
542
543 rcu_read_lock();
544 inet_opt = rcu_dereference(inet->inet_opt);
545 if (inet_opt && inet_opt->opt.srr)
546 daddr = inet_opt->opt.faddr;
547 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
548 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
549 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
550 inet_sk_flowi_flags(sk),
551 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
552 rcu_read_unlock();
553 }
554
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)555 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
556 const struct sk_buff *skb)
557 {
558 if (skb)
559 build_skb_flow_key(fl4, skb, sk);
560 else
561 build_sk_flow_key(fl4, sk);
562 }
563
rt_free(struct rtable * rt)564 static inline void rt_free(struct rtable *rt)
565 {
566 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
567 }
568
569 static DEFINE_SPINLOCK(fnhe_lock);
570
fnhe_oldest(struct fnhe_hash_bucket * hash)571 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
572 {
573 struct fib_nh_exception *fnhe, *oldest;
574 struct rtable *orig;
575
576 oldest = rcu_dereference(hash->chain);
577 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
578 fnhe = rcu_dereference(fnhe->fnhe_next)) {
579 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
580 oldest = fnhe;
581 }
582 orig = rcu_dereference(oldest->fnhe_rth);
583 if (orig) {
584 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
585 rt_free(orig);
586 }
587 return oldest;
588 }
589
fnhe_hashfun(__be32 daddr)590 static inline u32 fnhe_hashfun(__be32 daddr)
591 {
592 u32 hval;
593
594 hval = (__force u32) daddr;
595 hval ^= (hval >> 11) ^ (hval >> 22);
596
597 return hval & (FNHE_HASH_SIZE - 1);
598 }
599
update_or_create_fnhe(struct fib_nh * nh,__be32 daddr,__be32 gw,u32 pmtu,unsigned long expires)600 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
601 u32 pmtu, unsigned long expires)
602 {
603 struct fnhe_hash_bucket *hash;
604 struct fib_nh_exception *fnhe;
605 int depth;
606 u32 hval = fnhe_hashfun(daddr);
607
608 spin_lock_bh(&fnhe_lock);
609
610 hash = nh->nh_exceptions;
611 if (!hash) {
612 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
613 if (!hash)
614 goto out_unlock;
615 nh->nh_exceptions = hash;
616 }
617
618 hash += hval;
619
620 depth = 0;
621 for (fnhe = rcu_dereference(hash->chain); fnhe;
622 fnhe = rcu_dereference(fnhe->fnhe_next)) {
623 if (fnhe->fnhe_daddr == daddr)
624 break;
625 depth++;
626 }
627
628 if (fnhe) {
629 if (gw)
630 fnhe->fnhe_gw = gw;
631 if (pmtu) {
632 fnhe->fnhe_pmtu = pmtu;
633 fnhe->fnhe_expires = expires;
634 }
635 } else {
636 if (depth > FNHE_RECLAIM_DEPTH)
637 fnhe = fnhe_oldest(hash);
638 else {
639 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
640 if (!fnhe)
641 goto out_unlock;
642
643 fnhe->fnhe_next = hash->chain;
644 rcu_assign_pointer(hash->chain, fnhe);
645 }
646 fnhe->fnhe_daddr = daddr;
647 fnhe->fnhe_gw = gw;
648 fnhe->fnhe_pmtu = pmtu;
649 fnhe->fnhe_expires = expires;
650 }
651
652 fnhe->fnhe_stamp = jiffies;
653
654 out_unlock:
655 spin_unlock_bh(&fnhe_lock);
656 return;
657 }
658
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)659 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
660 bool kill_route)
661 {
662 __be32 new_gw = icmp_hdr(skb)->un.gateway;
663 __be32 old_gw = ip_hdr(skb)->saddr;
664 struct net_device *dev = skb->dev;
665 struct in_device *in_dev;
666 struct fib_result res;
667 struct neighbour *n;
668 struct net *net;
669
670 switch (icmp_hdr(skb)->code & 7) {
671 case ICMP_REDIR_NET:
672 case ICMP_REDIR_NETTOS:
673 case ICMP_REDIR_HOST:
674 case ICMP_REDIR_HOSTTOS:
675 break;
676
677 default:
678 return;
679 }
680
681 if (rt->rt_gateway != old_gw)
682 return;
683
684 in_dev = __in_dev_get_rcu(dev);
685 if (!in_dev)
686 return;
687
688 net = dev_net(dev);
689 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
690 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
691 ipv4_is_zeronet(new_gw))
692 goto reject_redirect;
693
694 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
695 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
696 goto reject_redirect;
697 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
698 goto reject_redirect;
699 } else {
700 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
701 goto reject_redirect;
702 }
703
704 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
705 if (n) {
706 if (!(n->nud_state & NUD_VALID)) {
707 neigh_event_send(n, NULL);
708 } else {
709 if (fib_lookup(net, fl4, &res) == 0) {
710 struct fib_nh *nh = &FIB_RES_NH(res);
711
712 update_or_create_fnhe(nh, fl4->daddr, new_gw,
713 0, 0);
714 }
715 if (kill_route)
716 rt->dst.obsolete = DST_OBSOLETE_KILL;
717 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
718 }
719 neigh_release(n);
720 }
721 return;
722
723 reject_redirect:
724 #ifdef CONFIG_IP_ROUTE_VERBOSE
725 if (IN_DEV_LOG_MARTIANS(in_dev)) {
726 const struct iphdr *iph = (const struct iphdr *) skb->data;
727 __be32 daddr = iph->daddr;
728 __be32 saddr = iph->saddr;
729
730 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
731 " Advised path = %pI4 -> %pI4\n",
732 &old_gw, dev->name, &new_gw,
733 &saddr, &daddr);
734 }
735 #endif
736 ;
737 }
738
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)739 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
740 {
741 struct rtable *rt;
742 struct flowi4 fl4;
743 const struct iphdr *iph = (const struct iphdr *) skb->data;
744 struct net *net = dev_net(skb->dev);
745 int oif = skb->dev->ifindex;
746 u8 tos = RT_TOS(iph->tos);
747 u8 prot = iph->protocol;
748 u32 mark = skb->mark;
749
750 rt = (struct rtable *) dst;
751
752 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
753 __ip_do_redirect(rt, skb, &fl4, true);
754 }
755
ipv4_negative_advice(struct dst_entry * dst)756 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
757 {
758 struct rtable *rt = (struct rtable *)dst;
759 struct dst_entry *ret = dst;
760
761 if (rt) {
762 if (dst->obsolete > 0) {
763 ip_rt_put(rt);
764 ret = NULL;
765 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
766 rt->dst.expires) {
767 ip_rt_put(rt);
768 ret = NULL;
769 }
770 }
771 return ret;
772 }
773
774 /*
775 * Algorithm:
776 * 1. The first ip_rt_redirect_number redirects are sent
777 * with exponential backoff, then we stop sending them at all,
778 * assuming that the host ignores our redirects.
779 * 2. If we did not see packets requiring redirects
780 * during ip_rt_redirect_silence, we assume that the host
781 * forgot redirected route and start to send redirects again.
782 *
783 * This algorithm is much cheaper and more intelligent than dumb load limiting
784 * in icmp.c.
785 *
786 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
787 * and "frag. need" (breaks PMTU discovery) in icmp.c.
788 */
789
ip_rt_send_redirect(struct sk_buff * skb)790 void ip_rt_send_redirect(struct sk_buff *skb)
791 {
792 struct rtable *rt = skb_rtable(skb);
793 struct in_device *in_dev;
794 struct inet_peer *peer;
795 struct net *net;
796 int log_martians;
797
798 rcu_read_lock();
799 in_dev = __in_dev_get_rcu(rt->dst.dev);
800 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
801 rcu_read_unlock();
802 return;
803 }
804 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
805 rcu_read_unlock();
806
807 net = dev_net(rt->dst.dev);
808 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
809 if (!peer) {
810 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
811 rt_nexthop(rt, ip_hdr(skb)->daddr));
812 return;
813 }
814
815 /* No redirected packets during ip_rt_redirect_silence;
816 * reset the algorithm.
817 */
818 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
819 peer->rate_tokens = 0;
820
821 /* Too many ignored redirects; do not send anything
822 * set dst.rate_last to the last seen redirected packet.
823 */
824 if (peer->rate_tokens >= ip_rt_redirect_number) {
825 peer->rate_last = jiffies;
826 goto out_put_peer;
827 }
828
829 /* Check for load limit; set rate_last to the latest sent
830 * redirect.
831 */
832 if (peer->rate_tokens == 0 ||
833 time_after(jiffies,
834 (peer->rate_last +
835 (ip_rt_redirect_load << peer->rate_tokens)))) {
836 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
837
838 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
839 peer->rate_last = jiffies;
840 ++peer->rate_tokens;
841 #ifdef CONFIG_IP_ROUTE_VERBOSE
842 if (log_martians &&
843 peer->rate_tokens == ip_rt_redirect_number)
844 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
845 &ip_hdr(skb)->saddr, inet_iif(skb),
846 &ip_hdr(skb)->daddr, &gw);
847 #endif
848 }
849 out_put_peer:
850 inet_putpeer(peer);
851 }
852
ip_error(struct sk_buff * skb)853 static int ip_error(struct sk_buff *skb)
854 {
855 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
856 struct rtable *rt = skb_rtable(skb);
857 struct inet_peer *peer;
858 unsigned long now;
859 struct net *net;
860 bool send;
861 int code;
862
863 net = dev_net(rt->dst.dev);
864 if (!IN_DEV_FORWARD(in_dev)) {
865 switch (rt->dst.error) {
866 case EHOSTUNREACH:
867 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
868 break;
869
870 case ENETUNREACH:
871 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
872 break;
873 }
874 goto out;
875 }
876
877 switch (rt->dst.error) {
878 case EINVAL:
879 default:
880 goto out;
881 case EHOSTUNREACH:
882 code = ICMP_HOST_UNREACH;
883 break;
884 case ENETUNREACH:
885 code = ICMP_NET_UNREACH;
886 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
887 break;
888 case EACCES:
889 code = ICMP_PKT_FILTERED;
890 break;
891 }
892
893 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
894
895 send = true;
896 if (peer) {
897 now = jiffies;
898 peer->rate_tokens += now - peer->rate_last;
899 if (peer->rate_tokens > ip_rt_error_burst)
900 peer->rate_tokens = ip_rt_error_burst;
901 peer->rate_last = now;
902 if (peer->rate_tokens >= ip_rt_error_cost)
903 peer->rate_tokens -= ip_rt_error_cost;
904 else
905 send = false;
906 inet_putpeer(peer);
907 }
908 if (send)
909 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
910
911 out: kfree_skb(skb);
912 return 0;
913 }
914
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)915 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
916 {
917 struct dst_entry *dst = &rt->dst;
918 struct fib_result res;
919
920 if (dst_metric_locked(dst, RTAX_MTU))
921 return;
922
923 if (dst->dev->mtu < mtu)
924 return;
925
926 if (mtu < ip_rt_min_pmtu)
927 mtu = ip_rt_min_pmtu;
928
929 if (!rt->rt_pmtu) {
930 dst->obsolete = DST_OBSOLETE_KILL;
931 } else {
932 rt->rt_pmtu = mtu;
933 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
934 }
935
936 rcu_read_lock();
937 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
938 struct fib_nh *nh = &FIB_RES_NH(res);
939
940 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
941 jiffies + ip_rt_mtu_expires);
942 }
943 rcu_read_unlock();
944 }
945
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)946 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
947 struct sk_buff *skb, u32 mtu)
948 {
949 struct rtable *rt = (struct rtable *) dst;
950 struct flowi4 fl4;
951
952 ip_rt_build_flow_key(&fl4, sk, skb);
953 __ip_rt_update_pmtu(rt, &fl4, mtu);
954 }
955
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u32 mark,u8 protocol,int flow_flags)956 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
957 int oif, u32 mark, u8 protocol, int flow_flags)
958 {
959 const struct iphdr *iph = (const struct iphdr *) skb->data;
960 struct flowi4 fl4;
961 struct rtable *rt;
962
963 if (!mark)
964 mark = IP4_REPLY_MARK(net, skb->mark);
965
966 __build_flow_key(net, &fl4, NULL, iph, oif,
967 RT_TOS(iph->tos), protocol, mark, flow_flags);
968 rt = __ip_route_output_key(net, &fl4);
969 if (!IS_ERR(rt)) {
970 __ip_rt_update_pmtu(rt, &fl4, mtu);
971 ip_rt_put(rt);
972 }
973 }
974 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
975
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)976 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
977 {
978 const struct iphdr *iph = (const struct iphdr *) skb->data;
979 struct flowi4 fl4;
980 struct rtable *rt;
981
982 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
983
984 if (!fl4.flowi4_mark)
985 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
986
987 rt = __ip_route_output_key(sock_net(sk), &fl4);
988 if (!IS_ERR(rt)) {
989 __ip_rt_update_pmtu(rt, &fl4, mtu);
990 ip_rt_put(rt);
991 }
992 }
993
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)994 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
995 {
996 const struct iphdr *iph = (const struct iphdr *) skb->data;
997 struct flowi4 fl4;
998 struct rtable *rt;
999 struct dst_entry *dst;
1000 bool new = false;
1001 struct net *net = sock_net(sk);
1002
1003 bh_lock_sock(sk);
1004 rt = (struct rtable *) __sk_dst_get(sk);
1005
1006 if (sock_owned_by_user(sk) || !rt) {
1007 __ipv4_sk_update_pmtu(skb, sk, mtu);
1008 goto out;
1009 }
1010
1011 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1012
1013 if (!__sk_dst_check(sk, 0)) {
1014 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1015 if (IS_ERR(rt))
1016 goto out;
1017
1018 new = true;
1019 }
1020
1021 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1022
1023 dst = dst_check(&rt->dst, 0);
1024 if (!dst) {
1025 if (new)
1026 dst_release(&rt->dst);
1027
1028 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1029 if (IS_ERR(rt))
1030 goto out;
1031
1032 new = true;
1033 }
1034
1035 if (new)
1036 __sk_dst_set(sk, &rt->dst);
1037
1038 out:
1039 bh_unlock_sock(sk);
1040 }
1041 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1042
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u32 mark,u8 protocol,int flow_flags)1043 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1044 int oif, u32 mark, u8 protocol, int flow_flags)
1045 {
1046 const struct iphdr *iph = (const struct iphdr *) skb->data;
1047 struct flowi4 fl4;
1048 struct rtable *rt;
1049
1050 __build_flow_key(net, &fl4, NULL, iph, oif,
1051 RT_TOS(iph->tos), protocol, mark, flow_flags);
1052 rt = __ip_route_output_key(net, &fl4);
1053 if (!IS_ERR(rt)) {
1054 __ip_do_redirect(rt, skb, &fl4, false);
1055 ip_rt_put(rt);
1056 }
1057 }
1058 EXPORT_SYMBOL_GPL(ipv4_redirect);
1059
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1060 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1061 {
1062 const struct iphdr *iph = (const struct iphdr *) skb->data;
1063 struct flowi4 fl4;
1064 struct rtable *rt;
1065 struct net *net = sock_net(sk);
1066
1067 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1068 rt = __ip_route_output_key(net, &fl4);
1069 if (!IS_ERR(rt)) {
1070 __ip_do_redirect(rt, skb, &fl4, false);
1071 ip_rt_put(rt);
1072 }
1073 }
1074 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1075
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1076 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1077 {
1078 struct rtable *rt = (struct rtable *) dst;
1079
1080 /* All IPV4 dsts are created with ->obsolete set to the value
1081 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1082 * into this function always.
1083 *
1084 * When a PMTU/redirect information update invalidates a
1085 * route, this is indicated by setting obsolete to
1086 * DST_OBSOLETE_KILL.
1087 */
1088 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1089 return NULL;
1090 return dst;
1091 }
1092
ipv4_link_failure(struct sk_buff * skb)1093 static void ipv4_link_failure(struct sk_buff *skb)
1094 {
1095 struct rtable *rt;
1096
1097 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1098
1099 rt = skb_rtable(skb);
1100 if (rt)
1101 dst_set_expires(&rt->dst, 0);
1102 }
1103
ip_rt_bug(struct sk_buff * skb)1104 static int ip_rt_bug(struct sk_buff *skb)
1105 {
1106 pr_debug("%s: %pI4 -> %pI4, %s\n",
1107 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1108 skb->dev ? skb->dev->name : "?");
1109 kfree_skb(skb);
1110 WARN_ON(1);
1111 return 0;
1112 }
1113
1114 /*
1115 We do not cache source address of outgoing interface,
1116 because it is used only by IP RR, TS and SRR options,
1117 so that it out of fast path.
1118
1119 BTW remember: "addr" is allowed to be not aligned
1120 in IP options!
1121 */
1122
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1123 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1124 {
1125 __be32 src;
1126
1127 if (rt_is_output_route(rt))
1128 src = ip_hdr(skb)->saddr;
1129 else {
1130 struct fib_result res;
1131 struct flowi4 fl4;
1132 struct iphdr *iph;
1133
1134 iph = ip_hdr(skb);
1135
1136 memset(&fl4, 0, sizeof(fl4));
1137 fl4.daddr = iph->daddr;
1138 fl4.saddr = iph->saddr;
1139 fl4.flowi4_tos = RT_TOS(iph->tos);
1140 fl4.flowi4_oif = rt->dst.dev->ifindex;
1141 fl4.flowi4_iif = skb->dev->ifindex;
1142 fl4.flowi4_mark = skb->mark;
1143
1144 rcu_read_lock();
1145 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1146 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1147 else
1148 src = inet_select_addr(rt->dst.dev,
1149 rt_nexthop(rt, iph->daddr),
1150 RT_SCOPE_UNIVERSE);
1151 rcu_read_unlock();
1152 }
1153 memcpy(addr, &src, 4);
1154 }
1155
1156 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1157 static void set_class_tag(struct rtable *rt, u32 tag)
1158 {
1159 if (!(rt->dst.tclassid & 0xFFFF))
1160 rt->dst.tclassid |= tag & 0xFFFF;
1161 if (!(rt->dst.tclassid & 0xFFFF0000))
1162 rt->dst.tclassid |= tag & 0xFFFF0000;
1163 }
1164 #endif
1165
ipv4_default_advmss(const struct dst_entry * dst)1166 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1167 {
1168 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1169
1170 if (advmss == 0) {
1171 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1172 ip_rt_min_advmss);
1173 if (advmss > 65535 - 40)
1174 advmss = 65535 - 40;
1175 }
1176 return advmss;
1177 }
1178
ipv4_mtu(const struct dst_entry * dst)1179 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1180 {
1181 const struct rtable *rt = (const struct rtable *) dst;
1182 unsigned int mtu = rt->rt_pmtu;
1183
1184 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1185 mtu = dst_metric_raw(dst, RTAX_MTU);
1186
1187 if (mtu)
1188 return mtu;
1189
1190 mtu = dst->dev->mtu;
1191
1192 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1193 if (rt->rt_uses_gateway && mtu > 576)
1194 mtu = 576;
1195 }
1196
1197 if (mtu > IP_MAX_MTU)
1198 mtu = IP_MAX_MTU;
1199
1200 return mtu;
1201 }
1202
find_exception(struct fib_nh * nh,__be32 daddr)1203 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1204 {
1205 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1206 struct fib_nh_exception *fnhe;
1207 u32 hval;
1208
1209 if (!hash)
1210 return NULL;
1211
1212 hval = fnhe_hashfun(daddr);
1213
1214 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1215 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1216 if (fnhe->fnhe_daddr == daddr)
1217 return fnhe;
1218 }
1219 return NULL;
1220 }
1221
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr)1222 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1223 __be32 daddr)
1224 {
1225 bool ret = false;
1226
1227 spin_lock_bh(&fnhe_lock);
1228
1229 if (daddr == fnhe->fnhe_daddr) {
1230 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1231 if (orig && rt_is_expired(orig)) {
1232 fnhe->fnhe_gw = 0;
1233 fnhe->fnhe_pmtu = 0;
1234 fnhe->fnhe_expires = 0;
1235 }
1236 if (fnhe->fnhe_pmtu) {
1237 unsigned long expires = fnhe->fnhe_expires;
1238 unsigned long diff = expires - jiffies;
1239
1240 if (time_before(jiffies, expires)) {
1241 rt->rt_pmtu = fnhe->fnhe_pmtu;
1242 dst_set_expires(&rt->dst, diff);
1243 }
1244 }
1245 if (fnhe->fnhe_gw) {
1246 rt->rt_flags |= RTCF_REDIRECTED;
1247 rt->rt_gateway = fnhe->fnhe_gw;
1248 rt->rt_uses_gateway = 1;
1249 } else if (!rt->rt_gateway)
1250 rt->rt_gateway = daddr;
1251
1252 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1253 if (orig)
1254 rt_free(orig);
1255
1256 fnhe->fnhe_stamp = jiffies;
1257 ret = true;
1258 }
1259 spin_unlock_bh(&fnhe_lock);
1260
1261 return ret;
1262 }
1263
rt_cache_route(struct fib_nh * nh,struct rtable * rt)1264 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1265 {
1266 struct rtable *orig, *prev, **p;
1267 bool ret = true;
1268
1269 if (rt_is_input_route(rt)) {
1270 p = (struct rtable **)&nh->nh_rth_input;
1271 } else {
1272 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1273 }
1274 orig = *p;
1275
1276 prev = cmpxchg(p, orig, rt);
1277 if (prev == orig) {
1278 if (orig)
1279 rt_free(orig);
1280 } else
1281 ret = false;
1282
1283 return ret;
1284 }
1285
1286 static DEFINE_SPINLOCK(rt_uncached_lock);
1287 static LIST_HEAD(rt_uncached_list);
1288
rt_add_uncached_list(struct rtable * rt)1289 static void rt_add_uncached_list(struct rtable *rt)
1290 {
1291 spin_lock_bh(&rt_uncached_lock);
1292 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1293 spin_unlock_bh(&rt_uncached_lock);
1294 }
1295
ipv4_dst_destroy(struct dst_entry * dst)1296 static void ipv4_dst_destroy(struct dst_entry *dst)
1297 {
1298 struct rtable *rt = (struct rtable *) dst;
1299
1300 if (!list_empty(&rt->rt_uncached)) {
1301 spin_lock_bh(&rt_uncached_lock);
1302 list_del(&rt->rt_uncached);
1303 spin_unlock_bh(&rt_uncached_lock);
1304 }
1305 }
1306
rt_flush_dev(struct net_device * dev)1307 void rt_flush_dev(struct net_device *dev)
1308 {
1309 if (!list_empty(&rt_uncached_list)) {
1310 struct net *net = dev_net(dev);
1311 struct rtable *rt;
1312
1313 spin_lock_bh(&rt_uncached_lock);
1314 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1315 if (rt->dst.dev != dev)
1316 continue;
1317 rt->dst.dev = net->loopback_dev;
1318 dev_hold(rt->dst.dev);
1319 dev_put(dev);
1320 }
1321 spin_unlock_bh(&rt_uncached_lock);
1322 }
1323 }
1324
rt_cache_valid(const struct rtable * rt)1325 static bool rt_cache_valid(const struct rtable *rt)
1326 {
1327 return rt &&
1328 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1329 !rt_is_expired(rt);
1330 }
1331
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag)1332 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1333 const struct fib_result *res,
1334 struct fib_nh_exception *fnhe,
1335 struct fib_info *fi, u16 type, u32 itag)
1336 {
1337 bool cached = false;
1338
1339 if (fi) {
1340 struct fib_nh *nh = &FIB_RES_NH(*res);
1341
1342 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1343 rt->rt_gateway = nh->nh_gw;
1344 rt->rt_uses_gateway = 1;
1345 }
1346 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1347 #ifdef CONFIG_IP_ROUTE_CLASSID
1348 rt->dst.tclassid = nh->nh_tclassid;
1349 #endif
1350 if (unlikely(fnhe))
1351 cached = rt_bind_exception(rt, fnhe, daddr);
1352 else if (!(rt->dst.flags & DST_NOCACHE))
1353 cached = rt_cache_route(nh, rt);
1354 if (unlikely(!cached)) {
1355 /* Routes we intend to cache in nexthop exception or
1356 * FIB nexthop have the DST_NOCACHE bit clear.
1357 * However, if we are unsuccessful at storing this
1358 * route into the cache we really need to set it.
1359 */
1360 rt->dst.flags |= DST_NOCACHE;
1361 if (!rt->rt_gateway)
1362 rt->rt_gateway = daddr;
1363 rt_add_uncached_list(rt);
1364 }
1365 } else
1366 rt_add_uncached_list(rt);
1367
1368 #ifdef CONFIG_IP_ROUTE_CLASSID
1369 #ifdef CONFIG_IP_MULTIPLE_TABLES
1370 set_class_tag(rt, res->tclassid);
1371 #endif
1372 set_class_tag(rt, itag);
1373 #endif
1374 }
1375
rt_dst_alloc(struct net_device * dev,bool nopolicy,bool noxfrm,bool will_cache)1376 static struct rtable *rt_dst_alloc(struct net_device *dev,
1377 bool nopolicy, bool noxfrm, bool will_cache)
1378 {
1379 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1380 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1381 (nopolicy ? DST_NOPOLICY : 0) |
1382 (noxfrm ? DST_NOXFRM : 0));
1383 }
1384
1385 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1386 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1387 u8 tos, struct net_device *dev, int our)
1388 {
1389 struct rtable *rth;
1390 struct in_device *in_dev = __in_dev_get_rcu(dev);
1391 u32 itag = 0;
1392 int err;
1393
1394 /* Primary sanity checks. */
1395
1396 if (in_dev == NULL)
1397 return -EINVAL;
1398
1399 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1400 skb->protocol != htons(ETH_P_IP))
1401 goto e_inval;
1402
1403 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1404 if (ipv4_is_loopback(saddr))
1405 goto e_inval;
1406
1407 if (ipv4_is_zeronet(saddr)) {
1408 if (!ipv4_is_local_multicast(daddr))
1409 goto e_inval;
1410 } else {
1411 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1412 in_dev, &itag);
1413 if (err < 0)
1414 goto e_err;
1415 }
1416 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1417 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1418 if (!rth)
1419 goto e_nobufs;
1420
1421 #ifdef CONFIG_IP_ROUTE_CLASSID
1422 rth->dst.tclassid = itag;
1423 #endif
1424 rth->dst.output = ip_rt_bug;
1425
1426 rth->rt_genid = rt_genid(dev_net(dev));
1427 rth->rt_flags = RTCF_MULTICAST;
1428 rth->rt_type = RTN_MULTICAST;
1429 rth->rt_is_input= 1;
1430 rth->rt_iif = 0;
1431 rth->rt_pmtu = 0;
1432 rth->rt_gateway = 0;
1433 rth->rt_uses_gateway = 0;
1434 INIT_LIST_HEAD(&rth->rt_uncached);
1435 if (our) {
1436 rth->dst.input= ip_local_deliver;
1437 rth->rt_flags |= RTCF_LOCAL;
1438 }
1439
1440 #ifdef CONFIG_IP_MROUTE
1441 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1442 rth->dst.input = ip_mr_input;
1443 #endif
1444 RT_CACHE_STAT_INC(in_slow_mc);
1445
1446 skb_dst_set(skb, &rth->dst);
1447 return 0;
1448
1449 e_nobufs:
1450 return -ENOBUFS;
1451 e_inval:
1452 return -EINVAL;
1453 e_err:
1454 return err;
1455 }
1456
1457
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1458 static void ip_handle_martian_source(struct net_device *dev,
1459 struct in_device *in_dev,
1460 struct sk_buff *skb,
1461 __be32 daddr,
1462 __be32 saddr)
1463 {
1464 RT_CACHE_STAT_INC(in_martian_src);
1465 #ifdef CONFIG_IP_ROUTE_VERBOSE
1466 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1467 /*
1468 * RFC1812 recommendation, if source is martian,
1469 * the only hint is MAC header.
1470 */
1471 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1472 &daddr, &saddr, dev->name);
1473 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1474 print_hex_dump(KERN_WARNING, "ll header: ",
1475 DUMP_PREFIX_OFFSET, 16, 1,
1476 skb_mac_header(skb),
1477 dev->hard_header_len, true);
1478 }
1479 }
1480 #endif
1481 }
1482
1483 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1484 static int __mkroute_input(struct sk_buff *skb,
1485 const struct fib_result *res,
1486 struct in_device *in_dev,
1487 __be32 daddr, __be32 saddr, u32 tos)
1488 {
1489 struct rtable *rth;
1490 int err;
1491 struct in_device *out_dev;
1492 unsigned int flags = 0;
1493 bool do_cache;
1494 u32 itag;
1495
1496 /* get a working reference to the output device */
1497 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1498 if (out_dev == NULL) {
1499 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1500 return -EINVAL;
1501 }
1502
1503 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1504 in_dev->dev, in_dev, &itag);
1505 if (err < 0) {
1506 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1507 saddr);
1508
1509 goto cleanup;
1510 }
1511
1512 do_cache = res->fi && !itag;
1513 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1514 skb->protocol == htons(ETH_P_IP) &&
1515 (IN_DEV_SHARED_MEDIA(out_dev) ||
1516 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1517 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1518
1519 if (skb->protocol != htons(ETH_P_IP)) {
1520 /* Not IP (i.e. ARP). Do not create route, if it is
1521 * invalid for proxy arp. DNAT routes are always valid.
1522 *
1523 * Proxy arp feature have been extended to allow, ARP
1524 * replies back to the same interface, to support
1525 * Private VLAN switch technologies. See arp.c.
1526 */
1527 if (out_dev == in_dev &&
1528 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1529 err = -EINVAL;
1530 goto cleanup;
1531 }
1532 }
1533
1534 if (do_cache) {
1535 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1536 if (rt_cache_valid(rth)) {
1537 skb_dst_set_noref(skb, &rth->dst);
1538 goto out;
1539 }
1540 }
1541
1542 rth = rt_dst_alloc(out_dev->dev,
1543 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1544 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1545 if (!rth) {
1546 err = -ENOBUFS;
1547 goto cleanup;
1548 }
1549
1550 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1551 rth->rt_flags = flags;
1552 rth->rt_type = res->type;
1553 rth->rt_is_input = 1;
1554 rth->rt_iif = 0;
1555 rth->rt_pmtu = 0;
1556 rth->rt_gateway = 0;
1557 rth->rt_uses_gateway = 0;
1558 INIT_LIST_HEAD(&rth->rt_uncached);
1559
1560 rth->dst.input = ip_forward;
1561 rth->dst.output = ip_output;
1562
1563 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1564 skb_dst_set(skb, &rth->dst);
1565 out:
1566 err = 0;
1567 cleanup:
1568 return err;
1569 }
1570
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,const struct flowi4 * fl4,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1571 static int ip_mkroute_input(struct sk_buff *skb,
1572 struct fib_result *res,
1573 const struct flowi4 *fl4,
1574 struct in_device *in_dev,
1575 __be32 daddr, __be32 saddr, u32 tos)
1576 {
1577 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1578 if (res->fi && res->fi->fib_nhs > 1)
1579 fib_select_multipath(res);
1580 #endif
1581
1582 /* create a routing cache entry */
1583 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1584 }
1585
1586 /*
1587 * NOTE. We drop all the packets that has local source
1588 * addresses, because every properly looped back packet
1589 * must have correct destination already attached by output routine.
1590 *
1591 * Such approach solves two big problems:
1592 * 1. Not simplex devices are handled properly.
1593 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1594 * called with rcu_read_lock()
1595 */
1596
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1597 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1598 u8 tos, struct net_device *dev)
1599 {
1600 struct fib_result res;
1601 struct in_device *in_dev = __in_dev_get_rcu(dev);
1602 struct flowi4 fl4;
1603 unsigned int flags = 0;
1604 u32 itag = 0;
1605 struct rtable *rth;
1606 int err = -EINVAL;
1607 struct net *net = dev_net(dev);
1608 bool do_cache;
1609
1610 /* IP on this device is disabled. */
1611
1612 if (!in_dev)
1613 goto out;
1614
1615 /* Check for the most weird martians, which can be not detected
1616 by fib_lookup.
1617 */
1618
1619 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1620 goto martian_source;
1621
1622 res.fi = NULL;
1623 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1624 goto brd_input;
1625
1626 /* Accept zero addresses only to limited broadcast;
1627 * I even do not know to fix it or not. Waiting for complains :-)
1628 */
1629 if (ipv4_is_zeronet(saddr))
1630 goto martian_source;
1631
1632 if (ipv4_is_zeronet(daddr))
1633 goto martian_destination;
1634
1635 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1636 * and call it once if daddr or/and saddr are loopback addresses
1637 */
1638 if (ipv4_is_loopback(daddr)) {
1639 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1640 goto martian_destination;
1641 } else if (ipv4_is_loopback(saddr)) {
1642 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1643 goto martian_source;
1644 }
1645
1646 /*
1647 * Now we are ready to route packet.
1648 */
1649 fl4.flowi4_oif = 0;
1650 fl4.flowi4_iif = dev->ifindex;
1651 fl4.flowi4_mark = skb->mark;
1652 fl4.flowi4_tos = tos;
1653 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1654 fl4.daddr = daddr;
1655 fl4.saddr = saddr;
1656 err = fib_lookup(net, &fl4, &res);
1657 if (err != 0)
1658 goto no_route;
1659
1660 RT_CACHE_STAT_INC(in_slow_tot);
1661
1662 if (res.type == RTN_BROADCAST)
1663 goto brd_input;
1664
1665 if (res.type == RTN_LOCAL) {
1666 err = fib_validate_source(skb, saddr, daddr, tos,
1667 LOOPBACK_IFINDEX,
1668 dev, in_dev, &itag);
1669 if (err < 0)
1670 goto martian_source_keep_err;
1671 goto local_input;
1672 }
1673
1674 if (!IN_DEV_FORWARD(in_dev))
1675 goto no_route;
1676 if (res.type != RTN_UNICAST)
1677 goto martian_destination;
1678
1679 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1680 out: return err;
1681
1682 brd_input:
1683 if (skb->protocol != htons(ETH_P_IP))
1684 goto e_inval;
1685
1686 if (!ipv4_is_zeronet(saddr)) {
1687 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1688 in_dev, &itag);
1689 if (err < 0)
1690 goto martian_source_keep_err;
1691 }
1692 flags |= RTCF_BROADCAST;
1693 res.type = RTN_BROADCAST;
1694 RT_CACHE_STAT_INC(in_brd);
1695
1696 local_input:
1697 do_cache = false;
1698 if (res.fi) {
1699 if (!itag) {
1700 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1701 if (rt_cache_valid(rth)) {
1702 skb_dst_set_noref(skb, &rth->dst);
1703 err = 0;
1704 goto out;
1705 }
1706 do_cache = true;
1707 }
1708 }
1709
1710 rth = rt_dst_alloc(net->loopback_dev,
1711 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1712 if (!rth)
1713 goto e_nobufs;
1714
1715 rth->dst.input= ip_local_deliver;
1716 rth->dst.output= ip_rt_bug;
1717 #ifdef CONFIG_IP_ROUTE_CLASSID
1718 rth->dst.tclassid = itag;
1719 #endif
1720
1721 rth->rt_genid = rt_genid(net);
1722 rth->rt_flags = flags|RTCF_LOCAL;
1723 rth->rt_type = res.type;
1724 rth->rt_is_input = 1;
1725 rth->rt_iif = 0;
1726 rth->rt_pmtu = 0;
1727 rth->rt_gateway = 0;
1728 rth->rt_uses_gateway = 0;
1729 INIT_LIST_HEAD(&rth->rt_uncached);
1730 if (res.type == RTN_UNREACHABLE) {
1731 rth->dst.input= ip_error;
1732 rth->dst.error= -err;
1733 rth->rt_flags &= ~RTCF_LOCAL;
1734 }
1735 if (do_cache)
1736 rt_cache_route(&FIB_RES_NH(res), rth);
1737 skb_dst_set(skb, &rth->dst);
1738 err = 0;
1739 goto out;
1740
1741 no_route:
1742 RT_CACHE_STAT_INC(in_no_route);
1743 res.type = RTN_UNREACHABLE;
1744 if (err == -ESRCH)
1745 err = -ENETUNREACH;
1746 goto local_input;
1747
1748 /*
1749 * Do not cache martian addresses: they should be logged (RFC1812)
1750 */
1751 martian_destination:
1752 RT_CACHE_STAT_INC(in_martian_dst);
1753 #ifdef CONFIG_IP_ROUTE_VERBOSE
1754 if (IN_DEV_LOG_MARTIANS(in_dev))
1755 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1756 &daddr, &saddr, dev->name);
1757 #endif
1758
1759 e_inval:
1760 err = -EINVAL;
1761 goto out;
1762
1763 e_nobufs:
1764 err = -ENOBUFS;
1765 goto out;
1766
1767 martian_source:
1768 err = -EINVAL;
1769 martian_source_keep_err:
1770 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1771 goto out;
1772 }
1773
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1774 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1775 u8 tos, struct net_device *dev)
1776 {
1777 int res;
1778
1779 rcu_read_lock();
1780
1781 /* Multicast recognition logic is moved from route cache to here.
1782 The problem was that too many Ethernet cards have broken/missing
1783 hardware multicast filters :-( As result the host on multicasting
1784 network acquires a lot of useless route cache entries, sort of
1785 SDR messages from all the world. Now we try to get rid of them.
1786 Really, provided software IP multicast filter is organized
1787 reasonably (at least, hashed), it does not result in a slowdown
1788 comparing with route cache reject entries.
1789 Note, that multicast routers are not affected, because
1790 route cache entry is created eventually.
1791 */
1792 if (ipv4_is_multicast(daddr)) {
1793 struct in_device *in_dev = __in_dev_get_rcu(dev);
1794
1795 if (in_dev) {
1796 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1797 ip_hdr(skb)->protocol);
1798 if (our
1799 #ifdef CONFIG_IP_MROUTE
1800 ||
1801 (!ipv4_is_local_multicast(daddr) &&
1802 IN_DEV_MFORWARD(in_dev))
1803 #endif
1804 ) {
1805 int res = ip_route_input_mc(skb, daddr, saddr,
1806 tos, dev, our);
1807 rcu_read_unlock();
1808 return res;
1809 }
1810 }
1811 rcu_read_unlock();
1812 return -EINVAL;
1813 }
1814 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1815 rcu_read_unlock();
1816 return res;
1817 }
1818 EXPORT_SYMBOL(ip_route_input_noref);
1819
1820 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)1821 static struct rtable *__mkroute_output(const struct fib_result *res,
1822 const struct flowi4 *fl4, int orig_oif,
1823 struct net_device *dev_out,
1824 unsigned int flags)
1825 {
1826 struct fib_info *fi = res->fi;
1827 struct fib_nh_exception *fnhe;
1828 struct in_device *in_dev;
1829 u16 type = res->type;
1830 struct rtable *rth;
1831 bool do_cache;
1832
1833 in_dev = __in_dev_get_rcu(dev_out);
1834 if (!in_dev)
1835 return ERR_PTR(-EINVAL);
1836
1837 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1838 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1839 return ERR_PTR(-EINVAL);
1840
1841 if (ipv4_is_lbcast(fl4->daddr))
1842 type = RTN_BROADCAST;
1843 else if (ipv4_is_multicast(fl4->daddr))
1844 type = RTN_MULTICAST;
1845 else if (ipv4_is_zeronet(fl4->daddr))
1846 return ERR_PTR(-EINVAL);
1847
1848 if (dev_out->flags & IFF_LOOPBACK)
1849 flags |= RTCF_LOCAL;
1850
1851 do_cache = true;
1852 if (type == RTN_BROADCAST) {
1853 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1854 fi = NULL;
1855 } else if (type == RTN_MULTICAST) {
1856 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1857 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1858 fl4->flowi4_proto))
1859 flags &= ~RTCF_LOCAL;
1860 else
1861 do_cache = false;
1862 /* If multicast route do not exist use
1863 * default one, but do not gateway in this case.
1864 * Yes, it is hack.
1865 */
1866 if (fi && res->prefixlen < 4)
1867 fi = NULL;
1868 }
1869
1870 fnhe = NULL;
1871 do_cache &= fi != NULL;
1872 if (do_cache) {
1873 struct rtable __rcu **prth;
1874 struct fib_nh *nh = &FIB_RES_NH(*res);
1875
1876 fnhe = find_exception(nh, fl4->daddr);
1877 if (fnhe)
1878 prth = &fnhe->fnhe_rth;
1879 else {
1880 if (unlikely(fl4->flowi4_flags &
1881 FLOWI_FLAG_KNOWN_NH &&
1882 !(nh->nh_gw &&
1883 nh->nh_scope == RT_SCOPE_LINK))) {
1884 do_cache = false;
1885 goto add;
1886 }
1887 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1888 }
1889 rth = rcu_dereference(*prth);
1890 if (rt_cache_valid(rth)) {
1891 dst_hold(&rth->dst);
1892 return rth;
1893 }
1894 }
1895
1896 add:
1897 rth = rt_dst_alloc(dev_out,
1898 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1899 IN_DEV_CONF_GET(in_dev, NOXFRM),
1900 do_cache);
1901 if (!rth)
1902 return ERR_PTR(-ENOBUFS);
1903
1904 rth->dst.output = ip_output;
1905
1906 rth->rt_genid = rt_genid(dev_net(dev_out));
1907 rth->rt_flags = flags;
1908 rth->rt_type = type;
1909 rth->rt_is_input = 0;
1910 rth->rt_iif = orig_oif ? : 0;
1911 rth->rt_pmtu = 0;
1912 rth->rt_gateway = 0;
1913 rth->rt_uses_gateway = 0;
1914 INIT_LIST_HEAD(&rth->rt_uncached);
1915
1916 RT_CACHE_STAT_INC(out_slow_tot);
1917
1918 if (flags & RTCF_LOCAL)
1919 rth->dst.input = ip_local_deliver;
1920 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1921 if (flags & RTCF_LOCAL &&
1922 !(dev_out->flags & IFF_LOOPBACK)) {
1923 rth->dst.output = ip_mc_output;
1924 RT_CACHE_STAT_INC(out_slow_mc);
1925 }
1926 #ifdef CONFIG_IP_MROUTE
1927 if (type == RTN_MULTICAST) {
1928 if (IN_DEV_MFORWARD(in_dev) &&
1929 !ipv4_is_local_multicast(fl4->daddr)) {
1930 rth->dst.input = ip_mr_input;
1931 rth->dst.output = ip_mc_output;
1932 }
1933 }
1934 #endif
1935 }
1936
1937 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1938
1939 return rth;
1940 }
1941
1942 /*
1943 * Major route resolver routine.
1944 */
1945
__ip_route_output_key(struct net * net,struct flowi4 * fl4)1946 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1947 {
1948 struct net_device *dev_out = NULL;
1949 __u8 tos = RT_FL_TOS(fl4);
1950 unsigned int flags = 0;
1951 struct fib_result res;
1952 struct rtable *rth;
1953 int orig_oif;
1954
1955 res.tclassid = 0;
1956 res.fi = NULL;
1957 res.table = NULL;
1958
1959 orig_oif = fl4->flowi4_oif;
1960
1961 fl4->flowi4_iif = LOOPBACK_IFINDEX;
1962 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1963 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1964 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1965
1966 rcu_read_lock();
1967 if (fl4->saddr) {
1968 rth = ERR_PTR(-EINVAL);
1969 if (ipv4_is_multicast(fl4->saddr) ||
1970 ipv4_is_lbcast(fl4->saddr) ||
1971 ipv4_is_zeronet(fl4->saddr))
1972 goto out;
1973
1974 /* I removed check for oif == dev_out->oif here.
1975 It was wrong for two reasons:
1976 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1977 is assigned to multiple interfaces.
1978 2. Moreover, we are allowed to send packets with saddr
1979 of another iface. --ANK
1980 */
1981
1982 if (fl4->flowi4_oif == 0 &&
1983 (ipv4_is_multicast(fl4->daddr) ||
1984 ipv4_is_lbcast(fl4->daddr))) {
1985 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1986 dev_out = __ip_dev_find(net, fl4->saddr, false);
1987 if (dev_out == NULL)
1988 goto out;
1989
1990 /* Special hack: user can direct multicasts
1991 and limited broadcast via necessary interface
1992 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1993 This hack is not just for fun, it allows
1994 vic,vat and friends to work.
1995 They bind socket to loopback, set ttl to zero
1996 and expect that it will work.
1997 From the viewpoint of routing cache they are broken,
1998 because we are not allowed to build multicast path
1999 with loopback source addr (look, routing cache
2000 cannot know, that ttl is zero, so that packet
2001 will not leave this host and route is valid).
2002 Luckily, this hack is good workaround.
2003 */
2004
2005 fl4->flowi4_oif = dev_out->ifindex;
2006 goto make_route;
2007 }
2008
2009 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2010 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2011 if (!__ip_dev_find(net, fl4->saddr, false))
2012 goto out;
2013 }
2014 }
2015
2016
2017 if (fl4->flowi4_oif) {
2018 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2019 rth = ERR_PTR(-ENODEV);
2020 if (dev_out == NULL)
2021 goto out;
2022
2023 /* RACE: Check return value of inet_select_addr instead. */
2024 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2025 rth = ERR_PTR(-ENETUNREACH);
2026 goto out;
2027 }
2028 if (ipv4_is_local_multicast(fl4->daddr) ||
2029 ipv4_is_lbcast(fl4->daddr)) {
2030 if (!fl4->saddr)
2031 fl4->saddr = inet_select_addr(dev_out, 0,
2032 RT_SCOPE_LINK);
2033 goto make_route;
2034 }
2035 if (fl4->saddr) {
2036 if (ipv4_is_multicast(fl4->daddr))
2037 fl4->saddr = inet_select_addr(dev_out, 0,
2038 fl4->flowi4_scope);
2039 else if (!fl4->daddr)
2040 fl4->saddr = inet_select_addr(dev_out, 0,
2041 RT_SCOPE_HOST);
2042 }
2043 }
2044
2045 if (!fl4->daddr) {
2046 fl4->daddr = fl4->saddr;
2047 if (!fl4->daddr)
2048 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2049 dev_out = net->loopback_dev;
2050 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2051 res.type = RTN_LOCAL;
2052 flags |= RTCF_LOCAL;
2053 goto make_route;
2054 }
2055
2056 if (fib_lookup(net, fl4, &res)) {
2057 res.fi = NULL;
2058 res.table = NULL;
2059 if (fl4->flowi4_oif) {
2060 /* Apparently, routing tables are wrong. Assume,
2061 that the destination is on link.
2062
2063 WHY? DW.
2064 Because we are allowed to send to iface
2065 even if it has NO routes and NO assigned
2066 addresses. When oif is specified, routing
2067 tables are looked up with only one purpose:
2068 to catch if destination is gatewayed, rather than
2069 direct. Moreover, if MSG_DONTROUTE is set,
2070 we send packet, ignoring both routing tables
2071 and ifaddr state. --ANK
2072
2073
2074 We could make it even if oif is unknown,
2075 likely IPv6, but we do not.
2076 */
2077
2078 if (fl4->saddr == 0)
2079 fl4->saddr = inet_select_addr(dev_out, 0,
2080 RT_SCOPE_LINK);
2081 res.type = RTN_UNICAST;
2082 goto make_route;
2083 }
2084 rth = ERR_PTR(-ENETUNREACH);
2085 goto out;
2086 }
2087
2088 if (res.type == RTN_LOCAL) {
2089 if (!fl4->saddr) {
2090 if (res.fi->fib_prefsrc)
2091 fl4->saddr = res.fi->fib_prefsrc;
2092 else
2093 fl4->saddr = fl4->daddr;
2094 }
2095 dev_out = net->loopback_dev;
2096 fl4->flowi4_oif = dev_out->ifindex;
2097 flags |= RTCF_LOCAL;
2098 goto make_route;
2099 }
2100
2101 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2102 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2103 fib_select_multipath(&res);
2104 else
2105 #endif
2106 if (!res.prefixlen &&
2107 res.table->tb_num_default > 1 &&
2108 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2109 fib_select_default(&res);
2110
2111 if (!fl4->saddr)
2112 fl4->saddr = FIB_RES_PREFSRC(net, res);
2113
2114 dev_out = FIB_RES_DEV(res);
2115 fl4->flowi4_oif = dev_out->ifindex;
2116
2117
2118 make_route:
2119 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2120
2121 out:
2122 rcu_read_unlock();
2123 return rth;
2124 }
2125 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2126
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2127 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2128 {
2129 return NULL;
2130 }
2131
ipv4_blackhole_mtu(const struct dst_entry * dst)2132 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2133 {
2134 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2135
2136 return mtu ? : dst->dev->mtu;
2137 }
2138
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)2139 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2140 struct sk_buff *skb, u32 mtu)
2141 {
2142 }
2143
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2144 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2145 struct sk_buff *skb)
2146 {
2147 }
2148
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2149 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2150 unsigned long old)
2151 {
2152 return NULL;
2153 }
2154
2155 static struct dst_ops ipv4_dst_blackhole_ops = {
2156 .family = AF_INET,
2157 .protocol = cpu_to_be16(ETH_P_IP),
2158 .check = ipv4_blackhole_dst_check,
2159 .mtu = ipv4_blackhole_mtu,
2160 .default_advmss = ipv4_default_advmss,
2161 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2162 .redirect = ipv4_rt_blackhole_redirect,
2163 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2164 .neigh_lookup = ipv4_neigh_lookup,
2165 };
2166
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2167 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2168 {
2169 struct rtable *ort = (struct rtable *) dst_orig;
2170 struct rtable *rt;
2171
2172 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2173 if (rt) {
2174 struct dst_entry *new = &rt->dst;
2175
2176 new->__use = 1;
2177 new->input = dst_discard;
2178 new->output = dst_discard;
2179
2180 new->dev = ort->dst.dev;
2181 if (new->dev)
2182 dev_hold(new->dev);
2183
2184 rt->rt_is_input = ort->rt_is_input;
2185 rt->rt_iif = ort->rt_iif;
2186 rt->rt_pmtu = ort->rt_pmtu;
2187
2188 rt->rt_genid = rt_genid(net);
2189 rt->rt_flags = ort->rt_flags;
2190 rt->rt_type = ort->rt_type;
2191 rt->rt_gateway = ort->rt_gateway;
2192 rt->rt_uses_gateway = ort->rt_uses_gateway;
2193
2194 INIT_LIST_HEAD(&rt->rt_uncached);
2195
2196 dst_free(new);
2197 }
2198
2199 dst_release(dst_orig);
2200
2201 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2202 }
2203
ip_route_output_flow(struct net * net,struct flowi4 * flp4,struct sock * sk)2204 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2205 struct sock *sk)
2206 {
2207 struct rtable *rt = __ip_route_output_key(net, flp4);
2208
2209 if (IS_ERR(rt))
2210 return rt;
2211
2212 if (flp4->flowi4_proto)
2213 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2214 flowi4_to_flowi(flp4),
2215 sk, 0);
2216
2217 return rt;
2218 }
2219 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2220
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,int event,int nowait,unsigned int flags)2221 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2222 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2223 u32 seq, int event, int nowait, unsigned int flags)
2224 {
2225 struct rtable *rt = skb_rtable(skb);
2226 struct rtmsg *r;
2227 struct nlmsghdr *nlh;
2228 unsigned long expires = 0;
2229 u32 error;
2230 u32 metrics[RTAX_MAX];
2231
2232 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2233 if (nlh == NULL)
2234 return -EMSGSIZE;
2235
2236 r = nlmsg_data(nlh);
2237 r->rtm_family = AF_INET;
2238 r->rtm_dst_len = 32;
2239 r->rtm_src_len = 0;
2240 r->rtm_tos = fl4->flowi4_tos;
2241 r->rtm_table = RT_TABLE_MAIN;
2242 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2243 goto nla_put_failure;
2244 r->rtm_type = rt->rt_type;
2245 r->rtm_scope = RT_SCOPE_UNIVERSE;
2246 r->rtm_protocol = RTPROT_UNSPEC;
2247 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2248 if (rt->rt_flags & RTCF_NOTIFY)
2249 r->rtm_flags |= RTM_F_NOTIFY;
2250 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2251 r->rtm_flags |= RTCF_DOREDIRECT;
2252
2253 if (nla_put_be32(skb, RTA_DST, dst))
2254 goto nla_put_failure;
2255 if (src) {
2256 r->rtm_src_len = 32;
2257 if (nla_put_be32(skb, RTA_SRC, src))
2258 goto nla_put_failure;
2259 }
2260 if (rt->dst.dev &&
2261 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2262 goto nla_put_failure;
2263 #ifdef CONFIG_IP_ROUTE_CLASSID
2264 if (rt->dst.tclassid &&
2265 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2266 goto nla_put_failure;
2267 #endif
2268 if (!rt_is_input_route(rt) &&
2269 fl4->saddr != src) {
2270 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2271 goto nla_put_failure;
2272 }
2273 if (rt->rt_uses_gateway &&
2274 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2275 goto nla_put_failure;
2276
2277 expires = rt->dst.expires;
2278 if (expires) {
2279 unsigned long now = jiffies;
2280
2281 if (time_before(now, expires))
2282 expires -= now;
2283 else
2284 expires = 0;
2285 }
2286
2287 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2288 if (rt->rt_pmtu && expires)
2289 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2290 if (rtnetlink_put_metrics(skb, metrics) < 0)
2291 goto nla_put_failure;
2292
2293 if (fl4->flowi4_mark &&
2294 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2295 goto nla_put_failure;
2296
2297 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2298 nla_put_u32(skb, RTA_UID,
2299 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2300 goto nla_put_failure;
2301
2302 error = rt->dst.error;
2303
2304 if (rt_is_input_route(rt)) {
2305 #ifdef CONFIG_IP_MROUTE
2306 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2307 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2308 int err = ipmr_get_route(net, skb,
2309 fl4->saddr, fl4->daddr,
2310 r, nowait);
2311 if (err <= 0) {
2312 if (!nowait) {
2313 if (err == 0)
2314 return 0;
2315 goto nla_put_failure;
2316 } else {
2317 if (err == -EMSGSIZE)
2318 goto nla_put_failure;
2319 error = err;
2320 }
2321 }
2322 } else
2323 #endif
2324 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2325 goto nla_put_failure;
2326 }
2327
2328 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2329 goto nla_put_failure;
2330
2331 return nlmsg_end(skb, nlh);
2332
2333 nla_put_failure:
2334 nlmsg_cancel(skb, nlh);
2335 return -EMSGSIZE;
2336 }
2337
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh)2338 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2339 {
2340 struct net *net = sock_net(in_skb->sk);
2341 struct rtmsg *rtm;
2342 struct nlattr *tb[RTA_MAX+1];
2343 struct rtable *rt = NULL;
2344 struct flowi4 fl4;
2345 __be32 dst = 0;
2346 __be32 src = 0;
2347 u32 iif;
2348 int err;
2349 int mark;
2350 struct sk_buff *skb;
2351 kuid_t uid;
2352
2353 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2354 if (err < 0)
2355 goto errout;
2356
2357 rtm = nlmsg_data(nlh);
2358
2359 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2360 if (skb == NULL) {
2361 err = -ENOBUFS;
2362 goto errout;
2363 }
2364
2365 /* Reserve room for dummy headers, this skb can pass
2366 through good chunk of routing engine.
2367 */
2368 skb_reset_mac_header(skb);
2369 skb_reset_network_header(skb);
2370
2371 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2372 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2373 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2374
2375 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2376 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2377 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2378 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2379 if (tb[RTA_UID])
2380 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2381 else
2382 uid = (iif ? INVALID_UID : current_uid());
2383
2384 memset(&fl4, 0, sizeof(fl4));
2385 fl4.daddr = dst;
2386 fl4.saddr = src;
2387 fl4.flowi4_tos = rtm->rtm_tos;
2388 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2389 fl4.flowi4_mark = mark;
2390 fl4.flowi4_uid = uid;
2391
2392 if (iif) {
2393 struct net_device *dev;
2394
2395 dev = __dev_get_by_index(net, iif);
2396 if (dev == NULL) {
2397 err = -ENODEV;
2398 goto errout_free;
2399 }
2400
2401 skb->protocol = htons(ETH_P_IP);
2402 skb->dev = dev;
2403 skb->mark = mark;
2404 local_bh_disable();
2405 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2406 local_bh_enable();
2407
2408 rt = skb_rtable(skb);
2409 if (err == 0 && rt->dst.error)
2410 err = -rt->dst.error;
2411 } else {
2412 rt = ip_route_output_key(net, &fl4);
2413
2414 err = 0;
2415 if (IS_ERR(rt))
2416 err = PTR_ERR(rt);
2417 }
2418
2419 if (err)
2420 goto errout_free;
2421
2422 skb_dst_set(skb, &rt->dst);
2423 if (rtm->rtm_flags & RTM_F_NOTIFY)
2424 rt->rt_flags |= RTCF_NOTIFY;
2425
2426 err = rt_fill_info(net, dst, src, &fl4, skb,
2427 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2428 RTM_NEWROUTE, 0, 0);
2429 if (err <= 0)
2430 goto errout_free;
2431
2432 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2433 errout:
2434 return err;
2435
2436 errout_free:
2437 kfree_skb(skb);
2438 goto errout;
2439 }
2440
ip_rt_dump(struct sk_buff * skb,struct netlink_callback * cb)2441 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2442 {
2443 return skb->len;
2444 }
2445
ip_rt_multicast_event(struct in_device * in_dev)2446 void ip_rt_multicast_event(struct in_device *in_dev)
2447 {
2448 rt_cache_flush(dev_net(in_dev->dev));
2449 }
2450
2451 #ifdef CONFIG_SYSCTL
2452 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2453 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2454 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2455 static int ip_rt_gc_elasticity __read_mostly = 8;
2456
ipv4_sysctl_rtcache_flush(ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)2457 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2458 void __user *buffer,
2459 size_t *lenp, loff_t *ppos)
2460 {
2461 if (write) {
2462 rt_cache_flush((struct net *)__ctl->extra1);
2463 return 0;
2464 }
2465
2466 return -EINVAL;
2467 }
2468
2469 static ctl_table ipv4_route_table[] = {
2470 {
2471 .procname = "gc_thresh",
2472 .data = &ipv4_dst_ops.gc_thresh,
2473 .maxlen = sizeof(int),
2474 .mode = 0644,
2475 .proc_handler = proc_dointvec,
2476 },
2477 {
2478 .procname = "max_size",
2479 .data = &ip_rt_max_size,
2480 .maxlen = sizeof(int),
2481 .mode = 0644,
2482 .proc_handler = proc_dointvec,
2483 },
2484 {
2485 /* Deprecated. Use gc_min_interval_ms */
2486
2487 .procname = "gc_min_interval",
2488 .data = &ip_rt_gc_min_interval,
2489 .maxlen = sizeof(int),
2490 .mode = 0644,
2491 .proc_handler = proc_dointvec_jiffies,
2492 },
2493 {
2494 .procname = "gc_min_interval_ms",
2495 .data = &ip_rt_gc_min_interval,
2496 .maxlen = sizeof(int),
2497 .mode = 0644,
2498 .proc_handler = proc_dointvec_ms_jiffies,
2499 },
2500 {
2501 .procname = "gc_timeout",
2502 .data = &ip_rt_gc_timeout,
2503 .maxlen = sizeof(int),
2504 .mode = 0644,
2505 .proc_handler = proc_dointvec_jiffies,
2506 },
2507 {
2508 .procname = "gc_interval",
2509 .data = &ip_rt_gc_interval,
2510 .maxlen = sizeof(int),
2511 .mode = 0644,
2512 .proc_handler = proc_dointvec_jiffies,
2513 },
2514 {
2515 .procname = "redirect_load",
2516 .data = &ip_rt_redirect_load,
2517 .maxlen = sizeof(int),
2518 .mode = 0644,
2519 .proc_handler = proc_dointvec,
2520 },
2521 {
2522 .procname = "redirect_number",
2523 .data = &ip_rt_redirect_number,
2524 .maxlen = sizeof(int),
2525 .mode = 0644,
2526 .proc_handler = proc_dointvec,
2527 },
2528 {
2529 .procname = "redirect_silence",
2530 .data = &ip_rt_redirect_silence,
2531 .maxlen = sizeof(int),
2532 .mode = 0644,
2533 .proc_handler = proc_dointvec,
2534 },
2535 {
2536 .procname = "error_cost",
2537 .data = &ip_rt_error_cost,
2538 .maxlen = sizeof(int),
2539 .mode = 0644,
2540 .proc_handler = proc_dointvec,
2541 },
2542 {
2543 .procname = "error_burst",
2544 .data = &ip_rt_error_burst,
2545 .maxlen = sizeof(int),
2546 .mode = 0644,
2547 .proc_handler = proc_dointvec,
2548 },
2549 {
2550 .procname = "gc_elasticity",
2551 .data = &ip_rt_gc_elasticity,
2552 .maxlen = sizeof(int),
2553 .mode = 0644,
2554 .proc_handler = proc_dointvec,
2555 },
2556 {
2557 .procname = "mtu_expires",
2558 .data = &ip_rt_mtu_expires,
2559 .maxlen = sizeof(int),
2560 .mode = 0644,
2561 .proc_handler = proc_dointvec_jiffies,
2562 },
2563 {
2564 .procname = "min_pmtu",
2565 .data = &ip_rt_min_pmtu,
2566 .maxlen = sizeof(int),
2567 .mode = 0644,
2568 .proc_handler = proc_dointvec,
2569 },
2570 {
2571 .procname = "min_adv_mss",
2572 .data = &ip_rt_min_advmss,
2573 .maxlen = sizeof(int),
2574 .mode = 0644,
2575 .proc_handler = proc_dointvec,
2576 },
2577 { }
2578 };
2579
2580 static struct ctl_table ipv4_route_flush_table[] = {
2581 {
2582 .procname = "flush",
2583 .maxlen = sizeof(int),
2584 .mode = 0200,
2585 .proc_handler = ipv4_sysctl_rtcache_flush,
2586 },
2587 { },
2588 };
2589
sysctl_route_net_init(struct net * net)2590 static __net_init int sysctl_route_net_init(struct net *net)
2591 {
2592 struct ctl_table *tbl;
2593
2594 tbl = ipv4_route_flush_table;
2595 if (!net_eq(net, &init_net)) {
2596 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2597 if (tbl == NULL)
2598 goto err_dup;
2599
2600 /* Don't export sysctls to unprivileged users */
2601 if (net->user_ns != &init_user_ns)
2602 tbl[0].procname = NULL;
2603 }
2604 tbl[0].extra1 = net;
2605
2606 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2607 if (net->ipv4.route_hdr == NULL)
2608 goto err_reg;
2609 return 0;
2610
2611 err_reg:
2612 if (tbl != ipv4_route_flush_table)
2613 kfree(tbl);
2614 err_dup:
2615 return -ENOMEM;
2616 }
2617
sysctl_route_net_exit(struct net * net)2618 static __net_exit void sysctl_route_net_exit(struct net *net)
2619 {
2620 struct ctl_table *tbl;
2621
2622 tbl = net->ipv4.route_hdr->ctl_table_arg;
2623 unregister_net_sysctl_table(net->ipv4.route_hdr);
2624 BUG_ON(tbl == ipv4_route_flush_table);
2625 kfree(tbl);
2626 }
2627
2628 static __net_initdata struct pernet_operations sysctl_route_ops = {
2629 .init = sysctl_route_net_init,
2630 .exit = sysctl_route_net_exit,
2631 };
2632 #endif
2633
rt_genid_init(struct net * net)2634 static __net_init int rt_genid_init(struct net *net)
2635 {
2636 atomic_set(&net->rt_genid, 0);
2637 get_random_bytes(&net->ipv4.dev_addr_genid,
2638 sizeof(net->ipv4.dev_addr_genid));
2639 return 0;
2640 }
2641
2642 static __net_initdata struct pernet_operations rt_genid_ops = {
2643 .init = rt_genid_init,
2644 };
2645
ipv4_inetpeer_init(struct net * net)2646 static int __net_init ipv4_inetpeer_init(struct net *net)
2647 {
2648 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2649
2650 if (!bp)
2651 return -ENOMEM;
2652 inet_peer_base_init(bp);
2653 net->ipv4.peers = bp;
2654 return 0;
2655 }
2656
ipv4_inetpeer_exit(struct net * net)2657 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2658 {
2659 struct inet_peer_base *bp = net->ipv4.peers;
2660
2661 net->ipv4.peers = NULL;
2662 inetpeer_invalidate_tree(bp);
2663 kfree(bp);
2664 }
2665
2666 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2667 .init = ipv4_inetpeer_init,
2668 .exit = ipv4_inetpeer_exit,
2669 };
2670
2671 #ifdef CONFIG_IP_ROUTE_CLASSID
2672 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2673 #endif /* CONFIG_IP_ROUTE_CLASSID */
2674
ip_rt_init(void)2675 int __init ip_rt_init(void)
2676 {
2677 int rc = 0;
2678
2679 #ifdef CONFIG_IP_ROUTE_CLASSID
2680 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2681 if (!ip_rt_acct)
2682 panic("IP: failed to allocate ip_rt_acct\n");
2683 #endif
2684
2685 ipv4_dst_ops.kmem_cachep =
2686 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2687 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2688
2689 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2690
2691 if (dst_entries_init(&ipv4_dst_ops) < 0)
2692 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2693
2694 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2695 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2696
2697 ipv4_dst_ops.gc_thresh = ~0;
2698 ip_rt_max_size = INT_MAX;
2699
2700 devinet_init();
2701 ip_fib_init();
2702
2703 if (ip_rt_proc_init())
2704 pr_err("Unable to create route proc files\n");
2705 #ifdef CONFIG_XFRM
2706 xfrm_init();
2707 xfrm4_init();
2708 #endif
2709 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2710
2711 #ifdef CONFIG_SYSCTL
2712 register_pernet_subsys(&sysctl_route_ops);
2713 #endif
2714 register_pernet_subsys(&rt_genid_ops);
2715 register_pernet_subsys(&ipv4_inetpeer_ops);
2716 return rc;
2717 }
2718
2719 #ifdef CONFIG_SYSCTL
2720 /*
2721 * We really need to sanitize the damn ipv4 init order, then all
2722 * this nonsense will go away.
2723 */
ip_static_sysctl_init(void)2724 void __init ip_static_sysctl_init(void)
2725 {
2726 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2727 }
2728 #endif
2729