1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
130
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu,
144 bool confirm_neigh);
145 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 WARN_ON(1);
152 return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
174 };
175
176 #define ECN_OR_COST(class) TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207 }
208
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 ++*pos;
212 return NULL;
213 }
214
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234 };
235
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct proc_ops rt_cache_proc_ops = {
242 .proc_open = rt_cache_seq_open,
243 .proc_read = seq_read,
244 .proc_lseek = seq_lseek,
245 .proc_release = seq_release,
246 };
247
248
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 int cpu;
252
253 if (*pos == 0)
254 return SEQ_START_TOKEN;
255
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
258 continue;
259 *pos = cpu+1;
260 return &per_cpu(rt_cache_stat, cpu);
261 }
262 return NULL;
263 }
264
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 int cpu;
268
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
271 continue;
272 *pos = cpu+1;
273 return &per_cpu(rt_cache_stat, cpu);
274 }
275 (*pos)++;
276 return NULL;
277
278 }
279
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 0, /* st->in_hit */
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 0, /* st->out_hit */
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
315 );
316 return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324 };
325
326
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct proc_ops rt_cpu_proc_ops = {
333 .proc_open = rt_cpu_seq_open,
334 .proc_read = seq_read,
335 .proc_lseek = seq_lseek,
336 .proc_release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
357 }
358
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362 }
363 #endif
364
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 struct proc_dir_entry *pde;
368
369 pde = proc_create("rt_cache", 0444, net->proc_net,
370 &rt_cache_proc_ops);
371 if (!pde)
372 goto err1;
373
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_proc_ops);
376 if (!pde)
377 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
381 rt_acct_proc_show);
382 if (!pde)
383 goto err3;
384 #endif
385 return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 return -ENOMEM;
395 }
396
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
409 };
410
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 rt_genid_bump_ipv4(net);
431 }
432
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 struct sk_buff *skb,
435 const void *daddr)
436 {
437 const struct rtable *rt = container_of(dst, struct rtable, dst);
438 struct net_device *dev = dst->dev;
439 struct neighbour *n;
440
441 rcu_read_lock_bh();
442
443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447 } else {
448 __be32 pkey;
449
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
452 }
453
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 n = NULL;
456
457 rcu_read_unlock_bh();
458
459 return n;
460 }
461
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 const struct rtable *rt = container_of(dst, struct rtable, dst);
465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
467
468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 } else if (!daddr ||
473 (rt->rt_flags &
474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 return;
476 }
477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 /* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
482 */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
490 */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 u32 bucket, old, now = (u32)jiffies;
494 atomic_t *p_id;
495 u32 *p_tstamp;
496 u32 delta = 0;
497
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
502
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
505
506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
509 */
510 return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 u32 hash, id;
517
518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
522
523 hash = siphash_3u32((__force u32)iph->daddr,
524 (__force u32)iph->saddr,
525 iph->protocol,
526 &net->ipv4.ip_id_key);
527 id = ip_idents_reserve(hash, segs);
528 iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531
ip_rt_fix_tos(struct flowi4 * fl4)532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534 __u8 tos = RT_FL_TOS(fl4);
535
536 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537 fl4->flowi4_scope = tos & RTO_ONLINK ?
538 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542 const struct sock *sk,
543 const struct iphdr *iph,
544 int oif, u8 tos,
545 u8 prot, u32 mark, int flow_flags)
546 {
547 if (sk) {
548 const struct inet_sock *inet = inet_sk(sk);
549
550 oif = sk->sk_bound_dev_if;
551 mark = sk->sk_mark;
552 tos = RT_CONN_FLAGS(sk);
553 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554 }
555 flowi4_init_output(fl4, oif, mark, tos,
556 RT_SCOPE_UNIVERSE, prot,
557 flow_flags,
558 iph->daddr, iph->saddr, 0, 0,
559 sock_net_uid(net, sk));
560 }
561
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563 const struct sock *sk)
564 {
565 const struct net *net = dev_net(skb->dev);
566 const struct iphdr *iph = ip_hdr(skb);
567 int oif = skb->dev->ifindex;
568 u8 tos = RT_TOS(iph->tos);
569 u8 prot = iph->protocol;
570 u32 mark = skb->mark;
571
572 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577 const struct inet_sock *inet = inet_sk(sk);
578 const struct ip_options_rcu *inet_opt;
579 __be32 daddr = inet->inet_daddr;
580
581 rcu_read_lock();
582 inet_opt = rcu_dereference(inet->inet_opt);
583 if (inet_opt && inet_opt->opt.srr)
584 daddr = inet_opt->opt.faddr;
585 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588 inet_sk_flowi_flags(sk),
589 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590 rcu_read_unlock();
591 }
592
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594 const struct sk_buff *skb)
595 {
596 if (skb)
597 build_skb_flow_key(fl4, skb, sk);
598 else
599 build_sk_flow_key(fl4, sk);
600 }
601
602 static DEFINE_SPINLOCK(fnhe_lock);
603
fnhe_flush_routes(struct fib_nh_exception * fnhe)604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606 struct rtable *rt;
607
608 rt = rcu_dereference(fnhe->fnhe_rth_input);
609 if (rt) {
610 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611 dst_dev_put(&rt->dst);
612 dst_release(&rt->dst);
613 }
614 rt = rcu_dereference(fnhe->fnhe_rth_output);
615 if (rt) {
616 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617 dst_dev_put(&rt->dst);
618 dst_release(&rt->dst);
619 }
620 }
621
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625 struct fib_nh_exception *fnhe, *oldest = NULL;
626
627 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628 fnhe = rcu_dereference_protected(*fnhe_p,
629 lockdep_is_held(&fnhe_lock));
630 if (!fnhe)
631 break;
632 if (!oldest ||
633 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634 oldest = fnhe;
635 oldest_p = fnhe_p;
636 }
637 }
638 fnhe_flush_routes(oldest);
639 *oldest_p = oldest->fnhe_next;
640 kfree_rcu(oldest, rcu);
641 }
642
fnhe_hashfun(__be32 daddr)643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645 static siphash_key_t fnhe_hash_key __read_mostly;
646 u64 hval;
647
648 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650 return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655 rt->rt_pmtu = fnhe->fnhe_pmtu;
656 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657 rt->dst.expires = fnhe->fnhe_expires;
658
659 if (fnhe->fnhe_gw) {
660 rt->rt_flags |= RTCF_REDIRECTED;
661 rt->rt_uses_gateway = 1;
662 rt->rt_gw_family = AF_INET;
663 rt->rt_gw4 = fnhe->fnhe_gw;
664 }
665 }
666
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668 __be32 gw, u32 pmtu, bool lock,
669 unsigned long expires)
670 {
671 struct fnhe_hash_bucket *hash;
672 struct fib_nh_exception *fnhe;
673 struct rtable *rt;
674 u32 genid, hval;
675 unsigned int i;
676 int depth;
677
678 genid = fnhe_genid(dev_net(nhc->nhc_dev));
679 hval = fnhe_hashfun(daddr);
680
681 spin_lock_bh(&fnhe_lock);
682
683 hash = rcu_dereference(nhc->nhc_exceptions);
684 if (!hash) {
685 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686 if (!hash)
687 goto out_unlock;
688 rcu_assign_pointer(nhc->nhc_exceptions, hash);
689 }
690
691 hash += hval;
692
693 depth = 0;
694 for (fnhe = rcu_dereference(hash->chain); fnhe;
695 fnhe = rcu_dereference(fnhe->fnhe_next)) {
696 if (fnhe->fnhe_daddr == daddr)
697 break;
698 depth++;
699 }
700
701 if (fnhe) {
702 if (fnhe->fnhe_genid != genid)
703 fnhe->fnhe_genid = genid;
704 if (gw)
705 fnhe->fnhe_gw = gw;
706 if (pmtu) {
707 fnhe->fnhe_pmtu = pmtu;
708 fnhe->fnhe_mtu_locked = lock;
709 }
710 fnhe->fnhe_expires = max(1UL, expires);
711 /* Update all cached dsts too */
712 rt = rcu_dereference(fnhe->fnhe_rth_input);
713 if (rt)
714 fill_route_from_fnhe(rt, fnhe);
715 rt = rcu_dereference(fnhe->fnhe_rth_output);
716 if (rt)
717 fill_route_from_fnhe(rt, fnhe);
718 } else {
719 /* Randomize max depth to avoid some side channels attacks. */
720 int max_depth = FNHE_RECLAIM_DEPTH +
721 prandom_u32_max(FNHE_RECLAIM_DEPTH);
722
723 while (depth > max_depth) {
724 fnhe_remove_oldest(hash);
725 depth--;
726 }
727
728 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729 if (!fnhe)
730 goto out_unlock;
731
732 fnhe->fnhe_next = hash->chain;
733
734 fnhe->fnhe_genid = genid;
735 fnhe->fnhe_daddr = daddr;
736 fnhe->fnhe_gw = gw;
737 fnhe->fnhe_pmtu = pmtu;
738 fnhe->fnhe_mtu_locked = lock;
739 fnhe->fnhe_expires = max(1UL, expires);
740
741 rcu_assign_pointer(hash->chain, fnhe);
742
743 /* Exception created; mark the cached routes for the nexthop
744 * stale, so anyone caching it rechecks if this exception
745 * applies to them.
746 */
747 rt = rcu_dereference(nhc->nhc_rth_input);
748 if (rt)
749 rt->dst.obsolete = DST_OBSOLETE_KILL;
750
751 for_each_possible_cpu(i) {
752 struct rtable __rcu **prt;
753 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754 rt = rcu_dereference(*prt);
755 if (rt)
756 rt->dst.obsolete = DST_OBSOLETE_KILL;
757 }
758 }
759
760 fnhe->fnhe_stamp = jiffies;
761
762 out_unlock:
763 spin_unlock_bh(&fnhe_lock);
764 }
765
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767 bool kill_route)
768 {
769 __be32 new_gw = icmp_hdr(skb)->un.gateway;
770 __be32 old_gw = ip_hdr(skb)->saddr;
771 struct net_device *dev = skb->dev;
772 struct in_device *in_dev;
773 struct fib_result res;
774 struct neighbour *n;
775 struct net *net;
776
777 switch (icmp_hdr(skb)->code & 7) {
778 case ICMP_REDIR_NET:
779 case ICMP_REDIR_NETTOS:
780 case ICMP_REDIR_HOST:
781 case ICMP_REDIR_HOSTTOS:
782 break;
783
784 default:
785 return;
786 }
787
788 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789 return;
790
791 in_dev = __in_dev_get_rcu(dev);
792 if (!in_dev)
793 return;
794
795 net = dev_net(dev);
796 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798 ipv4_is_zeronet(new_gw))
799 goto reject_redirect;
800
801 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803 goto reject_redirect;
804 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805 goto reject_redirect;
806 } else {
807 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808 goto reject_redirect;
809 }
810
811 n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
812 if (!n)
813 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814 if (!IS_ERR(n)) {
815 if (!(n->nud_state & NUD_VALID)) {
816 neigh_event_send(n, NULL);
817 } else {
818 if (fib_lookup(net, fl4, &res, 0) == 0) {
819 struct fib_nh_common *nhc;
820
821 fib_select_path(net, &res, fl4, skb);
822 nhc = FIB_RES_NHC(res);
823 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824 0, false,
825 jiffies + ip_rt_gc_timeout);
826 }
827 if (kill_route)
828 rt->dst.obsolete = DST_OBSOLETE_KILL;
829 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830 }
831 neigh_release(n);
832 }
833 return;
834
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 if (IN_DEV_LOG_MARTIANS(in_dev)) {
838 const struct iphdr *iph = (const struct iphdr *) skb->data;
839 __be32 daddr = iph->daddr;
840 __be32 saddr = iph->saddr;
841
842 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843 " Advised path = %pI4 -> %pI4\n",
844 &old_gw, dev->name, &new_gw,
845 &saddr, &daddr);
846 }
847 #endif
848 ;
849 }
850
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853 struct rtable *rt;
854 struct flowi4 fl4;
855 const struct iphdr *iph = (const struct iphdr *) skb->data;
856 struct net *net = dev_net(skb->dev);
857 int oif = skb->dev->ifindex;
858 u8 tos = RT_TOS(iph->tos);
859 u8 prot = iph->protocol;
860 u32 mark = skb->mark;
861
862 rt = (struct rtable *) dst;
863
864 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865 ip_rt_fix_tos(&fl4);
866 __ip_do_redirect(rt, skb, &fl4, true);
867 }
868
ipv4_negative_advice(struct dst_entry * dst)869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871 struct rtable *rt = (struct rtable *)dst;
872 struct dst_entry *ret = dst;
873
874 if (rt) {
875 if (dst->obsolete > 0) {
876 ip_rt_put(rt);
877 ret = NULL;
878 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879 rt->dst.expires) {
880 ip_rt_put(rt);
881 ret = NULL;
882 }
883 }
884 return ret;
885 }
886
887 /*
888 * Algorithm:
889 * 1. The first ip_rt_redirect_number redirects are sent
890 * with exponential backoff, then we stop sending them at all,
891 * assuming that the host ignores our redirects.
892 * 2. If we did not see packets requiring redirects
893 * during ip_rt_redirect_silence, we assume that the host
894 * forgot redirected route and start to send redirects again.
895 *
896 * This algorithm is much cheaper and more intelligent than dumb load limiting
897 * in icmp.c.
898 *
899 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900 * and "frag. need" (breaks PMTU discovery) in icmp.c.
901 */
902
ip_rt_send_redirect(struct sk_buff * skb)903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905 struct rtable *rt = skb_rtable(skb);
906 struct in_device *in_dev;
907 struct inet_peer *peer;
908 struct net *net;
909 int log_martians;
910 int vif;
911
912 rcu_read_lock();
913 in_dev = __in_dev_get_rcu(rt->dst.dev);
914 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915 rcu_read_unlock();
916 return;
917 }
918 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920 rcu_read_unlock();
921
922 net = dev_net(rt->dst.dev);
923 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924 if (!peer) {
925 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926 rt_nexthop(rt, ip_hdr(skb)->daddr));
927 return;
928 }
929
930 /* No redirected packets during ip_rt_redirect_silence;
931 * reset the algorithm.
932 */
933 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934 peer->rate_tokens = 0;
935 peer->n_redirects = 0;
936 }
937
938 /* Too many ignored redirects; do not send anything
939 * set dst.rate_last to the last seen redirected packet.
940 */
941 if (peer->n_redirects >= ip_rt_redirect_number) {
942 peer->rate_last = jiffies;
943 goto out_put_peer;
944 }
945
946 /* Check for load limit; set rate_last to the latest sent
947 * redirect.
948 */
949 if (peer->n_redirects == 0 ||
950 time_after(jiffies,
951 (peer->rate_last +
952 (ip_rt_redirect_load << peer->n_redirects)))) {
953 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954
955 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956 peer->rate_last = jiffies;
957 ++peer->n_redirects;
958 #ifdef CONFIG_IP_ROUTE_VERBOSE
959 if (log_martians &&
960 peer->n_redirects == ip_rt_redirect_number)
961 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962 &ip_hdr(skb)->saddr, inet_iif(skb),
963 &ip_hdr(skb)->daddr, &gw);
964 #endif
965 }
966 out_put_peer:
967 inet_putpeer(peer);
968 }
969
ip_error(struct sk_buff * skb)970 static int ip_error(struct sk_buff *skb)
971 {
972 struct rtable *rt = skb_rtable(skb);
973 struct net_device *dev = skb->dev;
974 struct in_device *in_dev;
975 struct inet_peer *peer;
976 unsigned long now;
977 struct net *net;
978 bool send;
979 int code;
980
981 if (netif_is_l3_master(skb->dev)) {
982 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983 if (!dev)
984 goto out;
985 }
986
987 in_dev = __in_dev_get_rcu(dev);
988
989 /* IP on this device is disabled. */
990 if (!in_dev)
991 goto out;
992
993 net = dev_net(rt->dst.dev);
994 if (!IN_DEV_FORWARD(in_dev)) {
995 switch (rt->dst.error) {
996 case EHOSTUNREACH:
997 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998 break;
999
1000 case ENETUNREACH:
1001 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002 break;
1003 }
1004 goto out;
1005 }
1006
1007 switch (rt->dst.error) {
1008 case EINVAL:
1009 default:
1010 goto out;
1011 case EHOSTUNREACH:
1012 code = ICMP_HOST_UNREACH;
1013 break;
1014 case ENETUNREACH:
1015 code = ICMP_NET_UNREACH;
1016 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017 break;
1018 case EACCES:
1019 code = ICMP_PKT_FILTERED;
1020 break;
1021 }
1022
1023 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024 l3mdev_master_ifindex(skb->dev), 1);
1025
1026 send = true;
1027 if (peer) {
1028 now = jiffies;
1029 peer->rate_tokens += now - peer->rate_last;
1030 if (peer->rate_tokens > ip_rt_error_burst)
1031 peer->rate_tokens = ip_rt_error_burst;
1032 peer->rate_last = now;
1033 if (peer->rate_tokens >= ip_rt_error_cost)
1034 peer->rate_tokens -= ip_rt_error_cost;
1035 else
1036 send = false;
1037 inet_putpeer(peer);
1038 }
1039 if (send)
1040 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041
1042 out: kfree_skb(skb);
1043 return 0;
1044 }
1045
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1046 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047 {
1048 struct dst_entry *dst = &rt->dst;
1049 struct net *net = dev_net(dst->dev);
1050 struct fib_result res;
1051 bool lock = false;
1052 u32 old_mtu;
1053
1054 if (ip_mtu_locked(dst))
1055 return;
1056
1057 old_mtu = ipv4_mtu(dst);
1058 if (old_mtu < mtu)
1059 return;
1060
1061 if (mtu < ip_rt_min_pmtu) {
1062 lock = true;
1063 mtu = min(old_mtu, ip_rt_min_pmtu);
1064 }
1065
1066 if (rt->rt_pmtu == mtu && !lock &&
1067 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068 return;
1069
1070 rcu_read_lock();
1071 if (fib_lookup(net, fl4, &res, 0) == 0) {
1072 struct fib_nh_common *nhc;
1073
1074 fib_select_path(net, &res, fl4, NULL);
1075 nhc = FIB_RES_NHC(res);
1076 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077 jiffies + ip_rt_mtu_expires);
1078 }
1079 rcu_read_unlock();
1080 }
1081
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1082 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083 struct sk_buff *skb, u32 mtu,
1084 bool confirm_neigh)
1085 {
1086 struct rtable *rt = (struct rtable *) dst;
1087 struct flowi4 fl4;
1088
1089 ip_rt_build_flow_key(&fl4, sk, skb);
1090 ip_rt_fix_tos(&fl4);
1091
1092 /* Don't make lookup fail for bridged encapsulations */
1093 if (skb && netif_is_any_bridge_port(skb->dev))
1094 fl4.flowi4_oif = 0;
1095
1096 __ip_rt_update_pmtu(rt, &fl4, mtu);
1097 }
1098
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100 int oif, u8 protocol)
1101 {
1102 const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 struct flowi4 fl4;
1104 struct rtable *rt;
1105 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106
1107 __build_flow_key(net, &fl4, NULL, iph, oif,
1108 RT_TOS(iph->tos), protocol, mark, 0);
1109 rt = __ip_route_output_key(net, &fl4);
1110 if (!IS_ERR(rt)) {
1111 __ip_rt_update_pmtu(rt, &fl4, mtu);
1112 ip_rt_put(rt);
1113 }
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118 {
1119 const struct iphdr *iph = (const struct iphdr *)skb->data;
1120 struct flowi4 fl4;
1121 struct rtable *rt;
1122
1123 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124
1125 if (!fl4.flowi4_mark)
1126 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127
1128 rt = __ip_route_output_key(sock_net(sk), &fl4);
1129 if (!IS_ERR(rt)) {
1130 __ip_rt_update_pmtu(rt, &fl4, mtu);
1131 ip_rt_put(rt);
1132 }
1133 }
1134
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136 {
1137 const struct iphdr *iph = (const struct iphdr *)skb->data;
1138 struct flowi4 fl4;
1139 struct rtable *rt;
1140 struct dst_entry *odst = NULL;
1141 bool new = false;
1142 struct net *net = sock_net(sk);
1143
1144 bh_lock_sock(sk);
1145
1146 if (!ip_sk_accept_pmtu(sk))
1147 goto out;
1148
1149 odst = sk_dst_get(sk);
1150
1151 if (sock_owned_by_user(sk) || !odst) {
1152 __ipv4_sk_update_pmtu(skb, sk, mtu);
1153 goto out;
1154 }
1155
1156 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157
1158 rt = (struct rtable *)odst;
1159 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161 if (IS_ERR(rt))
1162 goto out;
1163
1164 new = true;
1165 } else {
1166 ip_rt_fix_tos(&fl4);
1167 }
1168
1169 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170
1171 if (!dst_check(&rt->dst, 0)) {
1172 if (new)
1173 dst_release(&rt->dst);
1174
1175 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176 if (IS_ERR(rt))
1177 goto out;
1178
1179 new = true;
1180 }
1181
1182 if (new)
1183 sk_dst_set(sk, &rt->dst);
1184
1185 out:
1186 bh_unlock_sock(sk);
1187 dst_release(odst);
1188 }
1189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192 int oif, u8 protocol)
1193 {
1194 const struct iphdr *iph = (const struct iphdr *)skb->data;
1195 struct flowi4 fl4;
1196 struct rtable *rt;
1197
1198 __build_flow_key(net, &fl4, NULL, iph, oif,
1199 RT_TOS(iph->tos), protocol, 0, 0);
1200 rt = __ip_route_output_key(net, &fl4);
1201 if (!IS_ERR(rt)) {
1202 __ip_do_redirect(rt, skb, &fl4, false);
1203 ip_rt_put(rt);
1204 }
1205 }
1206 EXPORT_SYMBOL_GPL(ipv4_redirect);
1207
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209 {
1210 const struct iphdr *iph = (const struct iphdr *)skb->data;
1211 struct flowi4 fl4;
1212 struct rtable *rt;
1213 struct net *net = sock_net(sk);
1214
1215 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216 rt = __ip_route_output_key(net, &fl4);
1217 if (!IS_ERR(rt)) {
1218 __ip_do_redirect(rt, skb, &fl4, false);
1219 ip_rt_put(rt);
1220 }
1221 }
1222 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1224 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225 {
1226 struct rtable *rt = (struct rtable *) dst;
1227
1228 /* All IPV4 dsts are created with ->obsolete set to the value
1229 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230 * into this function always.
1231 *
1232 * When a PMTU/redirect information update invalidates a route,
1233 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234 * DST_OBSOLETE_DEAD.
1235 */
1236 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237 return NULL;
1238 return dst;
1239 }
1240
ipv4_send_dest_unreach(struct sk_buff * skb)1241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242 {
1243 struct net_device *dev;
1244 struct ip_options opt;
1245 int res;
1246
1247 /* Recompile ip options since IPCB may not be valid anymore.
1248 * Also check we have a reasonable ipv4 header.
1249 */
1250 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1251 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1252 return;
1253
1254 memset(&opt, 0, sizeof(opt));
1255 if (ip_hdr(skb)->ihl > 5) {
1256 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1257 return;
1258 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1259
1260 rcu_read_lock();
1261 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1262 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1263 rcu_read_unlock();
1264
1265 if (res)
1266 return;
1267 }
1268 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1269 }
1270
ipv4_link_failure(struct sk_buff * skb)1271 static void ipv4_link_failure(struct sk_buff *skb)
1272 {
1273 struct rtable *rt;
1274
1275 ipv4_send_dest_unreach(skb);
1276
1277 rt = skb_rtable(skb);
1278 if (rt)
1279 dst_set_expires(&rt->dst, 0);
1280 }
1281
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1282 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1283 {
1284 pr_debug("%s: %pI4 -> %pI4, %s\n",
1285 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1286 skb->dev ? skb->dev->name : "?");
1287 kfree_skb(skb);
1288 WARN_ON(1);
1289 return 0;
1290 }
1291
1292 /*
1293 We do not cache source address of outgoing interface,
1294 because it is used only by IP RR, TS and SRR options,
1295 so that it out of fast path.
1296
1297 BTW remember: "addr" is allowed to be not aligned
1298 in IP options!
1299 */
1300
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1301 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1302 {
1303 __be32 src;
1304
1305 if (rt_is_output_route(rt))
1306 src = ip_hdr(skb)->saddr;
1307 else {
1308 struct fib_result res;
1309 struct iphdr *iph = ip_hdr(skb);
1310 struct flowi4 fl4 = {
1311 .daddr = iph->daddr,
1312 .saddr = iph->saddr,
1313 .flowi4_tos = RT_TOS(iph->tos),
1314 .flowi4_oif = rt->dst.dev->ifindex,
1315 .flowi4_iif = skb->dev->ifindex,
1316 .flowi4_mark = skb->mark,
1317 };
1318
1319 rcu_read_lock();
1320 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1321 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1322 else
1323 src = inet_select_addr(rt->dst.dev,
1324 rt_nexthop(rt, iph->daddr),
1325 RT_SCOPE_UNIVERSE);
1326 rcu_read_unlock();
1327 }
1328 memcpy(addr, &src, 4);
1329 }
1330
1331 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1332 static void set_class_tag(struct rtable *rt, u32 tag)
1333 {
1334 if (!(rt->dst.tclassid & 0xFFFF))
1335 rt->dst.tclassid |= tag & 0xFFFF;
1336 if (!(rt->dst.tclassid & 0xFFFF0000))
1337 rt->dst.tclassid |= tag & 0xFFFF0000;
1338 }
1339 #endif
1340
ipv4_default_advmss(const struct dst_entry * dst)1341 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1342 {
1343 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1344 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1345 ip_rt_min_advmss);
1346
1347 return min(advmss, IPV4_MAX_PMTU - header_size);
1348 }
1349
ipv4_mtu(const struct dst_entry * dst)1350 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1351 {
1352 const struct rtable *rt = (const struct rtable *)dst;
1353 unsigned int mtu = rt->rt_pmtu;
1354
1355 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1356 mtu = dst_metric_raw(dst, RTAX_MTU);
1357
1358 if (mtu)
1359 goto out;
1360
1361 mtu = READ_ONCE(dst->dev->mtu);
1362
1363 if (unlikely(ip_mtu_locked(dst))) {
1364 if (rt->rt_uses_gateway && mtu > 576)
1365 mtu = 576;
1366 }
1367
1368 out:
1369 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1370
1371 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1372 }
1373
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1374 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1375 {
1376 struct fnhe_hash_bucket *hash;
1377 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1378 u32 hval = fnhe_hashfun(daddr);
1379
1380 spin_lock_bh(&fnhe_lock);
1381
1382 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1383 lockdep_is_held(&fnhe_lock));
1384 hash += hval;
1385
1386 fnhe_p = &hash->chain;
1387 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1388 while (fnhe) {
1389 if (fnhe->fnhe_daddr == daddr) {
1390 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1391 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1392 /* set fnhe_daddr to 0 to ensure it won't bind with
1393 * new dsts in rt_bind_exception().
1394 */
1395 fnhe->fnhe_daddr = 0;
1396 fnhe_flush_routes(fnhe);
1397 kfree_rcu(fnhe, rcu);
1398 break;
1399 }
1400 fnhe_p = &fnhe->fnhe_next;
1401 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1402 lockdep_is_held(&fnhe_lock));
1403 }
1404
1405 spin_unlock_bh(&fnhe_lock);
1406 }
1407
find_exception(struct fib_nh_common * nhc,__be32 daddr)1408 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1409 __be32 daddr)
1410 {
1411 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1412 struct fib_nh_exception *fnhe;
1413 u32 hval;
1414
1415 if (!hash)
1416 return NULL;
1417
1418 hval = fnhe_hashfun(daddr);
1419
1420 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1421 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1422 if (fnhe->fnhe_daddr == daddr) {
1423 if (fnhe->fnhe_expires &&
1424 time_after(jiffies, fnhe->fnhe_expires)) {
1425 ip_del_fnhe(nhc, daddr);
1426 break;
1427 }
1428 return fnhe;
1429 }
1430 }
1431 return NULL;
1432 }
1433
1434 /* MTU selection:
1435 * 1. mtu on route is locked - use it
1436 * 2. mtu from nexthop exception
1437 * 3. mtu from egress device
1438 */
1439
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1440 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1441 {
1442 struct fib_nh_common *nhc = res->nhc;
1443 struct net_device *dev = nhc->nhc_dev;
1444 struct fib_info *fi = res->fi;
1445 u32 mtu = 0;
1446
1447 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1448 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1449 mtu = fi->fib_mtu;
1450
1451 if (likely(!mtu)) {
1452 struct fib_nh_exception *fnhe;
1453
1454 fnhe = find_exception(nhc, daddr);
1455 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1456 mtu = fnhe->fnhe_pmtu;
1457 }
1458
1459 if (likely(!mtu))
1460 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1461
1462 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1463 }
1464
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1465 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1466 __be32 daddr, const bool do_cache)
1467 {
1468 bool ret = false;
1469
1470 spin_lock_bh(&fnhe_lock);
1471
1472 if (daddr == fnhe->fnhe_daddr) {
1473 struct rtable __rcu **porig;
1474 struct rtable *orig;
1475 int genid = fnhe_genid(dev_net(rt->dst.dev));
1476
1477 if (rt_is_input_route(rt))
1478 porig = &fnhe->fnhe_rth_input;
1479 else
1480 porig = &fnhe->fnhe_rth_output;
1481 orig = rcu_dereference(*porig);
1482
1483 if (fnhe->fnhe_genid != genid) {
1484 fnhe->fnhe_genid = genid;
1485 fnhe->fnhe_gw = 0;
1486 fnhe->fnhe_pmtu = 0;
1487 fnhe->fnhe_expires = 0;
1488 fnhe->fnhe_mtu_locked = false;
1489 fnhe_flush_routes(fnhe);
1490 orig = NULL;
1491 }
1492 fill_route_from_fnhe(rt, fnhe);
1493 if (!rt->rt_gw4) {
1494 rt->rt_gw4 = daddr;
1495 rt->rt_gw_family = AF_INET;
1496 }
1497
1498 if (do_cache) {
1499 dst_hold(&rt->dst);
1500 rcu_assign_pointer(*porig, rt);
1501 if (orig) {
1502 dst_dev_put(&orig->dst);
1503 dst_release(&orig->dst);
1504 }
1505 ret = true;
1506 }
1507
1508 fnhe->fnhe_stamp = jiffies;
1509 }
1510 spin_unlock_bh(&fnhe_lock);
1511
1512 return ret;
1513 }
1514
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1515 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1516 {
1517 struct rtable *orig, *prev, **p;
1518 bool ret = true;
1519
1520 if (rt_is_input_route(rt)) {
1521 p = (struct rtable **)&nhc->nhc_rth_input;
1522 } else {
1523 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1524 }
1525 orig = *p;
1526
1527 /* hold dst before doing cmpxchg() to avoid race condition
1528 * on this dst
1529 */
1530 dst_hold(&rt->dst);
1531 prev = cmpxchg(p, orig, rt);
1532 if (prev == orig) {
1533 if (orig) {
1534 rt_add_uncached_list(orig);
1535 dst_release(&orig->dst);
1536 }
1537 } else {
1538 dst_release(&rt->dst);
1539 ret = false;
1540 }
1541
1542 return ret;
1543 }
1544
1545 struct uncached_list {
1546 spinlock_t lock;
1547 struct list_head head;
1548 };
1549
1550 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1551
rt_add_uncached_list(struct rtable * rt)1552 void rt_add_uncached_list(struct rtable *rt)
1553 {
1554 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1555
1556 rt->rt_uncached_list = ul;
1557
1558 spin_lock_bh(&ul->lock);
1559 list_add_tail(&rt->rt_uncached, &ul->head);
1560 spin_unlock_bh(&ul->lock);
1561 }
1562
rt_del_uncached_list(struct rtable * rt)1563 void rt_del_uncached_list(struct rtable *rt)
1564 {
1565 if (!list_empty(&rt->rt_uncached)) {
1566 struct uncached_list *ul = rt->rt_uncached_list;
1567
1568 spin_lock_bh(&ul->lock);
1569 list_del(&rt->rt_uncached);
1570 spin_unlock_bh(&ul->lock);
1571 }
1572 }
1573
ipv4_dst_destroy(struct dst_entry * dst)1574 static void ipv4_dst_destroy(struct dst_entry *dst)
1575 {
1576 struct rtable *rt = (struct rtable *)dst;
1577
1578 ip_dst_metrics_put(dst);
1579 rt_del_uncached_list(rt);
1580 }
1581
rt_flush_dev(struct net_device * dev)1582 void rt_flush_dev(struct net_device *dev)
1583 {
1584 struct rtable *rt;
1585 int cpu;
1586
1587 for_each_possible_cpu(cpu) {
1588 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1589
1590 spin_lock_bh(&ul->lock);
1591 list_for_each_entry(rt, &ul->head, rt_uncached) {
1592 if (rt->dst.dev != dev)
1593 continue;
1594 rt->dst.dev = blackhole_netdev;
1595 dev_hold(rt->dst.dev);
1596 dev_put(dev);
1597 }
1598 spin_unlock_bh(&ul->lock);
1599 }
1600 }
1601
rt_cache_valid(const struct rtable * rt)1602 static bool rt_cache_valid(const struct rtable *rt)
1603 {
1604 return rt &&
1605 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1606 !rt_is_expired(rt);
1607 }
1608
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1609 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1610 const struct fib_result *res,
1611 struct fib_nh_exception *fnhe,
1612 struct fib_info *fi, u16 type, u32 itag,
1613 const bool do_cache)
1614 {
1615 bool cached = false;
1616
1617 if (fi) {
1618 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1619
1620 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1621 rt->rt_uses_gateway = 1;
1622 rt->rt_gw_family = nhc->nhc_gw_family;
1623 /* only INET and INET6 are supported */
1624 if (likely(nhc->nhc_gw_family == AF_INET))
1625 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1626 else
1627 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1628 }
1629
1630 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1631
1632 #ifdef CONFIG_IP_ROUTE_CLASSID
1633 if (nhc->nhc_family == AF_INET) {
1634 struct fib_nh *nh;
1635
1636 nh = container_of(nhc, struct fib_nh, nh_common);
1637 rt->dst.tclassid = nh->nh_tclassid;
1638 }
1639 #endif
1640 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1641 if (unlikely(fnhe))
1642 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1643 else if (do_cache)
1644 cached = rt_cache_route(nhc, rt);
1645 if (unlikely(!cached)) {
1646 /* Routes we intend to cache in nexthop exception or
1647 * FIB nexthop have the DST_NOCACHE bit clear.
1648 * However, if we are unsuccessful at storing this
1649 * route into the cache we really need to set it.
1650 */
1651 if (!rt->rt_gw4) {
1652 rt->rt_gw_family = AF_INET;
1653 rt->rt_gw4 = daddr;
1654 }
1655 rt_add_uncached_list(rt);
1656 }
1657 } else
1658 rt_add_uncached_list(rt);
1659
1660 #ifdef CONFIG_IP_ROUTE_CLASSID
1661 #ifdef CONFIG_IP_MULTIPLE_TABLES
1662 set_class_tag(rt, res->tclassid);
1663 #endif
1664 set_class_tag(rt, itag);
1665 #endif
1666 }
1667
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1668 struct rtable *rt_dst_alloc(struct net_device *dev,
1669 unsigned int flags, u16 type,
1670 bool nopolicy, bool noxfrm)
1671 {
1672 struct rtable *rt;
1673
1674 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1675 (nopolicy ? DST_NOPOLICY : 0) |
1676 (noxfrm ? DST_NOXFRM : 0));
1677
1678 if (rt) {
1679 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1680 rt->rt_flags = flags;
1681 rt->rt_type = type;
1682 rt->rt_is_input = 0;
1683 rt->rt_iif = 0;
1684 rt->rt_pmtu = 0;
1685 rt->rt_mtu_locked = 0;
1686 rt->rt_uses_gateway = 0;
1687 rt->rt_gw_family = 0;
1688 rt->rt_gw4 = 0;
1689 INIT_LIST_HEAD(&rt->rt_uncached);
1690
1691 rt->dst.output = ip_output;
1692 if (flags & RTCF_LOCAL)
1693 rt->dst.input = ip_local_deliver;
1694 }
1695
1696 return rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_alloc);
1699
rt_dst_clone(struct net_device * dev,struct rtable * rt)1700 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1701 {
1702 struct rtable *new_rt;
1703
1704 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1705 rt->dst.flags);
1706
1707 if (new_rt) {
1708 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1709 new_rt->rt_flags = rt->rt_flags;
1710 new_rt->rt_type = rt->rt_type;
1711 new_rt->rt_is_input = rt->rt_is_input;
1712 new_rt->rt_iif = rt->rt_iif;
1713 new_rt->rt_pmtu = rt->rt_pmtu;
1714 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1715 new_rt->rt_gw_family = rt->rt_gw_family;
1716 if (rt->rt_gw_family == AF_INET)
1717 new_rt->rt_gw4 = rt->rt_gw4;
1718 else if (rt->rt_gw_family == AF_INET6)
1719 new_rt->rt_gw6 = rt->rt_gw6;
1720 INIT_LIST_HEAD(&new_rt->rt_uncached);
1721
1722 new_rt->dst.input = rt->dst.input;
1723 new_rt->dst.output = rt->dst.output;
1724 new_rt->dst.error = rt->dst.error;
1725 new_rt->dst.lastuse = jiffies;
1726 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1727 }
1728 return new_rt;
1729 }
1730 EXPORT_SYMBOL(rt_dst_clone);
1731
1732 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1733 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1734 u8 tos, struct net_device *dev,
1735 struct in_device *in_dev, u32 *itag)
1736 {
1737 int err;
1738
1739 /* Primary sanity checks. */
1740 if (!in_dev)
1741 return -EINVAL;
1742
1743 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1744 skb->protocol != htons(ETH_P_IP))
1745 return -EINVAL;
1746
1747 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1748 return -EINVAL;
1749
1750 if (ipv4_is_zeronet(saddr)) {
1751 if (!ipv4_is_local_multicast(daddr) &&
1752 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1753 return -EINVAL;
1754 } else {
1755 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1756 in_dev, itag);
1757 if (err < 0)
1758 return err;
1759 }
1760 return 0;
1761 }
1762
1763 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1764 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1765 u8 tos, struct net_device *dev, int our)
1766 {
1767 struct in_device *in_dev = __in_dev_get_rcu(dev);
1768 unsigned int flags = RTCF_MULTICAST;
1769 struct rtable *rth;
1770 u32 itag = 0;
1771 int err;
1772
1773 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1774 if (err)
1775 return err;
1776
1777 if (our)
1778 flags |= RTCF_LOCAL;
1779
1780 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1781 IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1782 if (!rth)
1783 return -ENOBUFS;
1784
1785 #ifdef CONFIG_IP_ROUTE_CLASSID
1786 rth->dst.tclassid = itag;
1787 #endif
1788 rth->dst.output = ip_rt_bug;
1789 rth->rt_is_input= 1;
1790
1791 #ifdef CONFIG_IP_MROUTE
1792 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1793 rth->dst.input = ip_mr_input;
1794 #endif
1795 RT_CACHE_STAT_INC(in_slow_mc);
1796
1797 skb_dst_drop(skb);
1798 skb_dst_set(skb, &rth->dst);
1799 return 0;
1800 }
1801
1802
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1803 static void ip_handle_martian_source(struct net_device *dev,
1804 struct in_device *in_dev,
1805 struct sk_buff *skb,
1806 __be32 daddr,
1807 __be32 saddr)
1808 {
1809 RT_CACHE_STAT_INC(in_martian_src);
1810 #ifdef CONFIG_IP_ROUTE_VERBOSE
1811 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1812 /*
1813 * RFC1812 recommendation, if source is martian,
1814 * the only hint is MAC header.
1815 */
1816 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1817 &daddr, &saddr, dev->name);
1818 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1819 print_hex_dump(KERN_WARNING, "ll header: ",
1820 DUMP_PREFIX_OFFSET, 16, 1,
1821 skb_mac_header(skb),
1822 dev->hard_header_len, false);
1823 }
1824 }
1825 #endif
1826 }
1827
1828 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1829 static int __mkroute_input(struct sk_buff *skb,
1830 const struct fib_result *res,
1831 struct in_device *in_dev,
1832 __be32 daddr, __be32 saddr, u32 tos)
1833 {
1834 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1835 struct net_device *dev = nhc->nhc_dev;
1836 struct fib_nh_exception *fnhe;
1837 struct rtable *rth;
1838 int err;
1839 struct in_device *out_dev;
1840 bool do_cache;
1841 u32 itag = 0;
1842
1843 /* get a working reference to the output device */
1844 out_dev = __in_dev_get_rcu(dev);
1845 if (!out_dev) {
1846 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1847 return -EINVAL;
1848 }
1849
1850 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1851 in_dev->dev, in_dev, &itag);
1852 if (err < 0) {
1853 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1854 saddr);
1855
1856 goto cleanup;
1857 }
1858
1859 do_cache = res->fi && !itag;
1860 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1861 skb->protocol == htons(ETH_P_IP)) {
1862 __be32 gw;
1863
1864 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1865 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1866 inet_addr_onlink(out_dev, saddr, gw))
1867 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1868 }
1869
1870 if (skb->protocol != htons(ETH_P_IP)) {
1871 /* Not IP (i.e. ARP). Do not create route, if it is
1872 * invalid for proxy arp. DNAT routes are always valid.
1873 *
1874 * Proxy arp feature have been extended to allow, ARP
1875 * replies back to the same interface, to support
1876 * Private VLAN switch technologies. See arp.c.
1877 */
1878 if (out_dev == in_dev &&
1879 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1880 err = -EINVAL;
1881 goto cleanup;
1882 }
1883 }
1884
1885 fnhe = find_exception(nhc, daddr);
1886 if (do_cache) {
1887 if (fnhe)
1888 rth = rcu_dereference(fnhe->fnhe_rth_input);
1889 else
1890 rth = rcu_dereference(nhc->nhc_rth_input);
1891 if (rt_cache_valid(rth)) {
1892 skb_dst_set_noref(skb, &rth->dst);
1893 goto out;
1894 }
1895 }
1896
1897 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1898 IN_DEV_ORCONF(in_dev, NOPOLICY),
1899 IN_DEV_ORCONF(out_dev, NOXFRM));
1900 if (!rth) {
1901 err = -ENOBUFS;
1902 goto cleanup;
1903 }
1904
1905 rth->rt_is_input = 1;
1906 RT_CACHE_STAT_INC(in_slow_tot);
1907
1908 rth->dst.input = ip_forward;
1909
1910 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1911 do_cache);
1912 lwtunnel_set_redirect(&rth->dst);
1913 skb_dst_set(skb, &rth->dst);
1914 out:
1915 err = 0;
1916 cleanup:
1917 return err;
1918 }
1919
1920 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1921 /* To make ICMP packets follow the right flow, the multipath hash is
1922 * calculated from the inner IP addresses.
1923 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1924 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1925 struct flow_keys *hash_keys)
1926 {
1927 const struct iphdr *outer_iph = ip_hdr(skb);
1928 const struct iphdr *key_iph = outer_iph;
1929 const struct iphdr *inner_iph;
1930 const struct icmphdr *icmph;
1931 struct iphdr _inner_iph;
1932 struct icmphdr _icmph;
1933
1934 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1935 goto out;
1936
1937 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1938 goto out;
1939
1940 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1941 &_icmph);
1942 if (!icmph)
1943 goto out;
1944
1945 if (!icmp_is_err(icmph->type))
1946 goto out;
1947
1948 inner_iph = skb_header_pointer(skb,
1949 outer_iph->ihl * 4 + sizeof(_icmph),
1950 sizeof(_inner_iph), &_inner_iph);
1951 if (!inner_iph)
1952 goto out;
1953
1954 key_iph = inner_iph;
1955 out:
1956 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1957 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1958 }
1959
1960 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1961 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1962 const struct sk_buff *skb, struct flow_keys *flkeys)
1963 {
1964 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1965 struct flow_keys hash_keys;
1966 u32 mhash;
1967
1968 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1969 case 0:
1970 memset(&hash_keys, 0, sizeof(hash_keys));
1971 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1972 if (skb) {
1973 ip_multipath_l3_keys(skb, &hash_keys);
1974 } else {
1975 hash_keys.addrs.v4addrs.src = fl4->saddr;
1976 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1977 }
1978 break;
1979 case 1:
1980 /* skb is currently provided only when forwarding */
1981 if (skb) {
1982 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1983 struct flow_keys keys;
1984
1985 /* short-circuit if we already have L4 hash present */
1986 if (skb->l4_hash)
1987 return skb_get_hash_raw(skb) >> 1;
1988
1989 memset(&hash_keys, 0, sizeof(hash_keys));
1990
1991 if (!flkeys) {
1992 skb_flow_dissect_flow_keys(skb, &keys, flag);
1993 flkeys = &keys;
1994 }
1995
1996 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1997 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1998 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1999 hash_keys.ports.src = flkeys->ports.src;
2000 hash_keys.ports.dst = flkeys->ports.dst;
2001 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2002 } else {
2003 memset(&hash_keys, 0, sizeof(hash_keys));
2004 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005 hash_keys.addrs.v4addrs.src = fl4->saddr;
2006 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2007 hash_keys.ports.src = fl4->fl4_sport;
2008 hash_keys.ports.dst = fl4->fl4_dport;
2009 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2010 }
2011 break;
2012 case 2:
2013 memset(&hash_keys, 0, sizeof(hash_keys));
2014 /* skb is currently provided only when forwarding */
2015 if (skb) {
2016 struct flow_keys keys;
2017
2018 skb_flow_dissect_flow_keys(skb, &keys, 0);
2019 /* Inner can be v4 or v6 */
2020 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2021 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2023 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2024 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2025 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2027 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2028 hash_keys.tags.flow_label = keys.tags.flow_label;
2029 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2030 } else {
2031 /* Same as case 0 */
2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2033 ip_multipath_l3_keys(skb, &hash_keys);
2034 }
2035 } else {
2036 /* Same as case 0 */
2037 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2038 hash_keys.addrs.v4addrs.src = fl4->saddr;
2039 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2040 }
2041 break;
2042 }
2043 mhash = flow_hash_from_keys(&hash_keys);
2044
2045 if (multipath_hash)
2046 mhash = jhash_2words(mhash, multipath_hash, 0);
2047
2048 return mhash >> 1;
2049 }
2050 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2051
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2052 static int ip_mkroute_input(struct sk_buff *skb,
2053 struct fib_result *res,
2054 struct in_device *in_dev,
2055 __be32 daddr, __be32 saddr, u32 tos,
2056 struct flow_keys *hkeys)
2057 {
2058 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2059 if (res->fi && fib_info_num_path(res->fi) > 1) {
2060 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2061
2062 fib_select_multipath(res, h);
2063 IPCB(skb)->flags |= IPSKB_MULTIPATH;
2064 }
2065 #endif
2066
2067 /* create a routing cache entry */
2068 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2069 }
2070
2071 /* Implements all the saddr-related checks as ip_route_input_slow(),
2072 * assuming daddr is valid and the destination is not a local broadcast one.
2073 * Uses the provided hint instead of performing a route lookup.
2074 */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2075 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076 u8 tos, struct net_device *dev,
2077 const struct sk_buff *hint)
2078 {
2079 struct in_device *in_dev = __in_dev_get_rcu(dev);
2080 struct rtable *rt = skb_rtable(hint);
2081 struct net *net = dev_net(dev);
2082 int err = -EINVAL;
2083 u32 tag = 0;
2084
2085 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2086 goto martian_source;
2087
2088 if (ipv4_is_zeronet(saddr))
2089 goto martian_source;
2090
2091 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2092 goto martian_source;
2093
2094 if (rt->rt_type != RTN_LOCAL)
2095 goto skip_validate_source;
2096
2097 tos &= IPTOS_RT_MASK;
2098 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2099 if (err < 0)
2100 goto martian_source;
2101
2102 skip_validate_source:
2103 skb_dst_copy(skb, hint);
2104 return 0;
2105
2106 martian_source:
2107 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2108 return err;
2109 }
2110
2111 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2112 static struct net_device *ip_rt_get_dev(struct net *net,
2113 const struct fib_result *res)
2114 {
2115 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2116 struct net_device *dev = NULL;
2117
2118 if (nhc)
2119 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2120
2121 return dev ? : net->loopback_dev;
2122 }
2123
2124 /*
2125 * NOTE. We drop all the packets that has local source
2126 * addresses, because every properly looped back packet
2127 * must have correct destination already attached by output routine.
2128 * Changes in the enforced policies must be applied also to
2129 * ip_route_use_hint().
2130 *
2131 * Such approach solves two big problems:
2132 * 1. Not simplex devices are handled properly.
2133 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2134 * called with rcu_read_lock()
2135 */
2136
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2137 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2138 u8 tos, struct net_device *dev,
2139 struct fib_result *res)
2140 {
2141 struct in_device *in_dev = __in_dev_get_rcu(dev);
2142 struct flow_keys *flkeys = NULL, _flkeys;
2143 struct net *net = dev_net(dev);
2144 struct ip_tunnel_info *tun_info;
2145 int err = -EINVAL;
2146 unsigned int flags = 0;
2147 u32 itag = 0;
2148 struct rtable *rth;
2149 struct flowi4 fl4;
2150 bool do_cache = true;
2151
2152 /* IP on this device is disabled. */
2153
2154 if (!in_dev)
2155 goto out;
2156
2157 /* Check for the most weird martians, which can be not detected
2158 by fib_lookup.
2159 */
2160
2161 tun_info = skb_tunnel_info(skb);
2162 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2163 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2164 else
2165 fl4.flowi4_tun_key.tun_id = 0;
2166 skb_dst_drop(skb);
2167
2168 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2169 goto martian_source;
2170
2171 res->fi = NULL;
2172 res->table = NULL;
2173 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2174 goto brd_input;
2175
2176 /* Accept zero addresses only to limited broadcast;
2177 * I even do not know to fix it or not. Waiting for complains :-)
2178 */
2179 if (ipv4_is_zeronet(saddr))
2180 goto martian_source;
2181
2182 if (ipv4_is_zeronet(daddr))
2183 goto martian_destination;
2184
2185 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2186 * and call it once if daddr or/and saddr are loopback addresses
2187 */
2188 if (ipv4_is_loopback(daddr)) {
2189 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2190 goto martian_destination;
2191 } else if (ipv4_is_loopback(saddr)) {
2192 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2193 goto martian_source;
2194 }
2195
2196 /*
2197 * Now we are ready to route packet.
2198 */
2199 fl4.flowi4_oif = 0;
2200 fl4.flowi4_iif = dev->ifindex;
2201 fl4.flowi4_mark = skb->mark;
2202 fl4.flowi4_tos = tos;
2203 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2204 fl4.flowi4_flags = 0;
2205 fl4.daddr = daddr;
2206 fl4.saddr = saddr;
2207 fl4.flowi4_uid = sock_net_uid(net, NULL);
2208 fl4.flowi4_multipath_hash = 0;
2209
2210 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2211 flkeys = &_flkeys;
2212 } else {
2213 fl4.flowi4_proto = 0;
2214 fl4.fl4_sport = 0;
2215 fl4.fl4_dport = 0;
2216 }
2217
2218 err = fib_lookup(net, &fl4, res, 0);
2219 if (err != 0) {
2220 if (!IN_DEV_FORWARD(in_dev))
2221 err = -EHOSTUNREACH;
2222 goto no_route;
2223 }
2224
2225 if (res->type == RTN_BROADCAST) {
2226 if (IN_DEV_BFORWARD(in_dev))
2227 goto make_route;
2228 /* not do cache if bc_forwarding is enabled */
2229 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2230 do_cache = false;
2231 goto brd_input;
2232 }
2233
2234 if (res->type == RTN_LOCAL) {
2235 err = fib_validate_source(skb, saddr, daddr, tos,
2236 0, dev, in_dev, &itag);
2237 if (err < 0)
2238 goto martian_source;
2239 goto local_input;
2240 }
2241
2242 if (!IN_DEV_FORWARD(in_dev)) {
2243 err = -EHOSTUNREACH;
2244 goto no_route;
2245 }
2246 if (res->type != RTN_UNICAST)
2247 goto martian_destination;
2248
2249 make_route:
2250 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2251 out: return err;
2252
2253 brd_input:
2254 if (skb->protocol != htons(ETH_P_IP))
2255 goto e_inval;
2256
2257 if (!ipv4_is_zeronet(saddr)) {
2258 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2259 in_dev, &itag);
2260 if (err < 0)
2261 goto martian_source;
2262 }
2263 flags |= RTCF_BROADCAST;
2264 res->type = RTN_BROADCAST;
2265 RT_CACHE_STAT_INC(in_brd);
2266
2267 local_input:
2268 do_cache &= res->fi && !itag;
2269 if (do_cache) {
2270 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2271
2272 rth = rcu_dereference(nhc->nhc_rth_input);
2273 if (rt_cache_valid(rth)) {
2274 skb_dst_set_noref(skb, &rth->dst);
2275 err = 0;
2276 goto out;
2277 }
2278 }
2279
2280 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2281 flags | RTCF_LOCAL, res->type,
2282 IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2283 if (!rth)
2284 goto e_nobufs;
2285
2286 rth->dst.output= ip_rt_bug;
2287 #ifdef CONFIG_IP_ROUTE_CLASSID
2288 rth->dst.tclassid = itag;
2289 #endif
2290 rth->rt_is_input = 1;
2291
2292 RT_CACHE_STAT_INC(in_slow_tot);
2293 if (res->type == RTN_UNREACHABLE) {
2294 rth->dst.input= ip_error;
2295 rth->dst.error= -err;
2296 rth->rt_flags &= ~RTCF_LOCAL;
2297 }
2298
2299 if (do_cache) {
2300 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2301
2302 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2303 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2304 WARN_ON(rth->dst.input == lwtunnel_input);
2305 rth->dst.lwtstate->orig_input = rth->dst.input;
2306 rth->dst.input = lwtunnel_input;
2307 }
2308
2309 if (unlikely(!rt_cache_route(nhc, rth)))
2310 rt_add_uncached_list(rth);
2311 }
2312 skb_dst_set(skb, &rth->dst);
2313 err = 0;
2314 goto out;
2315
2316 no_route:
2317 RT_CACHE_STAT_INC(in_no_route);
2318 res->type = RTN_UNREACHABLE;
2319 res->fi = NULL;
2320 res->table = NULL;
2321 goto local_input;
2322
2323 /*
2324 * Do not cache martian addresses: they should be logged (RFC1812)
2325 */
2326 martian_destination:
2327 RT_CACHE_STAT_INC(in_martian_dst);
2328 #ifdef CONFIG_IP_ROUTE_VERBOSE
2329 if (IN_DEV_LOG_MARTIANS(in_dev))
2330 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2331 &daddr, &saddr, dev->name);
2332 #endif
2333
2334 e_inval:
2335 err = -EINVAL;
2336 goto out;
2337
2338 e_nobufs:
2339 err = -ENOBUFS;
2340 goto out;
2341
2342 martian_source:
2343 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2344 goto out;
2345 }
2346
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2347 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2348 u8 tos, struct net_device *dev)
2349 {
2350 struct fib_result res;
2351 int err;
2352
2353 tos &= IPTOS_RT_MASK;
2354 rcu_read_lock();
2355 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2356 rcu_read_unlock();
2357
2358 return err;
2359 }
2360 EXPORT_SYMBOL(ip_route_input_noref);
2361
2362 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2363 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364 u8 tos, struct net_device *dev, struct fib_result *res)
2365 {
2366 /* Multicast recognition logic is moved from route cache to here.
2367 The problem was that too many Ethernet cards have broken/missing
2368 hardware multicast filters :-( As result the host on multicasting
2369 network acquires a lot of useless route cache entries, sort of
2370 SDR messages from all the world. Now we try to get rid of them.
2371 Really, provided software IP multicast filter is organized
2372 reasonably (at least, hashed), it does not result in a slowdown
2373 comparing with route cache reject entries.
2374 Note, that multicast routers are not affected, because
2375 route cache entry is created eventually.
2376 */
2377 if (ipv4_is_multicast(daddr)) {
2378 struct in_device *in_dev = __in_dev_get_rcu(dev);
2379 int our = 0;
2380 int err = -EINVAL;
2381
2382 if (!in_dev)
2383 return err;
2384 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2385 ip_hdr(skb)->protocol);
2386
2387 /* check l3 master if no match yet */
2388 if (!our && netif_is_l3_slave(dev)) {
2389 struct in_device *l3_in_dev;
2390
2391 l3_in_dev = __in_dev_get_rcu(skb->dev);
2392 if (l3_in_dev)
2393 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2394 ip_hdr(skb)->protocol);
2395 }
2396
2397 if (our
2398 #ifdef CONFIG_IP_MROUTE
2399 ||
2400 (!ipv4_is_local_multicast(daddr) &&
2401 IN_DEV_MFORWARD(in_dev))
2402 #endif
2403 ) {
2404 err = ip_route_input_mc(skb, daddr, saddr,
2405 tos, dev, our);
2406 }
2407 return err;
2408 }
2409
2410 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2411 }
2412
2413 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2414 static struct rtable *__mkroute_output(const struct fib_result *res,
2415 const struct flowi4 *fl4, int orig_oif,
2416 struct net_device *dev_out,
2417 unsigned int flags)
2418 {
2419 struct fib_info *fi = res->fi;
2420 struct fib_nh_exception *fnhe;
2421 struct in_device *in_dev;
2422 u16 type = res->type;
2423 struct rtable *rth;
2424 bool do_cache;
2425
2426 in_dev = __in_dev_get_rcu(dev_out);
2427 if (!in_dev)
2428 return ERR_PTR(-EINVAL);
2429
2430 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2431 if (ipv4_is_loopback(fl4->saddr) &&
2432 !(dev_out->flags & IFF_LOOPBACK) &&
2433 !netif_is_l3_master(dev_out))
2434 return ERR_PTR(-EINVAL);
2435
2436 if (ipv4_is_lbcast(fl4->daddr))
2437 type = RTN_BROADCAST;
2438 else if (ipv4_is_multicast(fl4->daddr))
2439 type = RTN_MULTICAST;
2440 else if (ipv4_is_zeronet(fl4->daddr))
2441 return ERR_PTR(-EINVAL);
2442
2443 if (dev_out->flags & IFF_LOOPBACK)
2444 flags |= RTCF_LOCAL;
2445
2446 do_cache = true;
2447 if (type == RTN_BROADCAST) {
2448 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2449 fi = NULL;
2450 } else if (type == RTN_MULTICAST) {
2451 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2452 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2453 fl4->flowi4_proto))
2454 flags &= ~RTCF_LOCAL;
2455 else
2456 do_cache = false;
2457 /* If multicast route do not exist use
2458 * default one, but do not gateway in this case.
2459 * Yes, it is hack.
2460 */
2461 if (fi && res->prefixlen < 4)
2462 fi = NULL;
2463 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2464 (orig_oif != dev_out->ifindex)) {
2465 /* For local routes that require a particular output interface
2466 * we do not want to cache the result. Caching the result
2467 * causes incorrect behaviour when there are multiple source
2468 * addresses on the interface, the end result being that if the
2469 * intended recipient is waiting on that interface for the
2470 * packet he won't receive it because it will be delivered on
2471 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2472 * be set to the loopback interface as well.
2473 */
2474 do_cache = false;
2475 }
2476
2477 fnhe = NULL;
2478 do_cache &= fi != NULL;
2479 if (fi) {
2480 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2481 struct rtable __rcu **prth;
2482
2483 fnhe = find_exception(nhc, fl4->daddr);
2484 if (!do_cache)
2485 goto add;
2486 if (fnhe) {
2487 prth = &fnhe->fnhe_rth_output;
2488 } else {
2489 if (unlikely(fl4->flowi4_flags &
2490 FLOWI_FLAG_KNOWN_NH &&
2491 !(nhc->nhc_gw_family &&
2492 nhc->nhc_scope == RT_SCOPE_LINK))) {
2493 do_cache = false;
2494 goto add;
2495 }
2496 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2497 }
2498 rth = rcu_dereference(*prth);
2499 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2500 return rth;
2501 }
2502
2503 add:
2504 rth = rt_dst_alloc(dev_out, flags, type,
2505 IN_DEV_ORCONF(in_dev, NOPOLICY),
2506 IN_DEV_ORCONF(in_dev, NOXFRM));
2507 if (!rth)
2508 return ERR_PTR(-ENOBUFS);
2509
2510 rth->rt_iif = orig_oif;
2511
2512 RT_CACHE_STAT_INC(out_slow_tot);
2513
2514 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2515 if (flags & RTCF_LOCAL &&
2516 !(dev_out->flags & IFF_LOOPBACK)) {
2517 rth->dst.output = ip_mc_output;
2518 RT_CACHE_STAT_INC(out_slow_mc);
2519 }
2520 #ifdef CONFIG_IP_MROUTE
2521 if (type == RTN_MULTICAST) {
2522 if (IN_DEV_MFORWARD(in_dev) &&
2523 !ipv4_is_local_multicast(fl4->daddr)) {
2524 rth->dst.input = ip_mr_input;
2525 rth->dst.output = ip_mc_output;
2526 }
2527 }
2528 #endif
2529 }
2530
2531 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2532 lwtunnel_set_redirect(&rth->dst);
2533
2534 return rth;
2535 }
2536
2537 /*
2538 * Major route resolver routine.
2539 */
2540
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2541 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2542 const struct sk_buff *skb)
2543 {
2544 struct fib_result res = {
2545 .type = RTN_UNSPEC,
2546 .fi = NULL,
2547 .table = NULL,
2548 .tclassid = 0,
2549 };
2550 struct rtable *rth;
2551
2552 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2553 ip_rt_fix_tos(fl4);
2554
2555 rcu_read_lock();
2556 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2557 rcu_read_unlock();
2558
2559 return rth;
2560 }
2561 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2562
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2563 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2564 struct fib_result *res,
2565 const struct sk_buff *skb)
2566 {
2567 struct net_device *dev_out = NULL;
2568 int orig_oif = fl4->flowi4_oif;
2569 unsigned int flags = 0;
2570 struct rtable *rth;
2571 int err;
2572
2573 if (fl4->saddr) {
2574 if (ipv4_is_multicast(fl4->saddr) ||
2575 ipv4_is_lbcast(fl4->saddr) ||
2576 ipv4_is_zeronet(fl4->saddr)) {
2577 rth = ERR_PTR(-EINVAL);
2578 goto out;
2579 }
2580
2581 rth = ERR_PTR(-ENETUNREACH);
2582
2583 /* I removed check for oif == dev_out->oif here.
2584 It was wrong for two reasons:
2585 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2586 is assigned to multiple interfaces.
2587 2. Moreover, we are allowed to send packets with saddr
2588 of another iface. --ANK
2589 */
2590
2591 if (fl4->flowi4_oif == 0 &&
2592 (ipv4_is_multicast(fl4->daddr) ||
2593 ipv4_is_lbcast(fl4->daddr))) {
2594 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2595 dev_out = __ip_dev_find(net, fl4->saddr, false);
2596 if (!dev_out)
2597 goto out;
2598
2599 /* Special hack: user can direct multicasts
2600 and limited broadcast via necessary interface
2601 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2602 This hack is not just for fun, it allows
2603 vic,vat and friends to work.
2604 They bind socket to loopback, set ttl to zero
2605 and expect that it will work.
2606 From the viewpoint of routing cache they are broken,
2607 because we are not allowed to build multicast path
2608 with loopback source addr (look, routing cache
2609 cannot know, that ttl is zero, so that packet
2610 will not leave this host and route is valid).
2611 Luckily, this hack is good workaround.
2612 */
2613
2614 fl4->flowi4_oif = dev_out->ifindex;
2615 goto make_route;
2616 }
2617
2618 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2619 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2620 if (!__ip_dev_find(net, fl4->saddr, false))
2621 goto out;
2622 }
2623 }
2624
2625
2626 if (fl4->flowi4_oif) {
2627 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2628 rth = ERR_PTR(-ENODEV);
2629 if (!dev_out)
2630 goto out;
2631
2632 /* RACE: Check return value of inet_select_addr instead. */
2633 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2634 rth = ERR_PTR(-ENETUNREACH);
2635 goto out;
2636 }
2637 if (ipv4_is_local_multicast(fl4->daddr) ||
2638 ipv4_is_lbcast(fl4->daddr) ||
2639 fl4->flowi4_proto == IPPROTO_IGMP) {
2640 if (!fl4->saddr)
2641 fl4->saddr = inet_select_addr(dev_out, 0,
2642 RT_SCOPE_LINK);
2643 goto make_route;
2644 }
2645 if (!fl4->saddr) {
2646 if (ipv4_is_multicast(fl4->daddr))
2647 fl4->saddr = inet_select_addr(dev_out, 0,
2648 fl4->flowi4_scope);
2649 else if (!fl4->daddr)
2650 fl4->saddr = inet_select_addr(dev_out, 0,
2651 RT_SCOPE_HOST);
2652 }
2653 }
2654
2655 if (!fl4->daddr) {
2656 fl4->daddr = fl4->saddr;
2657 if (!fl4->daddr)
2658 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2659 dev_out = net->loopback_dev;
2660 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2661 res->type = RTN_LOCAL;
2662 flags |= RTCF_LOCAL;
2663 goto make_route;
2664 }
2665
2666 err = fib_lookup(net, fl4, res, 0);
2667 if (err) {
2668 res->fi = NULL;
2669 res->table = NULL;
2670 if (fl4->flowi4_oif &&
2671 (ipv4_is_multicast(fl4->daddr) ||
2672 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2673 /* Apparently, routing tables are wrong. Assume,
2674 that the destination is on link.
2675
2676 WHY? DW.
2677 Because we are allowed to send to iface
2678 even if it has NO routes and NO assigned
2679 addresses. When oif is specified, routing
2680 tables are looked up with only one purpose:
2681 to catch if destination is gatewayed, rather than
2682 direct. Moreover, if MSG_DONTROUTE is set,
2683 we send packet, ignoring both routing tables
2684 and ifaddr state. --ANK
2685
2686
2687 We could make it even if oif is unknown,
2688 likely IPv6, but we do not.
2689 */
2690
2691 if (fl4->saddr == 0)
2692 fl4->saddr = inet_select_addr(dev_out, 0,
2693 RT_SCOPE_LINK);
2694 res->type = RTN_UNICAST;
2695 goto make_route;
2696 }
2697 rth = ERR_PTR(err);
2698 goto out;
2699 }
2700
2701 if (res->type == RTN_LOCAL) {
2702 if (!fl4->saddr) {
2703 if (res->fi->fib_prefsrc)
2704 fl4->saddr = res->fi->fib_prefsrc;
2705 else
2706 fl4->saddr = fl4->daddr;
2707 }
2708
2709 /* L3 master device is the loopback for that domain */
2710 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2711 net->loopback_dev;
2712
2713 /* make sure orig_oif points to fib result device even
2714 * though packet rx/tx happens over loopback or l3mdev
2715 */
2716 orig_oif = FIB_RES_OIF(*res);
2717
2718 fl4->flowi4_oif = dev_out->ifindex;
2719 flags |= RTCF_LOCAL;
2720 goto make_route;
2721 }
2722
2723 fib_select_path(net, res, fl4, skb);
2724
2725 dev_out = FIB_RES_DEV(*res);
2726
2727 make_route:
2728 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2729
2730 out:
2731 return rth;
2732 }
2733
2734 static struct dst_ops ipv4_dst_blackhole_ops = {
2735 .family = AF_INET,
2736 .default_advmss = ipv4_default_advmss,
2737 .neigh_lookup = ipv4_neigh_lookup,
2738 .check = dst_blackhole_check,
2739 .cow_metrics = dst_blackhole_cow_metrics,
2740 .update_pmtu = dst_blackhole_update_pmtu,
2741 .redirect = dst_blackhole_redirect,
2742 .mtu = dst_blackhole_mtu,
2743 };
2744
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2745 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2746 {
2747 struct rtable *ort = (struct rtable *) dst_orig;
2748 struct rtable *rt;
2749
2750 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2751 if (rt) {
2752 struct dst_entry *new = &rt->dst;
2753
2754 new->__use = 1;
2755 new->input = dst_discard;
2756 new->output = dst_discard_out;
2757
2758 new->dev = net->loopback_dev;
2759 if (new->dev)
2760 dev_hold(new->dev);
2761
2762 rt->rt_is_input = ort->rt_is_input;
2763 rt->rt_iif = ort->rt_iif;
2764 rt->rt_pmtu = ort->rt_pmtu;
2765 rt->rt_mtu_locked = ort->rt_mtu_locked;
2766
2767 rt->rt_genid = rt_genid_ipv4(net);
2768 rt->rt_flags = ort->rt_flags;
2769 rt->rt_type = ort->rt_type;
2770 rt->rt_uses_gateway = ort->rt_uses_gateway;
2771 rt->rt_gw_family = ort->rt_gw_family;
2772 if (rt->rt_gw_family == AF_INET)
2773 rt->rt_gw4 = ort->rt_gw4;
2774 else if (rt->rt_gw_family == AF_INET6)
2775 rt->rt_gw6 = ort->rt_gw6;
2776
2777 INIT_LIST_HEAD(&rt->rt_uncached);
2778 }
2779
2780 dst_release(dst_orig);
2781
2782 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2783 }
2784
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2785 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2786 const struct sock *sk)
2787 {
2788 struct rtable *rt = __ip_route_output_key(net, flp4);
2789
2790 if (IS_ERR(rt))
2791 return rt;
2792
2793 if (flp4->flowi4_proto) {
2794 flp4->flowi4_oif = rt->dst.dev->ifindex;
2795 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2796 flowi4_to_flowi(flp4),
2797 sk, 0);
2798 }
2799
2800 return rt;
2801 }
2802 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2803
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2804 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2805 struct net_device *dev,
2806 struct net *net, __be32 *saddr,
2807 const struct ip_tunnel_info *info,
2808 u8 protocol, bool use_cache)
2809 {
2810 #ifdef CONFIG_DST_CACHE
2811 struct dst_cache *dst_cache;
2812 #endif
2813 struct rtable *rt = NULL;
2814 struct flowi4 fl4;
2815 __u8 tos;
2816
2817 #ifdef CONFIG_DST_CACHE
2818 dst_cache = (struct dst_cache *)&info->dst_cache;
2819 if (use_cache) {
2820 rt = dst_cache_get_ip4(dst_cache, saddr);
2821 if (rt)
2822 return rt;
2823 }
2824 #endif
2825 memset(&fl4, 0, sizeof(fl4));
2826 fl4.flowi4_mark = skb->mark;
2827 fl4.flowi4_proto = protocol;
2828 fl4.daddr = info->key.u.ipv4.dst;
2829 fl4.saddr = info->key.u.ipv4.src;
2830 tos = info->key.tos;
2831 fl4.flowi4_tos = RT_TOS(tos);
2832
2833 rt = ip_route_output_key(net, &fl4);
2834 if (IS_ERR(rt)) {
2835 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2836 return ERR_PTR(-ENETUNREACH);
2837 }
2838 if (rt->dst.dev == dev) { /* is this necessary? */
2839 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2840 ip_rt_put(rt);
2841 return ERR_PTR(-ELOOP);
2842 }
2843 #ifdef CONFIG_DST_CACHE
2844 if (use_cache)
2845 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2846 #endif
2847 *saddr = fl4.saddr;
2848 return rt;
2849 }
2850 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2851
2852 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2853 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2854 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2855 struct sk_buff *skb, u32 portid, u32 seq,
2856 unsigned int flags)
2857 {
2858 struct rtmsg *r;
2859 struct nlmsghdr *nlh;
2860 unsigned long expires = 0;
2861 u32 error;
2862 u32 metrics[RTAX_MAX];
2863
2864 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2865 if (!nlh)
2866 return -EMSGSIZE;
2867
2868 r = nlmsg_data(nlh);
2869 r->rtm_family = AF_INET;
2870 r->rtm_dst_len = 32;
2871 r->rtm_src_len = 0;
2872 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2873 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2874 if (nla_put_u32(skb, RTA_TABLE, table_id))
2875 goto nla_put_failure;
2876 r->rtm_type = rt->rt_type;
2877 r->rtm_scope = RT_SCOPE_UNIVERSE;
2878 r->rtm_protocol = RTPROT_UNSPEC;
2879 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2880 if (rt->rt_flags & RTCF_NOTIFY)
2881 r->rtm_flags |= RTM_F_NOTIFY;
2882 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2883 r->rtm_flags |= RTCF_DOREDIRECT;
2884
2885 if (nla_put_in_addr(skb, RTA_DST, dst))
2886 goto nla_put_failure;
2887 if (src) {
2888 r->rtm_src_len = 32;
2889 if (nla_put_in_addr(skb, RTA_SRC, src))
2890 goto nla_put_failure;
2891 }
2892 if (rt->dst.dev &&
2893 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2894 goto nla_put_failure;
2895 #ifdef CONFIG_IP_ROUTE_CLASSID
2896 if (rt->dst.tclassid &&
2897 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2898 goto nla_put_failure;
2899 #endif
2900 if (fl4 && !rt_is_input_route(rt) &&
2901 fl4->saddr != src) {
2902 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2903 goto nla_put_failure;
2904 }
2905 if (rt->rt_uses_gateway) {
2906 if (rt->rt_gw_family == AF_INET &&
2907 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2908 goto nla_put_failure;
2909 } else if (rt->rt_gw_family == AF_INET6) {
2910 int alen = sizeof(struct in6_addr);
2911 struct nlattr *nla;
2912 struct rtvia *via;
2913
2914 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2915 if (!nla)
2916 goto nla_put_failure;
2917
2918 via = nla_data(nla);
2919 via->rtvia_family = AF_INET6;
2920 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2921 }
2922 }
2923
2924 expires = rt->dst.expires;
2925 if (expires) {
2926 unsigned long now = jiffies;
2927
2928 if (time_before(now, expires))
2929 expires -= now;
2930 else
2931 expires = 0;
2932 }
2933
2934 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2935 if (rt->rt_pmtu && expires)
2936 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2937 if (rt->rt_mtu_locked && expires)
2938 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2939 if (rtnetlink_put_metrics(skb, metrics) < 0)
2940 goto nla_put_failure;
2941
2942 if (fl4) {
2943 if (fl4->flowi4_mark &&
2944 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2945 goto nla_put_failure;
2946
2947 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2948 nla_put_u32(skb, RTA_UID,
2949 from_kuid_munged(current_user_ns(),
2950 fl4->flowi4_uid)))
2951 goto nla_put_failure;
2952
2953 if (rt_is_input_route(rt)) {
2954 #ifdef CONFIG_IP_MROUTE
2955 if (ipv4_is_multicast(dst) &&
2956 !ipv4_is_local_multicast(dst) &&
2957 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2958 int err = ipmr_get_route(net, skb,
2959 fl4->saddr, fl4->daddr,
2960 r, portid);
2961
2962 if (err <= 0) {
2963 if (err == 0)
2964 return 0;
2965 goto nla_put_failure;
2966 }
2967 } else
2968 #endif
2969 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2970 goto nla_put_failure;
2971 }
2972 }
2973
2974 error = rt->dst.error;
2975
2976 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2977 goto nla_put_failure;
2978
2979 nlmsg_end(skb, nlh);
2980 return 0;
2981
2982 nla_put_failure:
2983 nlmsg_cancel(skb, nlh);
2984 return -EMSGSIZE;
2985 }
2986
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2987 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2988 struct netlink_callback *cb, u32 table_id,
2989 struct fnhe_hash_bucket *bucket, int genid,
2990 int *fa_index, int fa_start, unsigned int flags)
2991 {
2992 int i;
2993
2994 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2995 struct fib_nh_exception *fnhe;
2996
2997 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2998 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2999 struct rtable *rt;
3000 int err;
3001
3002 if (*fa_index < fa_start)
3003 goto next;
3004
3005 if (fnhe->fnhe_genid != genid)
3006 goto next;
3007
3008 if (fnhe->fnhe_expires &&
3009 time_after(jiffies, fnhe->fnhe_expires))
3010 goto next;
3011
3012 rt = rcu_dereference(fnhe->fnhe_rth_input);
3013 if (!rt)
3014 rt = rcu_dereference(fnhe->fnhe_rth_output);
3015 if (!rt)
3016 goto next;
3017
3018 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3019 table_id, NULL, skb,
3020 NETLINK_CB(cb->skb).portid,
3021 cb->nlh->nlmsg_seq, flags);
3022 if (err)
3023 return err;
3024 next:
3025 (*fa_index)++;
3026 }
3027 }
3028
3029 return 0;
3030 }
3031
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3032 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3033 u32 table_id, struct fib_info *fi,
3034 int *fa_index, int fa_start, unsigned int flags)
3035 {
3036 struct net *net = sock_net(cb->skb->sk);
3037 int nhsel, genid = fnhe_genid(net);
3038
3039 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3040 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3041 struct fnhe_hash_bucket *bucket;
3042 int err;
3043
3044 if (nhc->nhc_flags & RTNH_F_DEAD)
3045 continue;
3046
3047 rcu_read_lock();
3048 bucket = rcu_dereference(nhc->nhc_exceptions);
3049 err = 0;
3050 if (bucket)
3051 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3052 genid, fa_index, fa_start,
3053 flags);
3054 rcu_read_unlock();
3055 if (err)
3056 return err;
3057 }
3058
3059 return 0;
3060 }
3061
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3062 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3063 u8 ip_proto, __be16 sport,
3064 __be16 dport)
3065 {
3066 struct sk_buff *skb;
3067 struct iphdr *iph;
3068
3069 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3070 if (!skb)
3071 return NULL;
3072
3073 /* Reserve room for dummy headers, this skb can pass
3074 * through good chunk of routing engine.
3075 */
3076 skb_reset_mac_header(skb);
3077 skb_reset_network_header(skb);
3078 skb->protocol = htons(ETH_P_IP);
3079 iph = skb_put(skb, sizeof(struct iphdr));
3080 iph->protocol = ip_proto;
3081 iph->saddr = src;
3082 iph->daddr = dst;
3083 iph->version = 0x4;
3084 iph->frag_off = 0;
3085 iph->ihl = 0x5;
3086 skb_set_transport_header(skb, skb->len);
3087
3088 switch (iph->protocol) {
3089 case IPPROTO_UDP: {
3090 struct udphdr *udph;
3091
3092 udph = skb_put_zero(skb, sizeof(struct udphdr));
3093 udph->source = sport;
3094 udph->dest = dport;
3095 udph->len = htons(sizeof(struct udphdr));
3096 udph->check = 0;
3097 break;
3098 }
3099 case IPPROTO_TCP: {
3100 struct tcphdr *tcph;
3101
3102 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3103 tcph->source = sport;
3104 tcph->dest = dport;
3105 tcph->doff = sizeof(struct tcphdr) / 4;
3106 tcph->rst = 1;
3107 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3108 src, dst, 0);
3109 break;
3110 }
3111 case IPPROTO_ICMP: {
3112 struct icmphdr *icmph;
3113
3114 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3115 icmph->type = ICMP_ECHO;
3116 icmph->code = 0;
3117 }
3118 }
3119
3120 return skb;
3121 }
3122
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3123 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3124 const struct nlmsghdr *nlh,
3125 struct nlattr **tb,
3126 struct netlink_ext_ack *extack)
3127 {
3128 struct rtmsg *rtm;
3129 int i, err;
3130
3131 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3132 NL_SET_ERR_MSG(extack,
3133 "ipv4: Invalid header for route get request");
3134 return -EINVAL;
3135 }
3136
3137 if (!netlink_strict_get_check(skb))
3138 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3139 rtm_ipv4_policy, extack);
3140
3141 rtm = nlmsg_data(nlh);
3142 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3143 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3144 rtm->rtm_table || rtm->rtm_protocol ||
3145 rtm->rtm_scope || rtm->rtm_type) {
3146 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3147 return -EINVAL;
3148 }
3149
3150 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3151 RTM_F_LOOKUP_TABLE |
3152 RTM_F_FIB_MATCH)) {
3153 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3154 return -EINVAL;
3155 }
3156
3157 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3158 rtm_ipv4_policy, extack);
3159 if (err)
3160 return err;
3161
3162 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3163 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3164 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3165 return -EINVAL;
3166 }
3167
3168 for (i = 0; i <= RTA_MAX; i++) {
3169 if (!tb[i])
3170 continue;
3171
3172 switch (i) {
3173 case RTA_IIF:
3174 case RTA_OIF:
3175 case RTA_SRC:
3176 case RTA_DST:
3177 case RTA_IP_PROTO:
3178 case RTA_SPORT:
3179 case RTA_DPORT:
3180 case RTA_MARK:
3181 case RTA_UID:
3182 break;
3183 default:
3184 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3185 return -EINVAL;
3186 }
3187 }
3188
3189 return 0;
3190 }
3191
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3192 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3193 struct netlink_ext_ack *extack)
3194 {
3195 struct net *net = sock_net(in_skb->sk);
3196 struct nlattr *tb[RTA_MAX+1];
3197 u32 table_id = RT_TABLE_MAIN;
3198 __be16 sport = 0, dport = 0;
3199 struct fib_result res = {};
3200 u8 ip_proto = IPPROTO_UDP;
3201 struct rtable *rt = NULL;
3202 struct sk_buff *skb;
3203 struct rtmsg *rtm;
3204 struct flowi4 fl4 = {};
3205 __be32 dst = 0;
3206 __be32 src = 0;
3207 kuid_t uid;
3208 u32 iif;
3209 int err;
3210 int mark;
3211
3212 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3213 if (err < 0)
3214 return err;
3215
3216 rtm = nlmsg_data(nlh);
3217 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3218 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3219 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3220 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3221 if (tb[RTA_UID])
3222 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3223 else
3224 uid = (iif ? INVALID_UID : current_uid());
3225
3226 if (tb[RTA_IP_PROTO]) {
3227 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3228 &ip_proto, AF_INET, extack);
3229 if (err)
3230 return err;
3231 }
3232
3233 if (tb[RTA_SPORT])
3234 sport = nla_get_be16(tb[RTA_SPORT]);
3235
3236 if (tb[RTA_DPORT])
3237 dport = nla_get_be16(tb[RTA_DPORT]);
3238
3239 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3240 if (!skb)
3241 return -ENOBUFS;
3242
3243 fl4.daddr = dst;
3244 fl4.saddr = src;
3245 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3246 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3247 fl4.flowi4_mark = mark;
3248 fl4.flowi4_uid = uid;
3249 if (sport)
3250 fl4.fl4_sport = sport;
3251 if (dport)
3252 fl4.fl4_dport = dport;
3253 fl4.flowi4_proto = ip_proto;
3254
3255 rcu_read_lock();
3256
3257 if (iif) {
3258 struct net_device *dev;
3259
3260 dev = dev_get_by_index_rcu(net, iif);
3261 if (!dev) {
3262 err = -ENODEV;
3263 goto errout_rcu;
3264 }
3265
3266 fl4.flowi4_iif = iif; /* for rt_fill_info */
3267 skb->dev = dev;
3268 skb->mark = mark;
3269 err = ip_route_input_rcu(skb, dst, src,
3270 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3271 &res);
3272
3273 rt = skb_rtable(skb);
3274 if (err == 0 && rt->dst.error)
3275 err = -rt->dst.error;
3276 } else {
3277 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3278 skb->dev = net->loopback_dev;
3279 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3280 err = 0;
3281 if (IS_ERR(rt))
3282 err = PTR_ERR(rt);
3283 else
3284 skb_dst_set(skb, &rt->dst);
3285 }
3286
3287 if (err)
3288 goto errout_rcu;
3289
3290 if (rtm->rtm_flags & RTM_F_NOTIFY)
3291 rt->rt_flags |= RTCF_NOTIFY;
3292
3293 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3294 table_id = res.table ? res.table->tb_id : 0;
3295
3296 /* reset skb for netlink reply msg */
3297 skb_trim(skb, 0);
3298 skb_reset_network_header(skb);
3299 skb_reset_transport_header(skb);
3300 skb_reset_mac_header(skb);
3301
3302 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3303 struct fib_rt_info fri;
3304
3305 if (!res.fi) {
3306 err = fib_props[res.type].error;
3307 if (!err)
3308 err = -EHOSTUNREACH;
3309 goto errout_rcu;
3310 }
3311 fri.fi = res.fi;
3312 fri.tb_id = table_id;
3313 fri.dst = res.prefix;
3314 fri.dst_len = res.prefixlen;
3315 fri.tos = fl4.flowi4_tos;
3316 fri.type = rt->rt_type;
3317 fri.offload = 0;
3318 fri.trap = 0;
3319 if (res.fa_head) {
3320 struct fib_alias *fa;
3321
3322 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3323 u8 slen = 32 - fri.dst_len;
3324
3325 if (fa->fa_slen == slen &&
3326 fa->tb_id == fri.tb_id &&
3327 fa->fa_tos == fri.tos &&
3328 fa->fa_info == res.fi &&
3329 fa->fa_type == fri.type) {
3330 fri.offload = fa->offload;
3331 fri.trap = fa->trap;
3332 break;
3333 }
3334 }
3335 }
3336 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3337 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3338 } else {
3339 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3340 NETLINK_CB(in_skb).portid,
3341 nlh->nlmsg_seq, 0);
3342 }
3343 if (err < 0)
3344 goto errout_rcu;
3345
3346 rcu_read_unlock();
3347
3348 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3349
3350 errout_free:
3351 return err;
3352 errout_rcu:
3353 rcu_read_unlock();
3354 kfree_skb(skb);
3355 goto errout_free;
3356 }
3357
ip_rt_multicast_event(struct in_device * in_dev)3358 void ip_rt_multicast_event(struct in_device *in_dev)
3359 {
3360 rt_cache_flush(dev_net(in_dev->dev));
3361 }
3362
3363 #ifdef CONFIG_SYSCTL
3364 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3365 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3366 static int ip_rt_gc_elasticity __read_mostly = 8;
3367 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3368
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3369 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3370 void *buffer, size_t *lenp, loff_t *ppos)
3371 {
3372 struct net *net = (struct net *)__ctl->extra1;
3373
3374 if (write) {
3375 rt_cache_flush(net);
3376 fnhe_genid_bump(net);
3377 return 0;
3378 }
3379
3380 return -EINVAL;
3381 }
3382
3383 static struct ctl_table ipv4_route_table[] = {
3384 {
3385 .procname = "gc_thresh",
3386 .data = &ipv4_dst_ops.gc_thresh,
3387 .maxlen = sizeof(int),
3388 .mode = 0644,
3389 .proc_handler = proc_dointvec,
3390 },
3391 {
3392 .procname = "max_size",
3393 .data = &ip_rt_max_size,
3394 .maxlen = sizeof(int),
3395 .mode = 0644,
3396 .proc_handler = proc_dointvec,
3397 },
3398 {
3399 /* Deprecated. Use gc_min_interval_ms */
3400
3401 .procname = "gc_min_interval",
3402 .data = &ip_rt_gc_min_interval,
3403 .maxlen = sizeof(int),
3404 .mode = 0644,
3405 .proc_handler = proc_dointvec_jiffies,
3406 },
3407 {
3408 .procname = "gc_min_interval_ms",
3409 .data = &ip_rt_gc_min_interval,
3410 .maxlen = sizeof(int),
3411 .mode = 0644,
3412 .proc_handler = proc_dointvec_ms_jiffies,
3413 },
3414 {
3415 .procname = "gc_timeout",
3416 .data = &ip_rt_gc_timeout,
3417 .maxlen = sizeof(int),
3418 .mode = 0644,
3419 .proc_handler = proc_dointvec_jiffies,
3420 },
3421 {
3422 .procname = "gc_interval",
3423 .data = &ip_rt_gc_interval,
3424 .maxlen = sizeof(int),
3425 .mode = 0644,
3426 .proc_handler = proc_dointvec_jiffies,
3427 },
3428 {
3429 .procname = "redirect_load",
3430 .data = &ip_rt_redirect_load,
3431 .maxlen = sizeof(int),
3432 .mode = 0644,
3433 .proc_handler = proc_dointvec,
3434 },
3435 {
3436 .procname = "redirect_number",
3437 .data = &ip_rt_redirect_number,
3438 .maxlen = sizeof(int),
3439 .mode = 0644,
3440 .proc_handler = proc_dointvec,
3441 },
3442 {
3443 .procname = "redirect_silence",
3444 .data = &ip_rt_redirect_silence,
3445 .maxlen = sizeof(int),
3446 .mode = 0644,
3447 .proc_handler = proc_dointvec,
3448 },
3449 {
3450 .procname = "error_cost",
3451 .data = &ip_rt_error_cost,
3452 .maxlen = sizeof(int),
3453 .mode = 0644,
3454 .proc_handler = proc_dointvec,
3455 },
3456 {
3457 .procname = "error_burst",
3458 .data = &ip_rt_error_burst,
3459 .maxlen = sizeof(int),
3460 .mode = 0644,
3461 .proc_handler = proc_dointvec,
3462 },
3463 {
3464 .procname = "gc_elasticity",
3465 .data = &ip_rt_gc_elasticity,
3466 .maxlen = sizeof(int),
3467 .mode = 0644,
3468 .proc_handler = proc_dointvec,
3469 },
3470 {
3471 .procname = "mtu_expires",
3472 .data = &ip_rt_mtu_expires,
3473 .maxlen = sizeof(int),
3474 .mode = 0644,
3475 .proc_handler = proc_dointvec_jiffies,
3476 },
3477 {
3478 .procname = "min_pmtu",
3479 .data = &ip_rt_min_pmtu,
3480 .maxlen = sizeof(int),
3481 .mode = 0644,
3482 .proc_handler = proc_dointvec_minmax,
3483 .extra1 = &ip_min_valid_pmtu,
3484 },
3485 {
3486 .procname = "min_adv_mss",
3487 .data = &ip_rt_min_advmss,
3488 .maxlen = sizeof(int),
3489 .mode = 0644,
3490 .proc_handler = proc_dointvec,
3491 },
3492 { }
3493 };
3494
3495 static const char ipv4_route_flush_procname[] = "flush";
3496
3497 static struct ctl_table ipv4_route_flush_table[] = {
3498 {
3499 .procname = ipv4_route_flush_procname,
3500 .maxlen = sizeof(int),
3501 .mode = 0200,
3502 .proc_handler = ipv4_sysctl_rtcache_flush,
3503 },
3504 { },
3505 };
3506
sysctl_route_net_init(struct net * net)3507 static __net_init int sysctl_route_net_init(struct net *net)
3508 {
3509 struct ctl_table *tbl;
3510
3511 tbl = ipv4_route_flush_table;
3512 if (!net_eq(net, &init_net)) {
3513 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3514 if (!tbl)
3515 goto err_dup;
3516
3517 /* Don't export non-whitelisted sysctls to unprivileged users */
3518 if (net->user_ns != &init_user_ns) {
3519 if (tbl[0].procname != ipv4_route_flush_procname)
3520 tbl[0].procname = NULL;
3521 }
3522 }
3523 tbl[0].extra1 = net;
3524
3525 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3526 if (!net->ipv4.route_hdr)
3527 goto err_reg;
3528 return 0;
3529
3530 err_reg:
3531 if (tbl != ipv4_route_flush_table)
3532 kfree(tbl);
3533 err_dup:
3534 return -ENOMEM;
3535 }
3536
sysctl_route_net_exit(struct net * net)3537 static __net_exit void sysctl_route_net_exit(struct net *net)
3538 {
3539 struct ctl_table *tbl;
3540
3541 tbl = net->ipv4.route_hdr->ctl_table_arg;
3542 unregister_net_sysctl_table(net->ipv4.route_hdr);
3543 BUG_ON(tbl == ipv4_route_flush_table);
3544 kfree(tbl);
3545 }
3546
3547 static __net_initdata struct pernet_operations sysctl_route_ops = {
3548 .init = sysctl_route_net_init,
3549 .exit = sysctl_route_net_exit,
3550 };
3551 #endif
3552
rt_genid_init(struct net * net)3553 static __net_init int rt_genid_init(struct net *net)
3554 {
3555 atomic_set(&net->ipv4.rt_genid, 0);
3556 atomic_set(&net->fnhe_genid, 0);
3557 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3558 return 0;
3559 }
3560
3561 static __net_initdata struct pernet_operations rt_genid_ops = {
3562 .init = rt_genid_init,
3563 };
3564
ipv4_inetpeer_init(struct net * net)3565 static int __net_init ipv4_inetpeer_init(struct net *net)
3566 {
3567 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3568
3569 if (!bp)
3570 return -ENOMEM;
3571 inet_peer_base_init(bp);
3572 net->ipv4.peers = bp;
3573 return 0;
3574 }
3575
ipv4_inetpeer_exit(struct net * net)3576 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3577 {
3578 struct inet_peer_base *bp = net->ipv4.peers;
3579
3580 net->ipv4.peers = NULL;
3581 inetpeer_invalidate_tree(bp);
3582 kfree(bp);
3583 }
3584
3585 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3586 .init = ipv4_inetpeer_init,
3587 .exit = ipv4_inetpeer_exit,
3588 };
3589
3590 #ifdef CONFIG_IP_ROUTE_CLASSID
3591 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3592 #endif /* CONFIG_IP_ROUTE_CLASSID */
3593
ip_rt_init(void)3594 int __init ip_rt_init(void)
3595 {
3596 void *idents_hash;
3597 int cpu;
3598
3599 /* For modern hosts, this will use 2 MB of memory */
3600 idents_hash = alloc_large_system_hash("IP idents",
3601 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3602 0,
3603 16, /* one bucket per 64 KB */
3604 HASH_ZERO,
3605 NULL,
3606 &ip_idents_mask,
3607 2048,
3608 256*1024);
3609
3610 ip_idents = idents_hash;
3611
3612 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3613
3614 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3615
3616 for_each_possible_cpu(cpu) {
3617 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3618
3619 INIT_LIST_HEAD(&ul->head);
3620 spin_lock_init(&ul->lock);
3621 }
3622 #ifdef CONFIG_IP_ROUTE_CLASSID
3623 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3624 if (!ip_rt_acct)
3625 panic("IP: failed to allocate ip_rt_acct\n");
3626 #endif
3627
3628 ipv4_dst_ops.kmem_cachep =
3629 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3630 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3631
3632 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3633
3634 if (dst_entries_init(&ipv4_dst_ops) < 0)
3635 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3636
3637 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3638 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3639
3640 ipv4_dst_ops.gc_thresh = ~0;
3641 ip_rt_max_size = INT_MAX;
3642
3643 devinet_init();
3644 ip_fib_init();
3645
3646 if (ip_rt_proc_init())
3647 pr_err("Unable to create route proc files\n");
3648 #ifdef CONFIG_XFRM
3649 xfrm_init();
3650 xfrm4_init();
3651 #endif
3652 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3653 RTNL_FLAG_DOIT_UNLOCKED);
3654
3655 #ifdef CONFIG_SYSCTL
3656 register_pernet_subsys(&sysctl_route_ops);
3657 #endif
3658 register_pernet_subsys(&rt_genid_ops);
3659 register_pernet_subsys(&ipv4_inetpeer_ops);
3660 return 0;
3661 }
3662
3663 #ifdef CONFIG_SYSCTL
3664 /*
3665 * We really need to sanitize the damn ipv4 init order, then all
3666 * this nonsense will go away.
3667 */
ip_static_sysctl_init(void)3668 void __init ip_static_sysctl_init(void)
3669 {
3670 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3671 }
3672 #endif
3673