1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
130
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu,
144 bool confirm_neigh);
145 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 WARN_ON(1);
152 return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
174 };
175
176 #define ECN_OR_COST(class) TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207 }
208
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 ++*pos;
212 return NULL;
213 }
214
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234 };
235
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct proc_ops rt_cache_proc_ops = {
242 .proc_open = rt_cache_seq_open,
243 .proc_read = seq_read,
244 .proc_lseek = seq_lseek,
245 .proc_release = seq_release,
246 };
247
248
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 int cpu;
252
253 if (*pos == 0)
254 return SEQ_START_TOKEN;
255
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
258 continue;
259 *pos = cpu+1;
260 return &per_cpu(rt_cache_stat, cpu);
261 }
262 return NULL;
263 }
264
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 int cpu;
268
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
271 continue;
272 *pos = cpu+1;
273 return &per_cpu(rt_cache_stat, cpu);
274 }
275 (*pos)++;
276 return NULL;
277
278 }
279
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 0, /* st->in_hit */
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 0, /* st->out_hit */
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
315 );
316 return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324 };
325
326
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct proc_ops rt_cpu_proc_ops = {
333 .proc_open = rt_cpu_seq_open,
334 .proc_read = seq_read,
335 .proc_lseek = seq_lseek,
336 .proc_release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
357 }
358
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362 }
363 #endif
364
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 struct proc_dir_entry *pde;
368
369 pde = proc_create("rt_cache", 0444, net->proc_net,
370 &rt_cache_proc_ops);
371 if (!pde)
372 goto err1;
373
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_proc_ops);
376 if (!pde)
377 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
381 rt_acct_proc_show);
382 if (!pde)
383 goto err3;
384 #endif
385 return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 return -ENOMEM;
395 }
396
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
409 };
410
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 rt_genid_bump_ipv4(net);
431 }
432
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 struct sk_buff *skb,
435 const void *daddr)
436 {
437 const struct rtable *rt = container_of(dst, struct rtable, dst);
438 struct net_device *dev = dst->dev;
439 struct neighbour *n;
440
441 rcu_read_lock_bh();
442
443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447 } else {
448 __be32 pkey;
449
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
452 }
453
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 n = NULL;
456
457 rcu_read_unlock_bh();
458
459 return n;
460 }
461
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 const struct rtable *rt = container_of(dst, struct rtable, dst);
465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
467
468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 } else if (!daddr ||
473 (rt->rt_flags &
474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 return;
476 }
477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 /* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
482 */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
490 */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 u32 bucket, old, now = (u32)jiffies;
494 atomic_t *p_id;
495 u32 *p_tstamp;
496 u32 delta = 0;
497
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
502
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
505
506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
509 */
510 return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 u32 hash, id;
517
518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
522
523 hash = siphash_3u32((__force u32)iph->daddr,
524 (__force u32)iph->saddr,
525 iph->protocol,
526 &net->ipv4.ip_id_key);
527 id = ip_idents_reserve(hash, segs);
528 iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)532 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
533 const struct sock *sk,
534 const struct iphdr *iph,
535 int oif, u8 tos,
536 u8 prot, u32 mark, int flow_flags)
537 {
538 if (sk) {
539 const struct inet_sock *inet = inet_sk(sk);
540
541 oif = sk->sk_bound_dev_if;
542 mark = sk->sk_mark;
543 tos = RT_CONN_FLAGS(sk);
544 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
545 }
546 flowi4_init_output(fl4, oif, mark, tos,
547 RT_SCOPE_UNIVERSE, prot,
548 flow_flags,
549 iph->daddr, iph->saddr, 0, 0,
550 sock_net_uid(net, sk));
551 }
552
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 const struct sock *sk)
555 {
556 const struct net *net = dev_net(skb->dev);
557 const struct iphdr *iph = ip_hdr(skb);
558 int oif = skb->dev->ifindex;
559 u8 tos = RT_TOS(iph->tos);
560 u8 prot = iph->protocol;
561 u32 mark = skb->mark;
562
563 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
564 }
565
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)566 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
567 {
568 const struct inet_sock *inet = inet_sk(sk);
569 const struct ip_options_rcu *inet_opt;
570 __be32 daddr = inet->inet_daddr;
571
572 rcu_read_lock();
573 inet_opt = rcu_dereference(inet->inet_opt);
574 if (inet_opt && inet_opt->opt.srr)
575 daddr = inet_opt->opt.faddr;
576 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
577 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
578 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
579 inet_sk_flowi_flags(sk),
580 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
581 rcu_read_unlock();
582 }
583
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)584 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
585 const struct sk_buff *skb)
586 {
587 if (skb)
588 build_skb_flow_key(fl4, skb, sk);
589 else
590 build_sk_flow_key(fl4, sk);
591 }
592
593 static DEFINE_SPINLOCK(fnhe_lock);
594
fnhe_flush_routes(struct fib_nh_exception * fnhe)595 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
596 {
597 struct rtable *rt;
598
599 rt = rcu_dereference(fnhe->fnhe_rth_input);
600 if (rt) {
601 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
604 }
605 rt = rcu_dereference(fnhe->fnhe_rth_output);
606 if (rt) {
607 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
608 dst_dev_put(&rt->dst);
609 dst_release(&rt->dst);
610 }
611 }
612
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)613 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
614 {
615 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
616 struct fib_nh_exception *fnhe, *oldest = NULL;
617
618 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
619 fnhe = rcu_dereference_protected(*fnhe_p,
620 lockdep_is_held(&fnhe_lock));
621 if (!fnhe)
622 break;
623 if (!oldest ||
624 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
625 oldest = fnhe;
626 oldest_p = fnhe_p;
627 }
628 }
629 fnhe_flush_routes(oldest);
630 *oldest_p = oldest->fnhe_next;
631 kfree_rcu(oldest, rcu);
632 }
633
fnhe_hashfun(__be32 daddr)634 static u32 fnhe_hashfun(__be32 daddr)
635 {
636 static siphash_key_t fnhe_hash_key __read_mostly;
637 u64 hval;
638
639 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
640 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
641 return hash_64(hval, FNHE_HASH_SHIFT);
642 }
643
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)644 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
645 {
646 rt->rt_pmtu = fnhe->fnhe_pmtu;
647 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
648 rt->dst.expires = fnhe->fnhe_expires;
649
650 if (fnhe->fnhe_gw) {
651 rt->rt_flags |= RTCF_REDIRECTED;
652 rt->rt_uses_gateway = 1;
653 rt->rt_gw_family = AF_INET;
654 rt->rt_gw4 = fnhe->fnhe_gw;
655 }
656 }
657
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)658 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
659 __be32 gw, u32 pmtu, bool lock,
660 unsigned long expires)
661 {
662 struct fnhe_hash_bucket *hash;
663 struct fib_nh_exception *fnhe;
664 struct rtable *rt;
665 u32 genid, hval;
666 unsigned int i;
667 int depth;
668
669 genid = fnhe_genid(dev_net(nhc->nhc_dev));
670 hval = fnhe_hashfun(daddr);
671
672 spin_lock_bh(&fnhe_lock);
673
674 hash = rcu_dereference(nhc->nhc_exceptions);
675 if (!hash) {
676 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
677 if (!hash)
678 goto out_unlock;
679 rcu_assign_pointer(nhc->nhc_exceptions, hash);
680 }
681
682 hash += hval;
683
684 depth = 0;
685 for (fnhe = rcu_dereference(hash->chain); fnhe;
686 fnhe = rcu_dereference(fnhe->fnhe_next)) {
687 if (fnhe->fnhe_daddr == daddr)
688 break;
689 depth++;
690 }
691
692 if (fnhe) {
693 if (fnhe->fnhe_genid != genid)
694 fnhe->fnhe_genid = genid;
695 if (gw)
696 fnhe->fnhe_gw = gw;
697 if (pmtu) {
698 fnhe->fnhe_pmtu = pmtu;
699 fnhe->fnhe_mtu_locked = lock;
700 }
701 fnhe->fnhe_expires = max(1UL, expires);
702 /* Update all cached dsts too */
703 rt = rcu_dereference(fnhe->fnhe_rth_input);
704 if (rt)
705 fill_route_from_fnhe(rt, fnhe);
706 rt = rcu_dereference(fnhe->fnhe_rth_output);
707 if (rt)
708 fill_route_from_fnhe(rt, fnhe);
709 } else {
710 /* Randomize max depth to avoid some side channels attacks. */
711 int max_depth = FNHE_RECLAIM_DEPTH +
712 prandom_u32_max(FNHE_RECLAIM_DEPTH);
713
714 while (depth > max_depth) {
715 fnhe_remove_oldest(hash);
716 depth--;
717 }
718
719 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
720 if (!fnhe)
721 goto out_unlock;
722
723 fnhe->fnhe_next = hash->chain;
724
725 fnhe->fnhe_genid = genid;
726 fnhe->fnhe_daddr = daddr;
727 fnhe->fnhe_gw = gw;
728 fnhe->fnhe_pmtu = pmtu;
729 fnhe->fnhe_mtu_locked = lock;
730 fnhe->fnhe_expires = max(1UL, expires);
731
732 rcu_assign_pointer(hash->chain, fnhe);
733
734 /* Exception created; mark the cached routes for the nexthop
735 * stale, so anyone caching it rechecks if this exception
736 * applies to them.
737 */
738 rt = rcu_dereference(nhc->nhc_rth_input);
739 if (rt)
740 rt->dst.obsolete = DST_OBSOLETE_KILL;
741
742 for_each_possible_cpu(i) {
743 struct rtable __rcu **prt;
744 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
745 rt = rcu_dereference(*prt);
746 if (rt)
747 rt->dst.obsolete = DST_OBSOLETE_KILL;
748 }
749 }
750
751 fnhe->fnhe_stamp = jiffies;
752
753 out_unlock:
754 spin_unlock_bh(&fnhe_lock);
755 }
756
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)757 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
758 bool kill_route)
759 {
760 __be32 new_gw = icmp_hdr(skb)->un.gateway;
761 __be32 old_gw = ip_hdr(skb)->saddr;
762 struct net_device *dev = skb->dev;
763 struct in_device *in_dev;
764 struct fib_result res;
765 struct neighbour *n;
766 struct net *net;
767
768 switch (icmp_hdr(skb)->code & 7) {
769 case ICMP_REDIR_NET:
770 case ICMP_REDIR_NETTOS:
771 case ICMP_REDIR_HOST:
772 case ICMP_REDIR_HOSTTOS:
773 break;
774
775 default:
776 return;
777 }
778
779 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
780 return;
781
782 in_dev = __in_dev_get_rcu(dev);
783 if (!in_dev)
784 return;
785
786 net = dev_net(dev);
787 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
788 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
789 ipv4_is_zeronet(new_gw))
790 goto reject_redirect;
791
792 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
793 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
794 goto reject_redirect;
795 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
796 goto reject_redirect;
797 } else {
798 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
799 goto reject_redirect;
800 }
801
802 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
803 if (!n)
804 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
805 if (!IS_ERR(n)) {
806 if (!(n->nud_state & NUD_VALID)) {
807 neigh_event_send(n, NULL);
808 } else {
809 if (fib_lookup(net, fl4, &res, 0) == 0) {
810 struct fib_nh_common *nhc;
811
812 fib_select_path(net, &res, fl4, skb);
813 nhc = FIB_RES_NHC(res);
814 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
815 0, false,
816 jiffies + ip_rt_gc_timeout);
817 }
818 if (kill_route)
819 rt->dst.obsolete = DST_OBSOLETE_KILL;
820 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
821 }
822 neigh_release(n);
823 }
824 return;
825
826 reject_redirect:
827 #ifdef CONFIG_IP_ROUTE_VERBOSE
828 if (IN_DEV_LOG_MARTIANS(in_dev)) {
829 const struct iphdr *iph = (const struct iphdr *) skb->data;
830 __be32 daddr = iph->daddr;
831 __be32 saddr = iph->saddr;
832
833 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
834 " Advised path = %pI4 -> %pI4\n",
835 &old_gw, dev->name, &new_gw,
836 &saddr, &daddr);
837 }
838 #endif
839 ;
840 }
841
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)842 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
843 {
844 struct rtable *rt;
845 struct flowi4 fl4;
846 const struct iphdr *iph = (const struct iphdr *) skb->data;
847 struct net *net = dev_net(skb->dev);
848 int oif = skb->dev->ifindex;
849 u8 tos = RT_TOS(iph->tos);
850 u8 prot = iph->protocol;
851 u32 mark = skb->mark;
852
853 rt = (struct rtable *) dst;
854
855 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
856 __ip_do_redirect(rt, skb, &fl4, true);
857 }
858
ipv4_negative_advice(struct dst_entry * dst)859 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
860 {
861 struct rtable *rt = (struct rtable *)dst;
862 struct dst_entry *ret = dst;
863
864 if (rt) {
865 if (dst->obsolete > 0) {
866 ip_rt_put(rt);
867 ret = NULL;
868 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
869 rt->dst.expires) {
870 ip_rt_put(rt);
871 ret = NULL;
872 }
873 }
874 return ret;
875 }
876
877 /*
878 * Algorithm:
879 * 1. The first ip_rt_redirect_number redirects are sent
880 * with exponential backoff, then we stop sending them at all,
881 * assuming that the host ignores our redirects.
882 * 2. If we did not see packets requiring redirects
883 * during ip_rt_redirect_silence, we assume that the host
884 * forgot redirected route and start to send redirects again.
885 *
886 * This algorithm is much cheaper and more intelligent than dumb load limiting
887 * in icmp.c.
888 *
889 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
890 * and "frag. need" (breaks PMTU discovery) in icmp.c.
891 */
892
ip_rt_send_redirect(struct sk_buff * skb)893 void ip_rt_send_redirect(struct sk_buff *skb)
894 {
895 struct rtable *rt = skb_rtable(skb);
896 struct in_device *in_dev;
897 struct inet_peer *peer;
898 struct net *net;
899 int log_martians;
900 int vif;
901
902 rcu_read_lock();
903 in_dev = __in_dev_get_rcu(rt->dst.dev);
904 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
905 rcu_read_unlock();
906 return;
907 }
908 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
909 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
910 rcu_read_unlock();
911
912 net = dev_net(rt->dst.dev);
913 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
914 if (!peer) {
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
916 rt_nexthop(rt, ip_hdr(skb)->daddr));
917 return;
918 }
919
920 /* No redirected packets during ip_rt_redirect_silence;
921 * reset the algorithm.
922 */
923 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
924 peer->rate_tokens = 0;
925 peer->n_redirects = 0;
926 }
927
928 /* Too many ignored redirects; do not send anything
929 * set dst.rate_last to the last seen redirected packet.
930 */
931 if (peer->n_redirects >= ip_rt_redirect_number) {
932 peer->rate_last = jiffies;
933 goto out_put_peer;
934 }
935
936 /* Check for load limit; set rate_last to the latest sent
937 * redirect.
938 */
939 if (peer->n_redirects == 0 ||
940 time_after(jiffies,
941 (peer->rate_last +
942 (ip_rt_redirect_load << peer->n_redirects)))) {
943 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
944
945 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
946 peer->rate_last = jiffies;
947 ++peer->n_redirects;
948 #ifdef CONFIG_IP_ROUTE_VERBOSE
949 if (log_martians &&
950 peer->n_redirects == ip_rt_redirect_number)
951 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
952 &ip_hdr(skb)->saddr, inet_iif(skb),
953 &ip_hdr(skb)->daddr, &gw);
954 #endif
955 }
956 out_put_peer:
957 inet_putpeer(peer);
958 }
959
ip_error(struct sk_buff * skb)960 static int ip_error(struct sk_buff *skb)
961 {
962 struct rtable *rt = skb_rtable(skb);
963 struct net_device *dev = skb->dev;
964 struct in_device *in_dev;
965 struct inet_peer *peer;
966 unsigned long now;
967 struct net *net;
968 bool send;
969 int code;
970
971 if (netif_is_l3_master(skb->dev)) {
972 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
973 if (!dev)
974 goto out;
975 }
976
977 in_dev = __in_dev_get_rcu(dev);
978
979 /* IP on this device is disabled. */
980 if (!in_dev)
981 goto out;
982
983 net = dev_net(rt->dst.dev);
984 if (!IN_DEV_FORWARD(in_dev)) {
985 switch (rt->dst.error) {
986 case EHOSTUNREACH:
987 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
988 break;
989
990 case ENETUNREACH:
991 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
992 break;
993 }
994 goto out;
995 }
996
997 switch (rt->dst.error) {
998 case EINVAL:
999 default:
1000 goto out;
1001 case EHOSTUNREACH:
1002 code = ICMP_HOST_UNREACH;
1003 break;
1004 case ENETUNREACH:
1005 code = ICMP_NET_UNREACH;
1006 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1007 break;
1008 case EACCES:
1009 code = ICMP_PKT_FILTERED;
1010 break;
1011 }
1012
1013 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014 l3mdev_master_ifindex(skb->dev), 1);
1015
1016 send = true;
1017 if (peer) {
1018 now = jiffies;
1019 peer->rate_tokens += now - peer->rate_last;
1020 if (peer->rate_tokens > ip_rt_error_burst)
1021 peer->rate_tokens = ip_rt_error_burst;
1022 peer->rate_last = now;
1023 if (peer->rate_tokens >= ip_rt_error_cost)
1024 peer->rate_tokens -= ip_rt_error_cost;
1025 else
1026 send = false;
1027 inet_putpeer(peer);
1028 }
1029 if (send)
1030 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1031
1032 out: kfree_skb(skb);
1033 return 0;
1034 }
1035
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1036 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1037 {
1038 struct dst_entry *dst = &rt->dst;
1039 struct net *net = dev_net(dst->dev);
1040 struct fib_result res;
1041 bool lock = false;
1042 u32 old_mtu;
1043
1044 if (ip_mtu_locked(dst))
1045 return;
1046
1047 old_mtu = ipv4_mtu(dst);
1048 if (old_mtu < mtu)
1049 return;
1050
1051 if (mtu < ip_rt_min_pmtu) {
1052 lock = true;
1053 mtu = min(old_mtu, ip_rt_min_pmtu);
1054 }
1055
1056 if (rt->rt_pmtu == mtu && !lock &&
1057 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1058 return;
1059
1060 rcu_read_lock();
1061 if (fib_lookup(net, fl4, &res, 0) == 0) {
1062 struct fib_nh_common *nhc;
1063
1064 fib_select_path(net, &res, fl4, NULL);
1065 nhc = FIB_RES_NHC(res);
1066 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1067 jiffies + ip_rt_mtu_expires);
1068 }
1069 rcu_read_unlock();
1070 }
1071
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1072 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1073 struct sk_buff *skb, u32 mtu,
1074 bool confirm_neigh)
1075 {
1076 struct rtable *rt = (struct rtable *) dst;
1077 struct flowi4 fl4;
1078
1079 ip_rt_build_flow_key(&fl4, sk, skb);
1080
1081 /* Don't make lookup fail for bridged encapsulations */
1082 if (skb && netif_is_any_bridge_port(skb->dev))
1083 fl4.flowi4_oif = 0;
1084
1085 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086 }
1087
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1088 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1089 int oif, u8 protocol)
1090 {
1091 const struct iphdr *iph = (const struct iphdr *)skb->data;
1092 struct flowi4 fl4;
1093 struct rtable *rt;
1094 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1095
1096 __build_flow_key(net, &fl4, NULL, iph, oif,
1097 RT_TOS(iph->tos), protocol, mark, 0);
1098 rt = __ip_route_output_key(net, &fl4);
1099 if (!IS_ERR(rt)) {
1100 __ip_rt_update_pmtu(rt, &fl4, mtu);
1101 ip_rt_put(rt);
1102 }
1103 }
1104 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1105
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1106 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1107 {
1108 const struct iphdr *iph = (const struct iphdr *)skb->data;
1109 struct flowi4 fl4;
1110 struct rtable *rt;
1111
1112 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114 if (!fl4.flowi4_mark)
1115 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1116
1117 rt = __ip_route_output_key(sock_net(sk), &fl4);
1118 if (!IS_ERR(rt)) {
1119 __ip_rt_update_pmtu(rt, &fl4, mtu);
1120 ip_rt_put(rt);
1121 }
1122 }
1123
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1124 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1125 {
1126 const struct iphdr *iph = (const struct iphdr *)skb->data;
1127 struct flowi4 fl4;
1128 struct rtable *rt;
1129 struct dst_entry *odst = NULL;
1130 bool new = false;
1131 struct net *net = sock_net(sk);
1132
1133 bh_lock_sock(sk);
1134
1135 if (!ip_sk_accept_pmtu(sk))
1136 goto out;
1137
1138 odst = sk_dst_get(sk);
1139
1140 if (sock_owned_by_user(sk) || !odst) {
1141 __ipv4_sk_update_pmtu(skb, sk, mtu);
1142 goto out;
1143 }
1144
1145 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1146
1147 rt = (struct rtable *)odst;
1148 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1149 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1150 if (IS_ERR(rt))
1151 goto out;
1152
1153 new = true;
1154 }
1155
1156 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1157
1158 if (!dst_check(&rt->dst, 0)) {
1159 if (new)
1160 dst_release(&rt->dst);
1161
1162 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1163 if (IS_ERR(rt))
1164 goto out;
1165
1166 new = true;
1167 }
1168
1169 if (new)
1170 sk_dst_set(sk, &rt->dst);
1171
1172 out:
1173 bh_unlock_sock(sk);
1174 dst_release(odst);
1175 }
1176 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1177
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1178 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1179 int oif, u8 protocol)
1180 {
1181 const struct iphdr *iph = (const struct iphdr *)skb->data;
1182 struct flowi4 fl4;
1183 struct rtable *rt;
1184
1185 __build_flow_key(net, &fl4, NULL, iph, oif,
1186 RT_TOS(iph->tos), protocol, 0, 0);
1187 rt = __ip_route_output_key(net, &fl4);
1188 if (!IS_ERR(rt)) {
1189 __ip_do_redirect(rt, skb, &fl4, false);
1190 ip_rt_put(rt);
1191 }
1192 }
1193 EXPORT_SYMBOL_GPL(ipv4_redirect);
1194
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1195 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1196 {
1197 const struct iphdr *iph = (const struct iphdr *)skb->data;
1198 struct flowi4 fl4;
1199 struct rtable *rt;
1200 struct net *net = sock_net(sk);
1201
1202 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1203 rt = __ip_route_output_key(net, &fl4);
1204 if (!IS_ERR(rt)) {
1205 __ip_do_redirect(rt, skb, &fl4, false);
1206 ip_rt_put(rt);
1207 }
1208 }
1209 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1210
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1211 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1212 {
1213 struct rtable *rt = (struct rtable *) dst;
1214
1215 /* All IPV4 dsts are created with ->obsolete set to the value
1216 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1217 * into this function always.
1218 *
1219 * When a PMTU/redirect information update invalidates a route,
1220 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1221 * DST_OBSOLETE_DEAD.
1222 */
1223 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1224 return NULL;
1225 return dst;
1226 }
1227
ipv4_send_dest_unreach(struct sk_buff * skb)1228 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1229 {
1230 struct ip_options opt;
1231 int res;
1232
1233 /* Recompile ip options since IPCB may not be valid anymore.
1234 * Also check we have a reasonable ipv4 header.
1235 */
1236 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1237 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1238 return;
1239
1240 memset(&opt, 0, sizeof(opt));
1241 if (ip_hdr(skb)->ihl > 5) {
1242 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1243 return;
1244 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1245
1246 rcu_read_lock();
1247 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1248 rcu_read_unlock();
1249
1250 if (res)
1251 return;
1252 }
1253 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1254 }
1255
ipv4_link_failure(struct sk_buff * skb)1256 static void ipv4_link_failure(struct sk_buff *skb)
1257 {
1258 struct rtable *rt;
1259
1260 ipv4_send_dest_unreach(skb);
1261
1262 rt = skb_rtable(skb);
1263 if (rt)
1264 dst_set_expires(&rt->dst, 0);
1265 }
1266
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1267 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1268 {
1269 pr_debug("%s: %pI4 -> %pI4, %s\n",
1270 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1271 skb->dev ? skb->dev->name : "?");
1272 kfree_skb(skb);
1273 WARN_ON(1);
1274 return 0;
1275 }
1276
1277 /*
1278 We do not cache source address of outgoing interface,
1279 because it is used only by IP RR, TS and SRR options,
1280 so that it out of fast path.
1281
1282 BTW remember: "addr" is allowed to be not aligned
1283 in IP options!
1284 */
1285
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1286 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1287 {
1288 __be32 src;
1289
1290 if (rt_is_output_route(rt))
1291 src = ip_hdr(skb)->saddr;
1292 else {
1293 struct fib_result res;
1294 struct iphdr *iph = ip_hdr(skb);
1295 struct flowi4 fl4 = {
1296 .daddr = iph->daddr,
1297 .saddr = iph->saddr,
1298 .flowi4_tos = RT_TOS(iph->tos),
1299 .flowi4_oif = rt->dst.dev->ifindex,
1300 .flowi4_iif = skb->dev->ifindex,
1301 .flowi4_mark = skb->mark,
1302 };
1303
1304 rcu_read_lock();
1305 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1306 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1307 else
1308 src = inet_select_addr(rt->dst.dev,
1309 rt_nexthop(rt, iph->daddr),
1310 RT_SCOPE_UNIVERSE);
1311 rcu_read_unlock();
1312 }
1313 memcpy(addr, &src, 4);
1314 }
1315
1316 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1317 static void set_class_tag(struct rtable *rt, u32 tag)
1318 {
1319 if (!(rt->dst.tclassid & 0xFFFF))
1320 rt->dst.tclassid |= tag & 0xFFFF;
1321 if (!(rt->dst.tclassid & 0xFFFF0000))
1322 rt->dst.tclassid |= tag & 0xFFFF0000;
1323 }
1324 #endif
1325
ipv4_default_advmss(const struct dst_entry * dst)1326 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1327 {
1328 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1329 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1330 ip_rt_min_advmss);
1331
1332 return min(advmss, IPV4_MAX_PMTU - header_size);
1333 }
1334
ipv4_mtu(const struct dst_entry * dst)1335 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1336 {
1337 const struct rtable *rt = (const struct rtable *)dst;
1338 unsigned int mtu = rt->rt_pmtu;
1339
1340 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1341 mtu = dst_metric_raw(dst, RTAX_MTU);
1342
1343 if (mtu)
1344 goto out;
1345
1346 mtu = READ_ONCE(dst->dev->mtu);
1347
1348 if (unlikely(ip_mtu_locked(dst))) {
1349 if (rt->rt_uses_gateway && mtu > 576)
1350 mtu = 576;
1351 }
1352
1353 out:
1354 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1355
1356 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1357 }
1358
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1359 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1360 {
1361 struct fnhe_hash_bucket *hash;
1362 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1363 u32 hval = fnhe_hashfun(daddr);
1364
1365 spin_lock_bh(&fnhe_lock);
1366
1367 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1368 lockdep_is_held(&fnhe_lock));
1369 hash += hval;
1370
1371 fnhe_p = &hash->chain;
1372 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1373 while (fnhe) {
1374 if (fnhe->fnhe_daddr == daddr) {
1375 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1376 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1377 /* set fnhe_daddr to 0 to ensure it won't bind with
1378 * new dsts in rt_bind_exception().
1379 */
1380 fnhe->fnhe_daddr = 0;
1381 fnhe_flush_routes(fnhe);
1382 kfree_rcu(fnhe, rcu);
1383 break;
1384 }
1385 fnhe_p = &fnhe->fnhe_next;
1386 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1387 lockdep_is_held(&fnhe_lock));
1388 }
1389
1390 spin_unlock_bh(&fnhe_lock);
1391 }
1392
find_exception(struct fib_nh_common * nhc,__be32 daddr)1393 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1394 __be32 daddr)
1395 {
1396 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1397 struct fib_nh_exception *fnhe;
1398 u32 hval;
1399
1400 if (!hash)
1401 return NULL;
1402
1403 hval = fnhe_hashfun(daddr);
1404
1405 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1406 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1407 if (fnhe->fnhe_daddr == daddr) {
1408 if (fnhe->fnhe_expires &&
1409 time_after(jiffies, fnhe->fnhe_expires)) {
1410 ip_del_fnhe(nhc, daddr);
1411 break;
1412 }
1413 return fnhe;
1414 }
1415 }
1416 return NULL;
1417 }
1418
1419 /* MTU selection:
1420 * 1. mtu on route is locked - use it
1421 * 2. mtu from nexthop exception
1422 * 3. mtu from egress device
1423 */
1424
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1425 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1426 {
1427 struct fib_nh_common *nhc = res->nhc;
1428 struct net_device *dev = nhc->nhc_dev;
1429 struct fib_info *fi = res->fi;
1430 u32 mtu = 0;
1431
1432 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1433 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1434 mtu = fi->fib_mtu;
1435
1436 if (likely(!mtu)) {
1437 struct fib_nh_exception *fnhe;
1438
1439 fnhe = find_exception(nhc, daddr);
1440 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1441 mtu = fnhe->fnhe_pmtu;
1442 }
1443
1444 if (likely(!mtu))
1445 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1446
1447 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1448 }
1449
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1450 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1451 __be32 daddr, const bool do_cache)
1452 {
1453 bool ret = false;
1454
1455 spin_lock_bh(&fnhe_lock);
1456
1457 if (daddr == fnhe->fnhe_daddr) {
1458 struct rtable __rcu **porig;
1459 struct rtable *orig;
1460 int genid = fnhe_genid(dev_net(rt->dst.dev));
1461
1462 if (rt_is_input_route(rt))
1463 porig = &fnhe->fnhe_rth_input;
1464 else
1465 porig = &fnhe->fnhe_rth_output;
1466 orig = rcu_dereference(*porig);
1467
1468 if (fnhe->fnhe_genid != genid) {
1469 fnhe->fnhe_genid = genid;
1470 fnhe->fnhe_gw = 0;
1471 fnhe->fnhe_pmtu = 0;
1472 fnhe->fnhe_expires = 0;
1473 fnhe->fnhe_mtu_locked = false;
1474 fnhe_flush_routes(fnhe);
1475 orig = NULL;
1476 }
1477 fill_route_from_fnhe(rt, fnhe);
1478 if (!rt->rt_gw4) {
1479 rt->rt_gw4 = daddr;
1480 rt->rt_gw_family = AF_INET;
1481 }
1482
1483 if (do_cache) {
1484 dst_hold(&rt->dst);
1485 rcu_assign_pointer(*porig, rt);
1486 if (orig) {
1487 dst_dev_put(&orig->dst);
1488 dst_release(&orig->dst);
1489 }
1490 ret = true;
1491 }
1492
1493 fnhe->fnhe_stamp = jiffies;
1494 }
1495 spin_unlock_bh(&fnhe_lock);
1496
1497 return ret;
1498 }
1499
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1500 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1501 {
1502 struct rtable *orig, *prev, **p;
1503 bool ret = true;
1504
1505 if (rt_is_input_route(rt)) {
1506 p = (struct rtable **)&nhc->nhc_rth_input;
1507 } else {
1508 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1509 }
1510 orig = *p;
1511
1512 /* hold dst before doing cmpxchg() to avoid race condition
1513 * on this dst
1514 */
1515 dst_hold(&rt->dst);
1516 prev = cmpxchg(p, orig, rt);
1517 if (prev == orig) {
1518 if (orig) {
1519 rt_add_uncached_list(orig);
1520 dst_release(&orig->dst);
1521 }
1522 } else {
1523 dst_release(&rt->dst);
1524 ret = false;
1525 }
1526
1527 return ret;
1528 }
1529
1530 struct uncached_list {
1531 spinlock_t lock;
1532 struct list_head head;
1533 };
1534
1535 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1536
rt_add_uncached_list(struct rtable * rt)1537 void rt_add_uncached_list(struct rtable *rt)
1538 {
1539 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1540
1541 rt->rt_uncached_list = ul;
1542
1543 spin_lock_bh(&ul->lock);
1544 list_add_tail(&rt->rt_uncached, &ul->head);
1545 spin_unlock_bh(&ul->lock);
1546 }
1547
rt_del_uncached_list(struct rtable * rt)1548 void rt_del_uncached_list(struct rtable *rt)
1549 {
1550 if (!list_empty(&rt->rt_uncached)) {
1551 struct uncached_list *ul = rt->rt_uncached_list;
1552
1553 spin_lock_bh(&ul->lock);
1554 list_del(&rt->rt_uncached);
1555 spin_unlock_bh(&ul->lock);
1556 }
1557 }
1558
ipv4_dst_destroy(struct dst_entry * dst)1559 static void ipv4_dst_destroy(struct dst_entry *dst)
1560 {
1561 struct rtable *rt = (struct rtable *)dst;
1562
1563 ip_dst_metrics_put(dst);
1564 rt_del_uncached_list(rt);
1565 }
1566
rt_flush_dev(struct net_device * dev)1567 void rt_flush_dev(struct net_device *dev)
1568 {
1569 struct rtable *rt;
1570 int cpu;
1571
1572 for_each_possible_cpu(cpu) {
1573 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1574
1575 spin_lock_bh(&ul->lock);
1576 list_for_each_entry(rt, &ul->head, rt_uncached) {
1577 if (rt->dst.dev != dev)
1578 continue;
1579 rt->dst.dev = blackhole_netdev;
1580 dev_hold(rt->dst.dev);
1581 dev_put(dev);
1582 }
1583 spin_unlock_bh(&ul->lock);
1584 }
1585 }
1586
rt_cache_valid(const struct rtable * rt)1587 static bool rt_cache_valid(const struct rtable *rt)
1588 {
1589 return rt &&
1590 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1591 !rt_is_expired(rt);
1592 }
1593
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1594 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1595 const struct fib_result *res,
1596 struct fib_nh_exception *fnhe,
1597 struct fib_info *fi, u16 type, u32 itag,
1598 const bool do_cache)
1599 {
1600 bool cached = false;
1601
1602 if (fi) {
1603 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1604
1605 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1606 rt->rt_uses_gateway = 1;
1607 rt->rt_gw_family = nhc->nhc_gw_family;
1608 /* only INET and INET6 are supported */
1609 if (likely(nhc->nhc_gw_family == AF_INET))
1610 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1611 else
1612 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1613 }
1614
1615 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1616
1617 #ifdef CONFIG_IP_ROUTE_CLASSID
1618 if (nhc->nhc_family == AF_INET) {
1619 struct fib_nh *nh;
1620
1621 nh = container_of(nhc, struct fib_nh, nh_common);
1622 rt->dst.tclassid = nh->nh_tclassid;
1623 }
1624 #endif
1625 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1626 if (unlikely(fnhe))
1627 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1628 else if (do_cache)
1629 cached = rt_cache_route(nhc, rt);
1630 if (unlikely(!cached)) {
1631 /* Routes we intend to cache in nexthop exception or
1632 * FIB nexthop have the DST_NOCACHE bit clear.
1633 * However, if we are unsuccessful at storing this
1634 * route into the cache we really need to set it.
1635 */
1636 if (!rt->rt_gw4) {
1637 rt->rt_gw_family = AF_INET;
1638 rt->rt_gw4 = daddr;
1639 }
1640 rt_add_uncached_list(rt);
1641 }
1642 } else
1643 rt_add_uncached_list(rt);
1644
1645 #ifdef CONFIG_IP_ROUTE_CLASSID
1646 #ifdef CONFIG_IP_MULTIPLE_TABLES
1647 set_class_tag(rt, res->tclassid);
1648 #endif
1649 set_class_tag(rt, itag);
1650 #endif
1651 }
1652
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1653 struct rtable *rt_dst_alloc(struct net_device *dev,
1654 unsigned int flags, u16 type,
1655 bool nopolicy, bool noxfrm)
1656 {
1657 struct rtable *rt;
1658
1659 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1660 (nopolicy ? DST_NOPOLICY : 0) |
1661 (noxfrm ? DST_NOXFRM : 0));
1662
1663 if (rt) {
1664 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1665 rt->rt_flags = flags;
1666 rt->rt_type = type;
1667 rt->rt_is_input = 0;
1668 rt->rt_iif = 0;
1669 rt->rt_pmtu = 0;
1670 rt->rt_mtu_locked = 0;
1671 rt->rt_uses_gateway = 0;
1672 rt->rt_gw_family = 0;
1673 rt->rt_gw4 = 0;
1674 INIT_LIST_HEAD(&rt->rt_uncached);
1675
1676 rt->dst.output = ip_output;
1677 if (flags & RTCF_LOCAL)
1678 rt->dst.input = ip_local_deliver;
1679 }
1680
1681 return rt;
1682 }
1683 EXPORT_SYMBOL(rt_dst_alloc);
1684
rt_dst_clone(struct net_device * dev,struct rtable * rt)1685 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1686 {
1687 struct rtable *new_rt;
1688
1689 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1690 rt->dst.flags);
1691
1692 if (new_rt) {
1693 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1694 new_rt->rt_flags = rt->rt_flags;
1695 new_rt->rt_type = rt->rt_type;
1696 new_rt->rt_is_input = rt->rt_is_input;
1697 new_rt->rt_iif = rt->rt_iif;
1698 new_rt->rt_pmtu = rt->rt_pmtu;
1699 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1700 new_rt->rt_gw_family = rt->rt_gw_family;
1701 if (rt->rt_gw_family == AF_INET)
1702 new_rt->rt_gw4 = rt->rt_gw4;
1703 else if (rt->rt_gw_family == AF_INET6)
1704 new_rt->rt_gw6 = rt->rt_gw6;
1705 INIT_LIST_HEAD(&new_rt->rt_uncached);
1706
1707 new_rt->dst.input = rt->dst.input;
1708 new_rt->dst.output = rt->dst.output;
1709 new_rt->dst.error = rt->dst.error;
1710 new_rt->dst.lastuse = jiffies;
1711 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1712 }
1713 return new_rt;
1714 }
1715 EXPORT_SYMBOL(rt_dst_clone);
1716
1717 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1718 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1719 u8 tos, struct net_device *dev,
1720 struct in_device *in_dev, u32 *itag)
1721 {
1722 int err;
1723
1724 /* Primary sanity checks. */
1725 if (!in_dev)
1726 return -EINVAL;
1727
1728 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1729 skb->protocol != htons(ETH_P_IP))
1730 return -EINVAL;
1731
1732 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1733 return -EINVAL;
1734
1735 if (ipv4_is_zeronet(saddr)) {
1736 if (!ipv4_is_local_multicast(daddr) &&
1737 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1738 return -EINVAL;
1739 } else {
1740 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1741 in_dev, itag);
1742 if (err < 0)
1743 return err;
1744 }
1745 return 0;
1746 }
1747
1748 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1749 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1750 u8 tos, struct net_device *dev, int our)
1751 {
1752 struct in_device *in_dev = __in_dev_get_rcu(dev);
1753 unsigned int flags = RTCF_MULTICAST;
1754 struct rtable *rth;
1755 u32 itag = 0;
1756 int err;
1757
1758 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1759 if (err)
1760 return err;
1761
1762 if (our)
1763 flags |= RTCF_LOCAL;
1764
1765 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1766 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1767 if (!rth)
1768 return -ENOBUFS;
1769
1770 #ifdef CONFIG_IP_ROUTE_CLASSID
1771 rth->dst.tclassid = itag;
1772 #endif
1773 rth->dst.output = ip_rt_bug;
1774 rth->rt_is_input= 1;
1775
1776 #ifdef CONFIG_IP_MROUTE
1777 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1778 rth->dst.input = ip_mr_input;
1779 #endif
1780 RT_CACHE_STAT_INC(in_slow_mc);
1781
1782 skb_dst_set(skb, &rth->dst);
1783 return 0;
1784 }
1785
1786
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1787 static void ip_handle_martian_source(struct net_device *dev,
1788 struct in_device *in_dev,
1789 struct sk_buff *skb,
1790 __be32 daddr,
1791 __be32 saddr)
1792 {
1793 RT_CACHE_STAT_INC(in_martian_src);
1794 #ifdef CONFIG_IP_ROUTE_VERBOSE
1795 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1796 /*
1797 * RFC1812 recommendation, if source is martian,
1798 * the only hint is MAC header.
1799 */
1800 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1801 &daddr, &saddr, dev->name);
1802 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1803 print_hex_dump(KERN_WARNING, "ll header: ",
1804 DUMP_PREFIX_OFFSET, 16, 1,
1805 skb_mac_header(skb),
1806 dev->hard_header_len, false);
1807 }
1808 }
1809 #endif
1810 }
1811
1812 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1813 static int __mkroute_input(struct sk_buff *skb,
1814 const struct fib_result *res,
1815 struct in_device *in_dev,
1816 __be32 daddr, __be32 saddr, u32 tos)
1817 {
1818 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1819 struct net_device *dev = nhc->nhc_dev;
1820 struct fib_nh_exception *fnhe;
1821 struct rtable *rth;
1822 int err;
1823 struct in_device *out_dev;
1824 bool do_cache;
1825 u32 itag = 0;
1826
1827 /* get a working reference to the output device */
1828 out_dev = __in_dev_get_rcu(dev);
1829 if (!out_dev) {
1830 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1831 return -EINVAL;
1832 }
1833
1834 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1835 in_dev->dev, in_dev, &itag);
1836 if (err < 0) {
1837 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1838 saddr);
1839
1840 goto cleanup;
1841 }
1842
1843 do_cache = res->fi && !itag;
1844 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1845 skb->protocol == htons(ETH_P_IP)) {
1846 __be32 gw;
1847
1848 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1849 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1850 inet_addr_onlink(out_dev, saddr, gw))
1851 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1852 }
1853
1854 if (skb->protocol != htons(ETH_P_IP)) {
1855 /* Not IP (i.e. ARP). Do not create route, if it is
1856 * invalid for proxy arp. DNAT routes are always valid.
1857 *
1858 * Proxy arp feature have been extended to allow, ARP
1859 * replies back to the same interface, to support
1860 * Private VLAN switch technologies. See arp.c.
1861 */
1862 if (out_dev == in_dev &&
1863 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1864 err = -EINVAL;
1865 goto cleanup;
1866 }
1867 }
1868
1869 fnhe = find_exception(nhc, daddr);
1870 if (do_cache) {
1871 if (fnhe)
1872 rth = rcu_dereference(fnhe->fnhe_rth_input);
1873 else
1874 rth = rcu_dereference(nhc->nhc_rth_input);
1875 if (rt_cache_valid(rth)) {
1876 skb_dst_set_noref(skb, &rth->dst);
1877 goto out;
1878 }
1879 }
1880
1881 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1882 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1883 IN_DEV_CONF_GET(out_dev, NOXFRM));
1884 if (!rth) {
1885 err = -ENOBUFS;
1886 goto cleanup;
1887 }
1888
1889 rth->rt_is_input = 1;
1890 RT_CACHE_STAT_INC(in_slow_tot);
1891
1892 rth->dst.input = ip_forward;
1893
1894 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1895 do_cache);
1896 lwtunnel_set_redirect(&rth->dst);
1897 skb_dst_set(skb, &rth->dst);
1898 out:
1899 err = 0;
1900 cleanup:
1901 return err;
1902 }
1903
1904 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1905 /* To make ICMP packets follow the right flow, the multipath hash is
1906 * calculated from the inner IP addresses.
1907 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1908 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1909 struct flow_keys *hash_keys)
1910 {
1911 const struct iphdr *outer_iph = ip_hdr(skb);
1912 const struct iphdr *key_iph = outer_iph;
1913 const struct iphdr *inner_iph;
1914 const struct icmphdr *icmph;
1915 struct iphdr _inner_iph;
1916 struct icmphdr _icmph;
1917
1918 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1919 goto out;
1920
1921 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1922 goto out;
1923
1924 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1925 &_icmph);
1926 if (!icmph)
1927 goto out;
1928
1929 if (!icmp_is_err(icmph->type))
1930 goto out;
1931
1932 inner_iph = skb_header_pointer(skb,
1933 outer_iph->ihl * 4 + sizeof(_icmph),
1934 sizeof(_inner_iph), &_inner_iph);
1935 if (!inner_iph)
1936 goto out;
1937
1938 key_iph = inner_iph;
1939 out:
1940 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1941 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1942 }
1943
1944 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1945 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1946 const struct sk_buff *skb, struct flow_keys *flkeys)
1947 {
1948 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1949 struct flow_keys hash_keys;
1950 u32 mhash;
1951
1952 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1953 case 0:
1954 memset(&hash_keys, 0, sizeof(hash_keys));
1955 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1956 if (skb) {
1957 ip_multipath_l3_keys(skb, &hash_keys);
1958 } else {
1959 hash_keys.addrs.v4addrs.src = fl4->saddr;
1960 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1961 }
1962 break;
1963 case 1:
1964 /* skb is currently provided only when forwarding */
1965 if (skb) {
1966 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1967 struct flow_keys keys;
1968
1969 /* short-circuit if we already have L4 hash present */
1970 if (skb->l4_hash)
1971 return skb_get_hash_raw(skb) >> 1;
1972
1973 memset(&hash_keys, 0, sizeof(hash_keys));
1974
1975 if (!flkeys) {
1976 skb_flow_dissect_flow_keys(skb, &keys, flag);
1977 flkeys = &keys;
1978 }
1979
1980 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1981 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1982 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1983 hash_keys.ports.src = flkeys->ports.src;
1984 hash_keys.ports.dst = flkeys->ports.dst;
1985 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1986 } else {
1987 memset(&hash_keys, 0, sizeof(hash_keys));
1988 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989 hash_keys.addrs.v4addrs.src = fl4->saddr;
1990 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991 hash_keys.ports.src = fl4->fl4_sport;
1992 hash_keys.ports.dst = fl4->fl4_dport;
1993 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1994 }
1995 break;
1996 case 2:
1997 memset(&hash_keys, 0, sizeof(hash_keys));
1998 /* skb is currently provided only when forwarding */
1999 if (skb) {
2000 struct flow_keys keys;
2001
2002 skb_flow_dissect_flow_keys(skb, &keys, 0);
2003 /* Inner can be v4 or v6 */
2004 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2005 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2006 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2007 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2008 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2009 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2010 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2011 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2012 hash_keys.tags.flow_label = keys.tags.flow_label;
2013 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2014 } else {
2015 /* Same as case 0 */
2016 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2017 ip_multipath_l3_keys(skb, &hash_keys);
2018 }
2019 } else {
2020 /* Same as case 0 */
2021 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022 hash_keys.addrs.v4addrs.src = fl4->saddr;
2023 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2024 }
2025 break;
2026 }
2027 mhash = flow_hash_from_keys(&hash_keys);
2028
2029 if (multipath_hash)
2030 mhash = jhash_2words(mhash, multipath_hash, 0);
2031
2032 return mhash >> 1;
2033 }
2034 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2035
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2036 static int ip_mkroute_input(struct sk_buff *skb,
2037 struct fib_result *res,
2038 struct in_device *in_dev,
2039 __be32 daddr, __be32 saddr, u32 tos,
2040 struct flow_keys *hkeys)
2041 {
2042 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2043 if (res->fi && fib_info_num_path(res->fi) > 1) {
2044 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2045
2046 fib_select_multipath(res, h);
2047 }
2048 #endif
2049
2050 /* create a routing cache entry */
2051 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2052 }
2053
2054 /* Implements all the saddr-related checks as ip_route_input_slow(),
2055 * assuming daddr is valid and the destination is not a local broadcast one.
2056 * Uses the provided hint instead of performing a route lookup.
2057 */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2058 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2059 u8 tos, struct net_device *dev,
2060 const struct sk_buff *hint)
2061 {
2062 struct in_device *in_dev = __in_dev_get_rcu(dev);
2063 struct rtable *rt = skb_rtable(hint);
2064 struct net *net = dev_net(dev);
2065 int err = -EINVAL;
2066 u32 tag = 0;
2067
2068 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2069 goto martian_source;
2070
2071 if (ipv4_is_zeronet(saddr))
2072 goto martian_source;
2073
2074 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2075 goto martian_source;
2076
2077 if (rt->rt_type != RTN_LOCAL)
2078 goto skip_validate_source;
2079
2080 tos &= IPTOS_RT_MASK;
2081 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2082 if (err < 0)
2083 goto martian_source;
2084
2085 skip_validate_source:
2086 skb_dst_copy(skb, hint);
2087 return 0;
2088
2089 martian_source:
2090 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2091 return err;
2092 }
2093
2094 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2095 static struct net_device *ip_rt_get_dev(struct net *net,
2096 const struct fib_result *res)
2097 {
2098 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2099 struct net_device *dev = NULL;
2100
2101 if (nhc)
2102 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2103
2104 return dev ? : net->loopback_dev;
2105 }
2106
2107 /*
2108 * NOTE. We drop all the packets that has local source
2109 * addresses, because every properly looped back packet
2110 * must have correct destination already attached by output routine.
2111 * Changes in the enforced policies must be applied also to
2112 * ip_route_use_hint().
2113 *
2114 * Such approach solves two big problems:
2115 * 1. Not simplex devices are handled properly.
2116 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2117 * called with rcu_read_lock()
2118 */
2119
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2120 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2121 u8 tos, struct net_device *dev,
2122 struct fib_result *res)
2123 {
2124 struct in_device *in_dev = __in_dev_get_rcu(dev);
2125 struct flow_keys *flkeys = NULL, _flkeys;
2126 struct net *net = dev_net(dev);
2127 struct ip_tunnel_info *tun_info;
2128 int err = -EINVAL;
2129 unsigned int flags = 0;
2130 u32 itag = 0;
2131 struct rtable *rth;
2132 struct flowi4 fl4;
2133 bool do_cache = true;
2134
2135 /* IP on this device is disabled. */
2136
2137 if (!in_dev)
2138 goto out;
2139
2140 /* Check for the most weird martians, which can be not detected
2141 by fib_lookup.
2142 */
2143
2144 tun_info = skb_tunnel_info(skb);
2145 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2146 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2147 else
2148 fl4.flowi4_tun_key.tun_id = 0;
2149 skb_dst_drop(skb);
2150
2151 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2152 goto martian_source;
2153
2154 res->fi = NULL;
2155 res->table = NULL;
2156 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2157 goto brd_input;
2158
2159 /* Accept zero addresses only to limited broadcast;
2160 * I even do not know to fix it or not. Waiting for complains :-)
2161 */
2162 if (ipv4_is_zeronet(saddr))
2163 goto martian_source;
2164
2165 if (ipv4_is_zeronet(daddr))
2166 goto martian_destination;
2167
2168 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2169 * and call it once if daddr or/and saddr are loopback addresses
2170 */
2171 if (ipv4_is_loopback(daddr)) {
2172 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2173 goto martian_destination;
2174 } else if (ipv4_is_loopback(saddr)) {
2175 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2176 goto martian_source;
2177 }
2178
2179 /*
2180 * Now we are ready to route packet.
2181 */
2182 fl4.flowi4_oif = 0;
2183 fl4.flowi4_iif = dev->ifindex;
2184 fl4.flowi4_mark = skb->mark;
2185 fl4.flowi4_tos = tos;
2186 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2187 fl4.flowi4_flags = 0;
2188 fl4.daddr = daddr;
2189 fl4.saddr = saddr;
2190 fl4.flowi4_uid = sock_net_uid(net, NULL);
2191 fl4.flowi4_multipath_hash = 0;
2192
2193 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2194 flkeys = &_flkeys;
2195 } else {
2196 fl4.flowi4_proto = 0;
2197 fl4.fl4_sport = 0;
2198 fl4.fl4_dport = 0;
2199 }
2200
2201 err = fib_lookup(net, &fl4, res, 0);
2202 if (err != 0) {
2203 if (!IN_DEV_FORWARD(in_dev))
2204 err = -EHOSTUNREACH;
2205 goto no_route;
2206 }
2207
2208 if (res->type == RTN_BROADCAST) {
2209 if (IN_DEV_BFORWARD(in_dev))
2210 goto make_route;
2211 /* not do cache if bc_forwarding is enabled */
2212 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2213 do_cache = false;
2214 goto brd_input;
2215 }
2216
2217 if (res->type == RTN_LOCAL) {
2218 err = fib_validate_source(skb, saddr, daddr, tos,
2219 0, dev, in_dev, &itag);
2220 if (err < 0)
2221 goto martian_source;
2222 goto local_input;
2223 }
2224
2225 if (!IN_DEV_FORWARD(in_dev)) {
2226 err = -EHOSTUNREACH;
2227 goto no_route;
2228 }
2229 if (res->type != RTN_UNICAST)
2230 goto martian_destination;
2231
2232 make_route:
2233 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2234 out: return err;
2235
2236 brd_input:
2237 if (skb->protocol != htons(ETH_P_IP))
2238 goto e_inval;
2239
2240 if (!ipv4_is_zeronet(saddr)) {
2241 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2242 in_dev, &itag);
2243 if (err < 0)
2244 goto martian_source;
2245 }
2246 flags |= RTCF_BROADCAST;
2247 res->type = RTN_BROADCAST;
2248 RT_CACHE_STAT_INC(in_brd);
2249
2250 local_input:
2251 do_cache &= res->fi && !itag;
2252 if (do_cache) {
2253 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2254
2255 rth = rcu_dereference(nhc->nhc_rth_input);
2256 if (rt_cache_valid(rth)) {
2257 skb_dst_set_noref(skb, &rth->dst);
2258 err = 0;
2259 goto out;
2260 }
2261 }
2262
2263 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2264 flags | RTCF_LOCAL, res->type,
2265 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2266 if (!rth)
2267 goto e_nobufs;
2268
2269 rth->dst.output= ip_rt_bug;
2270 #ifdef CONFIG_IP_ROUTE_CLASSID
2271 rth->dst.tclassid = itag;
2272 #endif
2273 rth->rt_is_input = 1;
2274
2275 RT_CACHE_STAT_INC(in_slow_tot);
2276 if (res->type == RTN_UNREACHABLE) {
2277 rth->dst.input= ip_error;
2278 rth->dst.error= -err;
2279 rth->rt_flags &= ~RTCF_LOCAL;
2280 }
2281
2282 if (do_cache) {
2283 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2284
2285 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2286 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2287 WARN_ON(rth->dst.input == lwtunnel_input);
2288 rth->dst.lwtstate->orig_input = rth->dst.input;
2289 rth->dst.input = lwtunnel_input;
2290 }
2291
2292 if (unlikely(!rt_cache_route(nhc, rth)))
2293 rt_add_uncached_list(rth);
2294 }
2295 skb_dst_set(skb, &rth->dst);
2296 err = 0;
2297 goto out;
2298
2299 no_route:
2300 RT_CACHE_STAT_INC(in_no_route);
2301 res->type = RTN_UNREACHABLE;
2302 res->fi = NULL;
2303 res->table = NULL;
2304 goto local_input;
2305
2306 /*
2307 * Do not cache martian addresses: they should be logged (RFC1812)
2308 */
2309 martian_destination:
2310 RT_CACHE_STAT_INC(in_martian_dst);
2311 #ifdef CONFIG_IP_ROUTE_VERBOSE
2312 if (IN_DEV_LOG_MARTIANS(in_dev))
2313 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2314 &daddr, &saddr, dev->name);
2315 #endif
2316
2317 e_inval:
2318 err = -EINVAL;
2319 goto out;
2320
2321 e_nobufs:
2322 err = -ENOBUFS;
2323 goto out;
2324
2325 martian_source:
2326 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2327 goto out;
2328 }
2329
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2330 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2331 u8 tos, struct net_device *dev)
2332 {
2333 struct fib_result res;
2334 int err;
2335
2336 tos &= IPTOS_RT_MASK;
2337 rcu_read_lock();
2338 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2339 rcu_read_unlock();
2340
2341 return err;
2342 }
2343 EXPORT_SYMBOL(ip_route_input_noref);
2344
2345 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2346 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2347 u8 tos, struct net_device *dev, struct fib_result *res)
2348 {
2349 /* Multicast recognition logic is moved from route cache to here.
2350 The problem was that too many Ethernet cards have broken/missing
2351 hardware multicast filters :-( As result the host on multicasting
2352 network acquires a lot of useless route cache entries, sort of
2353 SDR messages from all the world. Now we try to get rid of them.
2354 Really, provided software IP multicast filter is organized
2355 reasonably (at least, hashed), it does not result in a slowdown
2356 comparing with route cache reject entries.
2357 Note, that multicast routers are not affected, because
2358 route cache entry is created eventually.
2359 */
2360 if (ipv4_is_multicast(daddr)) {
2361 struct in_device *in_dev = __in_dev_get_rcu(dev);
2362 int our = 0;
2363 int err = -EINVAL;
2364
2365 if (!in_dev)
2366 return err;
2367 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2368 ip_hdr(skb)->protocol);
2369
2370 /* check l3 master if no match yet */
2371 if (!our && netif_is_l3_slave(dev)) {
2372 struct in_device *l3_in_dev;
2373
2374 l3_in_dev = __in_dev_get_rcu(skb->dev);
2375 if (l3_in_dev)
2376 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2377 ip_hdr(skb)->protocol);
2378 }
2379
2380 if (our
2381 #ifdef CONFIG_IP_MROUTE
2382 ||
2383 (!ipv4_is_local_multicast(daddr) &&
2384 IN_DEV_MFORWARD(in_dev))
2385 #endif
2386 ) {
2387 err = ip_route_input_mc(skb, daddr, saddr,
2388 tos, dev, our);
2389 }
2390 return err;
2391 }
2392
2393 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2394 }
2395
2396 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2397 static struct rtable *__mkroute_output(const struct fib_result *res,
2398 const struct flowi4 *fl4, int orig_oif,
2399 struct net_device *dev_out,
2400 unsigned int flags)
2401 {
2402 struct fib_info *fi = res->fi;
2403 struct fib_nh_exception *fnhe;
2404 struct in_device *in_dev;
2405 u16 type = res->type;
2406 struct rtable *rth;
2407 bool do_cache;
2408
2409 in_dev = __in_dev_get_rcu(dev_out);
2410 if (!in_dev)
2411 return ERR_PTR(-EINVAL);
2412
2413 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2414 if (ipv4_is_loopback(fl4->saddr) &&
2415 !(dev_out->flags & IFF_LOOPBACK) &&
2416 !netif_is_l3_master(dev_out))
2417 return ERR_PTR(-EINVAL);
2418
2419 if (ipv4_is_lbcast(fl4->daddr))
2420 type = RTN_BROADCAST;
2421 else if (ipv4_is_multicast(fl4->daddr))
2422 type = RTN_MULTICAST;
2423 else if (ipv4_is_zeronet(fl4->daddr))
2424 return ERR_PTR(-EINVAL);
2425
2426 if (dev_out->flags & IFF_LOOPBACK)
2427 flags |= RTCF_LOCAL;
2428
2429 do_cache = true;
2430 if (type == RTN_BROADCAST) {
2431 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2432 fi = NULL;
2433 } else if (type == RTN_MULTICAST) {
2434 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2435 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2436 fl4->flowi4_proto))
2437 flags &= ~RTCF_LOCAL;
2438 else
2439 do_cache = false;
2440 /* If multicast route do not exist use
2441 * default one, but do not gateway in this case.
2442 * Yes, it is hack.
2443 */
2444 if (fi && res->prefixlen < 4)
2445 fi = NULL;
2446 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2447 (orig_oif != dev_out->ifindex)) {
2448 /* For local routes that require a particular output interface
2449 * we do not want to cache the result. Caching the result
2450 * causes incorrect behaviour when there are multiple source
2451 * addresses on the interface, the end result being that if the
2452 * intended recipient is waiting on that interface for the
2453 * packet he won't receive it because it will be delivered on
2454 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2455 * be set to the loopback interface as well.
2456 */
2457 do_cache = false;
2458 }
2459
2460 fnhe = NULL;
2461 do_cache &= fi != NULL;
2462 if (fi) {
2463 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2464 struct rtable __rcu **prth;
2465
2466 fnhe = find_exception(nhc, fl4->daddr);
2467 if (!do_cache)
2468 goto add;
2469 if (fnhe) {
2470 prth = &fnhe->fnhe_rth_output;
2471 } else {
2472 if (unlikely(fl4->flowi4_flags &
2473 FLOWI_FLAG_KNOWN_NH &&
2474 !(nhc->nhc_gw_family &&
2475 nhc->nhc_scope == RT_SCOPE_LINK))) {
2476 do_cache = false;
2477 goto add;
2478 }
2479 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2480 }
2481 rth = rcu_dereference(*prth);
2482 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2483 return rth;
2484 }
2485
2486 add:
2487 rth = rt_dst_alloc(dev_out, flags, type,
2488 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2489 IN_DEV_CONF_GET(in_dev, NOXFRM));
2490 if (!rth)
2491 return ERR_PTR(-ENOBUFS);
2492
2493 rth->rt_iif = orig_oif;
2494
2495 RT_CACHE_STAT_INC(out_slow_tot);
2496
2497 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2498 if (flags & RTCF_LOCAL &&
2499 !(dev_out->flags & IFF_LOOPBACK)) {
2500 rth->dst.output = ip_mc_output;
2501 RT_CACHE_STAT_INC(out_slow_mc);
2502 }
2503 #ifdef CONFIG_IP_MROUTE
2504 if (type == RTN_MULTICAST) {
2505 if (IN_DEV_MFORWARD(in_dev) &&
2506 !ipv4_is_local_multicast(fl4->daddr)) {
2507 rth->dst.input = ip_mr_input;
2508 rth->dst.output = ip_mc_output;
2509 }
2510 }
2511 #endif
2512 }
2513
2514 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2515 lwtunnel_set_redirect(&rth->dst);
2516
2517 return rth;
2518 }
2519
2520 /*
2521 * Major route resolver routine.
2522 */
2523
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2524 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2525 const struct sk_buff *skb)
2526 {
2527 __u8 tos = RT_FL_TOS(fl4);
2528 struct fib_result res = {
2529 .type = RTN_UNSPEC,
2530 .fi = NULL,
2531 .table = NULL,
2532 .tclassid = 0,
2533 };
2534 struct rtable *rth;
2535
2536 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2537 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2538 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2539 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2540
2541 rcu_read_lock();
2542 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2543 rcu_read_unlock();
2544
2545 return rth;
2546 }
2547 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2548
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2549 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2550 struct fib_result *res,
2551 const struct sk_buff *skb)
2552 {
2553 struct net_device *dev_out = NULL;
2554 int orig_oif = fl4->flowi4_oif;
2555 unsigned int flags = 0;
2556 struct rtable *rth;
2557 int err;
2558
2559 if (fl4->saddr) {
2560 if (ipv4_is_multicast(fl4->saddr) ||
2561 ipv4_is_lbcast(fl4->saddr) ||
2562 ipv4_is_zeronet(fl4->saddr)) {
2563 rth = ERR_PTR(-EINVAL);
2564 goto out;
2565 }
2566
2567 rth = ERR_PTR(-ENETUNREACH);
2568
2569 /* I removed check for oif == dev_out->oif here.
2570 It was wrong for two reasons:
2571 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2572 is assigned to multiple interfaces.
2573 2. Moreover, we are allowed to send packets with saddr
2574 of another iface. --ANK
2575 */
2576
2577 if (fl4->flowi4_oif == 0 &&
2578 (ipv4_is_multicast(fl4->daddr) ||
2579 ipv4_is_lbcast(fl4->daddr))) {
2580 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2581 dev_out = __ip_dev_find(net, fl4->saddr, false);
2582 if (!dev_out)
2583 goto out;
2584
2585 /* Special hack: user can direct multicasts
2586 and limited broadcast via necessary interface
2587 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2588 This hack is not just for fun, it allows
2589 vic,vat and friends to work.
2590 They bind socket to loopback, set ttl to zero
2591 and expect that it will work.
2592 From the viewpoint of routing cache they are broken,
2593 because we are not allowed to build multicast path
2594 with loopback source addr (look, routing cache
2595 cannot know, that ttl is zero, so that packet
2596 will not leave this host and route is valid).
2597 Luckily, this hack is good workaround.
2598 */
2599
2600 fl4->flowi4_oif = dev_out->ifindex;
2601 goto make_route;
2602 }
2603
2604 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2605 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2606 if (!__ip_dev_find(net, fl4->saddr, false))
2607 goto out;
2608 }
2609 }
2610
2611
2612 if (fl4->flowi4_oif) {
2613 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2614 rth = ERR_PTR(-ENODEV);
2615 if (!dev_out)
2616 goto out;
2617
2618 /* RACE: Check return value of inet_select_addr instead. */
2619 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2620 rth = ERR_PTR(-ENETUNREACH);
2621 goto out;
2622 }
2623 if (ipv4_is_local_multicast(fl4->daddr) ||
2624 ipv4_is_lbcast(fl4->daddr) ||
2625 fl4->flowi4_proto == IPPROTO_IGMP) {
2626 if (!fl4->saddr)
2627 fl4->saddr = inet_select_addr(dev_out, 0,
2628 RT_SCOPE_LINK);
2629 goto make_route;
2630 }
2631 if (!fl4->saddr) {
2632 if (ipv4_is_multicast(fl4->daddr))
2633 fl4->saddr = inet_select_addr(dev_out, 0,
2634 fl4->flowi4_scope);
2635 else if (!fl4->daddr)
2636 fl4->saddr = inet_select_addr(dev_out, 0,
2637 RT_SCOPE_HOST);
2638 }
2639 }
2640
2641 if (!fl4->daddr) {
2642 fl4->daddr = fl4->saddr;
2643 if (!fl4->daddr)
2644 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2645 dev_out = net->loopback_dev;
2646 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2647 res->type = RTN_LOCAL;
2648 flags |= RTCF_LOCAL;
2649 goto make_route;
2650 }
2651
2652 err = fib_lookup(net, fl4, res, 0);
2653 if (err) {
2654 res->fi = NULL;
2655 res->table = NULL;
2656 if (fl4->flowi4_oif &&
2657 (ipv4_is_multicast(fl4->daddr) ||
2658 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2659 /* Apparently, routing tables are wrong. Assume,
2660 that the destination is on link.
2661
2662 WHY? DW.
2663 Because we are allowed to send to iface
2664 even if it has NO routes and NO assigned
2665 addresses. When oif is specified, routing
2666 tables are looked up with only one purpose:
2667 to catch if destination is gatewayed, rather than
2668 direct. Moreover, if MSG_DONTROUTE is set,
2669 we send packet, ignoring both routing tables
2670 and ifaddr state. --ANK
2671
2672
2673 We could make it even if oif is unknown,
2674 likely IPv6, but we do not.
2675 */
2676
2677 if (fl4->saddr == 0)
2678 fl4->saddr = inet_select_addr(dev_out, 0,
2679 RT_SCOPE_LINK);
2680 res->type = RTN_UNICAST;
2681 goto make_route;
2682 }
2683 rth = ERR_PTR(err);
2684 goto out;
2685 }
2686
2687 if (res->type == RTN_LOCAL) {
2688 if (!fl4->saddr) {
2689 if (res->fi->fib_prefsrc)
2690 fl4->saddr = res->fi->fib_prefsrc;
2691 else
2692 fl4->saddr = fl4->daddr;
2693 }
2694
2695 /* L3 master device is the loopback for that domain */
2696 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2697 net->loopback_dev;
2698
2699 /* make sure orig_oif points to fib result device even
2700 * though packet rx/tx happens over loopback or l3mdev
2701 */
2702 orig_oif = FIB_RES_OIF(*res);
2703
2704 fl4->flowi4_oif = dev_out->ifindex;
2705 flags |= RTCF_LOCAL;
2706 goto make_route;
2707 }
2708
2709 fib_select_path(net, res, fl4, skb);
2710
2711 dev_out = FIB_RES_DEV(*res);
2712
2713 make_route:
2714 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2715
2716 out:
2717 return rth;
2718 }
2719
2720 static struct dst_ops ipv4_dst_blackhole_ops = {
2721 .family = AF_INET,
2722 .default_advmss = ipv4_default_advmss,
2723 .neigh_lookup = ipv4_neigh_lookup,
2724 .check = dst_blackhole_check,
2725 .cow_metrics = dst_blackhole_cow_metrics,
2726 .update_pmtu = dst_blackhole_update_pmtu,
2727 .redirect = dst_blackhole_redirect,
2728 .mtu = dst_blackhole_mtu,
2729 };
2730
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2731 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2732 {
2733 struct rtable *ort = (struct rtable *) dst_orig;
2734 struct rtable *rt;
2735
2736 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2737 if (rt) {
2738 struct dst_entry *new = &rt->dst;
2739
2740 new->__use = 1;
2741 new->input = dst_discard;
2742 new->output = dst_discard_out;
2743
2744 new->dev = net->loopback_dev;
2745 if (new->dev)
2746 dev_hold(new->dev);
2747
2748 rt->rt_is_input = ort->rt_is_input;
2749 rt->rt_iif = ort->rt_iif;
2750 rt->rt_pmtu = ort->rt_pmtu;
2751 rt->rt_mtu_locked = ort->rt_mtu_locked;
2752
2753 rt->rt_genid = rt_genid_ipv4(net);
2754 rt->rt_flags = ort->rt_flags;
2755 rt->rt_type = ort->rt_type;
2756 rt->rt_uses_gateway = ort->rt_uses_gateway;
2757 rt->rt_gw_family = ort->rt_gw_family;
2758 if (rt->rt_gw_family == AF_INET)
2759 rt->rt_gw4 = ort->rt_gw4;
2760 else if (rt->rt_gw_family == AF_INET6)
2761 rt->rt_gw6 = ort->rt_gw6;
2762
2763 INIT_LIST_HEAD(&rt->rt_uncached);
2764 }
2765
2766 dst_release(dst_orig);
2767
2768 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2769 }
2770
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2771 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2772 const struct sock *sk)
2773 {
2774 struct rtable *rt = __ip_route_output_key(net, flp4);
2775
2776 if (IS_ERR(rt))
2777 return rt;
2778
2779 if (flp4->flowi4_proto) {
2780 flp4->flowi4_oif = rt->dst.dev->ifindex;
2781 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2782 flowi4_to_flowi(flp4),
2783 sk, 0);
2784 }
2785
2786 return rt;
2787 }
2788 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2789
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2790 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2791 struct net_device *dev,
2792 struct net *net, __be32 *saddr,
2793 const struct ip_tunnel_info *info,
2794 u8 protocol, bool use_cache)
2795 {
2796 #ifdef CONFIG_DST_CACHE
2797 struct dst_cache *dst_cache;
2798 #endif
2799 struct rtable *rt = NULL;
2800 struct flowi4 fl4;
2801 __u8 tos;
2802
2803 #ifdef CONFIG_DST_CACHE
2804 dst_cache = (struct dst_cache *)&info->dst_cache;
2805 if (use_cache) {
2806 rt = dst_cache_get_ip4(dst_cache, saddr);
2807 if (rt)
2808 return rt;
2809 }
2810 #endif
2811 memset(&fl4, 0, sizeof(fl4));
2812 fl4.flowi4_mark = skb->mark;
2813 fl4.flowi4_proto = protocol;
2814 fl4.daddr = info->key.u.ipv4.dst;
2815 fl4.saddr = info->key.u.ipv4.src;
2816 tos = info->key.tos;
2817 fl4.flowi4_tos = RT_TOS(tos);
2818
2819 rt = ip_route_output_key(net, &fl4);
2820 if (IS_ERR(rt)) {
2821 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2822 return ERR_PTR(-ENETUNREACH);
2823 }
2824 if (rt->dst.dev == dev) { /* is this necessary? */
2825 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2826 ip_rt_put(rt);
2827 return ERR_PTR(-ELOOP);
2828 }
2829 #ifdef CONFIG_DST_CACHE
2830 if (use_cache)
2831 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2832 #endif
2833 *saddr = fl4.saddr;
2834 return rt;
2835 }
2836 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2837
2838 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2839 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2840 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2841 struct sk_buff *skb, u32 portid, u32 seq,
2842 unsigned int flags)
2843 {
2844 struct rtmsg *r;
2845 struct nlmsghdr *nlh;
2846 unsigned long expires = 0;
2847 u32 error;
2848 u32 metrics[RTAX_MAX];
2849
2850 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2851 if (!nlh)
2852 return -EMSGSIZE;
2853
2854 r = nlmsg_data(nlh);
2855 r->rtm_family = AF_INET;
2856 r->rtm_dst_len = 32;
2857 r->rtm_src_len = 0;
2858 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2859 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2860 if (nla_put_u32(skb, RTA_TABLE, table_id))
2861 goto nla_put_failure;
2862 r->rtm_type = rt->rt_type;
2863 r->rtm_scope = RT_SCOPE_UNIVERSE;
2864 r->rtm_protocol = RTPROT_UNSPEC;
2865 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2866 if (rt->rt_flags & RTCF_NOTIFY)
2867 r->rtm_flags |= RTM_F_NOTIFY;
2868 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2869 r->rtm_flags |= RTCF_DOREDIRECT;
2870
2871 if (nla_put_in_addr(skb, RTA_DST, dst))
2872 goto nla_put_failure;
2873 if (src) {
2874 r->rtm_src_len = 32;
2875 if (nla_put_in_addr(skb, RTA_SRC, src))
2876 goto nla_put_failure;
2877 }
2878 if (rt->dst.dev &&
2879 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2880 goto nla_put_failure;
2881 #ifdef CONFIG_IP_ROUTE_CLASSID
2882 if (rt->dst.tclassid &&
2883 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2884 goto nla_put_failure;
2885 #endif
2886 if (fl4 && !rt_is_input_route(rt) &&
2887 fl4->saddr != src) {
2888 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2889 goto nla_put_failure;
2890 }
2891 if (rt->rt_uses_gateway) {
2892 if (rt->rt_gw_family == AF_INET &&
2893 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2894 goto nla_put_failure;
2895 } else if (rt->rt_gw_family == AF_INET6) {
2896 int alen = sizeof(struct in6_addr);
2897 struct nlattr *nla;
2898 struct rtvia *via;
2899
2900 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2901 if (!nla)
2902 goto nla_put_failure;
2903
2904 via = nla_data(nla);
2905 via->rtvia_family = AF_INET6;
2906 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2907 }
2908 }
2909
2910 expires = rt->dst.expires;
2911 if (expires) {
2912 unsigned long now = jiffies;
2913
2914 if (time_before(now, expires))
2915 expires -= now;
2916 else
2917 expires = 0;
2918 }
2919
2920 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2921 if (rt->rt_pmtu && expires)
2922 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2923 if (rt->rt_mtu_locked && expires)
2924 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2925 if (rtnetlink_put_metrics(skb, metrics) < 0)
2926 goto nla_put_failure;
2927
2928 if (fl4) {
2929 if (fl4->flowi4_mark &&
2930 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2931 goto nla_put_failure;
2932
2933 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2934 nla_put_u32(skb, RTA_UID,
2935 from_kuid_munged(current_user_ns(),
2936 fl4->flowi4_uid)))
2937 goto nla_put_failure;
2938
2939 if (rt_is_input_route(rt)) {
2940 #ifdef CONFIG_IP_MROUTE
2941 if (ipv4_is_multicast(dst) &&
2942 !ipv4_is_local_multicast(dst) &&
2943 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2944 int err = ipmr_get_route(net, skb,
2945 fl4->saddr, fl4->daddr,
2946 r, portid);
2947
2948 if (err <= 0) {
2949 if (err == 0)
2950 return 0;
2951 goto nla_put_failure;
2952 }
2953 } else
2954 #endif
2955 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2956 goto nla_put_failure;
2957 }
2958 }
2959
2960 error = rt->dst.error;
2961
2962 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2963 goto nla_put_failure;
2964
2965 nlmsg_end(skb, nlh);
2966 return 0;
2967
2968 nla_put_failure:
2969 nlmsg_cancel(skb, nlh);
2970 return -EMSGSIZE;
2971 }
2972
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2973 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2974 struct netlink_callback *cb, u32 table_id,
2975 struct fnhe_hash_bucket *bucket, int genid,
2976 int *fa_index, int fa_start, unsigned int flags)
2977 {
2978 int i;
2979
2980 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2981 struct fib_nh_exception *fnhe;
2982
2983 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2984 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2985 struct rtable *rt;
2986 int err;
2987
2988 if (*fa_index < fa_start)
2989 goto next;
2990
2991 if (fnhe->fnhe_genid != genid)
2992 goto next;
2993
2994 if (fnhe->fnhe_expires &&
2995 time_after(jiffies, fnhe->fnhe_expires))
2996 goto next;
2997
2998 rt = rcu_dereference(fnhe->fnhe_rth_input);
2999 if (!rt)
3000 rt = rcu_dereference(fnhe->fnhe_rth_output);
3001 if (!rt)
3002 goto next;
3003
3004 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3005 table_id, NULL, skb,
3006 NETLINK_CB(cb->skb).portid,
3007 cb->nlh->nlmsg_seq, flags);
3008 if (err)
3009 return err;
3010 next:
3011 (*fa_index)++;
3012 }
3013 }
3014
3015 return 0;
3016 }
3017
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3018 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3019 u32 table_id, struct fib_info *fi,
3020 int *fa_index, int fa_start, unsigned int flags)
3021 {
3022 struct net *net = sock_net(cb->skb->sk);
3023 int nhsel, genid = fnhe_genid(net);
3024
3025 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3026 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3027 struct fnhe_hash_bucket *bucket;
3028 int err;
3029
3030 if (nhc->nhc_flags & RTNH_F_DEAD)
3031 continue;
3032
3033 rcu_read_lock();
3034 bucket = rcu_dereference(nhc->nhc_exceptions);
3035 err = 0;
3036 if (bucket)
3037 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3038 genid, fa_index, fa_start,
3039 flags);
3040 rcu_read_unlock();
3041 if (err)
3042 return err;
3043 }
3044
3045 return 0;
3046 }
3047
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3048 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3049 u8 ip_proto, __be16 sport,
3050 __be16 dport)
3051 {
3052 struct sk_buff *skb;
3053 struct iphdr *iph;
3054
3055 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3056 if (!skb)
3057 return NULL;
3058
3059 /* Reserve room for dummy headers, this skb can pass
3060 * through good chunk of routing engine.
3061 */
3062 skb_reset_mac_header(skb);
3063 skb_reset_network_header(skb);
3064 skb->protocol = htons(ETH_P_IP);
3065 iph = skb_put(skb, sizeof(struct iphdr));
3066 iph->protocol = ip_proto;
3067 iph->saddr = src;
3068 iph->daddr = dst;
3069 iph->version = 0x4;
3070 iph->frag_off = 0;
3071 iph->ihl = 0x5;
3072 skb_set_transport_header(skb, skb->len);
3073
3074 switch (iph->protocol) {
3075 case IPPROTO_UDP: {
3076 struct udphdr *udph;
3077
3078 udph = skb_put_zero(skb, sizeof(struct udphdr));
3079 udph->source = sport;
3080 udph->dest = dport;
3081 udph->len = htons(sizeof(struct udphdr));
3082 udph->check = 0;
3083 break;
3084 }
3085 case IPPROTO_TCP: {
3086 struct tcphdr *tcph;
3087
3088 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3089 tcph->source = sport;
3090 tcph->dest = dport;
3091 tcph->doff = sizeof(struct tcphdr) / 4;
3092 tcph->rst = 1;
3093 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3094 src, dst, 0);
3095 break;
3096 }
3097 case IPPROTO_ICMP: {
3098 struct icmphdr *icmph;
3099
3100 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3101 icmph->type = ICMP_ECHO;
3102 icmph->code = 0;
3103 }
3104 }
3105
3106 return skb;
3107 }
3108
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3109 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3110 const struct nlmsghdr *nlh,
3111 struct nlattr **tb,
3112 struct netlink_ext_ack *extack)
3113 {
3114 struct rtmsg *rtm;
3115 int i, err;
3116
3117 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3118 NL_SET_ERR_MSG(extack,
3119 "ipv4: Invalid header for route get request");
3120 return -EINVAL;
3121 }
3122
3123 if (!netlink_strict_get_check(skb))
3124 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3125 rtm_ipv4_policy, extack);
3126
3127 rtm = nlmsg_data(nlh);
3128 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3129 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3130 rtm->rtm_table || rtm->rtm_protocol ||
3131 rtm->rtm_scope || rtm->rtm_type) {
3132 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3133 return -EINVAL;
3134 }
3135
3136 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3137 RTM_F_LOOKUP_TABLE |
3138 RTM_F_FIB_MATCH)) {
3139 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3140 return -EINVAL;
3141 }
3142
3143 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3144 rtm_ipv4_policy, extack);
3145 if (err)
3146 return err;
3147
3148 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3149 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3150 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3151 return -EINVAL;
3152 }
3153
3154 for (i = 0; i <= RTA_MAX; i++) {
3155 if (!tb[i])
3156 continue;
3157
3158 switch (i) {
3159 case RTA_IIF:
3160 case RTA_OIF:
3161 case RTA_SRC:
3162 case RTA_DST:
3163 case RTA_IP_PROTO:
3164 case RTA_SPORT:
3165 case RTA_DPORT:
3166 case RTA_MARK:
3167 case RTA_UID:
3168 break;
3169 default:
3170 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3171 return -EINVAL;
3172 }
3173 }
3174
3175 return 0;
3176 }
3177
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3178 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3179 struct netlink_ext_ack *extack)
3180 {
3181 struct net *net = sock_net(in_skb->sk);
3182 struct nlattr *tb[RTA_MAX+1];
3183 u32 table_id = RT_TABLE_MAIN;
3184 __be16 sport = 0, dport = 0;
3185 struct fib_result res = {};
3186 u8 ip_proto = IPPROTO_UDP;
3187 struct rtable *rt = NULL;
3188 struct sk_buff *skb;
3189 struct rtmsg *rtm;
3190 struct flowi4 fl4 = {};
3191 __be32 dst = 0;
3192 __be32 src = 0;
3193 kuid_t uid;
3194 u32 iif;
3195 int err;
3196 int mark;
3197
3198 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3199 if (err < 0)
3200 return err;
3201
3202 rtm = nlmsg_data(nlh);
3203 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3204 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3205 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3206 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3207 if (tb[RTA_UID])
3208 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3209 else
3210 uid = (iif ? INVALID_UID : current_uid());
3211
3212 if (tb[RTA_IP_PROTO]) {
3213 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3214 &ip_proto, AF_INET, extack);
3215 if (err)
3216 return err;
3217 }
3218
3219 if (tb[RTA_SPORT])
3220 sport = nla_get_be16(tb[RTA_SPORT]);
3221
3222 if (tb[RTA_DPORT])
3223 dport = nla_get_be16(tb[RTA_DPORT]);
3224
3225 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3226 if (!skb)
3227 return -ENOBUFS;
3228
3229 fl4.daddr = dst;
3230 fl4.saddr = src;
3231 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3232 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3233 fl4.flowi4_mark = mark;
3234 fl4.flowi4_uid = uid;
3235 if (sport)
3236 fl4.fl4_sport = sport;
3237 if (dport)
3238 fl4.fl4_dport = dport;
3239 fl4.flowi4_proto = ip_proto;
3240
3241 rcu_read_lock();
3242
3243 if (iif) {
3244 struct net_device *dev;
3245
3246 dev = dev_get_by_index_rcu(net, iif);
3247 if (!dev) {
3248 err = -ENODEV;
3249 goto errout_rcu;
3250 }
3251
3252 fl4.flowi4_iif = iif; /* for rt_fill_info */
3253 skb->dev = dev;
3254 skb->mark = mark;
3255 err = ip_route_input_rcu(skb, dst, src,
3256 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3257 &res);
3258
3259 rt = skb_rtable(skb);
3260 if (err == 0 && rt->dst.error)
3261 err = -rt->dst.error;
3262 } else {
3263 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3264 skb->dev = net->loopback_dev;
3265 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3266 err = 0;
3267 if (IS_ERR(rt))
3268 err = PTR_ERR(rt);
3269 else
3270 skb_dst_set(skb, &rt->dst);
3271 }
3272
3273 if (err)
3274 goto errout_rcu;
3275
3276 if (rtm->rtm_flags & RTM_F_NOTIFY)
3277 rt->rt_flags |= RTCF_NOTIFY;
3278
3279 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3280 table_id = res.table ? res.table->tb_id : 0;
3281
3282 /* reset skb for netlink reply msg */
3283 skb_trim(skb, 0);
3284 skb_reset_network_header(skb);
3285 skb_reset_transport_header(skb);
3286 skb_reset_mac_header(skb);
3287
3288 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3289 struct fib_rt_info fri;
3290
3291 if (!res.fi) {
3292 err = fib_props[res.type].error;
3293 if (!err)
3294 err = -EHOSTUNREACH;
3295 goto errout_rcu;
3296 }
3297 fri.fi = res.fi;
3298 fri.tb_id = table_id;
3299 fri.dst = res.prefix;
3300 fri.dst_len = res.prefixlen;
3301 fri.tos = fl4.flowi4_tos;
3302 fri.type = rt->rt_type;
3303 fri.offload = 0;
3304 fri.trap = 0;
3305 if (res.fa_head) {
3306 struct fib_alias *fa;
3307
3308 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3309 u8 slen = 32 - fri.dst_len;
3310
3311 if (fa->fa_slen == slen &&
3312 fa->tb_id == fri.tb_id &&
3313 fa->fa_tos == fri.tos &&
3314 fa->fa_info == res.fi &&
3315 fa->fa_type == fri.type) {
3316 fri.offload = fa->offload;
3317 fri.trap = fa->trap;
3318 break;
3319 }
3320 }
3321 }
3322 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3323 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3324 } else {
3325 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3326 NETLINK_CB(in_skb).portid,
3327 nlh->nlmsg_seq, 0);
3328 }
3329 if (err < 0)
3330 goto errout_rcu;
3331
3332 rcu_read_unlock();
3333
3334 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3335
3336 errout_free:
3337 return err;
3338 errout_rcu:
3339 rcu_read_unlock();
3340 kfree_skb(skb);
3341 goto errout_free;
3342 }
3343
ip_rt_multicast_event(struct in_device * in_dev)3344 void ip_rt_multicast_event(struct in_device *in_dev)
3345 {
3346 rt_cache_flush(dev_net(in_dev->dev));
3347 }
3348
3349 #ifdef CONFIG_SYSCTL
3350 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3351 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3352 static int ip_rt_gc_elasticity __read_mostly = 8;
3353 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3354
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3355 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3356 void *buffer, size_t *lenp, loff_t *ppos)
3357 {
3358 struct net *net = (struct net *)__ctl->extra1;
3359
3360 if (write) {
3361 rt_cache_flush(net);
3362 fnhe_genid_bump(net);
3363 return 0;
3364 }
3365
3366 return -EINVAL;
3367 }
3368
3369 static struct ctl_table ipv4_route_table[] = {
3370 {
3371 .procname = "gc_thresh",
3372 .data = &ipv4_dst_ops.gc_thresh,
3373 .maxlen = sizeof(int),
3374 .mode = 0644,
3375 .proc_handler = proc_dointvec,
3376 },
3377 {
3378 .procname = "max_size",
3379 .data = &ip_rt_max_size,
3380 .maxlen = sizeof(int),
3381 .mode = 0644,
3382 .proc_handler = proc_dointvec,
3383 },
3384 {
3385 /* Deprecated. Use gc_min_interval_ms */
3386
3387 .procname = "gc_min_interval",
3388 .data = &ip_rt_gc_min_interval,
3389 .maxlen = sizeof(int),
3390 .mode = 0644,
3391 .proc_handler = proc_dointvec_jiffies,
3392 },
3393 {
3394 .procname = "gc_min_interval_ms",
3395 .data = &ip_rt_gc_min_interval,
3396 .maxlen = sizeof(int),
3397 .mode = 0644,
3398 .proc_handler = proc_dointvec_ms_jiffies,
3399 },
3400 {
3401 .procname = "gc_timeout",
3402 .data = &ip_rt_gc_timeout,
3403 .maxlen = sizeof(int),
3404 .mode = 0644,
3405 .proc_handler = proc_dointvec_jiffies,
3406 },
3407 {
3408 .procname = "gc_interval",
3409 .data = &ip_rt_gc_interval,
3410 .maxlen = sizeof(int),
3411 .mode = 0644,
3412 .proc_handler = proc_dointvec_jiffies,
3413 },
3414 {
3415 .procname = "redirect_load",
3416 .data = &ip_rt_redirect_load,
3417 .maxlen = sizeof(int),
3418 .mode = 0644,
3419 .proc_handler = proc_dointvec,
3420 },
3421 {
3422 .procname = "redirect_number",
3423 .data = &ip_rt_redirect_number,
3424 .maxlen = sizeof(int),
3425 .mode = 0644,
3426 .proc_handler = proc_dointvec,
3427 },
3428 {
3429 .procname = "redirect_silence",
3430 .data = &ip_rt_redirect_silence,
3431 .maxlen = sizeof(int),
3432 .mode = 0644,
3433 .proc_handler = proc_dointvec,
3434 },
3435 {
3436 .procname = "error_cost",
3437 .data = &ip_rt_error_cost,
3438 .maxlen = sizeof(int),
3439 .mode = 0644,
3440 .proc_handler = proc_dointvec,
3441 },
3442 {
3443 .procname = "error_burst",
3444 .data = &ip_rt_error_burst,
3445 .maxlen = sizeof(int),
3446 .mode = 0644,
3447 .proc_handler = proc_dointvec,
3448 },
3449 {
3450 .procname = "gc_elasticity",
3451 .data = &ip_rt_gc_elasticity,
3452 .maxlen = sizeof(int),
3453 .mode = 0644,
3454 .proc_handler = proc_dointvec,
3455 },
3456 {
3457 .procname = "mtu_expires",
3458 .data = &ip_rt_mtu_expires,
3459 .maxlen = sizeof(int),
3460 .mode = 0644,
3461 .proc_handler = proc_dointvec_jiffies,
3462 },
3463 {
3464 .procname = "min_pmtu",
3465 .data = &ip_rt_min_pmtu,
3466 .maxlen = sizeof(int),
3467 .mode = 0644,
3468 .proc_handler = proc_dointvec_minmax,
3469 .extra1 = &ip_min_valid_pmtu,
3470 },
3471 {
3472 .procname = "min_adv_mss",
3473 .data = &ip_rt_min_advmss,
3474 .maxlen = sizeof(int),
3475 .mode = 0644,
3476 .proc_handler = proc_dointvec,
3477 },
3478 { }
3479 };
3480
3481 static const char ipv4_route_flush_procname[] = "flush";
3482
3483 static struct ctl_table ipv4_route_flush_table[] = {
3484 {
3485 .procname = ipv4_route_flush_procname,
3486 .maxlen = sizeof(int),
3487 .mode = 0200,
3488 .proc_handler = ipv4_sysctl_rtcache_flush,
3489 },
3490 { },
3491 };
3492
sysctl_route_net_init(struct net * net)3493 static __net_init int sysctl_route_net_init(struct net *net)
3494 {
3495 struct ctl_table *tbl;
3496
3497 tbl = ipv4_route_flush_table;
3498 if (!net_eq(net, &init_net)) {
3499 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3500 if (!tbl)
3501 goto err_dup;
3502
3503 /* Don't export non-whitelisted sysctls to unprivileged users */
3504 if (net->user_ns != &init_user_ns) {
3505 if (tbl[0].procname != ipv4_route_flush_procname)
3506 tbl[0].procname = NULL;
3507 }
3508 }
3509 tbl[0].extra1 = net;
3510
3511 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3512 if (!net->ipv4.route_hdr)
3513 goto err_reg;
3514 return 0;
3515
3516 err_reg:
3517 if (tbl != ipv4_route_flush_table)
3518 kfree(tbl);
3519 err_dup:
3520 return -ENOMEM;
3521 }
3522
sysctl_route_net_exit(struct net * net)3523 static __net_exit void sysctl_route_net_exit(struct net *net)
3524 {
3525 struct ctl_table *tbl;
3526
3527 tbl = net->ipv4.route_hdr->ctl_table_arg;
3528 unregister_net_sysctl_table(net->ipv4.route_hdr);
3529 BUG_ON(tbl == ipv4_route_flush_table);
3530 kfree(tbl);
3531 }
3532
3533 static __net_initdata struct pernet_operations sysctl_route_ops = {
3534 .init = sysctl_route_net_init,
3535 .exit = sysctl_route_net_exit,
3536 };
3537 #endif
3538
rt_genid_init(struct net * net)3539 static __net_init int rt_genid_init(struct net *net)
3540 {
3541 atomic_set(&net->ipv4.rt_genid, 0);
3542 atomic_set(&net->fnhe_genid, 0);
3543 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3544 return 0;
3545 }
3546
3547 static __net_initdata struct pernet_operations rt_genid_ops = {
3548 .init = rt_genid_init,
3549 };
3550
ipv4_inetpeer_init(struct net * net)3551 static int __net_init ipv4_inetpeer_init(struct net *net)
3552 {
3553 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3554
3555 if (!bp)
3556 return -ENOMEM;
3557 inet_peer_base_init(bp);
3558 net->ipv4.peers = bp;
3559 return 0;
3560 }
3561
ipv4_inetpeer_exit(struct net * net)3562 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3563 {
3564 struct inet_peer_base *bp = net->ipv4.peers;
3565
3566 net->ipv4.peers = NULL;
3567 inetpeer_invalidate_tree(bp);
3568 kfree(bp);
3569 }
3570
3571 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3572 .init = ipv4_inetpeer_init,
3573 .exit = ipv4_inetpeer_exit,
3574 };
3575
3576 #ifdef CONFIG_IP_ROUTE_CLASSID
3577 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3578 #endif /* CONFIG_IP_ROUTE_CLASSID */
3579
ip_rt_init(void)3580 int __init ip_rt_init(void)
3581 {
3582 void *idents_hash;
3583 int cpu;
3584
3585 /* For modern hosts, this will use 2 MB of memory */
3586 idents_hash = alloc_large_system_hash("IP idents",
3587 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3588 0,
3589 16, /* one bucket per 64 KB */
3590 HASH_ZERO,
3591 NULL,
3592 &ip_idents_mask,
3593 2048,
3594 256*1024);
3595
3596 ip_idents = idents_hash;
3597
3598 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3599
3600 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3601
3602 for_each_possible_cpu(cpu) {
3603 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3604
3605 INIT_LIST_HEAD(&ul->head);
3606 spin_lock_init(&ul->lock);
3607 }
3608 #ifdef CONFIG_IP_ROUTE_CLASSID
3609 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3610 if (!ip_rt_acct)
3611 panic("IP: failed to allocate ip_rt_acct\n");
3612 #endif
3613
3614 ipv4_dst_ops.kmem_cachep =
3615 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3616 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3617
3618 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3619
3620 if (dst_entries_init(&ipv4_dst_ops) < 0)
3621 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3622
3623 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3624 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3625
3626 ipv4_dst_ops.gc_thresh = ~0;
3627 ip_rt_max_size = INT_MAX;
3628
3629 devinet_init();
3630 ip_fib_init();
3631
3632 if (ip_rt_proc_init())
3633 pr_err("Unable to create route proc files\n");
3634 #ifdef CONFIG_XFRM
3635 xfrm_init();
3636 xfrm4_init();
3637 #endif
3638 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3639 RTNL_FLAG_DOIT_UNLOCKED);
3640
3641 #ifdef CONFIG_SYSCTL
3642 register_pernet_subsys(&sysctl_route_ops);
3643 #endif
3644 register_pernet_subsys(&rt_genid_ops);
3645 register_pernet_subsys(&ipv4_inetpeer_ops);
3646 return 0;
3647 }
3648
3649 #ifdef CONFIG_SYSCTL
3650 /*
3651 * We really need to sanitize the damn ipv4 init order, then all
3652 * this nonsense will go away.
3653 */
ip_static_sysctl_init(void)3654 void __init ip_static_sysctl_init(void)
3655 {
3656 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3657 }
3658 #endif
3659