1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
130
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static void ipv4_negative_advice(struct sock *sk,
141 struct dst_entry *dst);
142 static void ipv4_link_failure(struct sk_buff *skb);
143 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 struct sk_buff *skb, u32 mtu,
145 bool confirm_neigh);
146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 WARN_ON(1);
153 return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
163 .check = ipv4_dst_check,
164 .default_advmss = ipv4_default_advmss,
165 .mtu = ipv4_mtu,
166 .cow_metrics = ipv4_cow_metrics,
167 .destroy = ipv4_dst_destroy,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174 .confirm_neigh = ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class) TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 if (*pos)
206 return NULL;
207 return SEQ_START_TOKEN;
208 }
209
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 ++*pos;
213 return NULL;
214 }
215
rt_cache_seq_stop(struct seq_file * seq,void * v)216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
rt_cache_seq_show(struct seq_file * seq,void * v)220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
227 return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235 };
236
rt_cache_seq_open(struct inode * inode,struct file * file)237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct proc_ops rt_cache_proc_ops = {
243 .proc_open = rt_cache_seq_open,
244 .proc_read = seq_read,
245 .proc_lseek = seq_lseek,
246 .proc_release = seq_release,
247 };
248
249
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
261 return &per_cpu(rt_cache_stat, cpu);
262 }
263 return NULL;
264 }
265
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 int cpu;
269
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 (*pos)++;
277 return NULL;
278
279 }
280
rt_cpu_seq_stop(struct seq_file * seq,void * v)281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
rt_cpu_seq_show(struct seq_file * seq,void * v)286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 return 0;
293 }
294
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 dst_entries_get_slow(&ipv4_dst_ops),
298 0, /* st->in_hit */
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
306 0, /* st->out_hit */
307 st->out_slow_tot,
308 st->out_slow_mc,
309
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
316 );
317 return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325 };
326
327
rt_cpu_seq_open(struct inode * inode,struct file * file)328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct proc_ops rt_cpu_proc_ops = {
334 .proc_open = rt_cpu_seq_open,
335 .proc_read = seq_read,
336 .proc_lseek = seq_lseek,
337 .proc_release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
358 }
359
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363 }
364 #endif
365
ip_rt_do_proc_init(struct net * net)366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368 struct proc_dir_entry *pde;
369
370 pde = proc_create("rt_cache", 0444, net->proc_net,
371 &rt_cache_proc_ops);
372 if (!pde)
373 goto err1;
374
375 pde = proc_create("rt_cache", 0444,
376 net->proc_net_stat, &rt_cpu_proc_ops);
377 if (!pde)
378 goto err2;
379
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 pde = proc_create_single("rt_acct", 0, net->proc_net,
382 rt_acct_proc_show);
383 if (!pde)
384 goto err3;
385 #endif
386 return 0;
387
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390 remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393 remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395 return -ENOMEM;
396 }
397
ip_rt_do_proc_exit(struct net * net)398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400 remove_proc_entry("rt_cache", net->proc_net_stat);
401 remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406
407 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
408 .init = ip_rt_do_proc_init,
409 .exit = ip_rt_do_proc_exit,
410 };
411
ip_rt_proc_init(void)412 static int __init ip_rt_proc_init(void)
413 {
414 return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416
417 #else
ip_rt_proc_init(void)418 static inline int ip_rt_proc_init(void)
419 {
420 return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423
rt_is_expired(const struct rtable * rth)424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428
rt_cache_flush(struct net * net)429 void rt_cache_flush(struct net *net)
430 {
431 rt_genid_bump_ipv4(net);
432 }
433
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435 struct sk_buff *skb,
436 const void *daddr)
437 {
438 const struct rtable *rt = container_of(dst, struct rtable, dst);
439 struct net_device *dev = dst->dev;
440 struct neighbour *n;
441
442 rcu_read_lock_bh();
443
444 if (likely(rt->rt_gw_family == AF_INET)) {
445 n = ip_neigh_gw4(dev, rt->rt_gw4);
446 } else if (rt->rt_gw_family == AF_INET6) {
447 n = ip_neigh_gw6(dev, &rt->rt_gw6);
448 } else {
449 __be32 pkey;
450
451 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 n = ip_neigh_gw4(dev, pkey);
453 }
454
455 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 n = NULL;
457
458 rcu_read_unlock_bh();
459
460 return n;
461 }
462
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465 const struct rtable *rt = container_of(dst, struct rtable, dst);
466 struct net_device *dev = dst->dev;
467 const __be32 *pkey = daddr;
468
469 if (rt->rt_gw_family == AF_INET) {
470 pkey = (const __be32 *)&rt->rt_gw4;
471 } else if (rt->rt_gw_family == AF_INET6) {
472 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 } else if (!daddr ||
474 (rt->rt_flags &
475 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 return;
477 }
478 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480
481 /* Hash tables of size 2048..262144 depending on RAM size.
482 * Each bucket uses 8 bytes.
483 */
484 static u32 ip_idents_mask __read_mostly;
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487
488 /* In order to protect privacy, we add a perturbation to identifiers
489 * if one generator is seldom used. This makes hard for an attacker
490 * to infer how many packets were sent between two points in time.
491 */
ip_idents_reserve(u32 hash,int segs)492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 u32 bucket, old, now = (u32)jiffies;
495 atomic_t *p_id;
496 u32 *p_tstamp;
497 u32 delta = 0;
498
499 bucket = hash & ip_idents_mask;
500 p_tstamp = ip_tstamps + bucket;
501 p_id = ip_idents + bucket;
502 old = READ_ONCE(*p_tstamp);
503
504 if (old != now && cmpxchg(p_tstamp, old, now) == old)
505 delta = prandom_u32_max(now - old);
506
507 /* If UBSAN reports an error there, please make sure your compiler
508 * supports -fno-strict-overflow before reporting it that was a bug
509 * in UBSAN, and it has been fixed in GCC-8.
510 */
511 return atomic_add_return(segs + delta, p_id) - segs;
512 }
513 EXPORT_SYMBOL(ip_idents_reserve);
514
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 {
517 u32 hash, id;
518
519 /* Note the following code is not safe, but this is okay. */
520 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
521 get_random_bytes(&net->ipv4.ip_id_key,
522 sizeof(net->ipv4.ip_id_key));
523
524 hash = siphash_3u32((__force u32)iph->daddr,
525 (__force u32)iph->saddr,
526 iph->protocol,
527 &net->ipv4.ip_id_key);
528 id = ip_idents_reserve(hash, segs);
529 iph->id = htons(id);
530 }
531 EXPORT_SYMBOL(__ip_select_ident);
532
ip_rt_fix_tos(struct flowi4 * fl4)533 static void ip_rt_fix_tos(struct flowi4 *fl4)
534 {
535 __u8 tos = RT_FL_TOS(fl4);
536
537 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
538 fl4->flowi4_scope = tos & RTO_ONLINK ?
539 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
540 }
541
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)542 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
543 const struct sock *sk,
544 const struct iphdr *iph,
545 int oif, u8 tos,
546 u8 prot, u32 mark, int flow_flags)
547 {
548 if (sk) {
549 const struct inet_sock *inet = inet_sk(sk);
550
551 oif = sk->sk_bound_dev_if;
552 mark = sk->sk_mark;
553 tos = RT_CONN_FLAGS(sk);
554 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
555 }
556 flowi4_init_output(fl4, oif, mark, tos,
557 RT_SCOPE_UNIVERSE, prot,
558 flow_flags,
559 iph->daddr, iph->saddr, 0, 0,
560 sock_net_uid(net, sk));
561 }
562
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)563 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
564 const struct sock *sk)
565 {
566 const struct net *net = dev_net(skb->dev);
567 const struct iphdr *iph = ip_hdr(skb);
568 int oif = skb->dev->ifindex;
569 u8 tos = RT_TOS(iph->tos);
570 u8 prot = iph->protocol;
571 u32 mark = skb->mark;
572
573 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
574 }
575
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)576 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
577 {
578 const struct inet_sock *inet = inet_sk(sk);
579 const struct ip_options_rcu *inet_opt;
580 __be32 daddr = inet->inet_daddr;
581
582 rcu_read_lock();
583 inet_opt = rcu_dereference(inet->inet_opt);
584 if (inet_opt && inet_opt->opt.srr)
585 daddr = inet_opt->opt.faddr;
586 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
587 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
588 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
589 inet_sk_flowi_flags(sk),
590 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
591 rcu_read_unlock();
592 }
593
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)594 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
595 const struct sk_buff *skb)
596 {
597 if (skb)
598 build_skb_flow_key(fl4, skb, sk);
599 else
600 build_sk_flow_key(fl4, sk);
601 }
602
603 static DEFINE_SPINLOCK(fnhe_lock);
604
fnhe_flush_routes(struct fib_nh_exception * fnhe)605 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
606 {
607 struct rtable *rt;
608
609 rt = rcu_dereference(fnhe->fnhe_rth_input);
610 if (rt) {
611 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
612 dst_dev_put(&rt->dst);
613 dst_release(&rt->dst);
614 }
615 rt = rcu_dereference(fnhe->fnhe_rth_output);
616 if (rt) {
617 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
618 dst_dev_put(&rt->dst);
619 dst_release(&rt->dst);
620 }
621 }
622
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)623 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
624 {
625 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
626 struct fib_nh_exception *fnhe, *oldest = NULL;
627
628 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
629 fnhe = rcu_dereference_protected(*fnhe_p,
630 lockdep_is_held(&fnhe_lock));
631 if (!fnhe)
632 break;
633 if (!oldest ||
634 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
635 oldest = fnhe;
636 oldest_p = fnhe_p;
637 }
638 }
639 fnhe_flush_routes(oldest);
640 *oldest_p = oldest->fnhe_next;
641 kfree_rcu(oldest, rcu);
642 }
643
fnhe_hashfun(__be32 daddr)644 static u32 fnhe_hashfun(__be32 daddr)
645 {
646 static siphash_key_t fnhe_hash_key __read_mostly;
647 u64 hval;
648
649 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
650 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
651 return hash_64(hval, FNHE_HASH_SHIFT);
652 }
653
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)654 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
655 {
656 rt->rt_pmtu = fnhe->fnhe_pmtu;
657 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
658 rt->dst.expires = fnhe->fnhe_expires;
659
660 if (fnhe->fnhe_gw) {
661 rt->rt_flags |= RTCF_REDIRECTED;
662 rt->rt_uses_gateway = 1;
663 rt->rt_gw_family = AF_INET;
664 rt->rt_gw4 = fnhe->fnhe_gw;
665 }
666 }
667
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)668 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
669 __be32 gw, u32 pmtu, bool lock,
670 unsigned long expires)
671 {
672 struct fnhe_hash_bucket *hash;
673 struct fib_nh_exception *fnhe;
674 struct rtable *rt;
675 u32 genid, hval;
676 unsigned int i;
677 int depth;
678
679 genid = fnhe_genid(dev_net(nhc->nhc_dev));
680 hval = fnhe_hashfun(daddr);
681
682 spin_lock_bh(&fnhe_lock);
683
684 hash = rcu_dereference(nhc->nhc_exceptions);
685 if (!hash) {
686 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
687 if (!hash)
688 goto out_unlock;
689 rcu_assign_pointer(nhc->nhc_exceptions, hash);
690 }
691
692 hash += hval;
693
694 depth = 0;
695 for (fnhe = rcu_dereference(hash->chain); fnhe;
696 fnhe = rcu_dereference(fnhe->fnhe_next)) {
697 if (fnhe->fnhe_daddr == daddr)
698 break;
699 depth++;
700 }
701
702 if (fnhe) {
703 if (fnhe->fnhe_genid != genid)
704 fnhe->fnhe_genid = genid;
705 if (gw)
706 fnhe->fnhe_gw = gw;
707 if (pmtu) {
708 fnhe->fnhe_pmtu = pmtu;
709 fnhe->fnhe_mtu_locked = lock;
710 }
711 fnhe->fnhe_expires = max(1UL, expires);
712 /* Update all cached dsts too */
713 rt = rcu_dereference(fnhe->fnhe_rth_input);
714 if (rt)
715 fill_route_from_fnhe(rt, fnhe);
716 rt = rcu_dereference(fnhe->fnhe_rth_output);
717 if (rt)
718 fill_route_from_fnhe(rt, fnhe);
719 } else {
720 /* Randomize max depth to avoid some side channels attacks. */
721 int max_depth = FNHE_RECLAIM_DEPTH +
722 prandom_u32_max(FNHE_RECLAIM_DEPTH);
723
724 while (depth > max_depth) {
725 fnhe_remove_oldest(hash);
726 depth--;
727 }
728
729 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
730 if (!fnhe)
731 goto out_unlock;
732
733 fnhe->fnhe_next = hash->chain;
734
735 fnhe->fnhe_genid = genid;
736 fnhe->fnhe_daddr = daddr;
737 fnhe->fnhe_gw = gw;
738 fnhe->fnhe_pmtu = pmtu;
739 fnhe->fnhe_mtu_locked = lock;
740 fnhe->fnhe_expires = max(1UL, expires);
741
742 rcu_assign_pointer(hash->chain, fnhe);
743
744 /* Exception created; mark the cached routes for the nexthop
745 * stale, so anyone caching it rechecks if this exception
746 * applies to them.
747 */
748 rt = rcu_dereference(nhc->nhc_rth_input);
749 if (rt)
750 rt->dst.obsolete = DST_OBSOLETE_KILL;
751
752 for_each_possible_cpu(i) {
753 struct rtable __rcu **prt;
754 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
755 rt = rcu_dereference(*prt);
756 if (rt)
757 rt->dst.obsolete = DST_OBSOLETE_KILL;
758 }
759 }
760
761 fnhe->fnhe_stamp = jiffies;
762
763 out_unlock:
764 spin_unlock_bh(&fnhe_lock);
765 }
766
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)767 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
768 bool kill_route)
769 {
770 __be32 new_gw = icmp_hdr(skb)->un.gateway;
771 __be32 old_gw = ip_hdr(skb)->saddr;
772 struct net_device *dev = skb->dev;
773 struct in_device *in_dev;
774 struct fib_result res;
775 struct neighbour *n;
776 struct net *net;
777
778 switch (icmp_hdr(skb)->code & 7) {
779 case ICMP_REDIR_NET:
780 case ICMP_REDIR_NETTOS:
781 case ICMP_REDIR_HOST:
782 case ICMP_REDIR_HOSTTOS:
783 break;
784
785 default:
786 return;
787 }
788
789 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
790 return;
791
792 in_dev = __in_dev_get_rcu(dev);
793 if (!in_dev)
794 return;
795
796 net = dev_net(dev);
797 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
798 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
799 ipv4_is_zeronet(new_gw))
800 goto reject_redirect;
801
802 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
803 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
804 goto reject_redirect;
805 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
806 goto reject_redirect;
807 } else {
808 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
809 goto reject_redirect;
810 }
811
812 n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
813 if (!n)
814 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
815 if (!IS_ERR(n)) {
816 if (!(n->nud_state & NUD_VALID)) {
817 neigh_event_send(n, NULL);
818 } else {
819 if (fib_lookup(net, fl4, &res, 0) == 0) {
820 struct fib_nh_common *nhc;
821
822 fib_select_path(net, &res, fl4, skb);
823 nhc = FIB_RES_NHC(res);
824 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
825 0, false,
826 jiffies + ip_rt_gc_timeout);
827 }
828 if (kill_route)
829 rt->dst.obsolete = DST_OBSOLETE_KILL;
830 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
831 }
832 neigh_release(n);
833 }
834 return;
835
836 reject_redirect:
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838 if (IN_DEV_LOG_MARTIANS(in_dev)) {
839 const struct iphdr *iph = (const struct iphdr *) skb->data;
840 __be32 daddr = iph->daddr;
841 __be32 saddr = iph->saddr;
842
843 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
844 " Advised path = %pI4 -> %pI4\n",
845 &old_gw, dev->name, &new_gw,
846 &saddr, &daddr);
847 }
848 #endif
849 ;
850 }
851
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)852 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
853 {
854 struct rtable *rt;
855 struct flowi4 fl4;
856 const struct iphdr *iph = (const struct iphdr *) skb->data;
857 struct net *net = dev_net(skb->dev);
858 int oif = skb->dev->ifindex;
859 u8 tos = RT_TOS(iph->tos);
860 u8 prot = iph->protocol;
861 u32 mark = skb->mark;
862
863 rt = (struct rtable *) dst;
864
865 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
866 ip_rt_fix_tos(&fl4);
867 __ip_do_redirect(rt, skb, &fl4, true);
868 }
869
ipv4_negative_advice(struct sock * sk,struct dst_entry * dst)870 static void ipv4_negative_advice(struct sock *sk,
871 struct dst_entry *dst)
872 {
873 struct rtable *rt = (struct rtable *)dst;
874
875 if ((dst->obsolete > 0) ||
876 (rt->rt_flags & RTCF_REDIRECTED) ||
877 rt->dst.expires)
878 sk_dst_reset(sk);
879 }
880
881 /*
882 * Algorithm:
883 * 1. The first ip_rt_redirect_number redirects are sent
884 * with exponential backoff, then we stop sending them at all,
885 * assuming that the host ignores our redirects.
886 * 2. If we did not see packets requiring redirects
887 * during ip_rt_redirect_silence, we assume that the host
888 * forgot redirected route and start to send redirects again.
889 *
890 * This algorithm is much cheaper and more intelligent than dumb load limiting
891 * in icmp.c.
892 *
893 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
894 * and "frag. need" (breaks PMTU discovery) in icmp.c.
895 */
896
ip_rt_send_redirect(struct sk_buff * skb)897 void ip_rt_send_redirect(struct sk_buff *skb)
898 {
899 struct rtable *rt = skb_rtable(skb);
900 struct in_device *in_dev;
901 struct inet_peer *peer;
902 struct net *net;
903 int log_martians;
904 int vif;
905
906 rcu_read_lock();
907 in_dev = __in_dev_get_rcu(rt->dst.dev);
908 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
909 rcu_read_unlock();
910 return;
911 }
912 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
913 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
914 rcu_read_unlock();
915
916 net = dev_net(rt->dst.dev);
917 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
918 if (!peer) {
919 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
920 rt_nexthop(rt, ip_hdr(skb)->daddr));
921 return;
922 }
923
924 /* No redirected packets during ip_rt_redirect_silence;
925 * reset the algorithm.
926 */
927 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
928 peer->rate_tokens = 0;
929 peer->n_redirects = 0;
930 }
931
932 /* Too many ignored redirects; do not send anything
933 * set dst.rate_last to the last seen redirected packet.
934 */
935 if (peer->n_redirects >= ip_rt_redirect_number) {
936 peer->rate_last = jiffies;
937 goto out_put_peer;
938 }
939
940 /* Check for load limit; set rate_last to the latest sent
941 * redirect.
942 */
943 if (peer->n_redirects == 0 ||
944 time_after(jiffies,
945 (peer->rate_last +
946 (ip_rt_redirect_load << peer->n_redirects)))) {
947 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
948
949 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
950 peer->rate_last = jiffies;
951 ++peer->n_redirects;
952 #ifdef CONFIG_IP_ROUTE_VERBOSE
953 if (log_martians &&
954 peer->n_redirects == ip_rt_redirect_number)
955 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
956 &ip_hdr(skb)->saddr, inet_iif(skb),
957 &ip_hdr(skb)->daddr, &gw);
958 #endif
959 }
960 out_put_peer:
961 inet_putpeer(peer);
962 }
963
ip_error(struct sk_buff * skb)964 static int ip_error(struct sk_buff *skb)
965 {
966 struct rtable *rt = skb_rtable(skb);
967 struct net_device *dev = skb->dev;
968 struct in_device *in_dev;
969 struct inet_peer *peer;
970 unsigned long now;
971 struct net *net;
972 bool send;
973 int code;
974
975 if (netif_is_l3_master(skb->dev)) {
976 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
977 if (!dev)
978 goto out;
979 }
980
981 in_dev = __in_dev_get_rcu(dev);
982
983 /* IP on this device is disabled. */
984 if (!in_dev)
985 goto out;
986
987 net = dev_net(rt->dst.dev);
988 if (!IN_DEV_FORWARD(in_dev)) {
989 switch (rt->dst.error) {
990 case EHOSTUNREACH:
991 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
992 break;
993
994 case ENETUNREACH:
995 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
996 break;
997 }
998 goto out;
999 }
1000
1001 switch (rt->dst.error) {
1002 case EINVAL:
1003 default:
1004 goto out;
1005 case EHOSTUNREACH:
1006 code = ICMP_HOST_UNREACH;
1007 break;
1008 case ENETUNREACH:
1009 code = ICMP_NET_UNREACH;
1010 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1011 break;
1012 case EACCES:
1013 code = ICMP_PKT_FILTERED;
1014 break;
1015 }
1016
1017 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1018 l3mdev_master_ifindex(skb->dev), 1);
1019
1020 send = true;
1021 if (peer) {
1022 now = jiffies;
1023 peer->rate_tokens += now - peer->rate_last;
1024 if (peer->rate_tokens > ip_rt_error_burst)
1025 peer->rate_tokens = ip_rt_error_burst;
1026 peer->rate_last = now;
1027 if (peer->rate_tokens >= ip_rt_error_cost)
1028 peer->rate_tokens -= ip_rt_error_cost;
1029 else
1030 send = false;
1031 inet_putpeer(peer);
1032 }
1033 if (send)
1034 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1035
1036 out: kfree_skb(skb);
1037 return 0;
1038 }
1039
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1040 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1041 {
1042 struct dst_entry *dst = &rt->dst;
1043 struct fib_result res;
1044 bool lock = false;
1045 struct net *net;
1046 u32 old_mtu;
1047
1048 if (ip_mtu_locked(dst))
1049 return;
1050
1051 old_mtu = ipv4_mtu(dst);
1052 if (old_mtu < mtu)
1053 return;
1054
1055 rcu_read_lock();
1056 net = dev_net_rcu(dst->dev);
1057 if (mtu < ip_rt_min_pmtu) {
1058 lock = true;
1059 mtu = min(old_mtu, ip_rt_min_pmtu);
1060 }
1061
1062 if (rt->rt_pmtu == mtu && !lock &&
1063 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1064 goto out;
1065
1066 if (fib_lookup(net, fl4, &res, 0) == 0) {
1067 struct fib_nh_common *nhc;
1068
1069 fib_select_path(net, &res, fl4, NULL);
1070 nhc = FIB_RES_NHC(res);
1071 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1072 jiffies + ip_rt_mtu_expires);
1073 }
1074 out:
1075 rcu_read_unlock();
1076 }
1077
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1078 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1079 struct sk_buff *skb, u32 mtu,
1080 bool confirm_neigh)
1081 {
1082 struct rtable *rt = (struct rtable *) dst;
1083 struct flowi4 fl4;
1084
1085 ip_rt_build_flow_key(&fl4, sk, skb);
1086 ip_rt_fix_tos(&fl4);
1087
1088 /* Don't make lookup fail for bridged encapsulations */
1089 if (skb && netif_is_any_bridge_port(skb->dev))
1090 fl4.flowi4_oif = 0;
1091
1092 __ip_rt_update_pmtu(rt, &fl4, mtu);
1093 }
1094
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1095 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1096 int oif, u8 protocol)
1097 {
1098 const struct iphdr *iph = (const struct iphdr *)skb->data;
1099 struct flowi4 fl4;
1100 struct rtable *rt;
1101 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1102
1103 __build_flow_key(net, &fl4, NULL, iph, oif,
1104 RT_TOS(iph->tos), protocol, mark, 0);
1105 rt = __ip_route_output_key(net, &fl4);
1106 if (!IS_ERR(rt)) {
1107 __ip_rt_update_pmtu(rt, &fl4, mtu);
1108 ip_rt_put(rt);
1109 }
1110 }
1111 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1112
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1113 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1114 {
1115 const struct iphdr *iph = (const struct iphdr *)skb->data;
1116 struct flowi4 fl4;
1117 struct rtable *rt;
1118
1119 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1120
1121 if (!fl4.flowi4_mark)
1122 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1123
1124 rt = __ip_route_output_key(sock_net(sk), &fl4);
1125 if (!IS_ERR(rt)) {
1126 __ip_rt_update_pmtu(rt, &fl4, mtu);
1127 ip_rt_put(rt);
1128 }
1129 }
1130
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1131 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1132 {
1133 const struct iphdr *iph = (const struct iphdr *)skb->data;
1134 struct flowi4 fl4;
1135 struct rtable *rt;
1136 struct dst_entry *odst = NULL;
1137 bool new = false;
1138 struct net *net = sock_net(sk);
1139
1140 bh_lock_sock(sk);
1141
1142 if (!ip_sk_accept_pmtu(sk))
1143 goto out;
1144
1145 odst = sk_dst_get(sk);
1146
1147 if (sock_owned_by_user(sk) || !odst) {
1148 __ipv4_sk_update_pmtu(skb, sk, mtu);
1149 goto out;
1150 }
1151
1152 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1153
1154 rt = (struct rtable *)odst;
1155 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1156 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157 if (IS_ERR(rt))
1158 goto out;
1159
1160 new = true;
1161 } else {
1162 ip_rt_fix_tos(&fl4);
1163 }
1164
1165 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1166
1167 if (!dst_check(&rt->dst, 0)) {
1168 if (new)
1169 dst_release(&rt->dst);
1170
1171 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1172 if (IS_ERR(rt))
1173 goto out;
1174
1175 new = true;
1176 }
1177
1178 if (new)
1179 sk_dst_set(sk, &rt->dst);
1180
1181 out:
1182 bh_unlock_sock(sk);
1183 dst_release(odst);
1184 }
1185 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1186
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1187 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1188 int oif, u8 protocol)
1189 {
1190 const struct iphdr *iph = (const struct iphdr *)skb->data;
1191 struct flowi4 fl4;
1192 struct rtable *rt;
1193
1194 __build_flow_key(net, &fl4, NULL, iph, oif,
1195 RT_TOS(iph->tos), protocol, 0, 0);
1196 rt = __ip_route_output_key(net, &fl4);
1197 if (!IS_ERR(rt)) {
1198 __ip_do_redirect(rt, skb, &fl4, false);
1199 ip_rt_put(rt);
1200 }
1201 }
1202 EXPORT_SYMBOL_GPL(ipv4_redirect);
1203
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1204 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1205 {
1206 const struct iphdr *iph = (const struct iphdr *)skb->data;
1207 struct flowi4 fl4;
1208 struct rtable *rt;
1209 struct net *net = sock_net(sk);
1210
1211 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1212 rt = __ip_route_output_key(net, &fl4);
1213 if (!IS_ERR(rt)) {
1214 __ip_do_redirect(rt, skb, &fl4, false);
1215 ip_rt_put(rt);
1216 }
1217 }
1218 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1219
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1220 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1221 {
1222 struct rtable *rt = (struct rtable *) dst;
1223
1224 /* All IPV4 dsts are created with ->obsolete set to the value
1225 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1226 * into this function always.
1227 *
1228 * When a PMTU/redirect information update invalidates a route,
1229 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1230 * DST_OBSOLETE_DEAD.
1231 */
1232 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1233 return NULL;
1234 return dst;
1235 }
1236
ipv4_send_dest_unreach(struct sk_buff * skb)1237 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1238 {
1239 struct net_device *dev;
1240 struct ip_options opt;
1241 int res;
1242
1243 /* Recompile ip options since IPCB may not be valid anymore.
1244 * Also check we have a reasonable ipv4 header.
1245 */
1246 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1247 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1248 return;
1249
1250 memset(&opt, 0, sizeof(opt));
1251 if (ip_hdr(skb)->ihl > 5) {
1252 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1253 return;
1254 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1255
1256 rcu_read_lock();
1257 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1258 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1259 rcu_read_unlock();
1260
1261 if (res)
1262 return;
1263 }
1264 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1265 }
1266
ipv4_link_failure(struct sk_buff * skb)1267 static void ipv4_link_failure(struct sk_buff *skb)
1268 {
1269 struct rtable *rt;
1270
1271 ipv4_send_dest_unreach(skb);
1272
1273 rt = skb_rtable(skb);
1274 if (rt)
1275 dst_set_expires(&rt->dst, 0);
1276 }
1277
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1278 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1279 {
1280 pr_debug("%s: %pI4 -> %pI4, %s\n",
1281 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1282 skb->dev ? skb->dev->name : "?");
1283 kfree_skb(skb);
1284 WARN_ON(1);
1285 return 0;
1286 }
1287
1288 /*
1289 We do not cache source address of outgoing interface,
1290 because it is used only by IP RR, TS and SRR options,
1291 so that it out of fast path.
1292
1293 BTW remember: "addr" is allowed to be not aligned
1294 in IP options!
1295 */
1296
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1297 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1298 {
1299 __be32 src;
1300
1301 if (rt_is_output_route(rt))
1302 src = ip_hdr(skb)->saddr;
1303 else {
1304 struct fib_result res;
1305 struct iphdr *iph = ip_hdr(skb);
1306 struct flowi4 fl4 = {
1307 .daddr = iph->daddr,
1308 .saddr = iph->saddr,
1309 .flowi4_tos = RT_TOS(iph->tos),
1310 .flowi4_oif = rt->dst.dev->ifindex,
1311 .flowi4_iif = skb->dev->ifindex,
1312 .flowi4_mark = skb->mark,
1313 };
1314
1315 rcu_read_lock();
1316 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1317 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1318 else
1319 src = inet_select_addr(rt->dst.dev,
1320 rt_nexthop(rt, iph->daddr),
1321 RT_SCOPE_UNIVERSE);
1322 rcu_read_unlock();
1323 }
1324 memcpy(addr, &src, 4);
1325 }
1326
1327 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1328 static void set_class_tag(struct rtable *rt, u32 tag)
1329 {
1330 if (!(rt->dst.tclassid & 0xFFFF))
1331 rt->dst.tclassid |= tag & 0xFFFF;
1332 if (!(rt->dst.tclassid & 0xFFFF0000))
1333 rt->dst.tclassid |= tag & 0xFFFF0000;
1334 }
1335 #endif
1336
ipv4_default_advmss(const struct dst_entry * dst)1337 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1338 {
1339 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1340 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1341 ip_rt_min_advmss);
1342
1343 return min(advmss, IPV4_MAX_PMTU - header_size);
1344 }
1345
ipv4_mtu(const struct dst_entry * dst)1346 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1347 {
1348 const struct rtable *rt = (const struct rtable *)dst;
1349 unsigned int mtu = rt->rt_pmtu;
1350
1351 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1352 mtu = dst_metric_raw(dst, RTAX_MTU);
1353
1354 if (mtu)
1355 goto out;
1356
1357 mtu = READ_ONCE(dst->dev->mtu);
1358
1359 if (unlikely(ip_mtu_locked(dst))) {
1360 if (rt->rt_uses_gateway && mtu > 576)
1361 mtu = 576;
1362 }
1363
1364 out:
1365 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1366
1367 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1368 }
1369
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1370 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1371 {
1372 struct fnhe_hash_bucket *hash;
1373 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1374 u32 hval = fnhe_hashfun(daddr);
1375
1376 spin_lock_bh(&fnhe_lock);
1377
1378 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1379 lockdep_is_held(&fnhe_lock));
1380 hash += hval;
1381
1382 fnhe_p = &hash->chain;
1383 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1384 while (fnhe) {
1385 if (fnhe->fnhe_daddr == daddr) {
1386 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1387 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1388 /* set fnhe_daddr to 0 to ensure it won't bind with
1389 * new dsts in rt_bind_exception().
1390 */
1391 fnhe->fnhe_daddr = 0;
1392 fnhe_flush_routes(fnhe);
1393 kfree_rcu(fnhe, rcu);
1394 break;
1395 }
1396 fnhe_p = &fnhe->fnhe_next;
1397 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1398 lockdep_is_held(&fnhe_lock));
1399 }
1400
1401 spin_unlock_bh(&fnhe_lock);
1402 }
1403
find_exception(struct fib_nh_common * nhc,__be32 daddr)1404 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1405 __be32 daddr)
1406 {
1407 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1408 struct fib_nh_exception *fnhe;
1409 u32 hval;
1410
1411 if (!hash)
1412 return NULL;
1413
1414 hval = fnhe_hashfun(daddr);
1415
1416 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1417 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1418 if (fnhe->fnhe_daddr == daddr) {
1419 if (fnhe->fnhe_expires &&
1420 time_after(jiffies, fnhe->fnhe_expires)) {
1421 ip_del_fnhe(nhc, daddr);
1422 break;
1423 }
1424 return fnhe;
1425 }
1426 }
1427 return NULL;
1428 }
1429
1430 /* MTU selection:
1431 * 1. mtu on route is locked - use it
1432 * 2. mtu from nexthop exception
1433 * 3. mtu from egress device
1434 */
1435
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1436 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1437 {
1438 struct fib_nh_common *nhc = res->nhc;
1439 struct net_device *dev = nhc->nhc_dev;
1440 struct fib_info *fi = res->fi;
1441 u32 mtu = 0;
1442
1443 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1444 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1445 mtu = fi->fib_mtu;
1446
1447 if (likely(!mtu)) {
1448 struct fib_nh_exception *fnhe;
1449
1450 fnhe = find_exception(nhc, daddr);
1451 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1452 mtu = fnhe->fnhe_pmtu;
1453 }
1454
1455 if (likely(!mtu))
1456 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1457
1458 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1459 }
1460
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1461 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1462 __be32 daddr, const bool do_cache)
1463 {
1464 bool ret = false;
1465
1466 spin_lock_bh(&fnhe_lock);
1467
1468 if (daddr == fnhe->fnhe_daddr) {
1469 struct rtable __rcu **porig;
1470 struct rtable *orig;
1471 int genid = fnhe_genid(dev_net(rt->dst.dev));
1472
1473 if (rt_is_input_route(rt))
1474 porig = &fnhe->fnhe_rth_input;
1475 else
1476 porig = &fnhe->fnhe_rth_output;
1477 orig = rcu_dereference(*porig);
1478
1479 if (fnhe->fnhe_genid != genid) {
1480 fnhe->fnhe_genid = genid;
1481 fnhe->fnhe_gw = 0;
1482 fnhe->fnhe_pmtu = 0;
1483 fnhe->fnhe_expires = 0;
1484 fnhe->fnhe_mtu_locked = false;
1485 fnhe_flush_routes(fnhe);
1486 orig = NULL;
1487 }
1488 fill_route_from_fnhe(rt, fnhe);
1489 if (!rt->rt_gw4) {
1490 rt->rt_gw4 = daddr;
1491 rt->rt_gw_family = AF_INET;
1492 }
1493
1494 if (do_cache) {
1495 dst_hold(&rt->dst);
1496 rcu_assign_pointer(*porig, rt);
1497 if (orig) {
1498 dst_dev_put(&orig->dst);
1499 dst_release(&orig->dst);
1500 }
1501 ret = true;
1502 }
1503
1504 fnhe->fnhe_stamp = jiffies;
1505 }
1506 spin_unlock_bh(&fnhe_lock);
1507
1508 return ret;
1509 }
1510
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1511 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1512 {
1513 struct rtable *orig, *prev, **p;
1514 bool ret = true;
1515
1516 if (rt_is_input_route(rt)) {
1517 p = (struct rtable **)&nhc->nhc_rth_input;
1518 } else {
1519 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1520 }
1521 orig = *p;
1522
1523 /* hold dst before doing cmpxchg() to avoid race condition
1524 * on this dst
1525 */
1526 dst_hold(&rt->dst);
1527 prev = cmpxchg(p, orig, rt);
1528 if (prev == orig) {
1529 if (orig) {
1530 rt_add_uncached_list(orig);
1531 dst_release(&orig->dst);
1532 }
1533 } else {
1534 dst_release(&rt->dst);
1535 ret = false;
1536 }
1537
1538 return ret;
1539 }
1540
1541 struct uncached_list {
1542 spinlock_t lock;
1543 struct list_head head;
1544 };
1545
1546 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1547
rt_add_uncached_list(struct rtable * rt)1548 void rt_add_uncached_list(struct rtable *rt)
1549 {
1550 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1551
1552 rt->rt_uncached_list = ul;
1553
1554 spin_lock_bh(&ul->lock);
1555 list_add_tail(&rt->rt_uncached, &ul->head);
1556 spin_unlock_bh(&ul->lock);
1557 }
1558
rt_del_uncached_list(struct rtable * rt)1559 void rt_del_uncached_list(struct rtable *rt)
1560 {
1561 if (!list_empty(&rt->rt_uncached)) {
1562 struct uncached_list *ul = rt->rt_uncached_list;
1563
1564 spin_lock_bh(&ul->lock);
1565 list_del(&rt->rt_uncached);
1566 spin_unlock_bh(&ul->lock);
1567 }
1568 }
1569
ipv4_dst_destroy(struct dst_entry * dst)1570 static void ipv4_dst_destroy(struct dst_entry *dst)
1571 {
1572 struct rtable *rt = (struct rtable *)dst;
1573
1574 ip_dst_metrics_put(dst);
1575 rt_del_uncached_list(rt);
1576 }
1577
rt_flush_dev(struct net_device * dev)1578 void rt_flush_dev(struct net_device *dev)
1579 {
1580 struct rtable *rt;
1581 int cpu;
1582
1583 for_each_possible_cpu(cpu) {
1584 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1585
1586 spin_lock_bh(&ul->lock);
1587 list_for_each_entry(rt, &ul->head, rt_uncached) {
1588 if (rt->dst.dev != dev)
1589 continue;
1590 rt->dst.dev = blackhole_netdev;
1591 dev_hold(rt->dst.dev);
1592 dev_put(dev);
1593 }
1594 spin_unlock_bh(&ul->lock);
1595 }
1596 }
1597
rt_cache_valid(const struct rtable * rt)1598 static bool rt_cache_valid(const struct rtable *rt)
1599 {
1600 return rt &&
1601 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1602 !rt_is_expired(rt);
1603 }
1604
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1605 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1606 const struct fib_result *res,
1607 struct fib_nh_exception *fnhe,
1608 struct fib_info *fi, u16 type, u32 itag,
1609 const bool do_cache)
1610 {
1611 bool cached = false;
1612
1613 if (fi) {
1614 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1615
1616 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1617 rt->rt_uses_gateway = 1;
1618 rt->rt_gw_family = nhc->nhc_gw_family;
1619 /* only INET and INET6 are supported */
1620 if (likely(nhc->nhc_gw_family == AF_INET))
1621 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1622 else
1623 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1624 }
1625
1626 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1627
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 if (nhc->nhc_family == AF_INET) {
1630 struct fib_nh *nh;
1631
1632 nh = container_of(nhc, struct fib_nh, nh_common);
1633 rt->dst.tclassid = nh->nh_tclassid;
1634 }
1635 #endif
1636 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1637 if (unlikely(fnhe))
1638 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1639 else if (do_cache)
1640 cached = rt_cache_route(nhc, rt);
1641 if (unlikely(!cached)) {
1642 /* Routes we intend to cache in nexthop exception or
1643 * FIB nexthop have the DST_NOCACHE bit clear.
1644 * However, if we are unsuccessful at storing this
1645 * route into the cache we really need to set it.
1646 */
1647 if (!rt->rt_gw4) {
1648 rt->rt_gw_family = AF_INET;
1649 rt->rt_gw4 = daddr;
1650 }
1651 rt_add_uncached_list(rt);
1652 }
1653 } else
1654 rt_add_uncached_list(rt);
1655
1656 #ifdef CONFIG_IP_ROUTE_CLASSID
1657 #ifdef CONFIG_IP_MULTIPLE_TABLES
1658 set_class_tag(rt, res->tclassid);
1659 #endif
1660 set_class_tag(rt, itag);
1661 #endif
1662 }
1663
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1664 struct rtable *rt_dst_alloc(struct net_device *dev,
1665 unsigned int flags, u16 type,
1666 bool nopolicy, bool noxfrm)
1667 {
1668 struct rtable *rt;
1669
1670 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1671 (nopolicy ? DST_NOPOLICY : 0) |
1672 (noxfrm ? DST_NOXFRM : 0));
1673
1674 if (rt) {
1675 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1676 rt->rt_flags = flags;
1677 rt->rt_type = type;
1678 rt->rt_is_input = 0;
1679 rt->rt_iif = 0;
1680 rt->rt_pmtu = 0;
1681 rt->rt_mtu_locked = 0;
1682 rt->rt_uses_gateway = 0;
1683 rt->rt_gw_family = 0;
1684 rt->rt_gw4 = 0;
1685 INIT_LIST_HEAD(&rt->rt_uncached);
1686
1687 rt->dst.output = ip_output;
1688 if (flags & RTCF_LOCAL)
1689 rt->dst.input = ip_local_deliver;
1690 }
1691
1692 return rt;
1693 }
1694 EXPORT_SYMBOL(rt_dst_alloc);
1695
rt_dst_clone(struct net_device * dev,struct rtable * rt)1696 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1697 {
1698 struct rtable *new_rt;
1699
1700 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1701 rt->dst.flags);
1702
1703 if (new_rt) {
1704 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1705 new_rt->rt_flags = rt->rt_flags;
1706 new_rt->rt_type = rt->rt_type;
1707 new_rt->rt_is_input = rt->rt_is_input;
1708 new_rt->rt_iif = rt->rt_iif;
1709 new_rt->rt_pmtu = rt->rt_pmtu;
1710 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1711 new_rt->rt_gw_family = rt->rt_gw_family;
1712 if (rt->rt_gw_family == AF_INET)
1713 new_rt->rt_gw4 = rt->rt_gw4;
1714 else if (rt->rt_gw_family == AF_INET6)
1715 new_rt->rt_gw6 = rt->rt_gw6;
1716 INIT_LIST_HEAD(&new_rt->rt_uncached);
1717
1718 new_rt->dst.input = rt->dst.input;
1719 new_rt->dst.output = rt->dst.output;
1720 new_rt->dst.error = rt->dst.error;
1721 new_rt->dst.lastuse = jiffies;
1722 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1723 }
1724 return new_rt;
1725 }
1726 EXPORT_SYMBOL(rt_dst_clone);
1727
1728 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1729 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730 u8 tos, struct net_device *dev,
1731 struct in_device *in_dev, u32 *itag)
1732 {
1733 int err;
1734
1735 /* Primary sanity checks. */
1736 if (!in_dev)
1737 return -EINVAL;
1738
1739 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1740 skb->protocol != htons(ETH_P_IP))
1741 return -EINVAL;
1742
1743 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1744 return -EINVAL;
1745
1746 if (ipv4_is_zeronet(saddr)) {
1747 if (!ipv4_is_local_multicast(daddr) &&
1748 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1749 return -EINVAL;
1750 } else {
1751 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1752 in_dev, itag);
1753 if (err < 0)
1754 return err;
1755 }
1756 return 0;
1757 }
1758
1759 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1760 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1761 u8 tos, struct net_device *dev, int our)
1762 {
1763 struct in_device *in_dev = __in_dev_get_rcu(dev);
1764 unsigned int flags = RTCF_MULTICAST;
1765 struct rtable *rth;
1766 bool no_policy;
1767 u32 itag = 0;
1768 int err;
1769
1770 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1771 if (err)
1772 return err;
1773
1774 if (our)
1775 flags |= RTCF_LOCAL;
1776
1777 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1778 if (no_policy)
1779 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1780
1781 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1782 no_policy, false);
1783 if (!rth)
1784 return -ENOBUFS;
1785
1786 #ifdef CONFIG_IP_ROUTE_CLASSID
1787 rth->dst.tclassid = itag;
1788 #endif
1789 rth->dst.output = ip_rt_bug;
1790 rth->rt_is_input= 1;
1791
1792 #ifdef CONFIG_IP_MROUTE
1793 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1794 rth->dst.input = ip_mr_input;
1795 #endif
1796 RT_CACHE_STAT_INC(in_slow_mc);
1797
1798 skb_dst_drop(skb);
1799 skb_dst_set(skb, &rth->dst);
1800 return 0;
1801 }
1802
1803
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1804 static void ip_handle_martian_source(struct net_device *dev,
1805 struct in_device *in_dev,
1806 struct sk_buff *skb,
1807 __be32 daddr,
1808 __be32 saddr)
1809 {
1810 RT_CACHE_STAT_INC(in_martian_src);
1811 #ifdef CONFIG_IP_ROUTE_VERBOSE
1812 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1813 /*
1814 * RFC1812 recommendation, if source is martian,
1815 * the only hint is MAC header.
1816 */
1817 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1818 &daddr, &saddr, dev->name);
1819 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1820 print_hex_dump(KERN_WARNING, "ll header: ",
1821 DUMP_PREFIX_OFFSET, 16, 1,
1822 skb_mac_header(skb),
1823 dev->hard_header_len, false);
1824 }
1825 }
1826 #endif
1827 }
1828
1829 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1830 static int __mkroute_input(struct sk_buff *skb,
1831 const struct fib_result *res,
1832 struct in_device *in_dev,
1833 __be32 daddr, __be32 saddr, u32 tos)
1834 {
1835 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1836 struct net_device *dev = nhc->nhc_dev;
1837 struct fib_nh_exception *fnhe;
1838 struct rtable *rth;
1839 int err;
1840 struct in_device *out_dev;
1841 bool do_cache, no_policy;
1842 u32 itag = 0;
1843
1844 /* get a working reference to the output device */
1845 out_dev = __in_dev_get_rcu(dev);
1846 if (!out_dev) {
1847 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1848 return -EINVAL;
1849 }
1850
1851 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1852 in_dev->dev, in_dev, &itag);
1853 if (err < 0) {
1854 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1855 saddr);
1856
1857 goto cleanup;
1858 }
1859
1860 do_cache = res->fi && !itag;
1861 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1862 skb->protocol == htons(ETH_P_IP)) {
1863 __be32 gw;
1864
1865 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1866 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1867 inet_addr_onlink(out_dev, saddr, gw))
1868 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1869 }
1870
1871 if (skb->protocol != htons(ETH_P_IP)) {
1872 /* Not IP (i.e. ARP). Do not create route, if it is
1873 * invalid for proxy arp. DNAT routes are always valid.
1874 *
1875 * Proxy arp feature have been extended to allow, ARP
1876 * replies back to the same interface, to support
1877 * Private VLAN switch technologies. See arp.c.
1878 */
1879 if (out_dev == in_dev &&
1880 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1881 err = -EINVAL;
1882 goto cleanup;
1883 }
1884 }
1885
1886 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1887 if (no_policy)
1888 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1889
1890 fnhe = find_exception(nhc, daddr);
1891 if (do_cache) {
1892 if (fnhe)
1893 rth = rcu_dereference(fnhe->fnhe_rth_input);
1894 else
1895 rth = rcu_dereference(nhc->nhc_rth_input);
1896 if (rt_cache_valid(rth)) {
1897 skb_dst_set_noref(skb, &rth->dst);
1898 goto out;
1899 }
1900 }
1901
1902 rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1903 IN_DEV_ORCONF(out_dev, NOXFRM));
1904 if (!rth) {
1905 err = -ENOBUFS;
1906 goto cleanup;
1907 }
1908
1909 rth->rt_is_input = 1;
1910 RT_CACHE_STAT_INC(in_slow_tot);
1911
1912 rth->dst.input = ip_forward;
1913
1914 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1915 do_cache);
1916 lwtunnel_set_redirect(&rth->dst);
1917 skb_dst_set(skb, &rth->dst);
1918 out:
1919 err = 0;
1920 cleanup:
1921 return err;
1922 }
1923
1924 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1925 /* To make ICMP packets follow the right flow, the multipath hash is
1926 * calculated from the inner IP addresses.
1927 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1928 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1929 struct flow_keys *hash_keys)
1930 {
1931 const struct iphdr *outer_iph = ip_hdr(skb);
1932 const struct iphdr *key_iph = outer_iph;
1933 const struct iphdr *inner_iph;
1934 const struct icmphdr *icmph;
1935 struct iphdr _inner_iph;
1936 struct icmphdr _icmph;
1937
1938 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1939 goto out;
1940
1941 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1942 goto out;
1943
1944 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1945 &_icmph);
1946 if (!icmph)
1947 goto out;
1948
1949 if (!icmp_is_err(icmph->type))
1950 goto out;
1951
1952 inner_iph = skb_header_pointer(skb,
1953 outer_iph->ihl * 4 + sizeof(_icmph),
1954 sizeof(_inner_iph), &_inner_iph);
1955 if (!inner_iph)
1956 goto out;
1957
1958 key_iph = inner_iph;
1959 out:
1960 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1961 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1962 }
1963
1964 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1965 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1966 const struct sk_buff *skb, struct flow_keys *flkeys)
1967 {
1968 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1969 struct flow_keys hash_keys;
1970 u32 mhash;
1971
1972 switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
1973 case 0:
1974 memset(&hash_keys, 0, sizeof(hash_keys));
1975 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1976 if (skb) {
1977 ip_multipath_l3_keys(skb, &hash_keys);
1978 } else {
1979 hash_keys.addrs.v4addrs.src = fl4->saddr;
1980 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1981 }
1982 break;
1983 case 1:
1984 /* skb is currently provided only when forwarding */
1985 if (skb) {
1986 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1987 struct flow_keys keys;
1988
1989 /* short-circuit if we already have L4 hash present */
1990 if (skb->l4_hash)
1991 return skb_get_hash_raw(skb) >> 1;
1992
1993 memset(&hash_keys, 0, sizeof(hash_keys));
1994
1995 if (!flkeys) {
1996 skb_flow_dissect_flow_keys(skb, &keys, flag);
1997 flkeys = &keys;
1998 }
1999
2000 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2001 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2002 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2003 hash_keys.ports.src = flkeys->ports.src;
2004 hash_keys.ports.dst = flkeys->ports.dst;
2005 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2006 } else {
2007 memset(&hash_keys, 0, sizeof(hash_keys));
2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2009 hash_keys.addrs.v4addrs.src = fl4->saddr;
2010 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2011 hash_keys.ports.src = fl4->fl4_sport;
2012 hash_keys.ports.dst = fl4->fl4_dport;
2013 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2014 }
2015 break;
2016 case 2:
2017 memset(&hash_keys, 0, sizeof(hash_keys));
2018 /* skb is currently provided only when forwarding */
2019 if (skb) {
2020 struct flow_keys keys;
2021
2022 skb_flow_dissect_flow_keys(skb, &keys, 0);
2023 /* Inner can be v4 or v6 */
2024 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2025 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2026 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2027 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2028 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2029 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2031 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2032 hash_keys.tags.flow_label = keys.tags.flow_label;
2033 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2034 } else {
2035 /* Same as case 0 */
2036 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2037 ip_multipath_l3_keys(skb, &hash_keys);
2038 }
2039 } else {
2040 /* Same as case 0 */
2041 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2042 hash_keys.addrs.v4addrs.src = fl4->saddr;
2043 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2044 }
2045 break;
2046 }
2047 mhash = flow_hash_from_keys(&hash_keys);
2048
2049 if (multipath_hash)
2050 mhash = jhash_2words(mhash, multipath_hash, 0);
2051
2052 return mhash >> 1;
2053 }
2054 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2055
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2056 static int ip_mkroute_input(struct sk_buff *skb,
2057 struct fib_result *res,
2058 struct in_device *in_dev,
2059 __be32 daddr, __be32 saddr, u32 tos,
2060 struct flow_keys *hkeys)
2061 {
2062 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2063 if (res->fi && fib_info_num_path(res->fi) > 1) {
2064 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2065
2066 fib_select_multipath(res, h);
2067 IPCB(skb)->flags |= IPSKB_MULTIPATH;
2068 }
2069 #endif
2070
2071 /* create a routing cache entry */
2072 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2073 }
2074
2075 /* Implements all the saddr-related checks as ip_route_input_slow(),
2076 * assuming daddr is valid and the destination is not a local broadcast one.
2077 * Uses the provided hint instead of performing a route lookup.
2078 */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2079 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080 u8 tos, struct net_device *dev,
2081 const struct sk_buff *hint)
2082 {
2083 struct in_device *in_dev = __in_dev_get_rcu(dev);
2084 struct rtable *rt = skb_rtable(hint);
2085 struct net *net = dev_net(dev);
2086 int err = -EINVAL;
2087 u32 tag = 0;
2088
2089 if (!in_dev)
2090 return -EINVAL;
2091
2092 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2093 goto martian_source;
2094
2095 if (ipv4_is_zeronet(saddr))
2096 goto martian_source;
2097
2098 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2099 goto martian_source;
2100
2101 if (rt->rt_type != RTN_LOCAL)
2102 goto skip_validate_source;
2103
2104 tos &= IPTOS_RT_MASK;
2105 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2106 if (err < 0)
2107 goto martian_source;
2108
2109 skip_validate_source:
2110 skb_dst_copy(skb, hint);
2111 return 0;
2112
2113 martian_source:
2114 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2115 return err;
2116 }
2117
2118 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2119 static struct net_device *ip_rt_get_dev(struct net *net,
2120 const struct fib_result *res)
2121 {
2122 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2123 struct net_device *dev = NULL;
2124
2125 if (nhc)
2126 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2127
2128 return dev ? : net->loopback_dev;
2129 }
2130
2131 /*
2132 * NOTE. We drop all the packets that has local source
2133 * addresses, because every properly looped back packet
2134 * must have correct destination already attached by output routine.
2135 * Changes in the enforced policies must be applied also to
2136 * ip_route_use_hint().
2137 *
2138 * Such approach solves two big problems:
2139 * 1. Not simplex devices are handled properly.
2140 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2141 * called with rcu_read_lock()
2142 */
2143
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2144 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2145 u8 tos, struct net_device *dev,
2146 struct fib_result *res)
2147 {
2148 struct in_device *in_dev = __in_dev_get_rcu(dev);
2149 struct flow_keys *flkeys = NULL, _flkeys;
2150 struct net *net = dev_net(dev);
2151 struct ip_tunnel_info *tun_info;
2152 int err = -EINVAL;
2153 unsigned int flags = 0;
2154 u32 itag = 0;
2155 struct rtable *rth;
2156 struct flowi4 fl4;
2157 bool do_cache = true;
2158 bool no_policy;
2159
2160 /* IP on this device is disabled. */
2161
2162 if (!in_dev)
2163 goto out;
2164
2165 /* Check for the most weird martians, which can be not detected
2166 by fib_lookup.
2167 */
2168
2169 tun_info = skb_tunnel_info(skb);
2170 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2171 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2172 else
2173 fl4.flowi4_tun_key.tun_id = 0;
2174 skb_dst_drop(skb);
2175
2176 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2177 goto martian_source;
2178
2179 res->fi = NULL;
2180 res->table = NULL;
2181 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2182 goto brd_input;
2183
2184 /* Accept zero addresses only to limited broadcast;
2185 * I even do not know to fix it or not. Waiting for complains :-)
2186 */
2187 if (ipv4_is_zeronet(saddr))
2188 goto martian_source;
2189
2190 if (ipv4_is_zeronet(daddr))
2191 goto martian_destination;
2192
2193 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2194 * and call it once if daddr or/and saddr are loopback addresses
2195 */
2196 if (ipv4_is_loopback(daddr)) {
2197 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2198 goto martian_destination;
2199 } else if (ipv4_is_loopback(saddr)) {
2200 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2201 goto martian_source;
2202 }
2203
2204 /*
2205 * Now we are ready to route packet.
2206 */
2207 fl4.flowi4_oif = 0;
2208 fl4.flowi4_iif = dev->ifindex;
2209 fl4.flowi4_mark = skb->mark;
2210 fl4.flowi4_tos = tos;
2211 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2212 fl4.flowi4_flags = 0;
2213 fl4.daddr = daddr;
2214 fl4.saddr = saddr;
2215 fl4.flowi4_uid = sock_net_uid(net, NULL);
2216 fl4.flowi4_multipath_hash = 0;
2217
2218 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2219 flkeys = &_flkeys;
2220 } else {
2221 fl4.flowi4_proto = 0;
2222 fl4.fl4_sport = 0;
2223 fl4.fl4_dport = 0;
2224 }
2225
2226 err = fib_lookup(net, &fl4, res, 0);
2227 if (err != 0) {
2228 if (!IN_DEV_FORWARD(in_dev))
2229 err = -EHOSTUNREACH;
2230 goto no_route;
2231 }
2232
2233 if (res->type == RTN_BROADCAST) {
2234 if (IN_DEV_BFORWARD(in_dev))
2235 goto make_route;
2236 /* not do cache if bc_forwarding is enabled */
2237 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2238 do_cache = false;
2239 goto brd_input;
2240 }
2241
2242 if (res->type == RTN_LOCAL) {
2243 err = fib_validate_source(skb, saddr, daddr, tos,
2244 0, dev, in_dev, &itag);
2245 if (err < 0)
2246 goto martian_source;
2247 goto local_input;
2248 }
2249
2250 if (!IN_DEV_FORWARD(in_dev)) {
2251 err = -EHOSTUNREACH;
2252 goto no_route;
2253 }
2254 if (res->type != RTN_UNICAST)
2255 goto martian_destination;
2256
2257 make_route:
2258 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2259 out: return err;
2260
2261 brd_input:
2262 if (skb->protocol != htons(ETH_P_IP))
2263 goto e_inval;
2264
2265 if (!ipv4_is_zeronet(saddr)) {
2266 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2267 in_dev, &itag);
2268 if (err < 0)
2269 goto martian_source;
2270 }
2271 flags |= RTCF_BROADCAST;
2272 res->type = RTN_BROADCAST;
2273 RT_CACHE_STAT_INC(in_brd);
2274
2275 local_input:
2276 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2277 if (no_policy)
2278 IPCB(skb)->flags |= IPSKB_NOPOLICY;
2279
2280 do_cache &= res->fi && !itag;
2281 if (do_cache) {
2282 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2283
2284 rth = rcu_dereference(nhc->nhc_rth_input);
2285 if (rt_cache_valid(rth)) {
2286 skb_dst_set_noref(skb, &rth->dst);
2287 err = 0;
2288 goto out;
2289 }
2290 }
2291
2292 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2293 flags | RTCF_LOCAL, res->type,
2294 no_policy, false);
2295 if (!rth)
2296 goto e_nobufs;
2297
2298 rth->dst.output= ip_rt_bug;
2299 #ifdef CONFIG_IP_ROUTE_CLASSID
2300 rth->dst.tclassid = itag;
2301 #endif
2302 rth->rt_is_input = 1;
2303
2304 RT_CACHE_STAT_INC(in_slow_tot);
2305 if (res->type == RTN_UNREACHABLE) {
2306 rth->dst.input= ip_error;
2307 rth->dst.error= -err;
2308 rth->rt_flags &= ~RTCF_LOCAL;
2309 }
2310
2311 if (do_cache) {
2312 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2313
2314 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2315 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2316 WARN_ON(rth->dst.input == lwtunnel_input);
2317 rth->dst.lwtstate->orig_input = rth->dst.input;
2318 rth->dst.input = lwtunnel_input;
2319 }
2320
2321 if (unlikely(!rt_cache_route(nhc, rth)))
2322 rt_add_uncached_list(rth);
2323 }
2324 skb_dst_set(skb, &rth->dst);
2325 err = 0;
2326 goto out;
2327
2328 no_route:
2329 RT_CACHE_STAT_INC(in_no_route);
2330 res->type = RTN_UNREACHABLE;
2331 res->fi = NULL;
2332 res->table = NULL;
2333 goto local_input;
2334
2335 /*
2336 * Do not cache martian addresses: they should be logged (RFC1812)
2337 */
2338 martian_destination:
2339 RT_CACHE_STAT_INC(in_martian_dst);
2340 #ifdef CONFIG_IP_ROUTE_VERBOSE
2341 if (IN_DEV_LOG_MARTIANS(in_dev))
2342 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2343 &daddr, &saddr, dev->name);
2344 #endif
2345
2346 e_inval:
2347 err = -EINVAL;
2348 goto out;
2349
2350 e_nobufs:
2351 err = -ENOBUFS;
2352 goto out;
2353
2354 martian_source:
2355 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2356 goto out;
2357 }
2358
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2359 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2360 u8 tos, struct net_device *dev)
2361 {
2362 struct fib_result res;
2363 int err;
2364
2365 tos &= IPTOS_RT_MASK;
2366 rcu_read_lock();
2367 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2368 rcu_read_unlock();
2369
2370 return err;
2371 }
2372 EXPORT_SYMBOL(ip_route_input_noref);
2373
2374 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2375 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2376 u8 tos, struct net_device *dev, struct fib_result *res)
2377 {
2378 /* Multicast recognition logic is moved from route cache to here.
2379 The problem was that too many Ethernet cards have broken/missing
2380 hardware multicast filters :-( As result the host on multicasting
2381 network acquires a lot of useless route cache entries, sort of
2382 SDR messages from all the world. Now we try to get rid of them.
2383 Really, provided software IP multicast filter is organized
2384 reasonably (at least, hashed), it does not result in a slowdown
2385 comparing with route cache reject entries.
2386 Note, that multicast routers are not affected, because
2387 route cache entry is created eventually.
2388 */
2389 if (ipv4_is_multicast(daddr)) {
2390 struct in_device *in_dev = __in_dev_get_rcu(dev);
2391 int our = 0;
2392 int err = -EINVAL;
2393
2394 if (!in_dev)
2395 return err;
2396 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2397 ip_hdr(skb)->protocol);
2398
2399 /* check l3 master if no match yet */
2400 if (!our && netif_is_l3_slave(dev)) {
2401 struct in_device *l3_in_dev;
2402
2403 l3_in_dev = __in_dev_get_rcu(skb->dev);
2404 if (l3_in_dev)
2405 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2406 ip_hdr(skb)->protocol);
2407 }
2408
2409 if (our
2410 #ifdef CONFIG_IP_MROUTE
2411 ||
2412 (!ipv4_is_local_multicast(daddr) &&
2413 IN_DEV_MFORWARD(in_dev))
2414 #endif
2415 ) {
2416 err = ip_route_input_mc(skb, daddr, saddr,
2417 tos, dev, our);
2418 }
2419 return err;
2420 }
2421
2422 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2423 }
2424
2425 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2426 static struct rtable *__mkroute_output(const struct fib_result *res,
2427 const struct flowi4 *fl4, int orig_oif,
2428 struct net_device *dev_out,
2429 unsigned int flags)
2430 {
2431 struct fib_info *fi = res->fi;
2432 struct fib_nh_exception *fnhe;
2433 struct in_device *in_dev;
2434 u16 type = res->type;
2435 struct rtable *rth;
2436 bool do_cache;
2437
2438 in_dev = __in_dev_get_rcu(dev_out);
2439 if (!in_dev)
2440 return ERR_PTR(-EINVAL);
2441
2442 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2443 if (ipv4_is_loopback(fl4->saddr) &&
2444 !(dev_out->flags & IFF_LOOPBACK) &&
2445 !netif_is_l3_master(dev_out))
2446 return ERR_PTR(-EINVAL);
2447
2448 if (ipv4_is_lbcast(fl4->daddr))
2449 type = RTN_BROADCAST;
2450 else if (ipv4_is_multicast(fl4->daddr))
2451 type = RTN_MULTICAST;
2452 else if (ipv4_is_zeronet(fl4->daddr))
2453 return ERR_PTR(-EINVAL);
2454
2455 if (dev_out->flags & IFF_LOOPBACK)
2456 flags |= RTCF_LOCAL;
2457
2458 do_cache = true;
2459 if (type == RTN_BROADCAST) {
2460 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2461 fi = NULL;
2462 } else if (type == RTN_MULTICAST) {
2463 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2464 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2465 fl4->flowi4_proto))
2466 flags &= ~RTCF_LOCAL;
2467 else
2468 do_cache = false;
2469 /* If multicast route do not exist use
2470 * default one, but do not gateway in this case.
2471 * Yes, it is hack.
2472 */
2473 if (fi && res->prefixlen < 4)
2474 fi = NULL;
2475 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2476 (orig_oif != dev_out->ifindex)) {
2477 /* For local routes that require a particular output interface
2478 * we do not want to cache the result. Caching the result
2479 * causes incorrect behaviour when there are multiple source
2480 * addresses on the interface, the end result being that if the
2481 * intended recipient is waiting on that interface for the
2482 * packet he won't receive it because it will be delivered on
2483 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2484 * be set to the loopback interface as well.
2485 */
2486 do_cache = false;
2487 }
2488
2489 fnhe = NULL;
2490 do_cache &= fi != NULL;
2491 if (fi) {
2492 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2493 struct rtable __rcu **prth;
2494
2495 fnhe = find_exception(nhc, fl4->daddr);
2496 if (!do_cache)
2497 goto add;
2498 if (fnhe) {
2499 prth = &fnhe->fnhe_rth_output;
2500 } else {
2501 if (unlikely(fl4->flowi4_flags &
2502 FLOWI_FLAG_KNOWN_NH &&
2503 !(nhc->nhc_gw_family &&
2504 nhc->nhc_scope == RT_SCOPE_LINK))) {
2505 do_cache = false;
2506 goto add;
2507 }
2508 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2509 }
2510 rth = rcu_dereference(*prth);
2511 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2512 return rth;
2513 }
2514
2515 add:
2516 rth = rt_dst_alloc(dev_out, flags, type,
2517 IN_DEV_ORCONF(in_dev, NOPOLICY),
2518 IN_DEV_ORCONF(in_dev, NOXFRM));
2519 if (!rth)
2520 return ERR_PTR(-ENOBUFS);
2521
2522 rth->rt_iif = orig_oif;
2523
2524 RT_CACHE_STAT_INC(out_slow_tot);
2525
2526 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2527 if (flags & RTCF_LOCAL &&
2528 !(dev_out->flags & IFF_LOOPBACK)) {
2529 rth->dst.output = ip_mc_output;
2530 RT_CACHE_STAT_INC(out_slow_mc);
2531 }
2532 #ifdef CONFIG_IP_MROUTE
2533 if (type == RTN_MULTICAST) {
2534 if (IN_DEV_MFORWARD(in_dev) &&
2535 !ipv4_is_local_multicast(fl4->daddr)) {
2536 rth->dst.input = ip_mr_input;
2537 rth->dst.output = ip_mc_output;
2538 }
2539 }
2540 #endif
2541 }
2542
2543 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2544 lwtunnel_set_redirect(&rth->dst);
2545
2546 return rth;
2547 }
2548
2549 /*
2550 * Major route resolver routine.
2551 */
2552
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2553 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2554 const struct sk_buff *skb)
2555 {
2556 struct fib_result res = {
2557 .type = RTN_UNSPEC,
2558 .fi = NULL,
2559 .table = NULL,
2560 .tclassid = 0,
2561 };
2562 struct rtable *rth;
2563
2564 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2565 ip_rt_fix_tos(fl4);
2566
2567 rcu_read_lock();
2568 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2569 rcu_read_unlock();
2570
2571 return rth;
2572 }
2573 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2574
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2575 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2576 struct fib_result *res,
2577 const struct sk_buff *skb)
2578 {
2579 struct net_device *dev_out = NULL;
2580 int orig_oif = fl4->flowi4_oif;
2581 unsigned int flags = 0;
2582 struct rtable *rth;
2583 int err;
2584
2585 if (fl4->saddr) {
2586 if (ipv4_is_multicast(fl4->saddr) ||
2587 ipv4_is_lbcast(fl4->saddr) ||
2588 ipv4_is_zeronet(fl4->saddr)) {
2589 rth = ERR_PTR(-EINVAL);
2590 goto out;
2591 }
2592
2593 rth = ERR_PTR(-ENETUNREACH);
2594
2595 /* I removed check for oif == dev_out->oif here.
2596 It was wrong for two reasons:
2597 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2598 is assigned to multiple interfaces.
2599 2. Moreover, we are allowed to send packets with saddr
2600 of another iface. --ANK
2601 */
2602
2603 if (fl4->flowi4_oif == 0 &&
2604 (ipv4_is_multicast(fl4->daddr) ||
2605 ipv4_is_lbcast(fl4->daddr))) {
2606 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2607 dev_out = __ip_dev_find(net, fl4->saddr, false);
2608 if (!dev_out)
2609 goto out;
2610
2611 /* Special hack: user can direct multicasts
2612 and limited broadcast via necessary interface
2613 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2614 This hack is not just for fun, it allows
2615 vic,vat and friends to work.
2616 They bind socket to loopback, set ttl to zero
2617 and expect that it will work.
2618 From the viewpoint of routing cache they are broken,
2619 because we are not allowed to build multicast path
2620 with loopback source addr (look, routing cache
2621 cannot know, that ttl is zero, so that packet
2622 will not leave this host and route is valid).
2623 Luckily, this hack is good workaround.
2624 */
2625
2626 fl4->flowi4_oif = dev_out->ifindex;
2627 goto make_route;
2628 }
2629
2630 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2631 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2632 if (!__ip_dev_find(net, fl4->saddr, false))
2633 goto out;
2634 }
2635 }
2636
2637
2638 if (fl4->flowi4_oif) {
2639 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2640 rth = ERR_PTR(-ENODEV);
2641 if (!dev_out)
2642 goto out;
2643
2644 /* RACE: Check return value of inet_select_addr instead. */
2645 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2646 rth = ERR_PTR(-ENETUNREACH);
2647 goto out;
2648 }
2649 if (ipv4_is_local_multicast(fl4->daddr) ||
2650 ipv4_is_lbcast(fl4->daddr) ||
2651 fl4->flowi4_proto == IPPROTO_IGMP) {
2652 if (!fl4->saddr)
2653 fl4->saddr = inet_select_addr(dev_out, 0,
2654 RT_SCOPE_LINK);
2655 goto make_route;
2656 }
2657 if (!fl4->saddr) {
2658 if (ipv4_is_multicast(fl4->daddr))
2659 fl4->saddr = inet_select_addr(dev_out, 0,
2660 fl4->flowi4_scope);
2661 else if (!fl4->daddr)
2662 fl4->saddr = inet_select_addr(dev_out, 0,
2663 RT_SCOPE_HOST);
2664 }
2665 }
2666
2667 if (!fl4->daddr) {
2668 fl4->daddr = fl4->saddr;
2669 if (!fl4->daddr)
2670 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2671 dev_out = net->loopback_dev;
2672 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2673 res->type = RTN_LOCAL;
2674 flags |= RTCF_LOCAL;
2675 goto make_route;
2676 }
2677
2678 err = fib_lookup(net, fl4, res, 0);
2679 if (err) {
2680 res->fi = NULL;
2681 res->table = NULL;
2682 if (fl4->flowi4_oif &&
2683 (ipv4_is_multicast(fl4->daddr) ||
2684 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2685 /* Apparently, routing tables are wrong. Assume,
2686 that the destination is on link.
2687
2688 WHY? DW.
2689 Because we are allowed to send to iface
2690 even if it has NO routes and NO assigned
2691 addresses. When oif is specified, routing
2692 tables are looked up with only one purpose:
2693 to catch if destination is gatewayed, rather than
2694 direct. Moreover, if MSG_DONTROUTE is set,
2695 we send packet, ignoring both routing tables
2696 and ifaddr state. --ANK
2697
2698
2699 We could make it even if oif is unknown,
2700 likely IPv6, but we do not.
2701 */
2702
2703 if (fl4->saddr == 0)
2704 fl4->saddr = inet_select_addr(dev_out, 0,
2705 RT_SCOPE_LINK);
2706 res->type = RTN_UNICAST;
2707 goto make_route;
2708 }
2709 rth = ERR_PTR(err);
2710 goto out;
2711 }
2712
2713 if (res->type == RTN_LOCAL) {
2714 if (!fl4->saddr) {
2715 if (res->fi->fib_prefsrc)
2716 fl4->saddr = res->fi->fib_prefsrc;
2717 else
2718 fl4->saddr = fl4->daddr;
2719 }
2720
2721 /* L3 master device is the loopback for that domain */
2722 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2723 net->loopback_dev;
2724
2725 /* make sure orig_oif points to fib result device even
2726 * though packet rx/tx happens over loopback or l3mdev
2727 */
2728 orig_oif = FIB_RES_OIF(*res);
2729
2730 fl4->flowi4_oif = dev_out->ifindex;
2731 flags |= RTCF_LOCAL;
2732 goto make_route;
2733 }
2734
2735 fib_select_path(net, res, fl4, skb);
2736
2737 dev_out = FIB_RES_DEV(*res);
2738
2739 make_route:
2740 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2741
2742 out:
2743 return rth;
2744 }
2745
2746 static struct dst_ops ipv4_dst_blackhole_ops = {
2747 .family = AF_INET,
2748 .default_advmss = ipv4_default_advmss,
2749 .neigh_lookup = ipv4_neigh_lookup,
2750 .check = dst_blackhole_check,
2751 .cow_metrics = dst_blackhole_cow_metrics,
2752 .update_pmtu = dst_blackhole_update_pmtu,
2753 .redirect = dst_blackhole_redirect,
2754 .mtu = dst_blackhole_mtu,
2755 };
2756
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2757 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2758 {
2759 struct rtable *ort = (struct rtable *) dst_orig;
2760 struct rtable *rt;
2761
2762 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2763 if (rt) {
2764 struct dst_entry *new = &rt->dst;
2765
2766 new->__use = 1;
2767 new->input = dst_discard;
2768 new->output = dst_discard_out;
2769
2770 new->dev = net->loopback_dev;
2771 if (new->dev)
2772 dev_hold(new->dev);
2773
2774 rt->rt_is_input = ort->rt_is_input;
2775 rt->rt_iif = ort->rt_iif;
2776 rt->rt_pmtu = ort->rt_pmtu;
2777 rt->rt_mtu_locked = ort->rt_mtu_locked;
2778
2779 rt->rt_genid = rt_genid_ipv4(net);
2780 rt->rt_flags = ort->rt_flags;
2781 rt->rt_type = ort->rt_type;
2782 rt->rt_uses_gateway = ort->rt_uses_gateway;
2783 rt->rt_gw_family = ort->rt_gw_family;
2784 if (rt->rt_gw_family == AF_INET)
2785 rt->rt_gw4 = ort->rt_gw4;
2786 else if (rt->rt_gw_family == AF_INET6)
2787 rt->rt_gw6 = ort->rt_gw6;
2788
2789 INIT_LIST_HEAD(&rt->rt_uncached);
2790 }
2791
2792 dst_release(dst_orig);
2793
2794 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2795 }
2796
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2797 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2798 const struct sock *sk)
2799 {
2800 struct rtable *rt = __ip_route_output_key(net, flp4);
2801
2802 if (IS_ERR(rt))
2803 return rt;
2804
2805 if (flp4->flowi4_proto) {
2806 flp4->flowi4_oif = rt->dst.dev->ifindex;
2807 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2808 flowi4_to_flowi(flp4),
2809 sk, 0);
2810 }
2811
2812 return rt;
2813 }
2814 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2815
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2816 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2817 struct net_device *dev,
2818 struct net *net, __be32 *saddr,
2819 const struct ip_tunnel_info *info,
2820 u8 protocol, bool use_cache)
2821 {
2822 #ifdef CONFIG_DST_CACHE
2823 struct dst_cache *dst_cache;
2824 #endif
2825 struct rtable *rt = NULL;
2826 struct flowi4 fl4;
2827 __u8 tos;
2828
2829 #ifdef CONFIG_DST_CACHE
2830 dst_cache = (struct dst_cache *)&info->dst_cache;
2831 if (use_cache) {
2832 rt = dst_cache_get_ip4(dst_cache, saddr);
2833 if (rt)
2834 return rt;
2835 }
2836 #endif
2837 memset(&fl4, 0, sizeof(fl4));
2838 fl4.flowi4_mark = skb->mark;
2839 fl4.flowi4_proto = protocol;
2840 fl4.daddr = info->key.u.ipv4.dst;
2841 fl4.saddr = info->key.u.ipv4.src;
2842 tos = info->key.tos;
2843 fl4.flowi4_tos = RT_TOS(tos);
2844
2845 rt = ip_route_output_key(net, &fl4);
2846 if (IS_ERR(rt)) {
2847 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2848 return ERR_PTR(-ENETUNREACH);
2849 }
2850 if (rt->dst.dev == dev) { /* is this necessary? */
2851 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2852 ip_rt_put(rt);
2853 return ERR_PTR(-ELOOP);
2854 }
2855 #ifdef CONFIG_DST_CACHE
2856 if (use_cache)
2857 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2858 #endif
2859 *saddr = fl4.saddr;
2860 return rt;
2861 }
2862 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2863
2864 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2865 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2866 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2867 struct sk_buff *skb, u32 portid, u32 seq,
2868 unsigned int flags)
2869 {
2870 struct rtmsg *r;
2871 struct nlmsghdr *nlh;
2872 unsigned long expires = 0;
2873 u32 error;
2874 u32 metrics[RTAX_MAX];
2875
2876 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2877 if (!nlh)
2878 return -EMSGSIZE;
2879
2880 r = nlmsg_data(nlh);
2881 r->rtm_family = AF_INET;
2882 r->rtm_dst_len = 32;
2883 r->rtm_src_len = 0;
2884 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2885 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2886 if (nla_put_u32(skb, RTA_TABLE, table_id))
2887 goto nla_put_failure;
2888 r->rtm_type = rt->rt_type;
2889 r->rtm_scope = RT_SCOPE_UNIVERSE;
2890 r->rtm_protocol = RTPROT_UNSPEC;
2891 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2892 if (rt->rt_flags & RTCF_NOTIFY)
2893 r->rtm_flags |= RTM_F_NOTIFY;
2894 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2895 r->rtm_flags |= RTCF_DOREDIRECT;
2896
2897 if (nla_put_in_addr(skb, RTA_DST, dst))
2898 goto nla_put_failure;
2899 if (src) {
2900 r->rtm_src_len = 32;
2901 if (nla_put_in_addr(skb, RTA_SRC, src))
2902 goto nla_put_failure;
2903 }
2904 if (rt->dst.dev &&
2905 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2906 goto nla_put_failure;
2907 #ifdef CONFIG_IP_ROUTE_CLASSID
2908 if (rt->dst.tclassid &&
2909 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2910 goto nla_put_failure;
2911 #endif
2912 if (fl4 && !rt_is_input_route(rt) &&
2913 fl4->saddr != src) {
2914 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2915 goto nla_put_failure;
2916 }
2917 if (rt->rt_uses_gateway) {
2918 if (rt->rt_gw_family == AF_INET &&
2919 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2920 goto nla_put_failure;
2921 } else if (rt->rt_gw_family == AF_INET6) {
2922 int alen = sizeof(struct in6_addr);
2923 struct nlattr *nla;
2924 struct rtvia *via;
2925
2926 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2927 if (!nla)
2928 goto nla_put_failure;
2929
2930 via = nla_data(nla);
2931 via->rtvia_family = AF_INET6;
2932 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2933 }
2934 }
2935
2936 expires = rt->dst.expires;
2937 if (expires) {
2938 unsigned long now = jiffies;
2939
2940 if (time_before(now, expires))
2941 expires -= now;
2942 else
2943 expires = 0;
2944 }
2945
2946 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2947 if (rt->rt_pmtu && expires)
2948 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2949 if (rt->rt_mtu_locked && expires)
2950 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2951 if (rtnetlink_put_metrics(skb, metrics) < 0)
2952 goto nla_put_failure;
2953
2954 if (fl4) {
2955 if (fl4->flowi4_mark &&
2956 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2957 goto nla_put_failure;
2958
2959 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2960 nla_put_u32(skb, RTA_UID,
2961 from_kuid_munged(current_user_ns(),
2962 fl4->flowi4_uid)))
2963 goto nla_put_failure;
2964
2965 if (rt_is_input_route(rt)) {
2966 #ifdef CONFIG_IP_MROUTE
2967 if (ipv4_is_multicast(dst) &&
2968 !ipv4_is_local_multicast(dst) &&
2969 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2970 int err = ipmr_get_route(net, skb,
2971 fl4->saddr, fl4->daddr,
2972 r, portid);
2973
2974 if (err <= 0) {
2975 if (err == 0)
2976 return 0;
2977 goto nla_put_failure;
2978 }
2979 } else
2980 #endif
2981 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2982 goto nla_put_failure;
2983 }
2984 }
2985
2986 error = rt->dst.error;
2987
2988 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2989 goto nla_put_failure;
2990
2991 nlmsg_end(skb, nlh);
2992 return 0;
2993
2994 nla_put_failure:
2995 nlmsg_cancel(skb, nlh);
2996 return -EMSGSIZE;
2997 }
2998
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2999 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3000 struct netlink_callback *cb, u32 table_id,
3001 struct fnhe_hash_bucket *bucket, int genid,
3002 int *fa_index, int fa_start, unsigned int flags)
3003 {
3004 int i;
3005
3006 for (i = 0; i < FNHE_HASH_SIZE; i++) {
3007 struct fib_nh_exception *fnhe;
3008
3009 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3010 fnhe = rcu_dereference(fnhe->fnhe_next)) {
3011 struct rtable *rt;
3012 int err;
3013
3014 if (*fa_index < fa_start)
3015 goto next;
3016
3017 if (fnhe->fnhe_genid != genid)
3018 goto next;
3019
3020 if (fnhe->fnhe_expires &&
3021 time_after(jiffies, fnhe->fnhe_expires))
3022 goto next;
3023
3024 rt = rcu_dereference(fnhe->fnhe_rth_input);
3025 if (!rt)
3026 rt = rcu_dereference(fnhe->fnhe_rth_output);
3027 if (!rt)
3028 goto next;
3029
3030 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3031 table_id, NULL, skb,
3032 NETLINK_CB(cb->skb).portid,
3033 cb->nlh->nlmsg_seq, flags);
3034 if (err)
3035 return err;
3036 next:
3037 (*fa_index)++;
3038 }
3039 }
3040
3041 return 0;
3042 }
3043
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3044 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3045 u32 table_id, struct fib_info *fi,
3046 int *fa_index, int fa_start, unsigned int flags)
3047 {
3048 struct net *net = sock_net(cb->skb->sk);
3049 int nhsel, genid = fnhe_genid(net);
3050
3051 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3052 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3053 struct fnhe_hash_bucket *bucket;
3054 int err;
3055
3056 if (nhc->nhc_flags & RTNH_F_DEAD)
3057 continue;
3058
3059 rcu_read_lock();
3060 bucket = rcu_dereference(nhc->nhc_exceptions);
3061 err = 0;
3062 if (bucket)
3063 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3064 genid, fa_index, fa_start,
3065 flags);
3066 rcu_read_unlock();
3067 if (err)
3068 return err;
3069 }
3070
3071 return 0;
3072 }
3073
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3074 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3075 u8 ip_proto, __be16 sport,
3076 __be16 dport)
3077 {
3078 struct sk_buff *skb;
3079 struct iphdr *iph;
3080
3081 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3082 if (!skb)
3083 return NULL;
3084
3085 /* Reserve room for dummy headers, this skb can pass
3086 * through good chunk of routing engine.
3087 */
3088 skb_reset_mac_header(skb);
3089 skb_reset_network_header(skb);
3090 skb->protocol = htons(ETH_P_IP);
3091 iph = skb_put(skb, sizeof(struct iphdr));
3092 iph->protocol = ip_proto;
3093 iph->saddr = src;
3094 iph->daddr = dst;
3095 iph->version = 0x4;
3096 iph->frag_off = 0;
3097 iph->ihl = 0x5;
3098 skb_set_transport_header(skb, skb->len);
3099
3100 switch (iph->protocol) {
3101 case IPPROTO_UDP: {
3102 struct udphdr *udph;
3103
3104 udph = skb_put_zero(skb, sizeof(struct udphdr));
3105 udph->source = sport;
3106 udph->dest = dport;
3107 udph->len = htons(sizeof(struct udphdr));
3108 udph->check = 0;
3109 break;
3110 }
3111 case IPPROTO_TCP: {
3112 struct tcphdr *tcph;
3113
3114 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3115 tcph->source = sport;
3116 tcph->dest = dport;
3117 tcph->doff = sizeof(struct tcphdr) / 4;
3118 tcph->rst = 1;
3119 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3120 src, dst, 0);
3121 break;
3122 }
3123 case IPPROTO_ICMP: {
3124 struct icmphdr *icmph;
3125
3126 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3127 icmph->type = ICMP_ECHO;
3128 icmph->code = 0;
3129 }
3130 }
3131
3132 return skb;
3133 }
3134
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3135 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3136 const struct nlmsghdr *nlh,
3137 struct nlattr **tb,
3138 struct netlink_ext_ack *extack)
3139 {
3140 struct rtmsg *rtm;
3141 int i, err;
3142
3143 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3144 NL_SET_ERR_MSG(extack,
3145 "ipv4: Invalid header for route get request");
3146 return -EINVAL;
3147 }
3148
3149 if (!netlink_strict_get_check(skb))
3150 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3151 rtm_ipv4_policy, extack);
3152
3153 rtm = nlmsg_data(nlh);
3154 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3155 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3156 rtm->rtm_table || rtm->rtm_protocol ||
3157 rtm->rtm_scope || rtm->rtm_type) {
3158 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3159 return -EINVAL;
3160 }
3161
3162 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3163 RTM_F_LOOKUP_TABLE |
3164 RTM_F_FIB_MATCH)) {
3165 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3166 return -EINVAL;
3167 }
3168
3169 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3170 rtm_ipv4_policy, extack);
3171 if (err)
3172 return err;
3173
3174 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3175 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3176 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3177 return -EINVAL;
3178 }
3179
3180 for (i = 0; i <= RTA_MAX; i++) {
3181 if (!tb[i])
3182 continue;
3183
3184 switch (i) {
3185 case RTA_IIF:
3186 case RTA_OIF:
3187 case RTA_SRC:
3188 case RTA_DST:
3189 case RTA_IP_PROTO:
3190 case RTA_SPORT:
3191 case RTA_DPORT:
3192 case RTA_MARK:
3193 case RTA_UID:
3194 break;
3195 default:
3196 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3197 return -EINVAL;
3198 }
3199 }
3200
3201 return 0;
3202 }
3203
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3204 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3205 struct netlink_ext_ack *extack)
3206 {
3207 struct net *net = sock_net(in_skb->sk);
3208 struct nlattr *tb[RTA_MAX+1];
3209 u32 table_id = RT_TABLE_MAIN;
3210 __be16 sport = 0, dport = 0;
3211 struct fib_result res = {};
3212 u8 ip_proto = IPPROTO_UDP;
3213 struct rtable *rt = NULL;
3214 struct sk_buff *skb;
3215 struct rtmsg *rtm;
3216 struct flowi4 fl4 = {};
3217 __be32 dst = 0;
3218 __be32 src = 0;
3219 kuid_t uid;
3220 u32 iif;
3221 int err;
3222 int mark;
3223
3224 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3225 if (err < 0)
3226 return err;
3227
3228 rtm = nlmsg_data(nlh);
3229 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3230 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3231 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3232 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3233 if (tb[RTA_UID])
3234 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3235 else
3236 uid = (iif ? INVALID_UID : current_uid());
3237
3238 if (tb[RTA_IP_PROTO]) {
3239 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3240 &ip_proto, AF_INET, extack);
3241 if (err)
3242 return err;
3243 }
3244
3245 if (tb[RTA_SPORT])
3246 sport = nla_get_be16(tb[RTA_SPORT]);
3247
3248 if (tb[RTA_DPORT])
3249 dport = nla_get_be16(tb[RTA_DPORT]);
3250
3251 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3252 if (!skb)
3253 return -ENOBUFS;
3254
3255 fl4.daddr = dst;
3256 fl4.saddr = src;
3257 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3258 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3259 fl4.flowi4_mark = mark;
3260 fl4.flowi4_uid = uid;
3261 if (sport)
3262 fl4.fl4_sport = sport;
3263 if (dport)
3264 fl4.fl4_dport = dport;
3265 fl4.flowi4_proto = ip_proto;
3266
3267 rcu_read_lock();
3268
3269 if (iif) {
3270 struct net_device *dev;
3271
3272 dev = dev_get_by_index_rcu(net, iif);
3273 if (!dev) {
3274 err = -ENODEV;
3275 goto errout_rcu;
3276 }
3277
3278 fl4.flowi4_iif = iif; /* for rt_fill_info */
3279 skb->dev = dev;
3280 skb->mark = mark;
3281 err = ip_route_input_rcu(skb, dst, src,
3282 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3283 &res);
3284
3285 rt = skb_rtable(skb);
3286 if (err == 0 && rt->dst.error)
3287 err = -rt->dst.error;
3288 } else {
3289 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3290 skb->dev = net->loopback_dev;
3291 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3292 err = 0;
3293 if (IS_ERR(rt))
3294 err = PTR_ERR(rt);
3295 else
3296 skb_dst_set(skb, &rt->dst);
3297 }
3298
3299 if (err)
3300 goto errout_rcu;
3301
3302 if (rtm->rtm_flags & RTM_F_NOTIFY)
3303 rt->rt_flags |= RTCF_NOTIFY;
3304
3305 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3306 table_id = res.table ? res.table->tb_id : 0;
3307
3308 /* reset skb for netlink reply msg */
3309 skb_trim(skb, 0);
3310 skb_reset_network_header(skb);
3311 skb_reset_transport_header(skb);
3312 skb_reset_mac_header(skb);
3313
3314 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3315 struct fib_rt_info fri;
3316
3317 if (!res.fi) {
3318 err = fib_props[res.type].error;
3319 if (!err)
3320 err = -EHOSTUNREACH;
3321 goto errout_rcu;
3322 }
3323 fri.fi = res.fi;
3324 fri.tb_id = table_id;
3325 fri.dst = res.prefix;
3326 fri.dst_len = res.prefixlen;
3327 fri.tos = fl4.flowi4_tos;
3328 fri.type = rt->rt_type;
3329 fri.offload = 0;
3330 fri.trap = 0;
3331 if (res.fa_head) {
3332 struct fib_alias *fa;
3333
3334 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3335 u8 slen = 32 - fri.dst_len;
3336
3337 if (fa->fa_slen == slen &&
3338 fa->tb_id == fri.tb_id &&
3339 fa->fa_tos == fri.tos &&
3340 fa->fa_info == res.fi &&
3341 fa->fa_type == fri.type) {
3342 fri.offload = fa->offload;
3343 fri.trap = fa->trap;
3344 break;
3345 }
3346 }
3347 }
3348 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3349 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3350 } else {
3351 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3352 NETLINK_CB(in_skb).portid,
3353 nlh->nlmsg_seq, 0);
3354 }
3355 if (err < 0)
3356 goto errout_rcu;
3357
3358 rcu_read_unlock();
3359
3360 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3361
3362 errout_free:
3363 return err;
3364 errout_rcu:
3365 rcu_read_unlock();
3366 kfree_skb(skb);
3367 goto errout_free;
3368 }
3369
ip_rt_multicast_event(struct in_device * in_dev)3370 void ip_rt_multicast_event(struct in_device *in_dev)
3371 {
3372 rt_cache_flush(dev_net(in_dev->dev));
3373 }
3374
3375 #ifdef CONFIG_SYSCTL
3376 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3377 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3378 static int ip_rt_gc_elasticity __read_mostly = 8;
3379 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3380
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3381 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3382 void *buffer, size_t *lenp, loff_t *ppos)
3383 {
3384 struct net *net = (struct net *)__ctl->extra1;
3385
3386 if (write) {
3387 rt_cache_flush(net);
3388 fnhe_genid_bump(net);
3389 return 0;
3390 }
3391
3392 return -EINVAL;
3393 }
3394
3395 static struct ctl_table ipv4_route_table[] = {
3396 {
3397 .procname = "gc_thresh",
3398 .data = &ipv4_dst_ops.gc_thresh,
3399 .maxlen = sizeof(int),
3400 .mode = 0644,
3401 .proc_handler = proc_dointvec,
3402 },
3403 {
3404 .procname = "max_size",
3405 .data = &ip_rt_max_size,
3406 .maxlen = sizeof(int),
3407 .mode = 0644,
3408 .proc_handler = proc_dointvec,
3409 },
3410 {
3411 /* Deprecated. Use gc_min_interval_ms */
3412
3413 .procname = "gc_min_interval",
3414 .data = &ip_rt_gc_min_interval,
3415 .maxlen = sizeof(int),
3416 .mode = 0644,
3417 .proc_handler = proc_dointvec_jiffies,
3418 },
3419 {
3420 .procname = "gc_min_interval_ms",
3421 .data = &ip_rt_gc_min_interval,
3422 .maxlen = sizeof(int),
3423 .mode = 0644,
3424 .proc_handler = proc_dointvec_ms_jiffies,
3425 },
3426 {
3427 .procname = "gc_timeout",
3428 .data = &ip_rt_gc_timeout,
3429 .maxlen = sizeof(int),
3430 .mode = 0644,
3431 .proc_handler = proc_dointvec_jiffies,
3432 },
3433 {
3434 .procname = "gc_interval",
3435 .data = &ip_rt_gc_interval,
3436 .maxlen = sizeof(int),
3437 .mode = 0644,
3438 .proc_handler = proc_dointvec_jiffies,
3439 },
3440 {
3441 .procname = "redirect_load",
3442 .data = &ip_rt_redirect_load,
3443 .maxlen = sizeof(int),
3444 .mode = 0644,
3445 .proc_handler = proc_dointvec,
3446 },
3447 {
3448 .procname = "redirect_number",
3449 .data = &ip_rt_redirect_number,
3450 .maxlen = sizeof(int),
3451 .mode = 0644,
3452 .proc_handler = proc_dointvec,
3453 },
3454 {
3455 .procname = "redirect_silence",
3456 .data = &ip_rt_redirect_silence,
3457 .maxlen = sizeof(int),
3458 .mode = 0644,
3459 .proc_handler = proc_dointvec,
3460 },
3461 {
3462 .procname = "error_cost",
3463 .data = &ip_rt_error_cost,
3464 .maxlen = sizeof(int),
3465 .mode = 0644,
3466 .proc_handler = proc_dointvec,
3467 },
3468 {
3469 .procname = "error_burst",
3470 .data = &ip_rt_error_burst,
3471 .maxlen = sizeof(int),
3472 .mode = 0644,
3473 .proc_handler = proc_dointvec,
3474 },
3475 {
3476 .procname = "gc_elasticity",
3477 .data = &ip_rt_gc_elasticity,
3478 .maxlen = sizeof(int),
3479 .mode = 0644,
3480 .proc_handler = proc_dointvec,
3481 },
3482 {
3483 .procname = "mtu_expires",
3484 .data = &ip_rt_mtu_expires,
3485 .maxlen = sizeof(int),
3486 .mode = 0644,
3487 .proc_handler = proc_dointvec_jiffies,
3488 },
3489 {
3490 .procname = "min_pmtu",
3491 .data = &ip_rt_min_pmtu,
3492 .maxlen = sizeof(int),
3493 .mode = 0644,
3494 .proc_handler = proc_dointvec_minmax,
3495 .extra1 = &ip_min_valid_pmtu,
3496 },
3497 {
3498 .procname = "min_adv_mss",
3499 .data = &ip_rt_min_advmss,
3500 .maxlen = sizeof(int),
3501 .mode = 0644,
3502 .proc_handler = proc_dointvec,
3503 },
3504 { }
3505 };
3506
3507 static const char ipv4_route_flush_procname[] = "flush";
3508
3509 static struct ctl_table ipv4_route_flush_table[] = {
3510 {
3511 .procname = ipv4_route_flush_procname,
3512 .maxlen = sizeof(int),
3513 .mode = 0200,
3514 .proc_handler = ipv4_sysctl_rtcache_flush,
3515 },
3516 { },
3517 };
3518
sysctl_route_net_init(struct net * net)3519 static __net_init int sysctl_route_net_init(struct net *net)
3520 {
3521 struct ctl_table *tbl;
3522
3523 tbl = ipv4_route_flush_table;
3524 if (!net_eq(net, &init_net)) {
3525 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3526 if (!tbl)
3527 goto err_dup;
3528
3529 /* Don't export non-whitelisted sysctls to unprivileged users */
3530 if (net->user_ns != &init_user_ns) {
3531 if (tbl[0].procname != ipv4_route_flush_procname)
3532 tbl[0].procname = NULL;
3533 }
3534 }
3535 tbl[0].extra1 = net;
3536
3537 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3538 if (!net->ipv4.route_hdr)
3539 goto err_reg;
3540 return 0;
3541
3542 err_reg:
3543 if (tbl != ipv4_route_flush_table)
3544 kfree(tbl);
3545 err_dup:
3546 return -ENOMEM;
3547 }
3548
sysctl_route_net_exit(struct net * net)3549 static __net_exit void sysctl_route_net_exit(struct net *net)
3550 {
3551 struct ctl_table *tbl;
3552
3553 tbl = net->ipv4.route_hdr->ctl_table_arg;
3554 unregister_net_sysctl_table(net->ipv4.route_hdr);
3555 BUG_ON(tbl == ipv4_route_flush_table);
3556 kfree(tbl);
3557 }
3558
3559 static __net_initdata struct pernet_operations sysctl_route_ops = {
3560 .init = sysctl_route_net_init,
3561 .exit = sysctl_route_net_exit,
3562 };
3563 #endif
3564
rt_genid_init(struct net * net)3565 static __net_init int rt_genid_init(struct net *net)
3566 {
3567 atomic_set(&net->ipv4.rt_genid, 0);
3568 atomic_set(&net->fnhe_genid, 0);
3569 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3570 return 0;
3571 }
3572
3573 static __net_initdata struct pernet_operations rt_genid_ops = {
3574 .init = rt_genid_init,
3575 };
3576
ipv4_inetpeer_init(struct net * net)3577 static int __net_init ipv4_inetpeer_init(struct net *net)
3578 {
3579 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3580
3581 if (!bp)
3582 return -ENOMEM;
3583 inet_peer_base_init(bp);
3584 net->ipv4.peers = bp;
3585 return 0;
3586 }
3587
ipv4_inetpeer_exit(struct net * net)3588 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3589 {
3590 struct inet_peer_base *bp = net->ipv4.peers;
3591
3592 net->ipv4.peers = NULL;
3593 inetpeer_invalidate_tree(bp);
3594 kfree(bp);
3595 }
3596
3597 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3598 .init = ipv4_inetpeer_init,
3599 .exit = ipv4_inetpeer_exit,
3600 };
3601
3602 #ifdef CONFIG_IP_ROUTE_CLASSID
3603 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3604 #endif /* CONFIG_IP_ROUTE_CLASSID */
3605
ip_rt_init(void)3606 int __init ip_rt_init(void)
3607 {
3608 void *idents_hash;
3609 int cpu;
3610
3611 /* For modern hosts, this will use 2 MB of memory */
3612 idents_hash = alloc_large_system_hash("IP idents",
3613 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3614 0,
3615 16, /* one bucket per 64 KB */
3616 HASH_ZERO,
3617 NULL,
3618 &ip_idents_mask,
3619 2048,
3620 256*1024);
3621
3622 ip_idents = idents_hash;
3623
3624 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3625
3626 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3627
3628 for_each_possible_cpu(cpu) {
3629 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3630
3631 INIT_LIST_HEAD(&ul->head);
3632 spin_lock_init(&ul->lock);
3633 }
3634 #ifdef CONFIG_IP_ROUTE_CLASSID
3635 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3636 if (!ip_rt_acct)
3637 panic("IP: failed to allocate ip_rt_acct\n");
3638 #endif
3639
3640 ipv4_dst_ops.kmem_cachep =
3641 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3642 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3643
3644 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3645
3646 if (dst_entries_init(&ipv4_dst_ops) < 0)
3647 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3648
3649 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3650 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3651
3652 ipv4_dst_ops.gc_thresh = ~0;
3653 ip_rt_max_size = INT_MAX;
3654
3655 devinet_init();
3656 ip_fib_init();
3657
3658 if (ip_rt_proc_init())
3659 pr_err("Unable to create route proc files\n");
3660 #ifdef CONFIG_XFRM
3661 xfrm_init();
3662 xfrm4_init();
3663 #endif
3664 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3665 RTNL_FLAG_DOIT_UNLOCKED);
3666
3667 #ifdef CONFIG_SYSCTL
3668 register_pernet_subsys(&sysctl_route_ops);
3669 #endif
3670 register_pernet_subsys(&rt_genid_ops);
3671 register_pernet_subsys(&ipv4_inetpeer_ops);
3672 return 0;
3673 }
3674
3675 #ifdef CONFIG_SYSCTL
3676 /*
3677 * We really need to sanitize the damn ipv4 init order, then all
3678 * this nonsense will go away.
3679 */
ip_static_sysctl_init(void)3680 void __init ip_static_sysctl_init(void)
3681 {
3682 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3683 }
3684 #endif
3685