1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
130
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static void ipv4_negative_advice(struct sock *sk,
141 struct dst_entry *dst);
142 static void ipv4_link_failure(struct sk_buff *skb);
143 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 struct sk_buff *skb, u32 mtu,
145 bool confirm_neigh);
146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152 WARN_ON(1);
153 return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
163 .check = ipv4_dst_check,
164 .default_advmss = ipv4_default_advmss,
165 .mtu = ipv4_mtu,
166 .cow_metrics = ipv4_cow_metrics,
167 .destroy = ipv4_dst_destroy,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174 .confirm_neigh = ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class) TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205 if (*pos)
206 return NULL;
207 return SEQ_START_TOKEN;
208 }
209
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212 ++*pos;
213 return NULL;
214 }
215
rt_cache_seq_stop(struct seq_file * seq,void * v)216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
rt_cache_seq_show(struct seq_file * seq,void * v)220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
227 return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235 };
236
rt_cache_seq_open(struct inode * inode,struct file * file)237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239 return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct proc_ops rt_cache_proc_ops = {
243 .proc_open = rt_cache_seq_open,
244 .proc_read = seq_read,
245 .proc_lseek = seq_lseek,
246 .proc_release = seq_release,
247 };
248
249
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
261 return &per_cpu(rt_cache_stat, cpu);
262 }
263 return NULL;
264 }
265
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268 int cpu;
269
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 (*pos)++;
277 return NULL;
278
279 }
280
rt_cpu_seq_stop(struct seq_file * seq,void * v)281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
rt_cpu_seq_show(struct seq_file * seq,void * v)286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 return 0;
293 }
294
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 dst_entries_get_slow(&ipv4_dst_ops),
298 0, /* st->in_hit */
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
306 0, /* st->out_hit */
307 st->out_slow_tot,
308 st->out_slow_mc,
309
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
316 );
317 return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325 };
326
327
rt_cpu_seq_open(struct inode * inode,struct file * file)328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330 return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct proc_ops rt_cpu_proc_ops = {
334 .proc_open = rt_cpu_seq_open,
335 .proc_read = seq_read,
336 .proc_lseek = seq_lseek,
337 .proc_release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
358 }
359
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363 }
364 #endif
365
ip_rt_do_proc_init(struct net * net)366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368 struct proc_dir_entry *pde;
369
370 pde = proc_create("rt_cache", 0444, net->proc_net,
371 &rt_cache_proc_ops);
372 if (!pde)
373 goto err1;
374
375 pde = proc_create("rt_cache", 0444,
376 net->proc_net_stat, &rt_cpu_proc_ops);
377 if (!pde)
378 goto err2;
379
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 pde = proc_create_single("rt_acct", 0, net->proc_net,
382 rt_acct_proc_show);
383 if (!pde)
384 goto err3;
385 #endif
386 return 0;
387
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390 remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393 remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395 return -ENOMEM;
396 }
397
ip_rt_do_proc_exit(struct net * net)398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400 remove_proc_entry("rt_cache", net->proc_net_stat);
401 remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406
407 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
408 .init = ip_rt_do_proc_init,
409 .exit = ip_rt_do_proc_exit,
410 };
411
ip_rt_proc_init(void)412 static int __init ip_rt_proc_init(void)
413 {
414 return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416
417 #else
ip_rt_proc_init(void)418 static inline int ip_rt_proc_init(void)
419 {
420 return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423
rt_is_expired(const struct rtable * rth)424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428
rt_cache_flush(struct net * net)429 void rt_cache_flush(struct net *net)
430 {
431 rt_genid_bump_ipv4(net);
432 }
433
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435 struct sk_buff *skb,
436 const void *daddr)
437 {
438 const struct rtable *rt = container_of(dst, struct rtable, dst);
439 struct net_device *dev = dst->dev;
440 struct neighbour *n;
441
442 rcu_read_lock_bh();
443
444 if (likely(rt->rt_gw_family == AF_INET)) {
445 n = ip_neigh_gw4(dev, rt->rt_gw4);
446 } else if (rt->rt_gw_family == AF_INET6) {
447 n = ip_neigh_gw6(dev, &rt->rt_gw6);
448 } else {
449 __be32 pkey;
450
451 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 n = ip_neigh_gw4(dev, pkey);
453 }
454
455 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 n = NULL;
457
458 rcu_read_unlock_bh();
459
460 return n;
461 }
462
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465 const struct rtable *rt = container_of(dst, struct rtable, dst);
466 struct net_device *dev = dst->dev;
467 const __be32 *pkey = daddr;
468
469 if (rt->rt_gw_family == AF_INET) {
470 pkey = (const __be32 *)&rt->rt_gw4;
471 } else if (rt->rt_gw_family == AF_INET6) {
472 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 } else if (!daddr ||
474 (rt->rt_flags &
475 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 return;
477 }
478 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480
481 /* Hash tables of size 2048..262144 depending on RAM size.
482 * Each bucket uses 8 bytes.
483 */
484 static u32 ip_idents_mask __read_mostly;
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487
488 /* In order to protect privacy, we add a perturbation to identifiers
489 * if one generator is seldom used. This makes hard for an attacker
490 * to infer how many packets were sent between two points in time.
491 */
ip_idents_reserve(u32 hash,int segs)492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494 u32 bucket, old, now = (u32)jiffies;
495 atomic_t *p_id;
496 u32 *p_tstamp;
497 u32 delta = 0;
498
499 bucket = hash & ip_idents_mask;
500 p_tstamp = ip_tstamps + bucket;
501 p_id = ip_idents + bucket;
502 old = READ_ONCE(*p_tstamp);
503
504 if (old != now && cmpxchg(p_tstamp, old, now) == old)
505 delta = prandom_u32_max(now - old);
506
507 /* If UBSAN reports an error there, please make sure your compiler
508 * supports -fno-strict-overflow before reporting it that was a bug
509 * in UBSAN, and it has been fixed in GCC-8.
510 */
511 return atomic_add_return(segs + delta, p_id) - segs;
512 }
513 EXPORT_SYMBOL(ip_idents_reserve);
514
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 {
517 u32 hash, id;
518
519 /* Note the following code is not safe, but this is okay. */
520 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
521 get_random_bytes(&net->ipv4.ip_id_key,
522 sizeof(net->ipv4.ip_id_key));
523
524 hash = siphash_3u32((__force u32)iph->daddr,
525 (__force u32)iph->saddr,
526 iph->protocol,
527 &net->ipv4.ip_id_key);
528 id = ip_idents_reserve(hash, segs);
529 iph->id = htons(id);
530 }
531 EXPORT_SYMBOL(__ip_select_ident);
532
ip_rt_fix_tos(struct flowi4 * fl4)533 static void ip_rt_fix_tos(struct flowi4 *fl4)
534 {
535 __u8 tos = RT_FL_TOS(fl4);
536
537 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
538 fl4->flowi4_scope = tos & RTO_ONLINK ?
539 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
540 }
541
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)542 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
543 const struct sock *sk,
544 const struct iphdr *iph,
545 int oif, u8 tos,
546 u8 prot, u32 mark, int flow_flags)
547 {
548 if (sk) {
549 const struct inet_sock *inet = inet_sk(sk);
550
551 oif = sk->sk_bound_dev_if;
552 mark = sk->sk_mark;
553 tos = RT_CONN_FLAGS(sk);
554 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
555 }
556 flowi4_init_output(fl4, oif, mark, tos,
557 RT_SCOPE_UNIVERSE, prot,
558 flow_flags,
559 iph->daddr, iph->saddr, 0, 0,
560 sock_net_uid(net, sk));
561 }
562
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)563 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
564 const struct sock *sk)
565 {
566 const struct net *net = dev_net(skb->dev);
567 const struct iphdr *iph = ip_hdr(skb);
568 int oif = skb->dev->ifindex;
569 u8 tos = RT_TOS(iph->tos);
570 u8 prot = iph->protocol;
571 u32 mark = skb->mark;
572
573 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
574 }
575
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)576 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
577 {
578 const struct inet_sock *inet = inet_sk(sk);
579 const struct ip_options_rcu *inet_opt;
580 __be32 daddr = inet->inet_daddr;
581
582 rcu_read_lock();
583 inet_opt = rcu_dereference(inet->inet_opt);
584 if (inet_opt && inet_opt->opt.srr)
585 daddr = inet_opt->opt.faddr;
586 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
587 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
588 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
589 inet_sk_flowi_flags(sk),
590 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
591 rcu_read_unlock();
592 }
593
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)594 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
595 const struct sk_buff *skb)
596 {
597 if (skb)
598 build_skb_flow_key(fl4, skb, sk);
599 else
600 build_sk_flow_key(fl4, sk);
601 }
602
603 static DEFINE_SPINLOCK(fnhe_lock);
604
fnhe_flush_routes(struct fib_nh_exception * fnhe)605 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
606 {
607 struct rtable *rt;
608
609 rt = rcu_dereference(fnhe->fnhe_rth_input);
610 if (rt) {
611 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
612 dst_dev_put(&rt->dst);
613 dst_release(&rt->dst);
614 }
615 rt = rcu_dereference(fnhe->fnhe_rth_output);
616 if (rt) {
617 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
618 dst_dev_put(&rt->dst);
619 dst_release(&rt->dst);
620 }
621 }
622
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)623 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
624 {
625 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
626 struct fib_nh_exception *fnhe, *oldest = NULL;
627
628 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
629 fnhe = rcu_dereference_protected(*fnhe_p,
630 lockdep_is_held(&fnhe_lock));
631 if (!fnhe)
632 break;
633 if (!oldest ||
634 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
635 oldest = fnhe;
636 oldest_p = fnhe_p;
637 }
638 }
639 fnhe_flush_routes(oldest);
640 *oldest_p = oldest->fnhe_next;
641 kfree_rcu(oldest, rcu);
642 }
643
fnhe_hashfun(__be32 daddr)644 static u32 fnhe_hashfun(__be32 daddr)
645 {
646 static siphash_key_t fnhe_hash_key __read_mostly;
647 u64 hval;
648
649 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
650 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
651 return hash_64(hval, FNHE_HASH_SHIFT);
652 }
653
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)654 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
655 {
656 rt->rt_pmtu = fnhe->fnhe_pmtu;
657 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
658 rt->dst.expires = fnhe->fnhe_expires;
659
660 if (fnhe->fnhe_gw) {
661 rt->rt_flags |= RTCF_REDIRECTED;
662 rt->rt_uses_gateway = 1;
663 rt->rt_gw_family = AF_INET;
664 rt->rt_gw4 = fnhe->fnhe_gw;
665 }
666 }
667
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)668 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
669 __be32 gw, u32 pmtu, bool lock,
670 unsigned long expires)
671 {
672 struct fnhe_hash_bucket *hash;
673 struct fib_nh_exception *fnhe;
674 struct rtable *rt;
675 u32 genid, hval;
676 unsigned int i;
677 int depth;
678
679 genid = fnhe_genid(dev_net(nhc->nhc_dev));
680 hval = fnhe_hashfun(daddr);
681
682 spin_lock_bh(&fnhe_lock);
683
684 hash = rcu_dereference(nhc->nhc_exceptions);
685 if (!hash) {
686 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
687 if (!hash)
688 goto out_unlock;
689 rcu_assign_pointer(nhc->nhc_exceptions, hash);
690 }
691
692 hash += hval;
693
694 depth = 0;
695 for (fnhe = rcu_dereference(hash->chain); fnhe;
696 fnhe = rcu_dereference(fnhe->fnhe_next)) {
697 if (fnhe->fnhe_daddr == daddr)
698 break;
699 depth++;
700 }
701
702 if (fnhe) {
703 if (fnhe->fnhe_genid != genid)
704 fnhe->fnhe_genid = genid;
705 if (gw)
706 fnhe->fnhe_gw = gw;
707 if (pmtu) {
708 fnhe->fnhe_pmtu = pmtu;
709 fnhe->fnhe_mtu_locked = lock;
710 }
711 fnhe->fnhe_expires = max(1UL, expires);
712 /* Update all cached dsts too */
713 rt = rcu_dereference(fnhe->fnhe_rth_input);
714 if (rt)
715 fill_route_from_fnhe(rt, fnhe);
716 rt = rcu_dereference(fnhe->fnhe_rth_output);
717 if (rt)
718 fill_route_from_fnhe(rt, fnhe);
719 } else {
720 /* Randomize max depth to avoid some side channels attacks. */
721 int max_depth = FNHE_RECLAIM_DEPTH +
722 prandom_u32_max(FNHE_RECLAIM_DEPTH);
723
724 while (depth > max_depth) {
725 fnhe_remove_oldest(hash);
726 depth--;
727 }
728
729 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
730 if (!fnhe)
731 goto out_unlock;
732
733 fnhe->fnhe_next = hash->chain;
734
735 fnhe->fnhe_genid = genid;
736 fnhe->fnhe_daddr = daddr;
737 fnhe->fnhe_gw = gw;
738 fnhe->fnhe_pmtu = pmtu;
739 fnhe->fnhe_mtu_locked = lock;
740 fnhe->fnhe_expires = max(1UL, expires);
741
742 rcu_assign_pointer(hash->chain, fnhe);
743
744 /* Exception created; mark the cached routes for the nexthop
745 * stale, so anyone caching it rechecks if this exception
746 * applies to them.
747 */
748 rt = rcu_dereference(nhc->nhc_rth_input);
749 if (rt)
750 rt->dst.obsolete = DST_OBSOLETE_KILL;
751
752 for_each_possible_cpu(i) {
753 struct rtable __rcu **prt;
754 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
755 rt = rcu_dereference(*prt);
756 if (rt)
757 rt->dst.obsolete = DST_OBSOLETE_KILL;
758 }
759 }
760
761 fnhe->fnhe_stamp = jiffies;
762
763 out_unlock:
764 spin_unlock_bh(&fnhe_lock);
765 }
766
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)767 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
768 bool kill_route)
769 {
770 __be32 new_gw = icmp_hdr(skb)->un.gateway;
771 __be32 old_gw = ip_hdr(skb)->saddr;
772 struct net_device *dev = skb->dev;
773 struct in_device *in_dev;
774 struct fib_result res;
775 struct neighbour *n;
776 struct net *net;
777
778 switch (icmp_hdr(skb)->code & 7) {
779 case ICMP_REDIR_NET:
780 case ICMP_REDIR_NETTOS:
781 case ICMP_REDIR_HOST:
782 case ICMP_REDIR_HOSTTOS:
783 break;
784
785 default:
786 return;
787 }
788
789 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
790 return;
791
792 in_dev = __in_dev_get_rcu(dev);
793 if (!in_dev)
794 return;
795
796 net = dev_net(dev);
797 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
798 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
799 ipv4_is_zeronet(new_gw))
800 goto reject_redirect;
801
802 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
803 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
804 goto reject_redirect;
805 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
806 goto reject_redirect;
807 } else {
808 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
809 goto reject_redirect;
810 }
811
812 n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
813 if (!n)
814 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
815 if (!IS_ERR(n)) {
816 if (!(n->nud_state & NUD_VALID)) {
817 neigh_event_send(n, NULL);
818 } else {
819 if (fib_lookup(net, fl4, &res, 0) == 0) {
820 struct fib_nh_common *nhc;
821
822 fib_select_path(net, &res, fl4, skb);
823 nhc = FIB_RES_NHC(res);
824 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
825 0, false,
826 jiffies + ip_rt_gc_timeout);
827 }
828 if (kill_route)
829 rt->dst.obsolete = DST_OBSOLETE_KILL;
830 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
831 }
832 neigh_release(n);
833 }
834 return;
835
836 reject_redirect:
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838 if (IN_DEV_LOG_MARTIANS(in_dev)) {
839 const struct iphdr *iph = (const struct iphdr *) skb->data;
840 __be32 daddr = iph->daddr;
841 __be32 saddr = iph->saddr;
842
843 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
844 " Advised path = %pI4 -> %pI4\n",
845 &old_gw, dev->name, &new_gw,
846 &saddr, &daddr);
847 }
848 #endif
849 ;
850 }
851
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)852 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
853 {
854 struct rtable *rt;
855 struct flowi4 fl4;
856 const struct iphdr *iph = (const struct iphdr *) skb->data;
857 struct net *net = dev_net(skb->dev);
858 int oif = skb->dev->ifindex;
859 u8 tos = RT_TOS(iph->tos);
860 u8 prot = iph->protocol;
861 u32 mark = skb->mark;
862
863 rt = (struct rtable *) dst;
864
865 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
866 ip_rt_fix_tos(&fl4);
867 __ip_do_redirect(rt, skb, &fl4, true);
868 }
869
ipv4_negative_advice(struct sock * sk,struct dst_entry * dst)870 static void ipv4_negative_advice(struct sock *sk,
871 struct dst_entry *dst)
872 {
873 struct rtable *rt = (struct rtable *)dst;
874
875 if ((dst->obsolete > 0) ||
876 (rt->rt_flags & RTCF_REDIRECTED) ||
877 rt->dst.expires)
878 sk_dst_reset(sk);
879 }
880
881 /*
882 * Algorithm:
883 * 1. The first ip_rt_redirect_number redirects are sent
884 * with exponential backoff, then we stop sending them at all,
885 * assuming that the host ignores our redirects.
886 * 2. If we did not see packets requiring redirects
887 * during ip_rt_redirect_silence, we assume that the host
888 * forgot redirected route and start to send redirects again.
889 *
890 * This algorithm is much cheaper and more intelligent than dumb load limiting
891 * in icmp.c.
892 *
893 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
894 * and "frag. need" (breaks PMTU discovery) in icmp.c.
895 */
896
ip_rt_send_redirect(struct sk_buff * skb)897 void ip_rt_send_redirect(struct sk_buff *skb)
898 {
899 struct rtable *rt = skb_rtable(skb);
900 struct in_device *in_dev;
901 struct inet_peer *peer;
902 struct net *net;
903 int log_martians;
904 int vif;
905
906 rcu_read_lock();
907 in_dev = __in_dev_get_rcu(rt->dst.dev);
908 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
909 rcu_read_unlock();
910 return;
911 }
912 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
913 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
914 rcu_read_unlock();
915
916 net = dev_net(rt->dst.dev);
917 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
918 if (!peer) {
919 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
920 rt_nexthop(rt, ip_hdr(skb)->daddr));
921 return;
922 }
923
924 /* No redirected packets during ip_rt_redirect_silence;
925 * reset the algorithm.
926 */
927 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
928 peer->rate_tokens = 0;
929 peer->n_redirects = 0;
930 }
931
932 /* Too many ignored redirects; do not send anything
933 * set dst.rate_last to the last seen redirected packet.
934 */
935 if (peer->n_redirects >= ip_rt_redirect_number) {
936 peer->rate_last = jiffies;
937 goto out_put_peer;
938 }
939
940 /* Check for load limit; set rate_last to the latest sent
941 * redirect.
942 */
943 if (peer->n_redirects == 0 ||
944 time_after(jiffies,
945 (peer->rate_last +
946 (ip_rt_redirect_load << peer->n_redirects)))) {
947 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
948
949 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
950 peer->rate_last = jiffies;
951 ++peer->n_redirects;
952 #ifdef CONFIG_IP_ROUTE_VERBOSE
953 if (log_martians &&
954 peer->n_redirects == ip_rt_redirect_number)
955 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
956 &ip_hdr(skb)->saddr, inet_iif(skb),
957 &ip_hdr(skb)->daddr, &gw);
958 #endif
959 }
960 out_put_peer:
961 inet_putpeer(peer);
962 }
963
ip_error(struct sk_buff * skb)964 static int ip_error(struct sk_buff *skb)
965 {
966 struct rtable *rt = skb_rtable(skb);
967 struct net_device *dev = skb->dev;
968 struct in_device *in_dev;
969 struct inet_peer *peer;
970 unsigned long now;
971 struct net *net;
972 bool send;
973 int code;
974
975 if (netif_is_l3_master(skb->dev)) {
976 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
977 if (!dev)
978 goto out;
979 }
980
981 in_dev = __in_dev_get_rcu(dev);
982
983 /* IP on this device is disabled. */
984 if (!in_dev)
985 goto out;
986
987 net = dev_net(rt->dst.dev);
988 if (!IN_DEV_FORWARD(in_dev)) {
989 switch (rt->dst.error) {
990 case EHOSTUNREACH:
991 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
992 break;
993
994 case ENETUNREACH:
995 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
996 break;
997 }
998 goto out;
999 }
1000
1001 switch (rt->dst.error) {
1002 case EINVAL:
1003 default:
1004 goto out;
1005 case EHOSTUNREACH:
1006 code = ICMP_HOST_UNREACH;
1007 break;
1008 case ENETUNREACH:
1009 code = ICMP_NET_UNREACH;
1010 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1011 break;
1012 case EACCES:
1013 code = ICMP_PKT_FILTERED;
1014 break;
1015 }
1016
1017 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1018 l3mdev_master_ifindex(skb->dev), 1);
1019
1020 send = true;
1021 if (peer) {
1022 now = jiffies;
1023 peer->rate_tokens += now - peer->rate_last;
1024 if (peer->rate_tokens > ip_rt_error_burst)
1025 peer->rate_tokens = ip_rt_error_burst;
1026 peer->rate_last = now;
1027 if (peer->rate_tokens >= ip_rt_error_cost)
1028 peer->rate_tokens -= ip_rt_error_cost;
1029 else
1030 send = false;
1031 inet_putpeer(peer);
1032 }
1033 if (send)
1034 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1035
1036 out: kfree_skb(skb);
1037 return 0;
1038 }
1039
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1040 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1041 {
1042 struct dst_entry *dst = &rt->dst;
1043 struct net *net = dev_net(dst->dev);
1044 struct fib_result res;
1045 bool lock = false;
1046 u32 old_mtu;
1047
1048 if (ip_mtu_locked(dst))
1049 return;
1050
1051 old_mtu = ipv4_mtu(dst);
1052 if (old_mtu < mtu)
1053 return;
1054
1055 if (mtu < ip_rt_min_pmtu) {
1056 lock = true;
1057 mtu = min(old_mtu, ip_rt_min_pmtu);
1058 }
1059
1060 if (rt->rt_pmtu == mtu && !lock &&
1061 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1062 return;
1063
1064 rcu_read_lock();
1065 if (fib_lookup(net, fl4, &res, 0) == 0) {
1066 struct fib_nh_common *nhc;
1067
1068 fib_select_path(net, &res, fl4, NULL);
1069 nhc = FIB_RES_NHC(res);
1070 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1071 jiffies + ip_rt_mtu_expires);
1072 }
1073 rcu_read_unlock();
1074 }
1075
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1076 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1077 struct sk_buff *skb, u32 mtu,
1078 bool confirm_neigh)
1079 {
1080 struct rtable *rt = (struct rtable *) dst;
1081 struct flowi4 fl4;
1082
1083 ip_rt_build_flow_key(&fl4, sk, skb);
1084 ip_rt_fix_tos(&fl4);
1085
1086 /* Don't make lookup fail for bridged encapsulations */
1087 if (skb && netif_is_any_bridge_port(skb->dev))
1088 fl4.flowi4_oif = 0;
1089
1090 __ip_rt_update_pmtu(rt, &fl4, mtu);
1091 }
1092
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1093 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1094 int oif, u8 protocol)
1095 {
1096 const struct iphdr *iph = (const struct iphdr *)skb->data;
1097 struct flowi4 fl4;
1098 struct rtable *rt;
1099 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1100
1101 __build_flow_key(net, &fl4, NULL, iph, oif,
1102 RT_TOS(iph->tos), protocol, mark, 0);
1103 rt = __ip_route_output_key(net, &fl4);
1104 if (!IS_ERR(rt)) {
1105 __ip_rt_update_pmtu(rt, &fl4, mtu);
1106 ip_rt_put(rt);
1107 }
1108 }
1109 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1110
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1111 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1112 {
1113 const struct iphdr *iph = (const struct iphdr *)skb->data;
1114 struct flowi4 fl4;
1115 struct rtable *rt;
1116
1117 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1118
1119 if (!fl4.flowi4_mark)
1120 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1121
1122 rt = __ip_route_output_key(sock_net(sk), &fl4);
1123 if (!IS_ERR(rt)) {
1124 __ip_rt_update_pmtu(rt, &fl4, mtu);
1125 ip_rt_put(rt);
1126 }
1127 }
1128
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1129 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1130 {
1131 const struct iphdr *iph = (const struct iphdr *)skb->data;
1132 struct flowi4 fl4;
1133 struct rtable *rt;
1134 struct dst_entry *odst = NULL;
1135 bool new = false;
1136 struct net *net = sock_net(sk);
1137
1138 bh_lock_sock(sk);
1139
1140 if (!ip_sk_accept_pmtu(sk))
1141 goto out;
1142
1143 odst = sk_dst_get(sk);
1144
1145 if (sock_owned_by_user(sk) || !odst) {
1146 __ipv4_sk_update_pmtu(skb, sk, mtu);
1147 goto out;
1148 }
1149
1150 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1151
1152 rt = (struct rtable *)odst;
1153 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1154 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1155 if (IS_ERR(rt))
1156 goto out;
1157
1158 new = true;
1159 } else {
1160 ip_rt_fix_tos(&fl4);
1161 }
1162
1163 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1164
1165 if (!dst_check(&rt->dst, 0)) {
1166 if (new)
1167 dst_release(&rt->dst);
1168
1169 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1170 if (IS_ERR(rt))
1171 goto out;
1172
1173 new = true;
1174 }
1175
1176 if (new)
1177 sk_dst_set(sk, &rt->dst);
1178
1179 out:
1180 bh_unlock_sock(sk);
1181 dst_release(odst);
1182 }
1183 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1184
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1185 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1186 int oif, u8 protocol)
1187 {
1188 const struct iphdr *iph = (const struct iphdr *)skb->data;
1189 struct flowi4 fl4;
1190 struct rtable *rt;
1191
1192 __build_flow_key(net, &fl4, NULL, iph, oif,
1193 RT_TOS(iph->tos), protocol, 0, 0);
1194 rt = __ip_route_output_key(net, &fl4);
1195 if (!IS_ERR(rt)) {
1196 __ip_do_redirect(rt, skb, &fl4, false);
1197 ip_rt_put(rt);
1198 }
1199 }
1200 EXPORT_SYMBOL_GPL(ipv4_redirect);
1201
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1202 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1203 {
1204 const struct iphdr *iph = (const struct iphdr *)skb->data;
1205 struct flowi4 fl4;
1206 struct rtable *rt;
1207 struct net *net = sock_net(sk);
1208
1209 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1210 rt = __ip_route_output_key(net, &fl4);
1211 if (!IS_ERR(rt)) {
1212 __ip_do_redirect(rt, skb, &fl4, false);
1213 ip_rt_put(rt);
1214 }
1215 }
1216 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1217
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1218 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1219 {
1220 struct rtable *rt = (struct rtable *) dst;
1221
1222 /* All IPV4 dsts are created with ->obsolete set to the value
1223 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1224 * into this function always.
1225 *
1226 * When a PMTU/redirect information update invalidates a route,
1227 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1228 * DST_OBSOLETE_DEAD.
1229 */
1230 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1231 return NULL;
1232 return dst;
1233 }
1234
ipv4_send_dest_unreach(struct sk_buff * skb)1235 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1236 {
1237 struct net_device *dev;
1238 struct ip_options opt;
1239 int res;
1240
1241 /* Recompile ip options since IPCB may not be valid anymore.
1242 * Also check we have a reasonable ipv4 header.
1243 */
1244 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1245 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1246 return;
1247
1248 memset(&opt, 0, sizeof(opt));
1249 if (ip_hdr(skb)->ihl > 5) {
1250 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1251 return;
1252 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1253
1254 rcu_read_lock();
1255 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1256 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1257 rcu_read_unlock();
1258
1259 if (res)
1260 return;
1261 }
1262 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1263 }
1264
ipv4_link_failure(struct sk_buff * skb)1265 static void ipv4_link_failure(struct sk_buff *skb)
1266 {
1267 struct rtable *rt;
1268
1269 ipv4_send_dest_unreach(skb);
1270
1271 rt = skb_rtable(skb);
1272 if (rt)
1273 dst_set_expires(&rt->dst, 0);
1274 }
1275
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1276 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1277 {
1278 pr_debug("%s: %pI4 -> %pI4, %s\n",
1279 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1280 skb->dev ? skb->dev->name : "?");
1281 kfree_skb(skb);
1282 WARN_ON(1);
1283 return 0;
1284 }
1285
1286 /*
1287 We do not cache source address of outgoing interface,
1288 because it is used only by IP RR, TS and SRR options,
1289 so that it out of fast path.
1290
1291 BTW remember: "addr" is allowed to be not aligned
1292 in IP options!
1293 */
1294
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1295 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1296 {
1297 __be32 src;
1298
1299 if (rt_is_output_route(rt))
1300 src = ip_hdr(skb)->saddr;
1301 else {
1302 struct fib_result res;
1303 struct iphdr *iph = ip_hdr(skb);
1304 struct flowi4 fl4 = {
1305 .daddr = iph->daddr,
1306 .saddr = iph->saddr,
1307 .flowi4_tos = RT_TOS(iph->tos),
1308 .flowi4_oif = rt->dst.dev->ifindex,
1309 .flowi4_iif = skb->dev->ifindex,
1310 .flowi4_mark = skb->mark,
1311 };
1312
1313 rcu_read_lock();
1314 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1315 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1316 else
1317 src = inet_select_addr(rt->dst.dev,
1318 rt_nexthop(rt, iph->daddr),
1319 RT_SCOPE_UNIVERSE);
1320 rcu_read_unlock();
1321 }
1322 memcpy(addr, &src, 4);
1323 }
1324
1325 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1326 static void set_class_tag(struct rtable *rt, u32 tag)
1327 {
1328 if (!(rt->dst.tclassid & 0xFFFF))
1329 rt->dst.tclassid |= tag & 0xFFFF;
1330 if (!(rt->dst.tclassid & 0xFFFF0000))
1331 rt->dst.tclassid |= tag & 0xFFFF0000;
1332 }
1333 #endif
1334
ipv4_default_advmss(const struct dst_entry * dst)1335 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1336 {
1337 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1338 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1339 ip_rt_min_advmss);
1340
1341 return min(advmss, IPV4_MAX_PMTU - header_size);
1342 }
1343
ipv4_mtu(const struct dst_entry * dst)1344 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1345 {
1346 const struct rtable *rt = (const struct rtable *)dst;
1347 unsigned int mtu = rt->rt_pmtu;
1348
1349 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1350 mtu = dst_metric_raw(dst, RTAX_MTU);
1351
1352 if (mtu)
1353 goto out;
1354
1355 mtu = READ_ONCE(dst->dev->mtu);
1356
1357 if (unlikely(ip_mtu_locked(dst))) {
1358 if (rt->rt_uses_gateway && mtu > 576)
1359 mtu = 576;
1360 }
1361
1362 out:
1363 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1364
1365 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1366 }
1367
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1368 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1369 {
1370 struct fnhe_hash_bucket *hash;
1371 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1372 u32 hval = fnhe_hashfun(daddr);
1373
1374 spin_lock_bh(&fnhe_lock);
1375
1376 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1377 lockdep_is_held(&fnhe_lock));
1378 hash += hval;
1379
1380 fnhe_p = &hash->chain;
1381 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1382 while (fnhe) {
1383 if (fnhe->fnhe_daddr == daddr) {
1384 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1385 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1386 /* set fnhe_daddr to 0 to ensure it won't bind with
1387 * new dsts in rt_bind_exception().
1388 */
1389 fnhe->fnhe_daddr = 0;
1390 fnhe_flush_routes(fnhe);
1391 kfree_rcu(fnhe, rcu);
1392 break;
1393 }
1394 fnhe_p = &fnhe->fnhe_next;
1395 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1396 lockdep_is_held(&fnhe_lock));
1397 }
1398
1399 spin_unlock_bh(&fnhe_lock);
1400 }
1401
find_exception(struct fib_nh_common * nhc,__be32 daddr)1402 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1403 __be32 daddr)
1404 {
1405 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1406 struct fib_nh_exception *fnhe;
1407 u32 hval;
1408
1409 if (!hash)
1410 return NULL;
1411
1412 hval = fnhe_hashfun(daddr);
1413
1414 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1415 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1416 if (fnhe->fnhe_daddr == daddr) {
1417 if (fnhe->fnhe_expires &&
1418 time_after(jiffies, fnhe->fnhe_expires)) {
1419 ip_del_fnhe(nhc, daddr);
1420 break;
1421 }
1422 return fnhe;
1423 }
1424 }
1425 return NULL;
1426 }
1427
1428 /* MTU selection:
1429 * 1. mtu on route is locked - use it
1430 * 2. mtu from nexthop exception
1431 * 3. mtu from egress device
1432 */
1433
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1434 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1435 {
1436 struct fib_nh_common *nhc = res->nhc;
1437 struct net_device *dev = nhc->nhc_dev;
1438 struct fib_info *fi = res->fi;
1439 u32 mtu = 0;
1440
1441 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1442 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1443 mtu = fi->fib_mtu;
1444
1445 if (likely(!mtu)) {
1446 struct fib_nh_exception *fnhe;
1447
1448 fnhe = find_exception(nhc, daddr);
1449 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1450 mtu = fnhe->fnhe_pmtu;
1451 }
1452
1453 if (likely(!mtu))
1454 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1455
1456 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1457 }
1458
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1459 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1460 __be32 daddr, const bool do_cache)
1461 {
1462 bool ret = false;
1463
1464 spin_lock_bh(&fnhe_lock);
1465
1466 if (daddr == fnhe->fnhe_daddr) {
1467 struct rtable __rcu **porig;
1468 struct rtable *orig;
1469 int genid = fnhe_genid(dev_net(rt->dst.dev));
1470
1471 if (rt_is_input_route(rt))
1472 porig = &fnhe->fnhe_rth_input;
1473 else
1474 porig = &fnhe->fnhe_rth_output;
1475 orig = rcu_dereference(*porig);
1476
1477 if (fnhe->fnhe_genid != genid) {
1478 fnhe->fnhe_genid = genid;
1479 fnhe->fnhe_gw = 0;
1480 fnhe->fnhe_pmtu = 0;
1481 fnhe->fnhe_expires = 0;
1482 fnhe->fnhe_mtu_locked = false;
1483 fnhe_flush_routes(fnhe);
1484 orig = NULL;
1485 }
1486 fill_route_from_fnhe(rt, fnhe);
1487 if (!rt->rt_gw4) {
1488 rt->rt_gw4 = daddr;
1489 rt->rt_gw_family = AF_INET;
1490 }
1491
1492 if (do_cache) {
1493 dst_hold(&rt->dst);
1494 rcu_assign_pointer(*porig, rt);
1495 if (orig) {
1496 dst_dev_put(&orig->dst);
1497 dst_release(&orig->dst);
1498 }
1499 ret = true;
1500 }
1501
1502 fnhe->fnhe_stamp = jiffies;
1503 }
1504 spin_unlock_bh(&fnhe_lock);
1505
1506 return ret;
1507 }
1508
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1509 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1510 {
1511 struct rtable *orig, *prev, **p;
1512 bool ret = true;
1513
1514 if (rt_is_input_route(rt)) {
1515 p = (struct rtable **)&nhc->nhc_rth_input;
1516 } else {
1517 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1518 }
1519 orig = *p;
1520
1521 /* hold dst before doing cmpxchg() to avoid race condition
1522 * on this dst
1523 */
1524 dst_hold(&rt->dst);
1525 prev = cmpxchg(p, orig, rt);
1526 if (prev == orig) {
1527 if (orig) {
1528 rt_add_uncached_list(orig);
1529 dst_release(&orig->dst);
1530 }
1531 } else {
1532 dst_release(&rt->dst);
1533 ret = false;
1534 }
1535
1536 return ret;
1537 }
1538
1539 struct uncached_list {
1540 spinlock_t lock;
1541 struct list_head head;
1542 };
1543
1544 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1545
rt_add_uncached_list(struct rtable * rt)1546 void rt_add_uncached_list(struct rtable *rt)
1547 {
1548 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1549
1550 rt->rt_uncached_list = ul;
1551
1552 spin_lock_bh(&ul->lock);
1553 list_add_tail(&rt->rt_uncached, &ul->head);
1554 spin_unlock_bh(&ul->lock);
1555 }
1556
rt_del_uncached_list(struct rtable * rt)1557 void rt_del_uncached_list(struct rtable *rt)
1558 {
1559 if (!list_empty(&rt->rt_uncached)) {
1560 struct uncached_list *ul = rt->rt_uncached_list;
1561
1562 spin_lock_bh(&ul->lock);
1563 list_del(&rt->rt_uncached);
1564 spin_unlock_bh(&ul->lock);
1565 }
1566 }
1567
ipv4_dst_destroy(struct dst_entry * dst)1568 static void ipv4_dst_destroy(struct dst_entry *dst)
1569 {
1570 struct rtable *rt = (struct rtable *)dst;
1571
1572 ip_dst_metrics_put(dst);
1573 rt_del_uncached_list(rt);
1574 }
1575
rt_flush_dev(struct net_device * dev)1576 void rt_flush_dev(struct net_device *dev)
1577 {
1578 struct rtable *rt;
1579 int cpu;
1580
1581 for_each_possible_cpu(cpu) {
1582 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1583
1584 spin_lock_bh(&ul->lock);
1585 list_for_each_entry(rt, &ul->head, rt_uncached) {
1586 if (rt->dst.dev != dev)
1587 continue;
1588 rt->dst.dev = blackhole_netdev;
1589 dev_hold(rt->dst.dev);
1590 dev_put(dev);
1591 }
1592 spin_unlock_bh(&ul->lock);
1593 }
1594 }
1595
rt_cache_valid(const struct rtable * rt)1596 static bool rt_cache_valid(const struct rtable *rt)
1597 {
1598 return rt &&
1599 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1600 !rt_is_expired(rt);
1601 }
1602
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1603 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1604 const struct fib_result *res,
1605 struct fib_nh_exception *fnhe,
1606 struct fib_info *fi, u16 type, u32 itag,
1607 const bool do_cache)
1608 {
1609 bool cached = false;
1610
1611 if (fi) {
1612 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1613
1614 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1615 rt->rt_uses_gateway = 1;
1616 rt->rt_gw_family = nhc->nhc_gw_family;
1617 /* only INET and INET6 are supported */
1618 if (likely(nhc->nhc_gw_family == AF_INET))
1619 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1620 else
1621 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1622 }
1623
1624 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1625
1626 #ifdef CONFIG_IP_ROUTE_CLASSID
1627 if (nhc->nhc_family == AF_INET) {
1628 struct fib_nh *nh;
1629
1630 nh = container_of(nhc, struct fib_nh, nh_common);
1631 rt->dst.tclassid = nh->nh_tclassid;
1632 }
1633 #endif
1634 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1635 if (unlikely(fnhe))
1636 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1637 else if (do_cache)
1638 cached = rt_cache_route(nhc, rt);
1639 if (unlikely(!cached)) {
1640 /* Routes we intend to cache in nexthop exception or
1641 * FIB nexthop have the DST_NOCACHE bit clear.
1642 * However, if we are unsuccessful at storing this
1643 * route into the cache we really need to set it.
1644 */
1645 if (!rt->rt_gw4) {
1646 rt->rt_gw_family = AF_INET;
1647 rt->rt_gw4 = daddr;
1648 }
1649 rt_add_uncached_list(rt);
1650 }
1651 } else
1652 rt_add_uncached_list(rt);
1653
1654 #ifdef CONFIG_IP_ROUTE_CLASSID
1655 #ifdef CONFIG_IP_MULTIPLE_TABLES
1656 set_class_tag(rt, res->tclassid);
1657 #endif
1658 set_class_tag(rt, itag);
1659 #endif
1660 }
1661
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm)1662 struct rtable *rt_dst_alloc(struct net_device *dev,
1663 unsigned int flags, u16 type,
1664 bool nopolicy, bool noxfrm)
1665 {
1666 struct rtable *rt;
1667
1668 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1669 (nopolicy ? DST_NOPOLICY : 0) |
1670 (noxfrm ? DST_NOXFRM : 0));
1671
1672 if (rt) {
1673 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1674 rt->rt_flags = flags;
1675 rt->rt_type = type;
1676 rt->rt_is_input = 0;
1677 rt->rt_iif = 0;
1678 rt->rt_pmtu = 0;
1679 rt->rt_mtu_locked = 0;
1680 rt->rt_uses_gateway = 0;
1681 rt->rt_gw_family = 0;
1682 rt->rt_gw4 = 0;
1683 INIT_LIST_HEAD(&rt->rt_uncached);
1684
1685 rt->dst.output = ip_output;
1686 if (flags & RTCF_LOCAL)
1687 rt->dst.input = ip_local_deliver;
1688 }
1689
1690 return rt;
1691 }
1692 EXPORT_SYMBOL(rt_dst_alloc);
1693
rt_dst_clone(struct net_device * dev,struct rtable * rt)1694 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1695 {
1696 struct rtable *new_rt;
1697
1698 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1699 rt->dst.flags);
1700
1701 if (new_rt) {
1702 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1703 new_rt->rt_flags = rt->rt_flags;
1704 new_rt->rt_type = rt->rt_type;
1705 new_rt->rt_is_input = rt->rt_is_input;
1706 new_rt->rt_iif = rt->rt_iif;
1707 new_rt->rt_pmtu = rt->rt_pmtu;
1708 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1709 new_rt->rt_gw_family = rt->rt_gw_family;
1710 if (rt->rt_gw_family == AF_INET)
1711 new_rt->rt_gw4 = rt->rt_gw4;
1712 else if (rt->rt_gw_family == AF_INET6)
1713 new_rt->rt_gw6 = rt->rt_gw6;
1714 INIT_LIST_HEAD(&new_rt->rt_uncached);
1715
1716 new_rt->dst.input = rt->dst.input;
1717 new_rt->dst.output = rt->dst.output;
1718 new_rt->dst.error = rt->dst.error;
1719 new_rt->dst.lastuse = jiffies;
1720 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1721 }
1722 return new_rt;
1723 }
1724 EXPORT_SYMBOL(rt_dst_clone);
1725
1726 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1727 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1728 u8 tos, struct net_device *dev,
1729 struct in_device *in_dev, u32 *itag)
1730 {
1731 int err;
1732
1733 /* Primary sanity checks. */
1734 if (!in_dev)
1735 return -EINVAL;
1736
1737 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1738 skb->protocol != htons(ETH_P_IP))
1739 return -EINVAL;
1740
1741 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1742 return -EINVAL;
1743
1744 if (ipv4_is_zeronet(saddr)) {
1745 if (!ipv4_is_local_multicast(daddr) &&
1746 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1747 return -EINVAL;
1748 } else {
1749 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1750 in_dev, itag);
1751 if (err < 0)
1752 return err;
1753 }
1754 return 0;
1755 }
1756
1757 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1758 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1759 u8 tos, struct net_device *dev, int our)
1760 {
1761 struct in_device *in_dev = __in_dev_get_rcu(dev);
1762 unsigned int flags = RTCF_MULTICAST;
1763 struct rtable *rth;
1764 bool no_policy;
1765 u32 itag = 0;
1766 int err;
1767
1768 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1769 if (err)
1770 return err;
1771
1772 if (our)
1773 flags |= RTCF_LOCAL;
1774
1775 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1776 if (no_policy)
1777 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1778
1779 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1780 no_policy, false);
1781 if (!rth)
1782 return -ENOBUFS;
1783
1784 #ifdef CONFIG_IP_ROUTE_CLASSID
1785 rth->dst.tclassid = itag;
1786 #endif
1787 rth->dst.output = ip_rt_bug;
1788 rth->rt_is_input= 1;
1789
1790 #ifdef CONFIG_IP_MROUTE
1791 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1792 rth->dst.input = ip_mr_input;
1793 #endif
1794 RT_CACHE_STAT_INC(in_slow_mc);
1795
1796 skb_dst_drop(skb);
1797 skb_dst_set(skb, &rth->dst);
1798 return 0;
1799 }
1800
1801
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1802 static void ip_handle_martian_source(struct net_device *dev,
1803 struct in_device *in_dev,
1804 struct sk_buff *skb,
1805 __be32 daddr,
1806 __be32 saddr)
1807 {
1808 RT_CACHE_STAT_INC(in_martian_src);
1809 #ifdef CONFIG_IP_ROUTE_VERBOSE
1810 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1811 /*
1812 * RFC1812 recommendation, if source is martian,
1813 * the only hint is MAC header.
1814 */
1815 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1816 &daddr, &saddr, dev->name);
1817 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1818 print_hex_dump(KERN_WARNING, "ll header: ",
1819 DUMP_PREFIX_OFFSET, 16, 1,
1820 skb_mac_header(skb),
1821 dev->hard_header_len, false);
1822 }
1823 }
1824 #endif
1825 }
1826
1827 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1828 static int __mkroute_input(struct sk_buff *skb,
1829 const struct fib_result *res,
1830 struct in_device *in_dev,
1831 __be32 daddr, __be32 saddr, u32 tos)
1832 {
1833 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1834 struct net_device *dev = nhc->nhc_dev;
1835 struct fib_nh_exception *fnhe;
1836 struct rtable *rth;
1837 int err;
1838 struct in_device *out_dev;
1839 bool do_cache, no_policy;
1840 u32 itag = 0;
1841
1842 /* get a working reference to the output device */
1843 out_dev = __in_dev_get_rcu(dev);
1844 if (!out_dev) {
1845 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1846 return -EINVAL;
1847 }
1848
1849 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1850 in_dev->dev, in_dev, &itag);
1851 if (err < 0) {
1852 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1853 saddr);
1854
1855 goto cleanup;
1856 }
1857
1858 do_cache = res->fi && !itag;
1859 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1860 skb->protocol == htons(ETH_P_IP)) {
1861 __be32 gw;
1862
1863 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1864 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1865 inet_addr_onlink(out_dev, saddr, gw))
1866 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1867 }
1868
1869 if (skb->protocol != htons(ETH_P_IP)) {
1870 /* Not IP (i.e. ARP). Do not create route, if it is
1871 * invalid for proxy arp. DNAT routes are always valid.
1872 *
1873 * Proxy arp feature have been extended to allow, ARP
1874 * replies back to the same interface, to support
1875 * Private VLAN switch technologies. See arp.c.
1876 */
1877 if (out_dev == in_dev &&
1878 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1879 err = -EINVAL;
1880 goto cleanup;
1881 }
1882 }
1883
1884 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1885 if (no_policy)
1886 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1887
1888 fnhe = find_exception(nhc, daddr);
1889 if (do_cache) {
1890 if (fnhe)
1891 rth = rcu_dereference(fnhe->fnhe_rth_input);
1892 else
1893 rth = rcu_dereference(nhc->nhc_rth_input);
1894 if (rt_cache_valid(rth)) {
1895 skb_dst_set_noref(skb, &rth->dst);
1896 goto out;
1897 }
1898 }
1899
1900 rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1901 IN_DEV_ORCONF(out_dev, NOXFRM));
1902 if (!rth) {
1903 err = -ENOBUFS;
1904 goto cleanup;
1905 }
1906
1907 rth->rt_is_input = 1;
1908 RT_CACHE_STAT_INC(in_slow_tot);
1909
1910 rth->dst.input = ip_forward;
1911
1912 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1913 do_cache);
1914 lwtunnel_set_redirect(&rth->dst);
1915 skb_dst_set(skb, &rth->dst);
1916 out:
1917 err = 0;
1918 cleanup:
1919 return err;
1920 }
1921
1922 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1923 /* To make ICMP packets follow the right flow, the multipath hash is
1924 * calculated from the inner IP addresses.
1925 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1926 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1927 struct flow_keys *hash_keys)
1928 {
1929 const struct iphdr *outer_iph = ip_hdr(skb);
1930 const struct iphdr *key_iph = outer_iph;
1931 const struct iphdr *inner_iph;
1932 const struct icmphdr *icmph;
1933 struct iphdr _inner_iph;
1934 struct icmphdr _icmph;
1935
1936 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1937 goto out;
1938
1939 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1940 goto out;
1941
1942 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1943 &_icmph);
1944 if (!icmph)
1945 goto out;
1946
1947 if (!icmp_is_err(icmph->type))
1948 goto out;
1949
1950 inner_iph = skb_header_pointer(skb,
1951 outer_iph->ihl * 4 + sizeof(_icmph),
1952 sizeof(_inner_iph), &_inner_iph);
1953 if (!inner_iph)
1954 goto out;
1955
1956 key_iph = inner_iph;
1957 out:
1958 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1959 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1960 }
1961
1962 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1963 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1964 const struct sk_buff *skb, struct flow_keys *flkeys)
1965 {
1966 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1967 struct flow_keys hash_keys;
1968 u32 mhash;
1969
1970 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1971 case 0:
1972 memset(&hash_keys, 0, sizeof(hash_keys));
1973 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1974 if (skb) {
1975 ip_multipath_l3_keys(skb, &hash_keys);
1976 } else {
1977 hash_keys.addrs.v4addrs.src = fl4->saddr;
1978 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1979 }
1980 break;
1981 case 1:
1982 /* skb is currently provided only when forwarding */
1983 if (skb) {
1984 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1985 struct flow_keys keys;
1986
1987 /* short-circuit if we already have L4 hash present */
1988 if (skb->l4_hash)
1989 return skb_get_hash_raw(skb) >> 1;
1990
1991 memset(&hash_keys, 0, sizeof(hash_keys));
1992
1993 if (!flkeys) {
1994 skb_flow_dissect_flow_keys(skb, &keys, flag);
1995 flkeys = &keys;
1996 }
1997
1998 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1999 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2000 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2001 hash_keys.ports.src = flkeys->ports.src;
2002 hash_keys.ports.dst = flkeys->ports.dst;
2003 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2004 } else {
2005 memset(&hash_keys, 0, sizeof(hash_keys));
2006 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2007 hash_keys.addrs.v4addrs.src = fl4->saddr;
2008 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2009 hash_keys.ports.src = fl4->fl4_sport;
2010 hash_keys.ports.dst = fl4->fl4_dport;
2011 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2012 }
2013 break;
2014 case 2:
2015 memset(&hash_keys, 0, sizeof(hash_keys));
2016 /* skb is currently provided only when forwarding */
2017 if (skb) {
2018 struct flow_keys keys;
2019
2020 skb_flow_dissect_flow_keys(skb, &keys, 0);
2021 /* Inner can be v4 or v6 */
2022 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2023 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2025 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2026 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2027 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2028 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2029 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2030 hash_keys.tags.flow_label = keys.tags.flow_label;
2031 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2032 } else {
2033 /* Same as case 0 */
2034 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2035 ip_multipath_l3_keys(skb, &hash_keys);
2036 }
2037 } else {
2038 /* Same as case 0 */
2039 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2040 hash_keys.addrs.v4addrs.src = fl4->saddr;
2041 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2042 }
2043 break;
2044 }
2045 mhash = flow_hash_from_keys(&hash_keys);
2046
2047 if (multipath_hash)
2048 mhash = jhash_2words(mhash, multipath_hash, 0);
2049
2050 return mhash >> 1;
2051 }
2052 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2053
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2054 static int ip_mkroute_input(struct sk_buff *skb,
2055 struct fib_result *res,
2056 struct in_device *in_dev,
2057 __be32 daddr, __be32 saddr, u32 tos,
2058 struct flow_keys *hkeys)
2059 {
2060 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2061 if (res->fi && fib_info_num_path(res->fi) > 1) {
2062 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2063
2064 fib_select_multipath(res, h);
2065 IPCB(skb)->flags |= IPSKB_MULTIPATH;
2066 }
2067 #endif
2068
2069 /* create a routing cache entry */
2070 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2071 }
2072
2073 /* Implements all the saddr-related checks as ip_route_input_slow(),
2074 * assuming daddr is valid and the destination is not a local broadcast one.
2075 * Uses the provided hint instead of performing a route lookup.
2076 */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2077 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078 u8 tos, struct net_device *dev,
2079 const struct sk_buff *hint)
2080 {
2081 struct in_device *in_dev = __in_dev_get_rcu(dev);
2082 struct rtable *rt = skb_rtable(hint);
2083 struct net *net = dev_net(dev);
2084 int err = -EINVAL;
2085 u32 tag = 0;
2086
2087 if (!in_dev)
2088 return -EINVAL;
2089
2090 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2091 goto martian_source;
2092
2093 if (ipv4_is_zeronet(saddr))
2094 goto martian_source;
2095
2096 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2097 goto martian_source;
2098
2099 if (rt->rt_type != RTN_LOCAL)
2100 goto skip_validate_source;
2101
2102 tos &= IPTOS_RT_MASK;
2103 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2104 if (err < 0)
2105 goto martian_source;
2106
2107 skip_validate_source:
2108 skb_dst_copy(skb, hint);
2109 return 0;
2110
2111 martian_source:
2112 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2113 return err;
2114 }
2115
2116 /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2117 static struct net_device *ip_rt_get_dev(struct net *net,
2118 const struct fib_result *res)
2119 {
2120 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2121 struct net_device *dev = NULL;
2122
2123 if (nhc)
2124 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2125
2126 return dev ? : net->loopback_dev;
2127 }
2128
2129 /*
2130 * NOTE. We drop all the packets that has local source
2131 * addresses, because every properly looped back packet
2132 * must have correct destination already attached by output routine.
2133 * Changes in the enforced policies must be applied also to
2134 * ip_route_use_hint().
2135 *
2136 * Such approach solves two big problems:
2137 * 1. Not simplex devices are handled properly.
2138 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2139 * called with rcu_read_lock()
2140 */
2141
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2142 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2143 u8 tos, struct net_device *dev,
2144 struct fib_result *res)
2145 {
2146 struct in_device *in_dev = __in_dev_get_rcu(dev);
2147 struct flow_keys *flkeys = NULL, _flkeys;
2148 struct net *net = dev_net(dev);
2149 struct ip_tunnel_info *tun_info;
2150 int err = -EINVAL;
2151 unsigned int flags = 0;
2152 u32 itag = 0;
2153 struct rtable *rth;
2154 struct flowi4 fl4;
2155 bool do_cache = true;
2156 bool no_policy;
2157
2158 /* IP on this device is disabled. */
2159
2160 if (!in_dev)
2161 goto out;
2162
2163 /* Check for the most weird martians, which can be not detected
2164 by fib_lookup.
2165 */
2166
2167 tun_info = skb_tunnel_info(skb);
2168 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2169 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2170 else
2171 fl4.flowi4_tun_key.tun_id = 0;
2172 skb_dst_drop(skb);
2173
2174 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2175 goto martian_source;
2176
2177 res->fi = NULL;
2178 res->table = NULL;
2179 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2180 goto brd_input;
2181
2182 /* Accept zero addresses only to limited broadcast;
2183 * I even do not know to fix it or not. Waiting for complains :-)
2184 */
2185 if (ipv4_is_zeronet(saddr))
2186 goto martian_source;
2187
2188 if (ipv4_is_zeronet(daddr))
2189 goto martian_destination;
2190
2191 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2192 * and call it once if daddr or/and saddr are loopback addresses
2193 */
2194 if (ipv4_is_loopback(daddr)) {
2195 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2196 goto martian_destination;
2197 } else if (ipv4_is_loopback(saddr)) {
2198 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2199 goto martian_source;
2200 }
2201
2202 /*
2203 * Now we are ready to route packet.
2204 */
2205 fl4.flowi4_oif = 0;
2206 fl4.flowi4_iif = dev->ifindex;
2207 fl4.flowi4_mark = skb->mark;
2208 fl4.flowi4_tos = tos;
2209 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2210 fl4.flowi4_flags = 0;
2211 fl4.daddr = daddr;
2212 fl4.saddr = saddr;
2213 fl4.flowi4_uid = sock_net_uid(net, NULL);
2214 fl4.flowi4_multipath_hash = 0;
2215
2216 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2217 flkeys = &_flkeys;
2218 } else {
2219 fl4.flowi4_proto = 0;
2220 fl4.fl4_sport = 0;
2221 fl4.fl4_dport = 0;
2222 }
2223
2224 err = fib_lookup(net, &fl4, res, 0);
2225 if (err != 0) {
2226 if (!IN_DEV_FORWARD(in_dev))
2227 err = -EHOSTUNREACH;
2228 goto no_route;
2229 }
2230
2231 if (res->type == RTN_BROADCAST) {
2232 if (IN_DEV_BFORWARD(in_dev))
2233 goto make_route;
2234 /* not do cache if bc_forwarding is enabled */
2235 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2236 do_cache = false;
2237 goto brd_input;
2238 }
2239
2240 if (res->type == RTN_LOCAL) {
2241 err = fib_validate_source(skb, saddr, daddr, tos,
2242 0, dev, in_dev, &itag);
2243 if (err < 0)
2244 goto martian_source;
2245 goto local_input;
2246 }
2247
2248 if (!IN_DEV_FORWARD(in_dev)) {
2249 err = -EHOSTUNREACH;
2250 goto no_route;
2251 }
2252 if (res->type != RTN_UNICAST)
2253 goto martian_destination;
2254
2255 make_route:
2256 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2257 out: return err;
2258
2259 brd_input:
2260 if (skb->protocol != htons(ETH_P_IP))
2261 goto e_inval;
2262
2263 if (!ipv4_is_zeronet(saddr)) {
2264 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2265 in_dev, &itag);
2266 if (err < 0)
2267 goto martian_source;
2268 }
2269 flags |= RTCF_BROADCAST;
2270 res->type = RTN_BROADCAST;
2271 RT_CACHE_STAT_INC(in_brd);
2272
2273 local_input:
2274 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2275 if (no_policy)
2276 IPCB(skb)->flags |= IPSKB_NOPOLICY;
2277
2278 do_cache &= res->fi && !itag;
2279 if (do_cache) {
2280 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2281
2282 rth = rcu_dereference(nhc->nhc_rth_input);
2283 if (rt_cache_valid(rth)) {
2284 skb_dst_set_noref(skb, &rth->dst);
2285 err = 0;
2286 goto out;
2287 }
2288 }
2289
2290 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2291 flags | RTCF_LOCAL, res->type,
2292 no_policy, false);
2293 if (!rth)
2294 goto e_nobufs;
2295
2296 rth->dst.output= ip_rt_bug;
2297 #ifdef CONFIG_IP_ROUTE_CLASSID
2298 rth->dst.tclassid = itag;
2299 #endif
2300 rth->rt_is_input = 1;
2301
2302 RT_CACHE_STAT_INC(in_slow_tot);
2303 if (res->type == RTN_UNREACHABLE) {
2304 rth->dst.input= ip_error;
2305 rth->dst.error= -err;
2306 rth->rt_flags &= ~RTCF_LOCAL;
2307 }
2308
2309 if (do_cache) {
2310 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2311
2312 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2313 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2314 WARN_ON(rth->dst.input == lwtunnel_input);
2315 rth->dst.lwtstate->orig_input = rth->dst.input;
2316 rth->dst.input = lwtunnel_input;
2317 }
2318
2319 if (unlikely(!rt_cache_route(nhc, rth)))
2320 rt_add_uncached_list(rth);
2321 }
2322 skb_dst_set(skb, &rth->dst);
2323 err = 0;
2324 goto out;
2325
2326 no_route:
2327 RT_CACHE_STAT_INC(in_no_route);
2328 res->type = RTN_UNREACHABLE;
2329 res->fi = NULL;
2330 res->table = NULL;
2331 goto local_input;
2332
2333 /*
2334 * Do not cache martian addresses: they should be logged (RFC1812)
2335 */
2336 martian_destination:
2337 RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339 if (IN_DEV_LOG_MARTIANS(in_dev))
2340 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2341 &daddr, &saddr, dev->name);
2342 #endif
2343
2344 e_inval:
2345 err = -EINVAL;
2346 goto out;
2347
2348 e_nobufs:
2349 err = -ENOBUFS;
2350 goto out;
2351
2352 martian_source:
2353 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2354 goto out;
2355 }
2356
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2357 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2358 u8 tos, struct net_device *dev)
2359 {
2360 struct fib_result res;
2361 int err;
2362
2363 tos &= IPTOS_RT_MASK;
2364 rcu_read_lock();
2365 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2366 rcu_read_unlock();
2367
2368 return err;
2369 }
2370 EXPORT_SYMBOL(ip_route_input_noref);
2371
2372 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2373 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2374 u8 tos, struct net_device *dev, struct fib_result *res)
2375 {
2376 /* Multicast recognition logic is moved from route cache to here.
2377 The problem was that too many Ethernet cards have broken/missing
2378 hardware multicast filters :-( As result the host on multicasting
2379 network acquires a lot of useless route cache entries, sort of
2380 SDR messages from all the world. Now we try to get rid of them.
2381 Really, provided software IP multicast filter is organized
2382 reasonably (at least, hashed), it does not result in a slowdown
2383 comparing with route cache reject entries.
2384 Note, that multicast routers are not affected, because
2385 route cache entry is created eventually.
2386 */
2387 if (ipv4_is_multicast(daddr)) {
2388 struct in_device *in_dev = __in_dev_get_rcu(dev);
2389 int our = 0;
2390 int err = -EINVAL;
2391
2392 if (!in_dev)
2393 return err;
2394 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2395 ip_hdr(skb)->protocol);
2396
2397 /* check l3 master if no match yet */
2398 if (!our && netif_is_l3_slave(dev)) {
2399 struct in_device *l3_in_dev;
2400
2401 l3_in_dev = __in_dev_get_rcu(skb->dev);
2402 if (l3_in_dev)
2403 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2404 ip_hdr(skb)->protocol);
2405 }
2406
2407 if (our
2408 #ifdef CONFIG_IP_MROUTE
2409 ||
2410 (!ipv4_is_local_multicast(daddr) &&
2411 IN_DEV_MFORWARD(in_dev))
2412 #endif
2413 ) {
2414 err = ip_route_input_mc(skb, daddr, saddr,
2415 tos, dev, our);
2416 }
2417 return err;
2418 }
2419
2420 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2421 }
2422
2423 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2424 static struct rtable *__mkroute_output(const struct fib_result *res,
2425 const struct flowi4 *fl4, int orig_oif,
2426 struct net_device *dev_out,
2427 unsigned int flags)
2428 {
2429 struct fib_info *fi = res->fi;
2430 struct fib_nh_exception *fnhe;
2431 struct in_device *in_dev;
2432 u16 type = res->type;
2433 struct rtable *rth;
2434 bool do_cache;
2435
2436 in_dev = __in_dev_get_rcu(dev_out);
2437 if (!in_dev)
2438 return ERR_PTR(-EINVAL);
2439
2440 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2441 if (ipv4_is_loopback(fl4->saddr) &&
2442 !(dev_out->flags & IFF_LOOPBACK) &&
2443 !netif_is_l3_master(dev_out))
2444 return ERR_PTR(-EINVAL);
2445
2446 if (ipv4_is_lbcast(fl4->daddr))
2447 type = RTN_BROADCAST;
2448 else if (ipv4_is_multicast(fl4->daddr))
2449 type = RTN_MULTICAST;
2450 else if (ipv4_is_zeronet(fl4->daddr))
2451 return ERR_PTR(-EINVAL);
2452
2453 if (dev_out->flags & IFF_LOOPBACK)
2454 flags |= RTCF_LOCAL;
2455
2456 do_cache = true;
2457 if (type == RTN_BROADCAST) {
2458 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2459 fi = NULL;
2460 } else if (type == RTN_MULTICAST) {
2461 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2462 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2463 fl4->flowi4_proto))
2464 flags &= ~RTCF_LOCAL;
2465 else
2466 do_cache = false;
2467 /* If multicast route do not exist use
2468 * default one, but do not gateway in this case.
2469 * Yes, it is hack.
2470 */
2471 if (fi && res->prefixlen < 4)
2472 fi = NULL;
2473 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2474 (orig_oif != dev_out->ifindex)) {
2475 /* For local routes that require a particular output interface
2476 * we do not want to cache the result. Caching the result
2477 * causes incorrect behaviour when there are multiple source
2478 * addresses on the interface, the end result being that if the
2479 * intended recipient is waiting on that interface for the
2480 * packet he won't receive it because it will be delivered on
2481 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2482 * be set to the loopback interface as well.
2483 */
2484 do_cache = false;
2485 }
2486
2487 fnhe = NULL;
2488 do_cache &= fi != NULL;
2489 if (fi) {
2490 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2491 struct rtable __rcu **prth;
2492
2493 fnhe = find_exception(nhc, fl4->daddr);
2494 if (!do_cache)
2495 goto add;
2496 if (fnhe) {
2497 prth = &fnhe->fnhe_rth_output;
2498 } else {
2499 if (unlikely(fl4->flowi4_flags &
2500 FLOWI_FLAG_KNOWN_NH &&
2501 !(nhc->nhc_gw_family &&
2502 nhc->nhc_scope == RT_SCOPE_LINK))) {
2503 do_cache = false;
2504 goto add;
2505 }
2506 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2507 }
2508 rth = rcu_dereference(*prth);
2509 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2510 return rth;
2511 }
2512
2513 add:
2514 rth = rt_dst_alloc(dev_out, flags, type,
2515 IN_DEV_ORCONF(in_dev, NOPOLICY),
2516 IN_DEV_ORCONF(in_dev, NOXFRM));
2517 if (!rth)
2518 return ERR_PTR(-ENOBUFS);
2519
2520 rth->rt_iif = orig_oif;
2521
2522 RT_CACHE_STAT_INC(out_slow_tot);
2523
2524 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2525 if (flags & RTCF_LOCAL &&
2526 !(dev_out->flags & IFF_LOOPBACK)) {
2527 rth->dst.output = ip_mc_output;
2528 RT_CACHE_STAT_INC(out_slow_mc);
2529 }
2530 #ifdef CONFIG_IP_MROUTE
2531 if (type == RTN_MULTICAST) {
2532 if (IN_DEV_MFORWARD(in_dev) &&
2533 !ipv4_is_local_multicast(fl4->daddr)) {
2534 rth->dst.input = ip_mr_input;
2535 rth->dst.output = ip_mc_output;
2536 }
2537 }
2538 #endif
2539 }
2540
2541 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2542 lwtunnel_set_redirect(&rth->dst);
2543
2544 return rth;
2545 }
2546
2547 /*
2548 * Major route resolver routine.
2549 */
2550
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2551 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2552 const struct sk_buff *skb)
2553 {
2554 struct fib_result res = {
2555 .type = RTN_UNSPEC,
2556 .fi = NULL,
2557 .table = NULL,
2558 .tclassid = 0,
2559 };
2560 struct rtable *rth;
2561
2562 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2563 ip_rt_fix_tos(fl4);
2564
2565 rcu_read_lock();
2566 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2567 rcu_read_unlock();
2568
2569 return rth;
2570 }
2571 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2572
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2573 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2574 struct fib_result *res,
2575 const struct sk_buff *skb)
2576 {
2577 struct net_device *dev_out = NULL;
2578 int orig_oif = fl4->flowi4_oif;
2579 unsigned int flags = 0;
2580 struct rtable *rth;
2581 int err;
2582
2583 if (fl4->saddr) {
2584 if (ipv4_is_multicast(fl4->saddr) ||
2585 ipv4_is_lbcast(fl4->saddr) ||
2586 ipv4_is_zeronet(fl4->saddr)) {
2587 rth = ERR_PTR(-EINVAL);
2588 goto out;
2589 }
2590
2591 rth = ERR_PTR(-ENETUNREACH);
2592
2593 /* I removed check for oif == dev_out->oif here.
2594 It was wrong for two reasons:
2595 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2596 is assigned to multiple interfaces.
2597 2. Moreover, we are allowed to send packets with saddr
2598 of another iface. --ANK
2599 */
2600
2601 if (fl4->flowi4_oif == 0 &&
2602 (ipv4_is_multicast(fl4->daddr) ||
2603 ipv4_is_lbcast(fl4->daddr))) {
2604 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2605 dev_out = __ip_dev_find(net, fl4->saddr, false);
2606 if (!dev_out)
2607 goto out;
2608
2609 /* Special hack: user can direct multicasts
2610 and limited broadcast via necessary interface
2611 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2612 This hack is not just for fun, it allows
2613 vic,vat and friends to work.
2614 They bind socket to loopback, set ttl to zero
2615 and expect that it will work.
2616 From the viewpoint of routing cache they are broken,
2617 because we are not allowed to build multicast path
2618 with loopback source addr (look, routing cache
2619 cannot know, that ttl is zero, so that packet
2620 will not leave this host and route is valid).
2621 Luckily, this hack is good workaround.
2622 */
2623
2624 fl4->flowi4_oif = dev_out->ifindex;
2625 goto make_route;
2626 }
2627
2628 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2629 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2630 if (!__ip_dev_find(net, fl4->saddr, false))
2631 goto out;
2632 }
2633 }
2634
2635
2636 if (fl4->flowi4_oif) {
2637 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2638 rth = ERR_PTR(-ENODEV);
2639 if (!dev_out)
2640 goto out;
2641
2642 /* RACE: Check return value of inet_select_addr instead. */
2643 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2644 rth = ERR_PTR(-ENETUNREACH);
2645 goto out;
2646 }
2647 if (ipv4_is_local_multicast(fl4->daddr) ||
2648 ipv4_is_lbcast(fl4->daddr) ||
2649 fl4->flowi4_proto == IPPROTO_IGMP) {
2650 if (!fl4->saddr)
2651 fl4->saddr = inet_select_addr(dev_out, 0,
2652 RT_SCOPE_LINK);
2653 goto make_route;
2654 }
2655 if (!fl4->saddr) {
2656 if (ipv4_is_multicast(fl4->daddr))
2657 fl4->saddr = inet_select_addr(dev_out, 0,
2658 fl4->flowi4_scope);
2659 else if (!fl4->daddr)
2660 fl4->saddr = inet_select_addr(dev_out, 0,
2661 RT_SCOPE_HOST);
2662 }
2663 }
2664
2665 if (!fl4->daddr) {
2666 fl4->daddr = fl4->saddr;
2667 if (!fl4->daddr)
2668 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2669 dev_out = net->loopback_dev;
2670 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2671 res->type = RTN_LOCAL;
2672 flags |= RTCF_LOCAL;
2673 goto make_route;
2674 }
2675
2676 err = fib_lookup(net, fl4, res, 0);
2677 if (err) {
2678 res->fi = NULL;
2679 res->table = NULL;
2680 if (fl4->flowi4_oif &&
2681 (ipv4_is_multicast(fl4->daddr) ||
2682 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2683 /* Apparently, routing tables are wrong. Assume,
2684 that the destination is on link.
2685
2686 WHY? DW.
2687 Because we are allowed to send to iface
2688 even if it has NO routes and NO assigned
2689 addresses. When oif is specified, routing
2690 tables are looked up with only one purpose:
2691 to catch if destination is gatewayed, rather than
2692 direct. Moreover, if MSG_DONTROUTE is set,
2693 we send packet, ignoring both routing tables
2694 and ifaddr state. --ANK
2695
2696
2697 We could make it even if oif is unknown,
2698 likely IPv6, but we do not.
2699 */
2700
2701 if (fl4->saddr == 0)
2702 fl4->saddr = inet_select_addr(dev_out, 0,
2703 RT_SCOPE_LINK);
2704 res->type = RTN_UNICAST;
2705 goto make_route;
2706 }
2707 rth = ERR_PTR(err);
2708 goto out;
2709 }
2710
2711 if (res->type == RTN_LOCAL) {
2712 if (!fl4->saddr) {
2713 if (res->fi->fib_prefsrc)
2714 fl4->saddr = res->fi->fib_prefsrc;
2715 else
2716 fl4->saddr = fl4->daddr;
2717 }
2718
2719 /* L3 master device is the loopback for that domain */
2720 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2721 net->loopback_dev;
2722
2723 /* make sure orig_oif points to fib result device even
2724 * though packet rx/tx happens over loopback or l3mdev
2725 */
2726 orig_oif = FIB_RES_OIF(*res);
2727
2728 fl4->flowi4_oif = dev_out->ifindex;
2729 flags |= RTCF_LOCAL;
2730 goto make_route;
2731 }
2732
2733 fib_select_path(net, res, fl4, skb);
2734
2735 dev_out = FIB_RES_DEV(*res);
2736
2737 make_route:
2738 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2739
2740 out:
2741 return rth;
2742 }
2743
2744 static struct dst_ops ipv4_dst_blackhole_ops = {
2745 .family = AF_INET,
2746 .default_advmss = ipv4_default_advmss,
2747 .neigh_lookup = ipv4_neigh_lookup,
2748 .check = dst_blackhole_check,
2749 .cow_metrics = dst_blackhole_cow_metrics,
2750 .update_pmtu = dst_blackhole_update_pmtu,
2751 .redirect = dst_blackhole_redirect,
2752 .mtu = dst_blackhole_mtu,
2753 };
2754
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2755 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2756 {
2757 struct rtable *ort = (struct rtable *) dst_orig;
2758 struct rtable *rt;
2759
2760 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2761 if (rt) {
2762 struct dst_entry *new = &rt->dst;
2763
2764 new->__use = 1;
2765 new->input = dst_discard;
2766 new->output = dst_discard_out;
2767
2768 new->dev = net->loopback_dev;
2769 if (new->dev)
2770 dev_hold(new->dev);
2771
2772 rt->rt_is_input = ort->rt_is_input;
2773 rt->rt_iif = ort->rt_iif;
2774 rt->rt_pmtu = ort->rt_pmtu;
2775 rt->rt_mtu_locked = ort->rt_mtu_locked;
2776
2777 rt->rt_genid = rt_genid_ipv4(net);
2778 rt->rt_flags = ort->rt_flags;
2779 rt->rt_type = ort->rt_type;
2780 rt->rt_uses_gateway = ort->rt_uses_gateway;
2781 rt->rt_gw_family = ort->rt_gw_family;
2782 if (rt->rt_gw_family == AF_INET)
2783 rt->rt_gw4 = ort->rt_gw4;
2784 else if (rt->rt_gw_family == AF_INET6)
2785 rt->rt_gw6 = ort->rt_gw6;
2786
2787 INIT_LIST_HEAD(&rt->rt_uncached);
2788 }
2789
2790 dst_release(dst_orig);
2791
2792 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2793 }
2794
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2795 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2796 const struct sock *sk)
2797 {
2798 struct rtable *rt = __ip_route_output_key(net, flp4);
2799
2800 if (IS_ERR(rt))
2801 return rt;
2802
2803 if (flp4->flowi4_proto) {
2804 flp4->flowi4_oif = rt->dst.dev->ifindex;
2805 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2806 flowi4_to_flowi(flp4),
2807 sk, 0);
2808 }
2809
2810 return rt;
2811 }
2812 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2813
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2814 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2815 struct net_device *dev,
2816 struct net *net, __be32 *saddr,
2817 const struct ip_tunnel_info *info,
2818 u8 protocol, bool use_cache)
2819 {
2820 #ifdef CONFIG_DST_CACHE
2821 struct dst_cache *dst_cache;
2822 #endif
2823 struct rtable *rt = NULL;
2824 struct flowi4 fl4;
2825 __u8 tos;
2826
2827 #ifdef CONFIG_DST_CACHE
2828 dst_cache = (struct dst_cache *)&info->dst_cache;
2829 if (use_cache) {
2830 rt = dst_cache_get_ip4(dst_cache, saddr);
2831 if (rt)
2832 return rt;
2833 }
2834 #endif
2835 memset(&fl4, 0, sizeof(fl4));
2836 fl4.flowi4_mark = skb->mark;
2837 fl4.flowi4_proto = protocol;
2838 fl4.daddr = info->key.u.ipv4.dst;
2839 fl4.saddr = info->key.u.ipv4.src;
2840 tos = info->key.tos;
2841 fl4.flowi4_tos = RT_TOS(tos);
2842
2843 rt = ip_route_output_key(net, &fl4);
2844 if (IS_ERR(rt)) {
2845 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2846 return ERR_PTR(-ENETUNREACH);
2847 }
2848 if (rt->dst.dev == dev) { /* is this necessary? */
2849 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2850 ip_rt_put(rt);
2851 return ERR_PTR(-ELOOP);
2852 }
2853 #ifdef CONFIG_DST_CACHE
2854 if (use_cache)
2855 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2856 #endif
2857 *saddr = fl4.saddr;
2858 return rt;
2859 }
2860 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2861
2862 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2863 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2864 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2865 struct sk_buff *skb, u32 portid, u32 seq,
2866 unsigned int flags)
2867 {
2868 struct rtmsg *r;
2869 struct nlmsghdr *nlh;
2870 unsigned long expires = 0;
2871 u32 error;
2872 u32 metrics[RTAX_MAX];
2873
2874 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2875 if (!nlh)
2876 return -EMSGSIZE;
2877
2878 r = nlmsg_data(nlh);
2879 r->rtm_family = AF_INET;
2880 r->rtm_dst_len = 32;
2881 r->rtm_src_len = 0;
2882 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2883 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2884 if (nla_put_u32(skb, RTA_TABLE, table_id))
2885 goto nla_put_failure;
2886 r->rtm_type = rt->rt_type;
2887 r->rtm_scope = RT_SCOPE_UNIVERSE;
2888 r->rtm_protocol = RTPROT_UNSPEC;
2889 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890 if (rt->rt_flags & RTCF_NOTIFY)
2891 r->rtm_flags |= RTM_F_NOTIFY;
2892 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2893 r->rtm_flags |= RTCF_DOREDIRECT;
2894
2895 if (nla_put_in_addr(skb, RTA_DST, dst))
2896 goto nla_put_failure;
2897 if (src) {
2898 r->rtm_src_len = 32;
2899 if (nla_put_in_addr(skb, RTA_SRC, src))
2900 goto nla_put_failure;
2901 }
2902 if (rt->dst.dev &&
2903 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2904 goto nla_put_failure;
2905 #ifdef CONFIG_IP_ROUTE_CLASSID
2906 if (rt->dst.tclassid &&
2907 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2908 goto nla_put_failure;
2909 #endif
2910 if (fl4 && !rt_is_input_route(rt) &&
2911 fl4->saddr != src) {
2912 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2913 goto nla_put_failure;
2914 }
2915 if (rt->rt_uses_gateway) {
2916 if (rt->rt_gw_family == AF_INET &&
2917 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2918 goto nla_put_failure;
2919 } else if (rt->rt_gw_family == AF_INET6) {
2920 int alen = sizeof(struct in6_addr);
2921 struct nlattr *nla;
2922 struct rtvia *via;
2923
2924 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2925 if (!nla)
2926 goto nla_put_failure;
2927
2928 via = nla_data(nla);
2929 via->rtvia_family = AF_INET6;
2930 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2931 }
2932 }
2933
2934 expires = rt->dst.expires;
2935 if (expires) {
2936 unsigned long now = jiffies;
2937
2938 if (time_before(now, expires))
2939 expires -= now;
2940 else
2941 expires = 0;
2942 }
2943
2944 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2945 if (rt->rt_pmtu && expires)
2946 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2947 if (rt->rt_mtu_locked && expires)
2948 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2949 if (rtnetlink_put_metrics(skb, metrics) < 0)
2950 goto nla_put_failure;
2951
2952 if (fl4) {
2953 if (fl4->flowi4_mark &&
2954 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2955 goto nla_put_failure;
2956
2957 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2958 nla_put_u32(skb, RTA_UID,
2959 from_kuid_munged(current_user_ns(),
2960 fl4->flowi4_uid)))
2961 goto nla_put_failure;
2962
2963 if (rt_is_input_route(rt)) {
2964 #ifdef CONFIG_IP_MROUTE
2965 if (ipv4_is_multicast(dst) &&
2966 !ipv4_is_local_multicast(dst) &&
2967 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2968 int err = ipmr_get_route(net, skb,
2969 fl4->saddr, fl4->daddr,
2970 r, portid);
2971
2972 if (err <= 0) {
2973 if (err == 0)
2974 return 0;
2975 goto nla_put_failure;
2976 }
2977 } else
2978 #endif
2979 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2980 goto nla_put_failure;
2981 }
2982 }
2983
2984 error = rt->dst.error;
2985
2986 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2987 goto nla_put_failure;
2988
2989 nlmsg_end(skb, nlh);
2990 return 0;
2991
2992 nla_put_failure:
2993 nlmsg_cancel(skb, nlh);
2994 return -EMSGSIZE;
2995 }
2996
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2997 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2998 struct netlink_callback *cb, u32 table_id,
2999 struct fnhe_hash_bucket *bucket, int genid,
3000 int *fa_index, int fa_start, unsigned int flags)
3001 {
3002 int i;
3003
3004 for (i = 0; i < FNHE_HASH_SIZE; i++) {
3005 struct fib_nh_exception *fnhe;
3006
3007 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3008 fnhe = rcu_dereference(fnhe->fnhe_next)) {
3009 struct rtable *rt;
3010 int err;
3011
3012 if (*fa_index < fa_start)
3013 goto next;
3014
3015 if (fnhe->fnhe_genid != genid)
3016 goto next;
3017
3018 if (fnhe->fnhe_expires &&
3019 time_after(jiffies, fnhe->fnhe_expires))
3020 goto next;
3021
3022 rt = rcu_dereference(fnhe->fnhe_rth_input);
3023 if (!rt)
3024 rt = rcu_dereference(fnhe->fnhe_rth_output);
3025 if (!rt)
3026 goto next;
3027
3028 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3029 table_id, NULL, skb,
3030 NETLINK_CB(cb->skb).portid,
3031 cb->nlh->nlmsg_seq, flags);
3032 if (err)
3033 return err;
3034 next:
3035 (*fa_index)++;
3036 }
3037 }
3038
3039 return 0;
3040 }
3041
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3042 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3043 u32 table_id, struct fib_info *fi,
3044 int *fa_index, int fa_start, unsigned int flags)
3045 {
3046 struct net *net = sock_net(cb->skb->sk);
3047 int nhsel, genid = fnhe_genid(net);
3048
3049 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3050 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3051 struct fnhe_hash_bucket *bucket;
3052 int err;
3053
3054 if (nhc->nhc_flags & RTNH_F_DEAD)
3055 continue;
3056
3057 rcu_read_lock();
3058 bucket = rcu_dereference(nhc->nhc_exceptions);
3059 err = 0;
3060 if (bucket)
3061 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3062 genid, fa_index, fa_start,
3063 flags);
3064 rcu_read_unlock();
3065 if (err)
3066 return err;
3067 }
3068
3069 return 0;
3070 }
3071
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3072 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3073 u8 ip_proto, __be16 sport,
3074 __be16 dport)
3075 {
3076 struct sk_buff *skb;
3077 struct iphdr *iph;
3078
3079 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3080 if (!skb)
3081 return NULL;
3082
3083 /* Reserve room for dummy headers, this skb can pass
3084 * through good chunk of routing engine.
3085 */
3086 skb_reset_mac_header(skb);
3087 skb_reset_network_header(skb);
3088 skb->protocol = htons(ETH_P_IP);
3089 iph = skb_put(skb, sizeof(struct iphdr));
3090 iph->protocol = ip_proto;
3091 iph->saddr = src;
3092 iph->daddr = dst;
3093 iph->version = 0x4;
3094 iph->frag_off = 0;
3095 iph->ihl = 0x5;
3096 skb_set_transport_header(skb, skb->len);
3097
3098 switch (iph->protocol) {
3099 case IPPROTO_UDP: {
3100 struct udphdr *udph;
3101
3102 udph = skb_put_zero(skb, sizeof(struct udphdr));
3103 udph->source = sport;
3104 udph->dest = dport;
3105 udph->len = htons(sizeof(struct udphdr));
3106 udph->check = 0;
3107 break;
3108 }
3109 case IPPROTO_TCP: {
3110 struct tcphdr *tcph;
3111
3112 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3113 tcph->source = sport;
3114 tcph->dest = dport;
3115 tcph->doff = sizeof(struct tcphdr) / 4;
3116 tcph->rst = 1;
3117 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3118 src, dst, 0);
3119 break;
3120 }
3121 case IPPROTO_ICMP: {
3122 struct icmphdr *icmph;
3123
3124 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3125 icmph->type = ICMP_ECHO;
3126 icmph->code = 0;
3127 }
3128 }
3129
3130 return skb;
3131 }
3132
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3133 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3134 const struct nlmsghdr *nlh,
3135 struct nlattr **tb,
3136 struct netlink_ext_ack *extack)
3137 {
3138 struct rtmsg *rtm;
3139 int i, err;
3140
3141 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3142 NL_SET_ERR_MSG(extack,
3143 "ipv4: Invalid header for route get request");
3144 return -EINVAL;
3145 }
3146
3147 if (!netlink_strict_get_check(skb))
3148 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3149 rtm_ipv4_policy, extack);
3150
3151 rtm = nlmsg_data(nlh);
3152 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3153 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3154 rtm->rtm_table || rtm->rtm_protocol ||
3155 rtm->rtm_scope || rtm->rtm_type) {
3156 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3157 return -EINVAL;
3158 }
3159
3160 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3161 RTM_F_LOOKUP_TABLE |
3162 RTM_F_FIB_MATCH)) {
3163 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3164 return -EINVAL;
3165 }
3166
3167 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3168 rtm_ipv4_policy, extack);
3169 if (err)
3170 return err;
3171
3172 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3173 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3174 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3175 return -EINVAL;
3176 }
3177
3178 for (i = 0; i <= RTA_MAX; i++) {
3179 if (!tb[i])
3180 continue;
3181
3182 switch (i) {
3183 case RTA_IIF:
3184 case RTA_OIF:
3185 case RTA_SRC:
3186 case RTA_DST:
3187 case RTA_IP_PROTO:
3188 case RTA_SPORT:
3189 case RTA_DPORT:
3190 case RTA_MARK:
3191 case RTA_UID:
3192 break;
3193 default:
3194 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3195 return -EINVAL;
3196 }
3197 }
3198
3199 return 0;
3200 }
3201
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3202 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3203 struct netlink_ext_ack *extack)
3204 {
3205 struct net *net = sock_net(in_skb->sk);
3206 struct nlattr *tb[RTA_MAX+1];
3207 u32 table_id = RT_TABLE_MAIN;
3208 __be16 sport = 0, dport = 0;
3209 struct fib_result res = {};
3210 u8 ip_proto = IPPROTO_UDP;
3211 struct rtable *rt = NULL;
3212 struct sk_buff *skb;
3213 struct rtmsg *rtm;
3214 struct flowi4 fl4 = {};
3215 __be32 dst = 0;
3216 __be32 src = 0;
3217 kuid_t uid;
3218 u32 iif;
3219 int err;
3220 int mark;
3221
3222 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3223 if (err < 0)
3224 return err;
3225
3226 rtm = nlmsg_data(nlh);
3227 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3228 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3229 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3230 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3231 if (tb[RTA_UID])
3232 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3233 else
3234 uid = (iif ? INVALID_UID : current_uid());
3235
3236 if (tb[RTA_IP_PROTO]) {
3237 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3238 &ip_proto, AF_INET, extack);
3239 if (err)
3240 return err;
3241 }
3242
3243 if (tb[RTA_SPORT])
3244 sport = nla_get_be16(tb[RTA_SPORT]);
3245
3246 if (tb[RTA_DPORT])
3247 dport = nla_get_be16(tb[RTA_DPORT]);
3248
3249 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3250 if (!skb)
3251 return -ENOBUFS;
3252
3253 fl4.daddr = dst;
3254 fl4.saddr = src;
3255 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3256 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3257 fl4.flowi4_mark = mark;
3258 fl4.flowi4_uid = uid;
3259 if (sport)
3260 fl4.fl4_sport = sport;
3261 if (dport)
3262 fl4.fl4_dport = dport;
3263 fl4.flowi4_proto = ip_proto;
3264
3265 rcu_read_lock();
3266
3267 if (iif) {
3268 struct net_device *dev;
3269
3270 dev = dev_get_by_index_rcu(net, iif);
3271 if (!dev) {
3272 err = -ENODEV;
3273 goto errout_rcu;
3274 }
3275
3276 fl4.flowi4_iif = iif; /* for rt_fill_info */
3277 skb->dev = dev;
3278 skb->mark = mark;
3279 err = ip_route_input_rcu(skb, dst, src,
3280 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3281 &res);
3282
3283 rt = skb_rtable(skb);
3284 if (err == 0 && rt->dst.error)
3285 err = -rt->dst.error;
3286 } else {
3287 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3288 skb->dev = net->loopback_dev;
3289 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3290 err = 0;
3291 if (IS_ERR(rt))
3292 err = PTR_ERR(rt);
3293 else
3294 skb_dst_set(skb, &rt->dst);
3295 }
3296
3297 if (err)
3298 goto errout_rcu;
3299
3300 if (rtm->rtm_flags & RTM_F_NOTIFY)
3301 rt->rt_flags |= RTCF_NOTIFY;
3302
3303 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3304 table_id = res.table ? res.table->tb_id : 0;
3305
3306 /* reset skb for netlink reply msg */
3307 skb_trim(skb, 0);
3308 skb_reset_network_header(skb);
3309 skb_reset_transport_header(skb);
3310 skb_reset_mac_header(skb);
3311
3312 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3313 struct fib_rt_info fri;
3314
3315 if (!res.fi) {
3316 err = fib_props[res.type].error;
3317 if (!err)
3318 err = -EHOSTUNREACH;
3319 goto errout_rcu;
3320 }
3321 fri.fi = res.fi;
3322 fri.tb_id = table_id;
3323 fri.dst = res.prefix;
3324 fri.dst_len = res.prefixlen;
3325 fri.tos = fl4.flowi4_tos;
3326 fri.type = rt->rt_type;
3327 fri.offload = 0;
3328 fri.trap = 0;
3329 if (res.fa_head) {
3330 struct fib_alias *fa;
3331
3332 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3333 u8 slen = 32 - fri.dst_len;
3334
3335 if (fa->fa_slen == slen &&
3336 fa->tb_id == fri.tb_id &&
3337 fa->fa_tos == fri.tos &&
3338 fa->fa_info == res.fi &&
3339 fa->fa_type == fri.type) {
3340 fri.offload = fa->offload;
3341 fri.trap = fa->trap;
3342 break;
3343 }
3344 }
3345 }
3346 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3347 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3348 } else {
3349 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3350 NETLINK_CB(in_skb).portid,
3351 nlh->nlmsg_seq, 0);
3352 }
3353 if (err < 0)
3354 goto errout_rcu;
3355
3356 rcu_read_unlock();
3357
3358 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3359
3360 errout_free:
3361 return err;
3362 errout_rcu:
3363 rcu_read_unlock();
3364 kfree_skb(skb);
3365 goto errout_free;
3366 }
3367
ip_rt_multicast_event(struct in_device * in_dev)3368 void ip_rt_multicast_event(struct in_device *in_dev)
3369 {
3370 rt_cache_flush(dev_net(in_dev->dev));
3371 }
3372
3373 #ifdef CONFIG_SYSCTL
3374 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3375 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3376 static int ip_rt_gc_elasticity __read_mostly = 8;
3377 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3378
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3379 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3380 void *buffer, size_t *lenp, loff_t *ppos)
3381 {
3382 struct net *net = (struct net *)__ctl->extra1;
3383
3384 if (write) {
3385 rt_cache_flush(net);
3386 fnhe_genid_bump(net);
3387 return 0;
3388 }
3389
3390 return -EINVAL;
3391 }
3392
3393 static struct ctl_table ipv4_route_table[] = {
3394 {
3395 .procname = "gc_thresh",
3396 .data = &ipv4_dst_ops.gc_thresh,
3397 .maxlen = sizeof(int),
3398 .mode = 0644,
3399 .proc_handler = proc_dointvec,
3400 },
3401 {
3402 .procname = "max_size",
3403 .data = &ip_rt_max_size,
3404 .maxlen = sizeof(int),
3405 .mode = 0644,
3406 .proc_handler = proc_dointvec,
3407 },
3408 {
3409 /* Deprecated. Use gc_min_interval_ms */
3410
3411 .procname = "gc_min_interval",
3412 .data = &ip_rt_gc_min_interval,
3413 .maxlen = sizeof(int),
3414 .mode = 0644,
3415 .proc_handler = proc_dointvec_jiffies,
3416 },
3417 {
3418 .procname = "gc_min_interval_ms",
3419 .data = &ip_rt_gc_min_interval,
3420 .maxlen = sizeof(int),
3421 .mode = 0644,
3422 .proc_handler = proc_dointvec_ms_jiffies,
3423 },
3424 {
3425 .procname = "gc_timeout",
3426 .data = &ip_rt_gc_timeout,
3427 .maxlen = sizeof(int),
3428 .mode = 0644,
3429 .proc_handler = proc_dointvec_jiffies,
3430 },
3431 {
3432 .procname = "gc_interval",
3433 .data = &ip_rt_gc_interval,
3434 .maxlen = sizeof(int),
3435 .mode = 0644,
3436 .proc_handler = proc_dointvec_jiffies,
3437 },
3438 {
3439 .procname = "redirect_load",
3440 .data = &ip_rt_redirect_load,
3441 .maxlen = sizeof(int),
3442 .mode = 0644,
3443 .proc_handler = proc_dointvec,
3444 },
3445 {
3446 .procname = "redirect_number",
3447 .data = &ip_rt_redirect_number,
3448 .maxlen = sizeof(int),
3449 .mode = 0644,
3450 .proc_handler = proc_dointvec,
3451 },
3452 {
3453 .procname = "redirect_silence",
3454 .data = &ip_rt_redirect_silence,
3455 .maxlen = sizeof(int),
3456 .mode = 0644,
3457 .proc_handler = proc_dointvec,
3458 },
3459 {
3460 .procname = "error_cost",
3461 .data = &ip_rt_error_cost,
3462 .maxlen = sizeof(int),
3463 .mode = 0644,
3464 .proc_handler = proc_dointvec,
3465 },
3466 {
3467 .procname = "error_burst",
3468 .data = &ip_rt_error_burst,
3469 .maxlen = sizeof(int),
3470 .mode = 0644,
3471 .proc_handler = proc_dointvec,
3472 },
3473 {
3474 .procname = "gc_elasticity",
3475 .data = &ip_rt_gc_elasticity,
3476 .maxlen = sizeof(int),
3477 .mode = 0644,
3478 .proc_handler = proc_dointvec,
3479 },
3480 {
3481 .procname = "mtu_expires",
3482 .data = &ip_rt_mtu_expires,
3483 .maxlen = sizeof(int),
3484 .mode = 0644,
3485 .proc_handler = proc_dointvec_jiffies,
3486 },
3487 {
3488 .procname = "min_pmtu",
3489 .data = &ip_rt_min_pmtu,
3490 .maxlen = sizeof(int),
3491 .mode = 0644,
3492 .proc_handler = proc_dointvec_minmax,
3493 .extra1 = &ip_min_valid_pmtu,
3494 },
3495 {
3496 .procname = "min_adv_mss",
3497 .data = &ip_rt_min_advmss,
3498 .maxlen = sizeof(int),
3499 .mode = 0644,
3500 .proc_handler = proc_dointvec,
3501 },
3502 { }
3503 };
3504
3505 static const char ipv4_route_flush_procname[] = "flush";
3506
3507 static struct ctl_table ipv4_route_flush_table[] = {
3508 {
3509 .procname = ipv4_route_flush_procname,
3510 .maxlen = sizeof(int),
3511 .mode = 0200,
3512 .proc_handler = ipv4_sysctl_rtcache_flush,
3513 },
3514 { },
3515 };
3516
sysctl_route_net_init(struct net * net)3517 static __net_init int sysctl_route_net_init(struct net *net)
3518 {
3519 struct ctl_table *tbl;
3520
3521 tbl = ipv4_route_flush_table;
3522 if (!net_eq(net, &init_net)) {
3523 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3524 if (!tbl)
3525 goto err_dup;
3526
3527 /* Don't export non-whitelisted sysctls to unprivileged users */
3528 if (net->user_ns != &init_user_ns) {
3529 if (tbl[0].procname != ipv4_route_flush_procname)
3530 tbl[0].procname = NULL;
3531 }
3532 }
3533 tbl[0].extra1 = net;
3534
3535 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3536 if (!net->ipv4.route_hdr)
3537 goto err_reg;
3538 return 0;
3539
3540 err_reg:
3541 if (tbl != ipv4_route_flush_table)
3542 kfree(tbl);
3543 err_dup:
3544 return -ENOMEM;
3545 }
3546
sysctl_route_net_exit(struct net * net)3547 static __net_exit void sysctl_route_net_exit(struct net *net)
3548 {
3549 struct ctl_table *tbl;
3550
3551 tbl = net->ipv4.route_hdr->ctl_table_arg;
3552 unregister_net_sysctl_table(net->ipv4.route_hdr);
3553 BUG_ON(tbl == ipv4_route_flush_table);
3554 kfree(tbl);
3555 }
3556
3557 static __net_initdata struct pernet_operations sysctl_route_ops = {
3558 .init = sysctl_route_net_init,
3559 .exit = sysctl_route_net_exit,
3560 };
3561 #endif
3562
rt_genid_init(struct net * net)3563 static __net_init int rt_genid_init(struct net *net)
3564 {
3565 atomic_set(&net->ipv4.rt_genid, 0);
3566 atomic_set(&net->fnhe_genid, 0);
3567 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3568 return 0;
3569 }
3570
3571 static __net_initdata struct pernet_operations rt_genid_ops = {
3572 .init = rt_genid_init,
3573 };
3574
ipv4_inetpeer_init(struct net * net)3575 static int __net_init ipv4_inetpeer_init(struct net *net)
3576 {
3577 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3578
3579 if (!bp)
3580 return -ENOMEM;
3581 inet_peer_base_init(bp);
3582 net->ipv4.peers = bp;
3583 return 0;
3584 }
3585
ipv4_inetpeer_exit(struct net * net)3586 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3587 {
3588 struct inet_peer_base *bp = net->ipv4.peers;
3589
3590 net->ipv4.peers = NULL;
3591 inetpeer_invalidate_tree(bp);
3592 kfree(bp);
3593 }
3594
3595 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3596 .init = ipv4_inetpeer_init,
3597 .exit = ipv4_inetpeer_exit,
3598 };
3599
3600 #ifdef CONFIG_IP_ROUTE_CLASSID
3601 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3602 #endif /* CONFIG_IP_ROUTE_CLASSID */
3603
ip_rt_init(void)3604 int __init ip_rt_init(void)
3605 {
3606 void *idents_hash;
3607 int cpu;
3608
3609 /* For modern hosts, this will use 2 MB of memory */
3610 idents_hash = alloc_large_system_hash("IP idents",
3611 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3612 0,
3613 16, /* one bucket per 64 KB */
3614 HASH_ZERO,
3615 NULL,
3616 &ip_idents_mask,
3617 2048,
3618 256*1024);
3619
3620 ip_idents = idents_hash;
3621
3622 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3623
3624 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3625
3626 for_each_possible_cpu(cpu) {
3627 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3628
3629 INIT_LIST_HEAD(&ul->head);
3630 spin_lock_init(&ul->lock);
3631 }
3632 #ifdef CONFIG_IP_ROUTE_CLASSID
3633 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3634 if (!ip_rt_acct)
3635 panic("IP: failed to allocate ip_rt_acct\n");
3636 #endif
3637
3638 ipv4_dst_ops.kmem_cachep =
3639 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3640 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3641
3642 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3643
3644 if (dst_entries_init(&ipv4_dst_ops) < 0)
3645 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3646
3647 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3648 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3649
3650 ipv4_dst_ops.gc_thresh = ~0;
3651 ip_rt_max_size = INT_MAX;
3652
3653 devinet_init();
3654 ip_fib_init();
3655
3656 if (ip_rt_proc_init())
3657 pr_err("Unable to create route proc files\n");
3658 #ifdef CONFIG_XFRM
3659 xfrm_init();
3660 xfrm4_init();
3661 #endif
3662 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3663 RTNL_FLAG_DOIT_UNLOCKED);
3664
3665 #ifdef CONFIG_SYSCTL
3666 register_pernet_subsys(&sysctl_route_ops);
3667 #endif
3668 register_pernet_subsys(&rt_genid_ops);
3669 register_pernet_subsys(&ipv4_inetpeer_ops);
3670 return 0;
3671 }
3672
3673 #ifdef CONFIG_SYSCTL
3674 /*
3675 * We really need to sanitize the damn ipv4 init order, then all
3676 * this nonsense will go away.
3677 */
ip_static_sysctl_init(void)3678 void __init ip_static_sysctl_init(void)
3679 {
3680 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3681 }
3682 #endif
3683