1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
130
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu,
144 bool confirm_neigh);
145 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151 WARN_ON(1);
152 return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
174 };
175
176 #define ECN_OR_COST(class) TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207 }
208
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211 ++*pos;
212 return NULL;
213 }
214
rt_cache_seq_stop(struct seq_file * seq,void * v)215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
rt_cache_seq_show(struct seq_file * seq,void * v)219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234 };
235
rt_cache_seq_open(struct inode * inode,struct file * file)236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238 return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242 .open = rt_cache_seq_open,
243 .read = seq_read,
244 .llseek = seq_lseek,
245 .release = seq_release,
246 };
247
248
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251 int cpu;
252
253 if (*pos == 0)
254 return SEQ_START_TOKEN;
255
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
258 continue;
259 *pos = cpu+1;
260 return &per_cpu(rt_cache_stat, cpu);
261 }
262 return NULL;
263 }
264
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267 int cpu;
268
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
271 continue;
272 *pos = cpu+1;
273 return &per_cpu(rt_cache_stat, cpu);
274 }
275 (*pos)++;
276 return NULL;
277
278 }
279
rt_cpu_seq_stop(struct seq_file * seq,void * v)280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
rt_cpu_seq_show(struct seq_file * seq,void * v)285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 0, /* st->in_hit */
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 0, /* st->out_hit */
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
315 );
316 return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324 };
325
326
rt_cpu_seq_open(struct inode * inode,struct file * file)327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329 return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333 .open = rt_cpu_seq_open,
334 .read = seq_read,
335 .llseek = seq_lseek,
336 .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
357 }
358
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362 }
363 #endif
364
ip_rt_do_proc_init(struct net * net)365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367 struct proc_dir_entry *pde;
368
369 pde = proc_create("rt_cache", 0444, net->proc_net,
370 &rt_cache_seq_fops);
371 if (!pde)
372 goto err1;
373
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_seq_fops);
376 if (!pde)
377 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
381 rt_acct_proc_show);
382 if (!pde)
383 goto err3;
384 #endif
385 return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389 remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392 remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394 return -ENOMEM;
395 }
396
ip_rt_do_proc_exit(struct net * net)397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
409 };
410
ip_rt_proc_init(void)411 static int __init ip_rt_proc_init(void)
412 {
413 return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
ip_rt_proc_init(void)417 static inline int ip_rt_proc_init(void)
418 {
419 return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
rt_is_expired(const struct rtable * rth)423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
rt_cache_flush(struct net * net)428 void rt_cache_flush(struct net *net)
429 {
430 rt_genid_bump_ipv4(net);
431 }
432
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 struct sk_buff *skb,
435 const void *daddr)
436 {
437 const struct rtable *rt = container_of(dst, struct rtable, dst);
438 struct net_device *dev = dst->dev;
439 struct neighbour *n;
440
441 rcu_read_lock_bh();
442
443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447 } else {
448 __be32 pkey;
449
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
452 }
453
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 n = NULL;
456
457 rcu_read_unlock_bh();
458
459 return n;
460 }
461
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464 const struct rtable *rt = container_of(dst, struct rtable, dst);
465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
467
468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 } else if (!daddr ||
473 (rt->rt_flags &
474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475 return;
476 }
477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 /* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
482 */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
490 */
ip_idents_reserve(u32 hash,int segs)491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493 u32 bucket, old, now = (u32)jiffies;
494 atomic_t *p_id;
495 u32 *p_tstamp;
496 u32 delta = 0;
497
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
502
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
505
506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
509 */
510 return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516 u32 hash, id;
517
518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
522
523 hash = siphash_3u32((__force u32)iph->daddr,
524 (__force u32)iph->saddr,
525 iph->protocol,
526 &net->ipv4.ip_id_key);
527 id = ip_idents_reserve(hash, segs);
528 iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)532 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
533 const struct sock *sk,
534 const struct iphdr *iph,
535 int oif, u8 tos,
536 u8 prot, u32 mark, int flow_flags)
537 {
538 if (sk) {
539 const struct inet_sock *inet = inet_sk(sk);
540
541 oif = sk->sk_bound_dev_if;
542 mark = sk->sk_mark;
543 tos = RT_CONN_FLAGS(sk);
544 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
545 }
546 flowi4_init_output(fl4, oif, mark, tos,
547 RT_SCOPE_UNIVERSE, prot,
548 flow_flags,
549 iph->daddr, iph->saddr, 0, 0,
550 sock_net_uid(net, sk));
551 }
552
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 const struct sock *sk)
555 {
556 const struct net *net = dev_net(skb->dev);
557 const struct iphdr *iph = ip_hdr(skb);
558 int oif = skb->dev->ifindex;
559 u8 tos = RT_TOS(iph->tos);
560 u8 prot = iph->protocol;
561 u32 mark = skb->mark;
562
563 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
564 }
565
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)566 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
567 {
568 const struct inet_sock *inet = inet_sk(sk);
569 const struct ip_options_rcu *inet_opt;
570 __be32 daddr = inet->inet_daddr;
571
572 rcu_read_lock();
573 inet_opt = rcu_dereference(inet->inet_opt);
574 if (inet_opt && inet_opt->opt.srr)
575 daddr = inet_opt->opt.faddr;
576 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
577 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
578 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
579 inet_sk_flowi_flags(sk),
580 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
581 rcu_read_unlock();
582 }
583
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)584 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
585 const struct sk_buff *skb)
586 {
587 if (skb)
588 build_skb_flow_key(fl4, skb, sk);
589 else
590 build_sk_flow_key(fl4, sk);
591 }
592
593 static DEFINE_SPINLOCK(fnhe_lock);
594
fnhe_flush_routes(struct fib_nh_exception * fnhe)595 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
596 {
597 struct rtable *rt;
598
599 rt = rcu_dereference(fnhe->fnhe_rth_input);
600 if (rt) {
601 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
604 }
605 rt = rcu_dereference(fnhe->fnhe_rth_output);
606 if (rt) {
607 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
608 dst_dev_put(&rt->dst);
609 dst_release(&rt->dst);
610 }
611 }
612
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)613 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
614 {
615 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
616 struct fib_nh_exception *fnhe, *oldest = NULL;
617
618 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
619 fnhe = rcu_dereference_protected(*fnhe_p,
620 lockdep_is_held(&fnhe_lock));
621 if (!fnhe)
622 break;
623 if (!oldest ||
624 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
625 oldest = fnhe;
626 oldest_p = fnhe_p;
627 }
628 }
629 fnhe_flush_routes(oldest);
630 *oldest_p = oldest->fnhe_next;
631 kfree_rcu(oldest, rcu);
632 }
633
fnhe_hashfun(__be32 daddr)634 static u32 fnhe_hashfun(__be32 daddr)
635 {
636 static siphash_key_t fnhe_hash_key __read_mostly;
637 u64 hval;
638
639 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
640 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
641 return hash_64(hval, FNHE_HASH_SHIFT);
642 }
643
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)644 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
645 {
646 rt->rt_pmtu = fnhe->fnhe_pmtu;
647 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
648 rt->dst.expires = fnhe->fnhe_expires;
649
650 if (fnhe->fnhe_gw) {
651 rt->rt_flags |= RTCF_REDIRECTED;
652 rt->rt_uses_gateway = 1;
653 rt->rt_gw_family = AF_INET;
654 rt->rt_gw4 = fnhe->fnhe_gw;
655 }
656 }
657
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)658 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
659 __be32 gw, u32 pmtu, bool lock,
660 unsigned long expires)
661 {
662 struct fnhe_hash_bucket *hash;
663 struct fib_nh_exception *fnhe;
664 struct rtable *rt;
665 u32 genid, hval;
666 unsigned int i;
667 int depth;
668
669 genid = fnhe_genid(dev_net(nhc->nhc_dev));
670 hval = fnhe_hashfun(daddr);
671
672 spin_lock_bh(&fnhe_lock);
673
674 hash = rcu_dereference(nhc->nhc_exceptions);
675 if (!hash) {
676 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
677 if (!hash)
678 goto out_unlock;
679 rcu_assign_pointer(nhc->nhc_exceptions, hash);
680 }
681
682 hash += hval;
683
684 depth = 0;
685 for (fnhe = rcu_dereference(hash->chain); fnhe;
686 fnhe = rcu_dereference(fnhe->fnhe_next)) {
687 if (fnhe->fnhe_daddr == daddr)
688 break;
689 depth++;
690 }
691
692 if (fnhe) {
693 if (fnhe->fnhe_genid != genid)
694 fnhe->fnhe_genid = genid;
695 if (gw)
696 fnhe->fnhe_gw = gw;
697 if (pmtu) {
698 fnhe->fnhe_pmtu = pmtu;
699 fnhe->fnhe_mtu_locked = lock;
700 }
701 fnhe->fnhe_expires = max(1UL, expires);
702 /* Update all cached dsts too */
703 rt = rcu_dereference(fnhe->fnhe_rth_input);
704 if (rt)
705 fill_route_from_fnhe(rt, fnhe);
706 rt = rcu_dereference(fnhe->fnhe_rth_output);
707 if (rt)
708 fill_route_from_fnhe(rt, fnhe);
709 } else {
710 /* Randomize max depth to avoid some side channels attacks. */
711 int max_depth = FNHE_RECLAIM_DEPTH +
712 prandom_u32_max(FNHE_RECLAIM_DEPTH);
713
714 while (depth > max_depth) {
715 fnhe_remove_oldest(hash);
716 depth--;
717 }
718
719 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
720 if (!fnhe)
721 goto out_unlock;
722
723 fnhe->fnhe_next = hash->chain;
724
725 fnhe->fnhe_genid = genid;
726 fnhe->fnhe_daddr = daddr;
727 fnhe->fnhe_gw = gw;
728 fnhe->fnhe_pmtu = pmtu;
729 fnhe->fnhe_mtu_locked = lock;
730 fnhe->fnhe_expires = max(1UL, expires);
731
732 rcu_assign_pointer(hash->chain, fnhe);
733
734 /* Exception created; mark the cached routes for the nexthop
735 * stale, so anyone caching it rechecks if this exception
736 * applies to them.
737 */
738 rt = rcu_dereference(nhc->nhc_rth_input);
739 if (rt)
740 rt->dst.obsolete = DST_OBSOLETE_KILL;
741
742 for_each_possible_cpu(i) {
743 struct rtable __rcu **prt;
744 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
745 rt = rcu_dereference(*prt);
746 if (rt)
747 rt->dst.obsolete = DST_OBSOLETE_KILL;
748 }
749 }
750
751 fnhe->fnhe_stamp = jiffies;
752
753 out_unlock:
754 spin_unlock_bh(&fnhe_lock);
755 }
756
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)757 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
758 bool kill_route)
759 {
760 __be32 new_gw = icmp_hdr(skb)->un.gateway;
761 __be32 old_gw = ip_hdr(skb)->saddr;
762 struct net_device *dev = skb->dev;
763 struct in_device *in_dev;
764 struct fib_result res;
765 struct neighbour *n;
766 struct net *net;
767
768 switch (icmp_hdr(skb)->code & 7) {
769 case ICMP_REDIR_NET:
770 case ICMP_REDIR_NETTOS:
771 case ICMP_REDIR_HOST:
772 case ICMP_REDIR_HOSTTOS:
773 break;
774
775 default:
776 return;
777 }
778
779 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
780 return;
781
782 in_dev = __in_dev_get_rcu(dev);
783 if (!in_dev)
784 return;
785
786 net = dev_net(dev);
787 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
788 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
789 ipv4_is_zeronet(new_gw))
790 goto reject_redirect;
791
792 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
793 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
794 goto reject_redirect;
795 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
796 goto reject_redirect;
797 } else {
798 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
799 goto reject_redirect;
800 }
801
802 n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
803 if (!n)
804 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
805 if (!IS_ERR(n)) {
806 if (!(n->nud_state & NUD_VALID)) {
807 neigh_event_send(n, NULL);
808 } else {
809 if (fib_lookup(net, fl4, &res, 0) == 0) {
810 struct fib_nh_common *nhc;
811
812 fib_select_path(net, &res, fl4, skb);
813 nhc = FIB_RES_NHC(res);
814 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
815 0, false,
816 jiffies + ip_rt_gc_timeout);
817 }
818 if (kill_route)
819 rt->dst.obsolete = DST_OBSOLETE_KILL;
820 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
821 }
822 neigh_release(n);
823 }
824 return;
825
826 reject_redirect:
827 #ifdef CONFIG_IP_ROUTE_VERBOSE
828 if (IN_DEV_LOG_MARTIANS(in_dev)) {
829 const struct iphdr *iph = (const struct iphdr *) skb->data;
830 __be32 daddr = iph->daddr;
831 __be32 saddr = iph->saddr;
832
833 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
834 " Advised path = %pI4 -> %pI4\n",
835 &old_gw, dev->name, &new_gw,
836 &saddr, &daddr);
837 }
838 #endif
839 ;
840 }
841
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)842 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
843 {
844 struct rtable *rt;
845 struct flowi4 fl4;
846 const struct iphdr *iph = (const struct iphdr *) skb->data;
847 struct net *net = dev_net(skb->dev);
848 int oif = skb->dev->ifindex;
849 u8 tos = RT_TOS(iph->tos);
850 u8 prot = iph->protocol;
851 u32 mark = skb->mark;
852
853 rt = (struct rtable *) dst;
854
855 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
856 __ip_do_redirect(rt, skb, &fl4, true);
857 }
858
ipv4_negative_advice(struct dst_entry * dst)859 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
860 {
861 struct rtable *rt = (struct rtable *)dst;
862 struct dst_entry *ret = dst;
863
864 if (rt) {
865 if (dst->obsolete > 0) {
866 ip_rt_put(rt);
867 ret = NULL;
868 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
869 rt->dst.expires) {
870 ip_rt_put(rt);
871 ret = NULL;
872 }
873 }
874 return ret;
875 }
876
877 /*
878 * Algorithm:
879 * 1. The first ip_rt_redirect_number redirects are sent
880 * with exponential backoff, then we stop sending them at all,
881 * assuming that the host ignores our redirects.
882 * 2. If we did not see packets requiring redirects
883 * during ip_rt_redirect_silence, we assume that the host
884 * forgot redirected route and start to send redirects again.
885 *
886 * This algorithm is much cheaper and more intelligent than dumb load limiting
887 * in icmp.c.
888 *
889 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
890 * and "frag. need" (breaks PMTU discovery) in icmp.c.
891 */
892
ip_rt_send_redirect(struct sk_buff * skb)893 void ip_rt_send_redirect(struct sk_buff *skb)
894 {
895 struct rtable *rt = skb_rtable(skb);
896 struct in_device *in_dev;
897 struct inet_peer *peer;
898 struct net *net;
899 int log_martians;
900 int vif;
901
902 rcu_read_lock();
903 in_dev = __in_dev_get_rcu(rt->dst.dev);
904 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
905 rcu_read_unlock();
906 return;
907 }
908 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
909 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
910 rcu_read_unlock();
911
912 net = dev_net(rt->dst.dev);
913 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
914 if (!peer) {
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
916 rt_nexthop(rt, ip_hdr(skb)->daddr));
917 return;
918 }
919
920 /* No redirected packets during ip_rt_redirect_silence;
921 * reset the algorithm.
922 */
923 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
924 peer->rate_tokens = 0;
925 peer->n_redirects = 0;
926 }
927
928 /* Too many ignored redirects; do not send anything
929 * set dst.rate_last to the last seen redirected packet.
930 */
931 if (peer->n_redirects >= ip_rt_redirect_number) {
932 peer->rate_last = jiffies;
933 goto out_put_peer;
934 }
935
936 /* Check for load limit; set rate_last to the latest sent
937 * redirect.
938 */
939 if (peer->n_redirects == 0 ||
940 time_after(jiffies,
941 (peer->rate_last +
942 (ip_rt_redirect_load << peer->n_redirects)))) {
943 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
944
945 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
946 peer->rate_last = jiffies;
947 ++peer->n_redirects;
948 #ifdef CONFIG_IP_ROUTE_VERBOSE
949 if (log_martians &&
950 peer->n_redirects == ip_rt_redirect_number)
951 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
952 &ip_hdr(skb)->saddr, inet_iif(skb),
953 &ip_hdr(skb)->daddr, &gw);
954 #endif
955 }
956 out_put_peer:
957 inet_putpeer(peer);
958 }
959
ip_error(struct sk_buff * skb)960 static int ip_error(struct sk_buff *skb)
961 {
962 struct rtable *rt = skb_rtable(skb);
963 struct net_device *dev = skb->dev;
964 struct in_device *in_dev;
965 struct inet_peer *peer;
966 unsigned long now;
967 struct net *net;
968 bool send;
969 int code;
970
971 if (netif_is_l3_master(skb->dev)) {
972 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
973 if (!dev)
974 goto out;
975 }
976
977 in_dev = __in_dev_get_rcu(dev);
978
979 /* IP on this device is disabled. */
980 if (!in_dev)
981 goto out;
982
983 net = dev_net(rt->dst.dev);
984 if (!IN_DEV_FORWARD(in_dev)) {
985 switch (rt->dst.error) {
986 case EHOSTUNREACH:
987 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
988 break;
989
990 case ENETUNREACH:
991 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
992 break;
993 }
994 goto out;
995 }
996
997 switch (rt->dst.error) {
998 case EINVAL:
999 default:
1000 goto out;
1001 case EHOSTUNREACH:
1002 code = ICMP_HOST_UNREACH;
1003 break;
1004 case ENETUNREACH:
1005 code = ICMP_NET_UNREACH;
1006 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1007 break;
1008 case EACCES:
1009 code = ICMP_PKT_FILTERED;
1010 break;
1011 }
1012
1013 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014 l3mdev_master_ifindex(skb->dev), 1);
1015
1016 send = true;
1017 if (peer) {
1018 now = jiffies;
1019 peer->rate_tokens += now - peer->rate_last;
1020 if (peer->rate_tokens > ip_rt_error_burst)
1021 peer->rate_tokens = ip_rt_error_burst;
1022 peer->rate_last = now;
1023 if (peer->rate_tokens >= ip_rt_error_cost)
1024 peer->rate_tokens -= ip_rt_error_cost;
1025 else
1026 send = false;
1027 inet_putpeer(peer);
1028 }
1029 if (send)
1030 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1031
1032 out: kfree_skb(skb);
1033 return 0;
1034 }
1035
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1036 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1037 {
1038 struct dst_entry *dst = &rt->dst;
1039 struct net *net = dev_net(dst->dev);
1040 u32 old_mtu = ipv4_mtu(dst);
1041 struct fib_result res;
1042 bool lock = false;
1043
1044 if (ip_mtu_locked(dst))
1045 return;
1046
1047 if (old_mtu < mtu)
1048 return;
1049
1050 if (mtu < ip_rt_min_pmtu) {
1051 lock = true;
1052 mtu = min(old_mtu, ip_rt_min_pmtu);
1053 }
1054
1055 if (rt->rt_pmtu == mtu && !lock &&
1056 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1057 return;
1058
1059 rcu_read_lock();
1060 if (fib_lookup(net, fl4, &res, 0) == 0) {
1061 struct fib_nh_common *nhc;
1062
1063 fib_select_path(net, &res, fl4, NULL);
1064 nhc = FIB_RES_NHC(res);
1065 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1066 jiffies + ip_rt_mtu_expires);
1067 }
1068 rcu_read_unlock();
1069 }
1070
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1071 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1072 struct sk_buff *skb, u32 mtu,
1073 bool confirm_neigh)
1074 {
1075 struct rtable *rt = (struct rtable *) dst;
1076 struct flowi4 fl4;
1077
1078 ip_rt_build_flow_key(&fl4, sk, skb);
1079 __ip_rt_update_pmtu(rt, &fl4, mtu);
1080 }
1081
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1082 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1083 int oif, u8 protocol)
1084 {
1085 const struct iphdr *iph = (const struct iphdr *) skb->data;
1086 struct flowi4 fl4;
1087 struct rtable *rt;
1088 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1089
1090 __build_flow_key(net, &fl4, NULL, iph, oif,
1091 RT_TOS(iph->tos), protocol, mark, 0);
1092 rt = __ip_route_output_key(net, &fl4);
1093 if (!IS_ERR(rt)) {
1094 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095 ip_rt_put(rt);
1096 }
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101 {
1102 const struct iphdr *iph = (const struct iphdr *) skb->data;
1103 struct flowi4 fl4;
1104 struct rtable *rt;
1105
1106 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107
1108 if (!fl4.flowi4_mark)
1109 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110
1111 rt = __ip_route_output_key(sock_net(sk), &fl4);
1112 if (!IS_ERR(rt)) {
1113 __ip_rt_update_pmtu(rt, &fl4, mtu);
1114 ip_rt_put(rt);
1115 }
1116 }
1117
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119 {
1120 const struct iphdr *iph = (const struct iphdr *) skb->data;
1121 struct flowi4 fl4;
1122 struct rtable *rt;
1123 struct dst_entry *odst = NULL;
1124 bool new = false;
1125 struct net *net = sock_net(sk);
1126
1127 bh_lock_sock(sk);
1128
1129 if (!ip_sk_accept_pmtu(sk))
1130 goto out;
1131
1132 odst = sk_dst_get(sk);
1133
1134 if (sock_owned_by_user(sk) || !odst) {
1135 __ipv4_sk_update_pmtu(skb, sk, mtu);
1136 goto out;
1137 }
1138
1139 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140
1141 rt = (struct rtable *)odst;
1142 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 if (IS_ERR(rt))
1145 goto out;
1146
1147 new = true;
1148 }
1149
1150 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1151
1152 if (!dst_check(&rt->dst, 0)) {
1153 if (new)
1154 dst_release(&rt->dst);
1155
1156 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157 if (IS_ERR(rt))
1158 goto out;
1159
1160 new = true;
1161 }
1162
1163 if (new)
1164 sk_dst_set(sk, &rt->dst);
1165
1166 out:
1167 bh_unlock_sock(sk);
1168 dst_release(odst);
1169 }
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173 int oif, u8 protocol)
1174 {
1175 const struct iphdr *iph = (const struct iphdr *) skb->data;
1176 struct flowi4 fl4;
1177 struct rtable *rt;
1178
1179 __build_flow_key(net, &fl4, NULL, iph, oif,
1180 RT_TOS(iph->tos), protocol, 0, 0);
1181 rt = __ip_route_output_key(net, &fl4);
1182 if (!IS_ERR(rt)) {
1183 __ip_do_redirect(rt, skb, &fl4, false);
1184 ip_rt_put(rt);
1185 }
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1188
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190 {
1191 const struct iphdr *iph = (const struct iphdr *) skb->data;
1192 struct flowi4 fl4;
1193 struct rtable *rt;
1194 struct net *net = sock_net(sk);
1195
1196 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197 rt = __ip_route_output_key(net, &fl4);
1198 if (!IS_ERR(rt)) {
1199 __ip_do_redirect(rt, skb, &fl4, false);
1200 ip_rt_put(rt);
1201 }
1202 }
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206 {
1207 struct rtable *rt = (struct rtable *) dst;
1208
1209 /* All IPV4 dsts are created with ->obsolete set to the value
1210 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211 * into this function always.
1212 *
1213 * When a PMTU/redirect information update invalidates a route,
1214 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215 * DST_OBSOLETE_DEAD.
1216 */
1217 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218 return NULL;
1219 return dst;
1220 }
1221
ipv4_send_dest_unreach(struct sk_buff * skb)1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223 {
1224 struct net_device *dev;
1225 struct ip_options opt;
1226 int res;
1227
1228 /* Recompile ip options since IPCB may not be valid anymore.
1229 * Also check we have a reasonable ipv4 header.
1230 */
1231 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1232 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1233 return;
1234
1235 memset(&opt, 0, sizeof(opt));
1236 if (ip_hdr(skb)->ihl > 5) {
1237 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1238 return;
1239 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1240
1241 rcu_read_lock();
1242 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1243 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1244 rcu_read_unlock();
1245
1246 if (res)
1247 return;
1248 }
1249 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1250 }
1251
ipv4_link_failure(struct sk_buff * skb)1252 static void ipv4_link_failure(struct sk_buff *skb)
1253 {
1254 struct rtable *rt;
1255
1256 ipv4_send_dest_unreach(skb);
1257
1258 rt = skb_rtable(skb);
1259 if (rt)
1260 dst_set_expires(&rt->dst, 0);
1261 }
1262
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1263 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1264 {
1265 pr_debug("%s: %pI4 -> %pI4, %s\n",
1266 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1267 skb->dev ? skb->dev->name : "?");
1268 kfree_skb(skb);
1269 WARN_ON(1);
1270 return 0;
1271 }
1272
1273 /*
1274 We do not cache source address of outgoing interface,
1275 because it is used only by IP RR, TS and SRR options,
1276 so that it out of fast path.
1277
1278 BTW remember: "addr" is allowed to be not aligned
1279 in IP options!
1280 */
1281
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1282 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1283 {
1284 __be32 src;
1285
1286 if (rt_is_output_route(rt))
1287 src = ip_hdr(skb)->saddr;
1288 else {
1289 struct fib_result res;
1290 struct iphdr *iph = ip_hdr(skb);
1291 struct flowi4 fl4 = {
1292 .daddr = iph->daddr,
1293 .saddr = iph->saddr,
1294 .flowi4_tos = RT_TOS(iph->tos),
1295 .flowi4_oif = rt->dst.dev->ifindex,
1296 .flowi4_iif = skb->dev->ifindex,
1297 .flowi4_mark = skb->mark,
1298 };
1299
1300 rcu_read_lock();
1301 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1302 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1303 else
1304 src = inet_select_addr(rt->dst.dev,
1305 rt_nexthop(rt, iph->daddr),
1306 RT_SCOPE_UNIVERSE);
1307 rcu_read_unlock();
1308 }
1309 memcpy(addr, &src, 4);
1310 }
1311
1312 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1313 static void set_class_tag(struct rtable *rt, u32 tag)
1314 {
1315 if (!(rt->dst.tclassid & 0xFFFF))
1316 rt->dst.tclassid |= tag & 0xFFFF;
1317 if (!(rt->dst.tclassid & 0xFFFF0000))
1318 rt->dst.tclassid |= tag & 0xFFFF0000;
1319 }
1320 #endif
1321
ipv4_default_advmss(const struct dst_entry * dst)1322 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1323 {
1324 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1325 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1326 ip_rt_min_advmss);
1327
1328 return min(advmss, IPV4_MAX_PMTU - header_size);
1329 }
1330
ipv4_mtu(const struct dst_entry * dst)1331 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1332 {
1333 const struct rtable *rt = (const struct rtable *) dst;
1334 unsigned int mtu = rt->rt_pmtu;
1335
1336 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1337 mtu = dst_metric_raw(dst, RTAX_MTU);
1338
1339 if (mtu)
1340 goto out;
1341
1342 mtu = READ_ONCE(dst->dev->mtu);
1343
1344 if (unlikely(ip_mtu_locked(dst))) {
1345 if (rt->rt_uses_gateway && mtu > 576)
1346 mtu = 576;
1347 }
1348
1349 out:
1350 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1351
1352 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1353 }
1354
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1355 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1356 {
1357 struct fnhe_hash_bucket *hash;
1358 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1359 u32 hval = fnhe_hashfun(daddr);
1360
1361 spin_lock_bh(&fnhe_lock);
1362
1363 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1364 lockdep_is_held(&fnhe_lock));
1365 hash += hval;
1366
1367 fnhe_p = &hash->chain;
1368 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1369 while (fnhe) {
1370 if (fnhe->fnhe_daddr == daddr) {
1371 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1372 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1373 /* set fnhe_daddr to 0 to ensure it won't bind with
1374 * new dsts in rt_bind_exception().
1375 */
1376 fnhe->fnhe_daddr = 0;
1377 fnhe_flush_routes(fnhe);
1378 kfree_rcu(fnhe, rcu);
1379 break;
1380 }
1381 fnhe_p = &fnhe->fnhe_next;
1382 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1383 lockdep_is_held(&fnhe_lock));
1384 }
1385
1386 spin_unlock_bh(&fnhe_lock);
1387 }
1388
find_exception(struct fib_nh_common * nhc,__be32 daddr)1389 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1390 __be32 daddr)
1391 {
1392 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1393 struct fib_nh_exception *fnhe;
1394 u32 hval;
1395
1396 if (!hash)
1397 return NULL;
1398
1399 hval = fnhe_hashfun(daddr);
1400
1401 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1402 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1403 if (fnhe->fnhe_daddr == daddr) {
1404 if (fnhe->fnhe_expires &&
1405 time_after(jiffies, fnhe->fnhe_expires)) {
1406 ip_del_fnhe(nhc, daddr);
1407 break;
1408 }
1409 return fnhe;
1410 }
1411 }
1412 return NULL;
1413 }
1414
1415 /* MTU selection:
1416 * 1. mtu on route is locked - use it
1417 * 2. mtu from nexthop exception
1418 * 3. mtu from egress device
1419 */
1420
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1421 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1422 {
1423 struct fib_nh_common *nhc = res->nhc;
1424 struct net_device *dev = nhc->nhc_dev;
1425 struct fib_info *fi = res->fi;
1426 u32 mtu = 0;
1427
1428 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1429 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1430 mtu = fi->fib_mtu;
1431
1432 if (likely(!mtu)) {
1433 struct fib_nh_exception *fnhe;
1434
1435 fnhe = find_exception(nhc, daddr);
1436 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1437 mtu = fnhe->fnhe_pmtu;
1438 }
1439
1440 if (likely(!mtu))
1441 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1442
1443 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1444 }
1445
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1446 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1447 __be32 daddr, const bool do_cache)
1448 {
1449 bool ret = false;
1450
1451 spin_lock_bh(&fnhe_lock);
1452
1453 if (daddr == fnhe->fnhe_daddr) {
1454 struct rtable __rcu **porig;
1455 struct rtable *orig;
1456 int genid = fnhe_genid(dev_net(rt->dst.dev));
1457
1458 if (rt_is_input_route(rt))
1459 porig = &fnhe->fnhe_rth_input;
1460 else
1461 porig = &fnhe->fnhe_rth_output;
1462 orig = rcu_dereference(*porig);
1463
1464 if (fnhe->fnhe_genid != genid) {
1465 fnhe->fnhe_genid = genid;
1466 fnhe->fnhe_gw = 0;
1467 fnhe->fnhe_pmtu = 0;
1468 fnhe->fnhe_expires = 0;
1469 fnhe->fnhe_mtu_locked = false;
1470 fnhe_flush_routes(fnhe);
1471 orig = NULL;
1472 }
1473 fill_route_from_fnhe(rt, fnhe);
1474 if (!rt->rt_gw4) {
1475 rt->rt_gw4 = daddr;
1476 rt->rt_gw_family = AF_INET;
1477 }
1478
1479 if (do_cache) {
1480 dst_hold(&rt->dst);
1481 rcu_assign_pointer(*porig, rt);
1482 if (orig) {
1483 dst_dev_put(&orig->dst);
1484 dst_release(&orig->dst);
1485 }
1486 ret = true;
1487 }
1488
1489 fnhe->fnhe_stamp = jiffies;
1490 }
1491 spin_unlock_bh(&fnhe_lock);
1492
1493 return ret;
1494 }
1495
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1496 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1497 {
1498 struct rtable *orig, *prev, **p;
1499 bool ret = true;
1500
1501 if (rt_is_input_route(rt)) {
1502 p = (struct rtable **)&nhc->nhc_rth_input;
1503 } else {
1504 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1505 }
1506 orig = *p;
1507
1508 /* hold dst before doing cmpxchg() to avoid race condition
1509 * on this dst
1510 */
1511 dst_hold(&rt->dst);
1512 prev = cmpxchg(p, orig, rt);
1513 if (prev == orig) {
1514 if (orig) {
1515 rt_add_uncached_list(orig);
1516 dst_release(&orig->dst);
1517 }
1518 } else {
1519 dst_release(&rt->dst);
1520 ret = false;
1521 }
1522
1523 return ret;
1524 }
1525
1526 struct uncached_list {
1527 spinlock_t lock;
1528 struct list_head head;
1529 };
1530
1531 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1532
rt_add_uncached_list(struct rtable * rt)1533 void rt_add_uncached_list(struct rtable *rt)
1534 {
1535 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1536
1537 rt->rt_uncached_list = ul;
1538
1539 spin_lock_bh(&ul->lock);
1540 list_add_tail(&rt->rt_uncached, &ul->head);
1541 spin_unlock_bh(&ul->lock);
1542 }
1543
rt_del_uncached_list(struct rtable * rt)1544 void rt_del_uncached_list(struct rtable *rt)
1545 {
1546 if (!list_empty(&rt->rt_uncached)) {
1547 struct uncached_list *ul = rt->rt_uncached_list;
1548
1549 spin_lock_bh(&ul->lock);
1550 list_del(&rt->rt_uncached);
1551 spin_unlock_bh(&ul->lock);
1552 }
1553 }
1554
ipv4_dst_destroy(struct dst_entry * dst)1555 static void ipv4_dst_destroy(struct dst_entry *dst)
1556 {
1557 struct rtable *rt = (struct rtable *)dst;
1558
1559 ip_dst_metrics_put(dst);
1560 rt_del_uncached_list(rt);
1561 }
1562
rt_flush_dev(struct net_device * dev)1563 void rt_flush_dev(struct net_device *dev)
1564 {
1565 struct rtable *rt;
1566 int cpu;
1567
1568 for_each_possible_cpu(cpu) {
1569 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1570
1571 spin_lock_bh(&ul->lock);
1572 list_for_each_entry(rt, &ul->head, rt_uncached) {
1573 if (rt->dst.dev != dev)
1574 continue;
1575 rt->dst.dev = blackhole_netdev;
1576 dev_hold(rt->dst.dev);
1577 dev_put(dev);
1578 }
1579 spin_unlock_bh(&ul->lock);
1580 }
1581 }
1582
rt_cache_valid(const struct rtable * rt)1583 static bool rt_cache_valid(const struct rtable *rt)
1584 {
1585 return rt &&
1586 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1587 !rt_is_expired(rt);
1588 }
1589
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1590 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1591 const struct fib_result *res,
1592 struct fib_nh_exception *fnhe,
1593 struct fib_info *fi, u16 type, u32 itag,
1594 const bool do_cache)
1595 {
1596 bool cached = false;
1597
1598 if (fi) {
1599 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1600
1601 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1602 rt->rt_uses_gateway = 1;
1603 rt->rt_gw_family = nhc->nhc_gw_family;
1604 /* only INET and INET6 are supported */
1605 if (likely(nhc->nhc_gw_family == AF_INET))
1606 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1607 else
1608 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1609 }
1610
1611 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1612
1613 #ifdef CONFIG_IP_ROUTE_CLASSID
1614 if (nhc->nhc_family == AF_INET) {
1615 struct fib_nh *nh;
1616
1617 nh = container_of(nhc, struct fib_nh, nh_common);
1618 rt->dst.tclassid = nh->nh_tclassid;
1619 }
1620 #endif
1621 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1622 if (unlikely(fnhe))
1623 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1624 else if (do_cache)
1625 cached = rt_cache_route(nhc, rt);
1626 if (unlikely(!cached)) {
1627 /* Routes we intend to cache in nexthop exception or
1628 * FIB nexthop have the DST_NOCACHE bit clear.
1629 * However, if we are unsuccessful at storing this
1630 * route into the cache we really need to set it.
1631 */
1632 if (!rt->rt_gw4) {
1633 rt->rt_gw_family = AF_INET;
1634 rt->rt_gw4 = daddr;
1635 }
1636 rt_add_uncached_list(rt);
1637 }
1638 } else
1639 rt_add_uncached_list(rt);
1640
1641 #ifdef CONFIG_IP_ROUTE_CLASSID
1642 #ifdef CONFIG_IP_MULTIPLE_TABLES
1643 set_class_tag(rt, res->tclassid);
1644 #endif
1645 set_class_tag(rt, itag);
1646 #endif
1647 }
1648
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm,bool will_cache)1649 struct rtable *rt_dst_alloc(struct net_device *dev,
1650 unsigned int flags, u16 type,
1651 bool nopolicy, bool noxfrm, bool will_cache)
1652 {
1653 struct rtable *rt;
1654
1655 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1656 (will_cache ? 0 : DST_HOST) |
1657 (nopolicy ? DST_NOPOLICY : 0) |
1658 (noxfrm ? DST_NOXFRM : 0));
1659
1660 if (rt) {
1661 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1662 rt->rt_flags = flags;
1663 rt->rt_type = type;
1664 rt->rt_is_input = 0;
1665 rt->rt_iif = 0;
1666 rt->rt_pmtu = 0;
1667 rt->rt_mtu_locked = 0;
1668 rt->rt_uses_gateway = 0;
1669 rt->rt_gw_family = 0;
1670 rt->rt_gw4 = 0;
1671 INIT_LIST_HEAD(&rt->rt_uncached);
1672
1673 rt->dst.output = ip_output;
1674 if (flags & RTCF_LOCAL)
1675 rt->dst.input = ip_local_deliver;
1676 }
1677
1678 return rt;
1679 }
1680 EXPORT_SYMBOL(rt_dst_alloc);
1681
rt_dst_clone(struct net_device * dev,struct rtable * rt)1682 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1683 {
1684 struct rtable *new_rt;
1685
1686 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1687 rt->dst.flags);
1688
1689 if (new_rt) {
1690 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1691 new_rt->rt_flags = rt->rt_flags;
1692 new_rt->rt_type = rt->rt_type;
1693 new_rt->rt_is_input = rt->rt_is_input;
1694 new_rt->rt_iif = rt->rt_iif;
1695 new_rt->rt_pmtu = rt->rt_pmtu;
1696 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1697 new_rt->rt_gw_family = rt->rt_gw_family;
1698 if (rt->rt_gw_family == AF_INET)
1699 new_rt->rt_gw4 = rt->rt_gw4;
1700 else if (rt->rt_gw_family == AF_INET6)
1701 new_rt->rt_gw6 = rt->rt_gw6;
1702 INIT_LIST_HEAD(&new_rt->rt_uncached);
1703
1704 new_rt->dst.flags |= DST_HOST;
1705 new_rt->dst.input = rt->dst.input;
1706 new_rt->dst.output = rt->dst.output;
1707 new_rt->dst.error = rt->dst.error;
1708 new_rt->dst.lastuse = jiffies;
1709 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1710 }
1711 return new_rt;
1712 }
1713 EXPORT_SYMBOL(rt_dst_clone);
1714
1715 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1716 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1717 u8 tos, struct net_device *dev,
1718 struct in_device *in_dev, u32 *itag)
1719 {
1720 int err;
1721
1722 /* Primary sanity checks. */
1723 if (!in_dev)
1724 return -EINVAL;
1725
1726 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1727 skb->protocol != htons(ETH_P_IP))
1728 return -EINVAL;
1729
1730 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1731 return -EINVAL;
1732
1733 if (ipv4_is_zeronet(saddr)) {
1734 if (!ipv4_is_local_multicast(daddr) &&
1735 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1736 return -EINVAL;
1737 } else {
1738 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1739 in_dev, itag);
1740 if (err < 0)
1741 return err;
1742 }
1743 return 0;
1744 }
1745
1746 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1747 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1748 u8 tos, struct net_device *dev, int our)
1749 {
1750 struct in_device *in_dev = __in_dev_get_rcu(dev);
1751 unsigned int flags = RTCF_MULTICAST;
1752 struct rtable *rth;
1753 u32 itag = 0;
1754 int err;
1755
1756 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1757 if (err)
1758 return err;
1759
1760 if (our)
1761 flags |= RTCF_LOCAL;
1762
1763 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1764 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1765 if (!rth)
1766 return -ENOBUFS;
1767
1768 #ifdef CONFIG_IP_ROUTE_CLASSID
1769 rth->dst.tclassid = itag;
1770 #endif
1771 rth->dst.output = ip_rt_bug;
1772 rth->rt_is_input= 1;
1773
1774 #ifdef CONFIG_IP_MROUTE
1775 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1776 rth->dst.input = ip_mr_input;
1777 #endif
1778 RT_CACHE_STAT_INC(in_slow_mc);
1779
1780 skb_dst_drop(skb);
1781 skb_dst_set(skb, &rth->dst);
1782 return 0;
1783 }
1784
1785
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1786 static void ip_handle_martian_source(struct net_device *dev,
1787 struct in_device *in_dev,
1788 struct sk_buff *skb,
1789 __be32 daddr,
1790 __be32 saddr)
1791 {
1792 RT_CACHE_STAT_INC(in_martian_src);
1793 #ifdef CONFIG_IP_ROUTE_VERBOSE
1794 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1795 /*
1796 * RFC1812 recommendation, if source is martian,
1797 * the only hint is MAC header.
1798 */
1799 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1800 &daddr, &saddr, dev->name);
1801 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1802 print_hex_dump(KERN_WARNING, "ll header: ",
1803 DUMP_PREFIX_OFFSET, 16, 1,
1804 skb_mac_header(skb),
1805 dev->hard_header_len, false);
1806 }
1807 }
1808 #endif
1809 }
1810
1811 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1812 static int __mkroute_input(struct sk_buff *skb,
1813 const struct fib_result *res,
1814 struct in_device *in_dev,
1815 __be32 daddr, __be32 saddr, u32 tos)
1816 {
1817 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1818 struct net_device *dev = nhc->nhc_dev;
1819 struct fib_nh_exception *fnhe;
1820 struct rtable *rth;
1821 int err;
1822 struct in_device *out_dev;
1823 bool do_cache;
1824 u32 itag = 0;
1825
1826 /* get a working reference to the output device */
1827 out_dev = __in_dev_get_rcu(dev);
1828 if (!out_dev) {
1829 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1830 return -EINVAL;
1831 }
1832
1833 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1834 in_dev->dev, in_dev, &itag);
1835 if (err < 0) {
1836 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1837 saddr);
1838
1839 goto cleanup;
1840 }
1841
1842 do_cache = res->fi && !itag;
1843 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1844 skb->protocol == htons(ETH_P_IP)) {
1845 __be32 gw;
1846
1847 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1848 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1849 inet_addr_onlink(out_dev, saddr, gw))
1850 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1851 }
1852
1853 if (skb->protocol != htons(ETH_P_IP)) {
1854 /* Not IP (i.e. ARP). Do not create route, if it is
1855 * invalid for proxy arp. DNAT routes are always valid.
1856 *
1857 * Proxy arp feature have been extended to allow, ARP
1858 * replies back to the same interface, to support
1859 * Private VLAN switch technologies. See arp.c.
1860 */
1861 if (out_dev == in_dev &&
1862 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1863 err = -EINVAL;
1864 goto cleanup;
1865 }
1866 }
1867
1868 fnhe = find_exception(nhc, daddr);
1869 if (do_cache) {
1870 if (fnhe)
1871 rth = rcu_dereference(fnhe->fnhe_rth_input);
1872 else
1873 rth = rcu_dereference(nhc->nhc_rth_input);
1874 if (rt_cache_valid(rth)) {
1875 skb_dst_set_noref(skb, &rth->dst);
1876 goto out;
1877 }
1878 }
1879
1880 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1881 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1882 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1883 if (!rth) {
1884 err = -ENOBUFS;
1885 goto cleanup;
1886 }
1887
1888 rth->rt_is_input = 1;
1889 RT_CACHE_STAT_INC(in_slow_tot);
1890
1891 rth->dst.input = ip_forward;
1892
1893 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1894 do_cache);
1895 lwtunnel_set_redirect(&rth->dst);
1896 skb_dst_set(skb, &rth->dst);
1897 out:
1898 err = 0;
1899 cleanup:
1900 return err;
1901 }
1902
1903 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1904 /* To make ICMP packets follow the right flow, the multipath hash is
1905 * calculated from the inner IP addresses.
1906 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1907 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1908 struct flow_keys *hash_keys)
1909 {
1910 const struct iphdr *outer_iph = ip_hdr(skb);
1911 const struct iphdr *key_iph = outer_iph;
1912 const struct iphdr *inner_iph;
1913 const struct icmphdr *icmph;
1914 struct iphdr _inner_iph;
1915 struct icmphdr _icmph;
1916
1917 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1918 goto out;
1919
1920 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1921 goto out;
1922
1923 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1924 &_icmph);
1925 if (!icmph)
1926 goto out;
1927
1928 if (icmph->type != ICMP_DEST_UNREACH &&
1929 icmph->type != ICMP_REDIRECT &&
1930 icmph->type != ICMP_TIME_EXCEEDED &&
1931 icmph->type != ICMP_PARAMETERPROB)
1932 goto out;
1933
1934 inner_iph = skb_header_pointer(skb,
1935 outer_iph->ihl * 4 + sizeof(_icmph),
1936 sizeof(_inner_iph), &_inner_iph);
1937 if (!inner_iph)
1938 goto out;
1939
1940 key_iph = inner_iph;
1941 out:
1942 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1943 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1944 }
1945
1946 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1947 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1948 const struct sk_buff *skb, struct flow_keys *flkeys)
1949 {
1950 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1951 struct flow_keys hash_keys;
1952 u32 mhash;
1953
1954 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1955 case 0:
1956 memset(&hash_keys, 0, sizeof(hash_keys));
1957 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958 if (skb) {
1959 ip_multipath_l3_keys(skb, &hash_keys);
1960 } else {
1961 hash_keys.addrs.v4addrs.src = fl4->saddr;
1962 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1963 }
1964 break;
1965 case 1:
1966 /* skb is currently provided only when forwarding */
1967 if (skb) {
1968 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1969 struct flow_keys keys;
1970
1971 /* short-circuit if we already have L4 hash present */
1972 if (skb->l4_hash)
1973 return skb_get_hash_raw(skb) >> 1;
1974
1975 memset(&hash_keys, 0, sizeof(hash_keys));
1976
1977 if (!flkeys) {
1978 skb_flow_dissect_flow_keys(skb, &keys, flag);
1979 flkeys = &keys;
1980 }
1981
1982 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1983 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1984 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1985 hash_keys.ports.src = flkeys->ports.src;
1986 hash_keys.ports.dst = flkeys->ports.dst;
1987 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1988 } else {
1989 memset(&hash_keys, 0, sizeof(hash_keys));
1990 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1991 hash_keys.addrs.v4addrs.src = fl4->saddr;
1992 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1993 hash_keys.ports.src = fl4->fl4_sport;
1994 hash_keys.ports.dst = fl4->fl4_dport;
1995 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1996 }
1997 break;
1998 case 2:
1999 memset(&hash_keys, 0, sizeof(hash_keys));
2000 /* skb is currently provided only when forwarding */
2001 if (skb) {
2002 struct flow_keys keys;
2003
2004 skb_flow_dissect_flow_keys(skb, &keys, 0);
2005 /* Inner can be v4 or v6 */
2006 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2007 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2008 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2009 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2010 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2013 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2014 hash_keys.tags.flow_label = keys.tags.flow_label;
2015 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2016 } else {
2017 /* Same as case 0 */
2018 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2019 ip_multipath_l3_keys(skb, &hash_keys);
2020 }
2021 } else {
2022 /* Same as case 0 */
2023 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024 hash_keys.addrs.v4addrs.src = fl4->saddr;
2025 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2026 }
2027 break;
2028 }
2029 mhash = flow_hash_from_keys(&hash_keys);
2030
2031 if (multipath_hash)
2032 mhash = jhash_2words(mhash, multipath_hash, 0);
2033
2034 return mhash >> 1;
2035 }
2036 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2037
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2038 static int ip_mkroute_input(struct sk_buff *skb,
2039 struct fib_result *res,
2040 struct in_device *in_dev,
2041 __be32 daddr, __be32 saddr, u32 tos,
2042 struct flow_keys *hkeys)
2043 {
2044 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2045 if (res->fi && fib_info_num_path(res->fi) > 1) {
2046 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2047
2048 fib_select_multipath(res, h);
2049 }
2050 #endif
2051
2052 /* create a routing cache entry */
2053 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2054 }
2055
2056 /*
2057 * NOTE. We drop all the packets that has local source
2058 * addresses, because every properly looped back packet
2059 * must have correct destination already attached by output routine.
2060 *
2061 * Such approach solves two big problems:
2062 * 1. Not simplex devices are handled properly.
2063 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2064 * called with rcu_read_lock()
2065 */
2066
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2067 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2068 u8 tos, struct net_device *dev,
2069 struct fib_result *res)
2070 {
2071 struct in_device *in_dev = __in_dev_get_rcu(dev);
2072 struct flow_keys *flkeys = NULL, _flkeys;
2073 struct net *net = dev_net(dev);
2074 struct ip_tunnel_info *tun_info;
2075 int err = -EINVAL;
2076 unsigned int flags = 0;
2077 u32 itag = 0;
2078 struct rtable *rth;
2079 struct flowi4 fl4;
2080 bool do_cache = true;
2081
2082 /* IP on this device is disabled. */
2083
2084 if (!in_dev)
2085 goto out;
2086
2087 /* Check for the most weird martians, which can be not detected
2088 by fib_lookup.
2089 */
2090
2091 tun_info = skb_tunnel_info(skb);
2092 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2093 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2094 else
2095 fl4.flowi4_tun_key.tun_id = 0;
2096 skb_dst_drop(skb);
2097
2098 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2099 goto martian_source;
2100
2101 res->fi = NULL;
2102 res->table = NULL;
2103 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2104 goto brd_input;
2105
2106 /* Accept zero addresses only to limited broadcast;
2107 * I even do not know to fix it or not. Waiting for complains :-)
2108 */
2109 if (ipv4_is_zeronet(saddr))
2110 goto martian_source;
2111
2112 if (ipv4_is_zeronet(daddr))
2113 goto martian_destination;
2114
2115 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2116 * and call it once if daddr or/and saddr are loopback addresses
2117 */
2118 if (ipv4_is_loopback(daddr)) {
2119 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2120 goto martian_destination;
2121 } else if (ipv4_is_loopback(saddr)) {
2122 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2123 goto martian_source;
2124 }
2125
2126 /*
2127 * Now we are ready to route packet.
2128 */
2129 fl4.flowi4_oif = 0;
2130 fl4.flowi4_iif = dev->ifindex;
2131 fl4.flowi4_mark = skb->mark;
2132 fl4.flowi4_tos = tos;
2133 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2134 fl4.flowi4_flags = 0;
2135 fl4.daddr = daddr;
2136 fl4.saddr = saddr;
2137 fl4.flowi4_uid = sock_net_uid(net, NULL);
2138 fl4.flowi4_multipath_hash = 0;
2139
2140 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2141 flkeys = &_flkeys;
2142 } else {
2143 fl4.flowi4_proto = 0;
2144 fl4.fl4_sport = 0;
2145 fl4.fl4_dport = 0;
2146 }
2147
2148 err = fib_lookup(net, &fl4, res, 0);
2149 if (err != 0) {
2150 if (!IN_DEV_FORWARD(in_dev))
2151 err = -EHOSTUNREACH;
2152 goto no_route;
2153 }
2154
2155 if (res->type == RTN_BROADCAST) {
2156 if (IN_DEV_BFORWARD(in_dev))
2157 goto make_route;
2158 /* not do cache if bc_forwarding is enabled */
2159 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2160 do_cache = false;
2161 goto brd_input;
2162 }
2163
2164 if (res->type == RTN_LOCAL) {
2165 err = fib_validate_source(skb, saddr, daddr, tos,
2166 0, dev, in_dev, &itag);
2167 if (err < 0)
2168 goto martian_source;
2169 goto local_input;
2170 }
2171
2172 if (!IN_DEV_FORWARD(in_dev)) {
2173 err = -EHOSTUNREACH;
2174 goto no_route;
2175 }
2176 if (res->type != RTN_UNICAST)
2177 goto martian_destination;
2178
2179 make_route:
2180 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2181 out: return err;
2182
2183 brd_input:
2184 if (skb->protocol != htons(ETH_P_IP))
2185 goto e_inval;
2186
2187 if (!ipv4_is_zeronet(saddr)) {
2188 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2189 in_dev, &itag);
2190 if (err < 0)
2191 goto martian_source;
2192 }
2193 flags |= RTCF_BROADCAST;
2194 res->type = RTN_BROADCAST;
2195 RT_CACHE_STAT_INC(in_brd);
2196
2197 local_input:
2198 do_cache &= res->fi && !itag;
2199 if (do_cache) {
2200 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2201
2202 rth = rcu_dereference(nhc->nhc_rth_input);
2203 if (rt_cache_valid(rth)) {
2204 skb_dst_set_noref(skb, &rth->dst);
2205 err = 0;
2206 goto out;
2207 }
2208 }
2209
2210 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2211 flags | RTCF_LOCAL, res->type,
2212 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2213 if (!rth)
2214 goto e_nobufs;
2215
2216 rth->dst.output= ip_rt_bug;
2217 #ifdef CONFIG_IP_ROUTE_CLASSID
2218 rth->dst.tclassid = itag;
2219 #endif
2220 rth->rt_is_input = 1;
2221
2222 RT_CACHE_STAT_INC(in_slow_tot);
2223 if (res->type == RTN_UNREACHABLE) {
2224 rth->dst.input= ip_error;
2225 rth->dst.error= -err;
2226 rth->rt_flags &= ~RTCF_LOCAL;
2227 }
2228
2229 if (do_cache) {
2230 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2231
2232 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2233 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2234 WARN_ON(rth->dst.input == lwtunnel_input);
2235 rth->dst.lwtstate->orig_input = rth->dst.input;
2236 rth->dst.input = lwtunnel_input;
2237 }
2238
2239 if (unlikely(!rt_cache_route(nhc, rth)))
2240 rt_add_uncached_list(rth);
2241 }
2242 skb_dst_set(skb, &rth->dst);
2243 err = 0;
2244 goto out;
2245
2246 no_route:
2247 RT_CACHE_STAT_INC(in_no_route);
2248 res->type = RTN_UNREACHABLE;
2249 res->fi = NULL;
2250 res->table = NULL;
2251 goto local_input;
2252
2253 /*
2254 * Do not cache martian addresses: they should be logged (RFC1812)
2255 */
2256 martian_destination:
2257 RT_CACHE_STAT_INC(in_martian_dst);
2258 #ifdef CONFIG_IP_ROUTE_VERBOSE
2259 if (IN_DEV_LOG_MARTIANS(in_dev))
2260 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2261 &daddr, &saddr, dev->name);
2262 #endif
2263
2264 e_inval:
2265 err = -EINVAL;
2266 goto out;
2267
2268 e_nobufs:
2269 err = -ENOBUFS;
2270 goto out;
2271
2272 martian_source:
2273 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2274 goto out;
2275 }
2276
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2277 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2278 u8 tos, struct net_device *dev)
2279 {
2280 struct fib_result res;
2281 int err;
2282
2283 tos &= IPTOS_RT_MASK;
2284 rcu_read_lock();
2285 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2286 rcu_read_unlock();
2287
2288 return err;
2289 }
2290 EXPORT_SYMBOL(ip_route_input_noref);
2291
2292 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2293 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2294 u8 tos, struct net_device *dev, struct fib_result *res)
2295 {
2296 /* Multicast recognition logic is moved from route cache to here.
2297 The problem was that too many Ethernet cards have broken/missing
2298 hardware multicast filters :-( As result the host on multicasting
2299 network acquires a lot of useless route cache entries, sort of
2300 SDR messages from all the world. Now we try to get rid of them.
2301 Really, provided software IP multicast filter is organized
2302 reasonably (at least, hashed), it does not result in a slowdown
2303 comparing with route cache reject entries.
2304 Note, that multicast routers are not affected, because
2305 route cache entry is created eventually.
2306 */
2307 if (ipv4_is_multicast(daddr)) {
2308 struct in_device *in_dev = __in_dev_get_rcu(dev);
2309 int our = 0;
2310 int err = -EINVAL;
2311
2312 if (!in_dev)
2313 return err;
2314 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2315 ip_hdr(skb)->protocol);
2316
2317 /* check l3 master if no match yet */
2318 if (!our && netif_is_l3_slave(dev)) {
2319 struct in_device *l3_in_dev;
2320
2321 l3_in_dev = __in_dev_get_rcu(skb->dev);
2322 if (l3_in_dev)
2323 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2324 ip_hdr(skb)->protocol);
2325 }
2326
2327 if (our
2328 #ifdef CONFIG_IP_MROUTE
2329 ||
2330 (!ipv4_is_local_multicast(daddr) &&
2331 IN_DEV_MFORWARD(in_dev))
2332 #endif
2333 ) {
2334 err = ip_route_input_mc(skb, daddr, saddr,
2335 tos, dev, our);
2336 }
2337 return err;
2338 }
2339
2340 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2341 }
2342
2343 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2344 static struct rtable *__mkroute_output(const struct fib_result *res,
2345 const struct flowi4 *fl4, int orig_oif,
2346 struct net_device *dev_out,
2347 unsigned int flags)
2348 {
2349 struct fib_info *fi = res->fi;
2350 struct fib_nh_exception *fnhe;
2351 struct in_device *in_dev;
2352 u16 type = res->type;
2353 struct rtable *rth;
2354 bool do_cache;
2355
2356 in_dev = __in_dev_get_rcu(dev_out);
2357 if (!in_dev)
2358 return ERR_PTR(-EINVAL);
2359
2360 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2361 if (ipv4_is_loopback(fl4->saddr) &&
2362 !(dev_out->flags & IFF_LOOPBACK) &&
2363 !netif_is_l3_master(dev_out))
2364 return ERR_PTR(-EINVAL);
2365
2366 if (ipv4_is_lbcast(fl4->daddr))
2367 type = RTN_BROADCAST;
2368 else if (ipv4_is_multicast(fl4->daddr))
2369 type = RTN_MULTICAST;
2370 else if (ipv4_is_zeronet(fl4->daddr))
2371 return ERR_PTR(-EINVAL);
2372
2373 if (dev_out->flags & IFF_LOOPBACK)
2374 flags |= RTCF_LOCAL;
2375
2376 do_cache = true;
2377 if (type == RTN_BROADCAST) {
2378 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2379 fi = NULL;
2380 } else if (type == RTN_MULTICAST) {
2381 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2382 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2383 fl4->flowi4_proto))
2384 flags &= ~RTCF_LOCAL;
2385 else
2386 do_cache = false;
2387 /* If multicast route do not exist use
2388 * default one, but do not gateway in this case.
2389 * Yes, it is hack.
2390 */
2391 if (fi && res->prefixlen < 4)
2392 fi = NULL;
2393 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2394 (orig_oif != dev_out->ifindex)) {
2395 /* For local routes that require a particular output interface
2396 * we do not want to cache the result. Caching the result
2397 * causes incorrect behaviour when there are multiple source
2398 * addresses on the interface, the end result being that if the
2399 * intended recipient is waiting on that interface for the
2400 * packet he won't receive it because it will be delivered on
2401 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2402 * be set to the loopback interface as well.
2403 */
2404 do_cache = false;
2405 }
2406
2407 fnhe = NULL;
2408 do_cache &= fi != NULL;
2409 if (fi) {
2410 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2411 struct rtable __rcu **prth;
2412
2413 fnhe = find_exception(nhc, fl4->daddr);
2414 if (!do_cache)
2415 goto add;
2416 if (fnhe) {
2417 prth = &fnhe->fnhe_rth_output;
2418 } else {
2419 if (unlikely(fl4->flowi4_flags &
2420 FLOWI_FLAG_KNOWN_NH &&
2421 !(nhc->nhc_gw_family &&
2422 nhc->nhc_scope == RT_SCOPE_LINK))) {
2423 do_cache = false;
2424 goto add;
2425 }
2426 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2427 }
2428 rth = rcu_dereference(*prth);
2429 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2430 return rth;
2431 }
2432
2433 add:
2434 rth = rt_dst_alloc(dev_out, flags, type,
2435 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2436 IN_DEV_CONF_GET(in_dev, NOXFRM),
2437 do_cache);
2438 if (!rth)
2439 return ERR_PTR(-ENOBUFS);
2440
2441 rth->rt_iif = orig_oif;
2442
2443 RT_CACHE_STAT_INC(out_slow_tot);
2444
2445 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2446 if (flags & RTCF_LOCAL &&
2447 !(dev_out->flags & IFF_LOOPBACK)) {
2448 rth->dst.output = ip_mc_output;
2449 RT_CACHE_STAT_INC(out_slow_mc);
2450 }
2451 #ifdef CONFIG_IP_MROUTE
2452 if (type == RTN_MULTICAST) {
2453 if (IN_DEV_MFORWARD(in_dev) &&
2454 !ipv4_is_local_multicast(fl4->daddr)) {
2455 rth->dst.input = ip_mr_input;
2456 rth->dst.output = ip_mc_output;
2457 }
2458 }
2459 #endif
2460 }
2461
2462 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2463 lwtunnel_set_redirect(&rth->dst);
2464
2465 return rth;
2466 }
2467
2468 /*
2469 * Major route resolver routine.
2470 */
2471
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2472 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2473 const struct sk_buff *skb)
2474 {
2475 __u8 tos = RT_FL_TOS(fl4);
2476 struct fib_result res = {
2477 .type = RTN_UNSPEC,
2478 .fi = NULL,
2479 .table = NULL,
2480 .tclassid = 0,
2481 };
2482 struct rtable *rth;
2483
2484 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2485 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2486 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2487 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2488
2489 rcu_read_lock();
2490 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2491 rcu_read_unlock();
2492
2493 return rth;
2494 }
2495 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2496
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2497 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2498 struct fib_result *res,
2499 const struct sk_buff *skb)
2500 {
2501 struct net_device *dev_out = NULL;
2502 int orig_oif = fl4->flowi4_oif;
2503 unsigned int flags = 0;
2504 struct rtable *rth;
2505 int err;
2506
2507 if (fl4->saddr) {
2508 if (ipv4_is_multicast(fl4->saddr) ||
2509 ipv4_is_lbcast(fl4->saddr) ||
2510 ipv4_is_zeronet(fl4->saddr)) {
2511 rth = ERR_PTR(-EINVAL);
2512 goto out;
2513 }
2514
2515 rth = ERR_PTR(-ENETUNREACH);
2516
2517 /* I removed check for oif == dev_out->oif here.
2518 It was wrong for two reasons:
2519 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2520 is assigned to multiple interfaces.
2521 2. Moreover, we are allowed to send packets with saddr
2522 of another iface. --ANK
2523 */
2524
2525 if (fl4->flowi4_oif == 0 &&
2526 (ipv4_is_multicast(fl4->daddr) ||
2527 ipv4_is_lbcast(fl4->daddr))) {
2528 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2529 dev_out = __ip_dev_find(net, fl4->saddr, false);
2530 if (!dev_out)
2531 goto out;
2532
2533 /* Special hack: user can direct multicasts
2534 and limited broadcast via necessary interface
2535 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2536 This hack is not just for fun, it allows
2537 vic,vat and friends to work.
2538 They bind socket to loopback, set ttl to zero
2539 and expect that it will work.
2540 From the viewpoint of routing cache they are broken,
2541 because we are not allowed to build multicast path
2542 with loopback source addr (look, routing cache
2543 cannot know, that ttl is zero, so that packet
2544 will not leave this host and route is valid).
2545 Luckily, this hack is good workaround.
2546 */
2547
2548 fl4->flowi4_oif = dev_out->ifindex;
2549 goto make_route;
2550 }
2551
2552 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2553 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2554 if (!__ip_dev_find(net, fl4->saddr, false))
2555 goto out;
2556 }
2557 }
2558
2559
2560 if (fl4->flowi4_oif) {
2561 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2562 rth = ERR_PTR(-ENODEV);
2563 if (!dev_out)
2564 goto out;
2565
2566 /* RACE: Check return value of inet_select_addr instead. */
2567 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2568 rth = ERR_PTR(-ENETUNREACH);
2569 goto out;
2570 }
2571 if (ipv4_is_local_multicast(fl4->daddr) ||
2572 ipv4_is_lbcast(fl4->daddr) ||
2573 fl4->flowi4_proto == IPPROTO_IGMP) {
2574 if (!fl4->saddr)
2575 fl4->saddr = inet_select_addr(dev_out, 0,
2576 RT_SCOPE_LINK);
2577 goto make_route;
2578 }
2579 if (!fl4->saddr) {
2580 if (ipv4_is_multicast(fl4->daddr))
2581 fl4->saddr = inet_select_addr(dev_out, 0,
2582 fl4->flowi4_scope);
2583 else if (!fl4->daddr)
2584 fl4->saddr = inet_select_addr(dev_out, 0,
2585 RT_SCOPE_HOST);
2586 }
2587 }
2588
2589 if (!fl4->daddr) {
2590 fl4->daddr = fl4->saddr;
2591 if (!fl4->daddr)
2592 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2593 dev_out = net->loopback_dev;
2594 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2595 res->type = RTN_LOCAL;
2596 flags |= RTCF_LOCAL;
2597 goto make_route;
2598 }
2599
2600 err = fib_lookup(net, fl4, res, 0);
2601 if (err) {
2602 res->fi = NULL;
2603 res->table = NULL;
2604 if (fl4->flowi4_oif &&
2605 (ipv4_is_multicast(fl4->daddr) ||
2606 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2607 /* Apparently, routing tables are wrong. Assume,
2608 that the destination is on link.
2609
2610 WHY? DW.
2611 Because we are allowed to send to iface
2612 even if it has NO routes and NO assigned
2613 addresses. When oif is specified, routing
2614 tables are looked up with only one purpose:
2615 to catch if destination is gatewayed, rather than
2616 direct. Moreover, if MSG_DONTROUTE is set,
2617 we send packet, ignoring both routing tables
2618 and ifaddr state. --ANK
2619
2620
2621 We could make it even if oif is unknown,
2622 likely IPv6, but we do not.
2623 */
2624
2625 if (fl4->saddr == 0)
2626 fl4->saddr = inet_select_addr(dev_out, 0,
2627 RT_SCOPE_LINK);
2628 res->type = RTN_UNICAST;
2629 goto make_route;
2630 }
2631 rth = ERR_PTR(err);
2632 goto out;
2633 }
2634
2635 if (res->type == RTN_LOCAL) {
2636 if (!fl4->saddr) {
2637 if (res->fi->fib_prefsrc)
2638 fl4->saddr = res->fi->fib_prefsrc;
2639 else
2640 fl4->saddr = fl4->daddr;
2641 }
2642
2643 /* L3 master device is the loopback for that domain */
2644 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2645 net->loopback_dev;
2646
2647 /* make sure orig_oif points to fib result device even
2648 * though packet rx/tx happens over loopback or l3mdev
2649 */
2650 orig_oif = FIB_RES_OIF(*res);
2651
2652 fl4->flowi4_oif = dev_out->ifindex;
2653 flags |= RTCF_LOCAL;
2654 goto make_route;
2655 }
2656
2657 fib_select_path(net, res, fl4, skb);
2658
2659 dev_out = FIB_RES_DEV(*res);
2660
2661 make_route:
2662 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2663
2664 out:
2665 return rth;
2666 }
2667
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2668 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2669 {
2670 return NULL;
2671 }
2672
ipv4_blackhole_mtu(const struct dst_entry * dst)2673 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2674 {
2675 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2676
2677 return mtu ? : dst->dev->mtu;
2678 }
2679
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)2680 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2681 struct sk_buff *skb, u32 mtu,
2682 bool confirm_neigh)
2683 {
2684 }
2685
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2686 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2687 struct sk_buff *skb)
2688 {
2689 }
2690
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2691 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2692 unsigned long old)
2693 {
2694 return NULL;
2695 }
2696
2697 static struct dst_ops ipv4_dst_blackhole_ops = {
2698 .family = AF_INET,
2699 .check = ipv4_blackhole_dst_check,
2700 .mtu = ipv4_blackhole_mtu,
2701 .default_advmss = ipv4_default_advmss,
2702 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2703 .redirect = ipv4_rt_blackhole_redirect,
2704 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2705 .neigh_lookup = ipv4_neigh_lookup,
2706 };
2707
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2708 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2709 {
2710 struct rtable *ort = (struct rtable *) dst_orig;
2711 struct rtable *rt;
2712
2713 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2714 if (rt) {
2715 struct dst_entry *new = &rt->dst;
2716
2717 new->__use = 1;
2718 new->input = dst_discard;
2719 new->output = dst_discard_out;
2720
2721 new->dev = net->loopback_dev;
2722 if (new->dev)
2723 dev_hold(new->dev);
2724
2725 rt->rt_is_input = ort->rt_is_input;
2726 rt->rt_iif = ort->rt_iif;
2727 rt->rt_pmtu = ort->rt_pmtu;
2728 rt->rt_mtu_locked = ort->rt_mtu_locked;
2729
2730 rt->rt_genid = rt_genid_ipv4(net);
2731 rt->rt_flags = ort->rt_flags;
2732 rt->rt_type = ort->rt_type;
2733 rt->rt_uses_gateway = ort->rt_uses_gateway;
2734 rt->rt_gw_family = ort->rt_gw_family;
2735 if (rt->rt_gw_family == AF_INET)
2736 rt->rt_gw4 = ort->rt_gw4;
2737 else if (rt->rt_gw_family == AF_INET6)
2738 rt->rt_gw6 = ort->rt_gw6;
2739
2740 INIT_LIST_HEAD(&rt->rt_uncached);
2741 }
2742
2743 dst_release(dst_orig);
2744
2745 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2746 }
2747
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2748 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2749 const struct sock *sk)
2750 {
2751 struct rtable *rt = __ip_route_output_key(net, flp4);
2752
2753 if (IS_ERR(rt))
2754 return rt;
2755
2756 if (flp4->flowi4_proto) {
2757 flp4->flowi4_oif = rt->dst.dev->ifindex;
2758 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2759 flowi4_to_flowi(flp4),
2760 sk, 0);
2761 }
2762
2763 return rt;
2764 }
2765 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2766
2767 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2768 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2769 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2770 struct sk_buff *skb, u32 portid, u32 seq,
2771 unsigned int flags)
2772 {
2773 struct rtmsg *r;
2774 struct nlmsghdr *nlh;
2775 unsigned long expires = 0;
2776 u32 error;
2777 u32 metrics[RTAX_MAX];
2778
2779 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2780 if (!nlh)
2781 return -EMSGSIZE;
2782
2783 r = nlmsg_data(nlh);
2784 r->rtm_family = AF_INET;
2785 r->rtm_dst_len = 32;
2786 r->rtm_src_len = 0;
2787 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2788 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2789 if (nla_put_u32(skb, RTA_TABLE, table_id))
2790 goto nla_put_failure;
2791 r->rtm_type = rt->rt_type;
2792 r->rtm_scope = RT_SCOPE_UNIVERSE;
2793 r->rtm_protocol = RTPROT_UNSPEC;
2794 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2795 if (rt->rt_flags & RTCF_NOTIFY)
2796 r->rtm_flags |= RTM_F_NOTIFY;
2797 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2798 r->rtm_flags |= RTCF_DOREDIRECT;
2799
2800 if (nla_put_in_addr(skb, RTA_DST, dst))
2801 goto nla_put_failure;
2802 if (src) {
2803 r->rtm_src_len = 32;
2804 if (nla_put_in_addr(skb, RTA_SRC, src))
2805 goto nla_put_failure;
2806 }
2807 if (rt->dst.dev &&
2808 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2809 goto nla_put_failure;
2810 #ifdef CONFIG_IP_ROUTE_CLASSID
2811 if (rt->dst.tclassid &&
2812 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2813 goto nla_put_failure;
2814 #endif
2815 if (fl4 && !rt_is_input_route(rt) &&
2816 fl4->saddr != src) {
2817 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2818 goto nla_put_failure;
2819 }
2820 if (rt->rt_uses_gateway) {
2821 if (rt->rt_gw_family == AF_INET &&
2822 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2823 goto nla_put_failure;
2824 } else if (rt->rt_gw_family == AF_INET6) {
2825 int alen = sizeof(struct in6_addr);
2826 struct nlattr *nla;
2827 struct rtvia *via;
2828
2829 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2830 if (!nla)
2831 goto nla_put_failure;
2832
2833 via = nla_data(nla);
2834 via->rtvia_family = AF_INET6;
2835 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2836 }
2837 }
2838
2839 expires = rt->dst.expires;
2840 if (expires) {
2841 unsigned long now = jiffies;
2842
2843 if (time_before(now, expires))
2844 expires -= now;
2845 else
2846 expires = 0;
2847 }
2848
2849 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2850 if (rt->rt_pmtu && expires)
2851 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2852 if (rt->rt_mtu_locked && expires)
2853 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2854 if (rtnetlink_put_metrics(skb, metrics) < 0)
2855 goto nla_put_failure;
2856
2857 if (fl4) {
2858 if (fl4->flowi4_mark &&
2859 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2860 goto nla_put_failure;
2861
2862 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2863 nla_put_u32(skb, RTA_UID,
2864 from_kuid_munged(current_user_ns(),
2865 fl4->flowi4_uid)))
2866 goto nla_put_failure;
2867
2868 if (rt_is_input_route(rt)) {
2869 #ifdef CONFIG_IP_MROUTE
2870 if (ipv4_is_multicast(dst) &&
2871 !ipv4_is_local_multicast(dst) &&
2872 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2873 int err = ipmr_get_route(net, skb,
2874 fl4->saddr, fl4->daddr,
2875 r, portid);
2876
2877 if (err <= 0) {
2878 if (err == 0)
2879 return 0;
2880 goto nla_put_failure;
2881 }
2882 } else
2883 #endif
2884 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2885 goto nla_put_failure;
2886 }
2887 }
2888
2889 error = rt->dst.error;
2890
2891 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2892 goto nla_put_failure;
2893
2894 nlmsg_end(skb, nlh);
2895 return 0;
2896
2897 nla_put_failure:
2898 nlmsg_cancel(skb, nlh);
2899 return -EMSGSIZE;
2900 }
2901
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2902 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2903 struct netlink_callback *cb, u32 table_id,
2904 struct fnhe_hash_bucket *bucket, int genid,
2905 int *fa_index, int fa_start, unsigned int flags)
2906 {
2907 int i;
2908
2909 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2910 struct fib_nh_exception *fnhe;
2911
2912 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2913 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2914 struct rtable *rt;
2915 int err;
2916
2917 if (*fa_index < fa_start)
2918 goto next;
2919
2920 if (fnhe->fnhe_genid != genid)
2921 goto next;
2922
2923 if (fnhe->fnhe_expires &&
2924 time_after(jiffies, fnhe->fnhe_expires))
2925 goto next;
2926
2927 rt = rcu_dereference(fnhe->fnhe_rth_input);
2928 if (!rt)
2929 rt = rcu_dereference(fnhe->fnhe_rth_output);
2930 if (!rt)
2931 goto next;
2932
2933 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2934 table_id, NULL, skb,
2935 NETLINK_CB(cb->skb).portid,
2936 cb->nlh->nlmsg_seq, flags);
2937 if (err)
2938 return err;
2939 next:
2940 (*fa_index)++;
2941 }
2942 }
2943
2944 return 0;
2945 }
2946
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)2947 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2948 u32 table_id, struct fib_info *fi,
2949 int *fa_index, int fa_start, unsigned int flags)
2950 {
2951 struct net *net = sock_net(cb->skb->sk);
2952 int nhsel, genid = fnhe_genid(net);
2953
2954 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2955 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2956 struct fnhe_hash_bucket *bucket;
2957 int err;
2958
2959 if (nhc->nhc_flags & RTNH_F_DEAD)
2960 continue;
2961
2962 rcu_read_lock();
2963 bucket = rcu_dereference(nhc->nhc_exceptions);
2964 err = 0;
2965 if (bucket)
2966 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2967 genid, fa_index, fa_start,
2968 flags);
2969 rcu_read_unlock();
2970 if (err)
2971 return err;
2972 }
2973
2974 return 0;
2975 }
2976
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)2977 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2978 u8 ip_proto, __be16 sport,
2979 __be16 dport)
2980 {
2981 struct sk_buff *skb;
2982 struct iphdr *iph;
2983
2984 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2985 if (!skb)
2986 return NULL;
2987
2988 /* Reserve room for dummy headers, this skb can pass
2989 * through good chunk of routing engine.
2990 */
2991 skb_reset_mac_header(skb);
2992 skb_reset_network_header(skb);
2993 skb->protocol = htons(ETH_P_IP);
2994 iph = skb_put(skb, sizeof(struct iphdr));
2995 iph->protocol = ip_proto;
2996 iph->saddr = src;
2997 iph->daddr = dst;
2998 iph->version = 0x4;
2999 iph->frag_off = 0;
3000 iph->ihl = 0x5;
3001 skb_set_transport_header(skb, skb->len);
3002
3003 switch (iph->protocol) {
3004 case IPPROTO_UDP: {
3005 struct udphdr *udph;
3006
3007 udph = skb_put_zero(skb, sizeof(struct udphdr));
3008 udph->source = sport;
3009 udph->dest = dport;
3010 udph->len = htons(sizeof(struct udphdr));
3011 udph->check = 0;
3012 break;
3013 }
3014 case IPPROTO_TCP: {
3015 struct tcphdr *tcph;
3016
3017 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3018 tcph->source = sport;
3019 tcph->dest = dport;
3020 tcph->doff = sizeof(struct tcphdr) / 4;
3021 tcph->rst = 1;
3022 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3023 src, dst, 0);
3024 break;
3025 }
3026 case IPPROTO_ICMP: {
3027 struct icmphdr *icmph;
3028
3029 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3030 icmph->type = ICMP_ECHO;
3031 icmph->code = 0;
3032 }
3033 }
3034
3035 return skb;
3036 }
3037
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3038 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3039 const struct nlmsghdr *nlh,
3040 struct nlattr **tb,
3041 struct netlink_ext_ack *extack)
3042 {
3043 struct rtmsg *rtm;
3044 int i, err;
3045
3046 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3047 NL_SET_ERR_MSG(extack,
3048 "ipv4: Invalid header for route get request");
3049 return -EINVAL;
3050 }
3051
3052 if (!netlink_strict_get_check(skb))
3053 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3054 rtm_ipv4_policy, extack);
3055
3056 rtm = nlmsg_data(nlh);
3057 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3058 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3059 rtm->rtm_table || rtm->rtm_protocol ||
3060 rtm->rtm_scope || rtm->rtm_type) {
3061 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3062 return -EINVAL;
3063 }
3064
3065 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3066 RTM_F_LOOKUP_TABLE |
3067 RTM_F_FIB_MATCH)) {
3068 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3069 return -EINVAL;
3070 }
3071
3072 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3073 rtm_ipv4_policy, extack);
3074 if (err)
3075 return err;
3076
3077 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3078 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3079 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3080 return -EINVAL;
3081 }
3082
3083 for (i = 0; i <= RTA_MAX; i++) {
3084 if (!tb[i])
3085 continue;
3086
3087 switch (i) {
3088 case RTA_IIF:
3089 case RTA_OIF:
3090 case RTA_SRC:
3091 case RTA_DST:
3092 case RTA_IP_PROTO:
3093 case RTA_SPORT:
3094 case RTA_DPORT:
3095 case RTA_MARK:
3096 case RTA_UID:
3097 break;
3098 default:
3099 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3100 return -EINVAL;
3101 }
3102 }
3103
3104 return 0;
3105 }
3106
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3107 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3108 struct netlink_ext_ack *extack)
3109 {
3110 struct net *net = sock_net(in_skb->sk);
3111 struct nlattr *tb[RTA_MAX+1];
3112 u32 table_id = RT_TABLE_MAIN;
3113 __be16 sport = 0, dport = 0;
3114 struct fib_result res = {};
3115 u8 ip_proto = IPPROTO_UDP;
3116 struct rtable *rt = NULL;
3117 struct sk_buff *skb;
3118 struct rtmsg *rtm;
3119 struct flowi4 fl4 = {};
3120 __be32 dst = 0;
3121 __be32 src = 0;
3122 kuid_t uid;
3123 u32 iif;
3124 int err;
3125 int mark;
3126
3127 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3128 if (err < 0)
3129 return err;
3130
3131 rtm = nlmsg_data(nlh);
3132 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3133 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3134 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3135 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3136 if (tb[RTA_UID])
3137 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3138 else
3139 uid = (iif ? INVALID_UID : current_uid());
3140
3141 if (tb[RTA_IP_PROTO]) {
3142 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3143 &ip_proto, AF_INET, extack);
3144 if (err)
3145 return err;
3146 }
3147
3148 if (tb[RTA_SPORT])
3149 sport = nla_get_be16(tb[RTA_SPORT]);
3150
3151 if (tb[RTA_DPORT])
3152 dport = nla_get_be16(tb[RTA_DPORT]);
3153
3154 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3155 if (!skb)
3156 return -ENOBUFS;
3157
3158 fl4.daddr = dst;
3159 fl4.saddr = src;
3160 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3161 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3162 fl4.flowi4_mark = mark;
3163 fl4.flowi4_uid = uid;
3164 if (sport)
3165 fl4.fl4_sport = sport;
3166 if (dport)
3167 fl4.fl4_dport = dport;
3168 fl4.flowi4_proto = ip_proto;
3169
3170 rcu_read_lock();
3171
3172 if (iif) {
3173 struct net_device *dev;
3174
3175 dev = dev_get_by_index_rcu(net, iif);
3176 if (!dev) {
3177 err = -ENODEV;
3178 goto errout_rcu;
3179 }
3180
3181 fl4.flowi4_iif = iif; /* for rt_fill_info */
3182 skb->dev = dev;
3183 skb->mark = mark;
3184 err = ip_route_input_rcu(skb, dst, src,
3185 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3186 &res);
3187
3188 rt = skb_rtable(skb);
3189 if (err == 0 && rt->dst.error)
3190 err = -rt->dst.error;
3191 } else {
3192 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3193 skb->dev = net->loopback_dev;
3194 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3195 err = 0;
3196 if (IS_ERR(rt))
3197 err = PTR_ERR(rt);
3198 else
3199 skb_dst_set(skb, &rt->dst);
3200 }
3201
3202 if (err)
3203 goto errout_rcu;
3204
3205 if (rtm->rtm_flags & RTM_F_NOTIFY)
3206 rt->rt_flags |= RTCF_NOTIFY;
3207
3208 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3209 table_id = res.table ? res.table->tb_id : 0;
3210
3211 /* reset skb for netlink reply msg */
3212 skb_trim(skb, 0);
3213 skb_reset_network_header(skb);
3214 skb_reset_transport_header(skb);
3215 skb_reset_mac_header(skb);
3216
3217 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3218 if (!res.fi) {
3219 err = fib_props[res.type].error;
3220 if (!err)
3221 err = -EHOSTUNREACH;
3222 goto errout_rcu;
3223 }
3224 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3225 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3226 rt->rt_type, res.prefix, res.prefixlen,
3227 fl4.flowi4_tos, res.fi, 0);
3228 } else {
3229 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3230 NETLINK_CB(in_skb).portid,
3231 nlh->nlmsg_seq, 0);
3232 }
3233 if (err < 0)
3234 goto errout_rcu;
3235
3236 rcu_read_unlock();
3237
3238 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3239
3240 errout_free:
3241 return err;
3242 errout_rcu:
3243 rcu_read_unlock();
3244 kfree_skb(skb);
3245 goto errout_free;
3246 }
3247
ip_rt_multicast_event(struct in_device * in_dev)3248 void ip_rt_multicast_event(struct in_device *in_dev)
3249 {
3250 rt_cache_flush(dev_net(in_dev->dev));
3251 }
3252
3253 #ifdef CONFIG_SYSCTL
3254 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3255 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3256 static int ip_rt_gc_elasticity __read_mostly = 8;
3257 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3258
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)3259 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3260 void __user *buffer,
3261 size_t *lenp, loff_t *ppos)
3262 {
3263 struct net *net = (struct net *)__ctl->extra1;
3264
3265 if (write) {
3266 rt_cache_flush(net);
3267 fnhe_genid_bump(net);
3268 return 0;
3269 }
3270
3271 return -EINVAL;
3272 }
3273
3274 static struct ctl_table ipv4_route_table[] = {
3275 {
3276 .procname = "gc_thresh",
3277 .data = &ipv4_dst_ops.gc_thresh,
3278 .maxlen = sizeof(int),
3279 .mode = 0644,
3280 .proc_handler = proc_dointvec,
3281 },
3282 {
3283 .procname = "max_size",
3284 .data = &ip_rt_max_size,
3285 .maxlen = sizeof(int),
3286 .mode = 0644,
3287 .proc_handler = proc_dointvec,
3288 },
3289 {
3290 /* Deprecated. Use gc_min_interval_ms */
3291
3292 .procname = "gc_min_interval",
3293 .data = &ip_rt_gc_min_interval,
3294 .maxlen = sizeof(int),
3295 .mode = 0644,
3296 .proc_handler = proc_dointvec_jiffies,
3297 },
3298 {
3299 .procname = "gc_min_interval_ms",
3300 .data = &ip_rt_gc_min_interval,
3301 .maxlen = sizeof(int),
3302 .mode = 0644,
3303 .proc_handler = proc_dointvec_ms_jiffies,
3304 },
3305 {
3306 .procname = "gc_timeout",
3307 .data = &ip_rt_gc_timeout,
3308 .maxlen = sizeof(int),
3309 .mode = 0644,
3310 .proc_handler = proc_dointvec_jiffies,
3311 },
3312 {
3313 .procname = "gc_interval",
3314 .data = &ip_rt_gc_interval,
3315 .maxlen = sizeof(int),
3316 .mode = 0644,
3317 .proc_handler = proc_dointvec_jiffies,
3318 },
3319 {
3320 .procname = "redirect_load",
3321 .data = &ip_rt_redirect_load,
3322 .maxlen = sizeof(int),
3323 .mode = 0644,
3324 .proc_handler = proc_dointvec,
3325 },
3326 {
3327 .procname = "redirect_number",
3328 .data = &ip_rt_redirect_number,
3329 .maxlen = sizeof(int),
3330 .mode = 0644,
3331 .proc_handler = proc_dointvec,
3332 },
3333 {
3334 .procname = "redirect_silence",
3335 .data = &ip_rt_redirect_silence,
3336 .maxlen = sizeof(int),
3337 .mode = 0644,
3338 .proc_handler = proc_dointvec,
3339 },
3340 {
3341 .procname = "error_cost",
3342 .data = &ip_rt_error_cost,
3343 .maxlen = sizeof(int),
3344 .mode = 0644,
3345 .proc_handler = proc_dointvec,
3346 },
3347 {
3348 .procname = "error_burst",
3349 .data = &ip_rt_error_burst,
3350 .maxlen = sizeof(int),
3351 .mode = 0644,
3352 .proc_handler = proc_dointvec,
3353 },
3354 {
3355 .procname = "gc_elasticity",
3356 .data = &ip_rt_gc_elasticity,
3357 .maxlen = sizeof(int),
3358 .mode = 0644,
3359 .proc_handler = proc_dointvec,
3360 },
3361 {
3362 .procname = "mtu_expires",
3363 .data = &ip_rt_mtu_expires,
3364 .maxlen = sizeof(int),
3365 .mode = 0644,
3366 .proc_handler = proc_dointvec_jiffies,
3367 },
3368 {
3369 .procname = "min_pmtu",
3370 .data = &ip_rt_min_pmtu,
3371 .maxlen = sizeof(int),
3372 .mode = 0644,
3373 .proc_handler = proc_dointvec_minmax,
3374 .extra1 = &ip_min_valid_pmtu,
3375 },
3376 {
3377 .procname = "min_adv_mss",
3378 .data = &ip_rt_min_advmss,
3379 .maxlen = sizeof(int),
3380 .mode = 0644,
3381 .proc_handler = proc_dointvec,
3382 },
3383 { }
3384 };
3385
3386 static const char ipv4_route_flush_procname[] = "flush";
3387
3388 static struct ctl_table ipv4_route_flush_table[] = {
3389 {
3390 .procname = ipv4_route_flush_procname,
3391 .maxlen = sizeof(int),
3392 .mode = 0200,
3393 .proc_handler = ipv4_sysctl_rtcache_flush,
3394 },
3395 { },
3396 };
3397
sysctl_route_net_init(struct net * net)3398 static __net_init int sysctl_route_net_init(struct net *net)
3399 {
3400 struct ctl_table *tbl;
3401
3402 tbl = ipv4_route_flush_table;
3403 if (!net_eq(net, &init_net)) {
3404 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3405 if (!tbl)
3406 goto err_dup;
3407
3408 /* Don't export non-whitelisted sysctls to unprivileged users */
3409 if (net->user_ns != &init_user_ns) {
3410 if (tbl[0].procname != ipv4_route_flush_procname)
3411 tbl[0].procname = NULL;
3412 }
3413 }
3414 tbl[0].extra1 = net;
3415
3416 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3417 if (!net->ipv4.route_hdr)
3418 goto err_reg;
3419 return 0;
3420
3421 err_reg:
3422 if (tbl != ipv4_route_flush_table)
3423 kfree(tbl);
3424 err_dup:
3425 return -ENOMEM;
3426 }
3427
sysctl_route_net_exit(struct net * net)3428 static __net_exit void sysctl_route_net_exit(struct net *net)
3429 {
3430 struct ctl_table *tbl;
3431
3432 tbl = net->ipv4.route_hdr->ctl_table_arg;
3433 unregister_net_sysctl_table(net->ipv4.route_hdr);
3434 BUG_ON(tbl == ipv4_route_flush_table);
3435 kfree(tbl);
3436 }
3437
3438 static __net_initdata struct pernet_operations sysctl_route_ops = {
3439 .init = sysctl_route_net_init,
3440 .exit = sysctl_route_net_exit,
3441 };
3442 #endif
3443
rt_genid_init(struct net * net)3444 static __net_init int rt_genid_init(struct net *net)
3445 {
3446 atomic_set(&net->ipv4.rt_genid, 0);
3447 atomic_set(&net->fnhe_genid, 0);
3448 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3449 return 0;
3450 }
3451
3452 static __net_initdata struct pernet_operations rt_genid_ops = {
3453 .init = rt_genid_init,
3454 };
3455
ipv4_inetpeer_init(struct net * net)3456 static int __net_init ipv4_inetpeer_init(struct net *net)
3457 {
3458 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3459
3460 if (!bp)
3461 return -ENOMEM;
3462 inet_peer_base_init(bp);
3463 net->ipv4.peers = bp;
3464 return 0;
3465 }
3466
ipv4_inetpeer_exit(struct net * net)3467 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3468 {
3469 struct inet_peer_base *bp = net->ipv4.peers;
3470
3471 net->ipv4.peers = NULL;
3472 inetpeer_invalidate_tree(bp);
3473 kfree(bp);
3474 }
3475
3476 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3477 .init = ipv4_inetpeer_init,
3478 .exit = ipv4_inetpeer_exit,
3479 };
3480
3481 #ifdef CONFIG_IP_ROUTE_CLASSID
3482 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3483 #endif /* CONFIG_IP_ROUTE_CLASSID */
3484
ip_rt_init(void)3485 int __init ip_rt_init(void)
3486 {
3487 void *idents_hash;
3488 int cpu;
3489
3490 /* For modern hosts, this will use 2 MB of memory */
3491 idents_hash = alloc_large_system_hash("IP idents",
3492 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3493 0,
3494 16, /* one bucket per 64 KB */
3495 HASH_ZERO,
3496 NULL,
3497 &ip_idents_mask,
3498 2048,
3499 256*1024);
3500
3501 ip_idents = idents_hash;
3502
3503 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3504
3505 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3506
3507 for_each_possible_cpu(cpu) {
3508 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3509
3510 INIT_LIST_HEAD(&ul->head);
3511 spin_lock_init(&ul->lock);
3512 }
3513 #ifdef CONFIG_IP_ROUTE_CLASSID
3514 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3515 if (!ip_rt_acct)
3516 panic("IP: failed to allocate ip_rt_acct\n");
3517 #endif
3518
3519 ipv4_dst_ops.kmem_cachep =
3520 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3521 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3522
3523 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3524
3525 if (dst_entries_init(&ipv4_dst_ops) < 0)
3526 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3527
3528 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3529 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3530
3531 ipv4_dst_ops.gc_thresh = ~0;
3532 ip_rt_max_size = INT_MAX;
3533
3534 devinet_init();
3535 ip_fib_init();
3536
3537 if (ip_rt_proc_init())
3538 pr_err("Unable to create route proc files\n");
3539 #ifdef CONFIG_XFRM
3540 xfrm_init();
3541 xfrm4_init();
3542 #endif
3543 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3544 RTNL_FLAG_DOIT_UNLOCKED);
3545
3546 #ifdef CONFIG_SYSCTL
3547 register_pernet_subsys(&sysctl_route_ops);
3548 #endif
3549 register_pernet_subsys(&rt_genid_ops);
3550 register_pernet_subsys(&ipv4_inetpeer_ops);
3551 return 0;
3552 }
3553
3554 #ifdef CONFIG_SYSCTL
3555 /*
3556 * We really need to sanitize the damn ipv4 init order, then all
3557 * this nonsense will go away.
3558 */
ip_static_sysctl_init(void)3559 void __init ip_static_sysctl_init(void)
3560 {
3561 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3562 }
3563 #endif
3564