1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly = 9;
122 static int ip_rt_redirect_load __read_mostly = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly = HZ;
125 static int ip_rt_error_burst __read_mostly = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly = 256;
129
130 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
131
132 /*
133 * Interface to generic destination cache.
134 */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void ipv4_link_failure(struct sk_buff *skb);
141 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142 struct sk_buff *skb, u32 mtu,
143 bool confirm_neigh);
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150 WARN_ON(1);
151 return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 struct sk_buff *skb,
156 const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160 .family = AF_INET,
161 .check = ipv4_dst_check,
162 .default_advmss = ipv4_default_advmss,
163 .mtu = ipv4_mtu,
164 .cow_metrics = ipv4_cow_metrics,
165 .destroy = ipv4_dst_destroy,
166 .negative_advice = ipv4_negative_advice,
167 .link_failure = ipv4_link_failure,
168 .update_pmtu = ip_rt_update_pmtu,
169 .redirect = ip_do_redirect,
170 .local_out = __ip_local_out,
171 .neigh_lookup = ipv4_neigh_lookup,
172 .confirm_neigh = ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class) TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(BESTEFFORT),
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203 if (*pos)
204 return NULL;
205 return SEQ_START_TOKEN;
206 }
207
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210 ++*pos;
211 return NULL;
212 }
213
rt_cache_seq_stop(struct seq_file * seq,void * v)214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
rt_cache_seq_show(struct seq_file * seq,void * v)218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220 if (v == SEQ_START_TOKEN)
221 seq_printf(seq, "%-127s\n",
222 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224 "HHUptod\tSpecDst");
225 return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229 .start = rt_cache_seq_start,
230 .next = rt_cache_seq_next,
231 .stop = rt_cache_seq_stop,
232 .show = rt_cache_seq_show,
233 };
234
rt_cache_seq_open(struct inode * inode,struct file * file)235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237 return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct file_operations rt_cache_seq_fops = {
241 .open = rt_cache_seq_open,
242 .read = seq_read,
243 .llseek = seq_lseek,
244 .release = seq_release,
245 };
246
247
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250 int cpu;
251
252 if (*pos == 0)
253 return SEQ_START_TOKEN;
254
255 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256 if (!cpu_possible(cpu))
257 continue;
258 *pos = cpu+1;
259 return &per_cpu(rt_cache_stat, cpu);
260 }
261 return NULL;
262 }
263
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266 int cpu;
267
268 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269 if (!cpu_possible(cpu))
270 continue;
271 *pos = cpu+1;
272 return &per_cpu(rt_cache_stat, cpu);
273 }
274 return NULL;
275
276 }
277
rt_cpu_seq_stop(struct seq_file * seq,void * v)278 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
279 {
280
281 }
282
rt_cpu_seq_show(struct seq_file * seq,void * v)283 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
284 {
285 struct rt_cache_stat *st = v;
286
287 if (v == SEQ_START_TOKEN) {
288 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
289 return 0;
290 }
291
292 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
293 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
294 dst_entries_get_slow(&ipv4_dst_ops),
295 0, /* st->in_hit */
296 st->in_slow_tot,
297 st->in_slow_mc,
298 st->in_no_route,
299 st->in_brd,
300 st->in_martian_dst,
301 st->in_martian_src,
302
303 0, /* st->out_hit */
304 st->out_slow_tot,
305 st->out_slow_mc,
306
307 0, /* st->gc_total */
308 0, /* st->gc_ignored */
309 0, /* st->gc_goal_miss */
310 0, /* st->gc_dst_overflow */
311 0, /* st->in_hlist_search */
312 0 /* st->out_hlist_search */
313 );
314 return 0;
315 }
316
317 static const struct seq_operations rt_cpu_seq_ops = {
318 .start = rt_cpu_seq_start,
319 .next = rt_cpu_seq_next,
320 .stop = rt_cpu_seq_stop,
321 .show = rt_cpu_seq_show,
322 };
323
324
rt_cpu_seq_open(struct inode * inode,struct file * file)325 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
326 {
327 return seq_open(file, &rt_cpu_seq_ops);
328 }
329
330 static const struct file_operations rt_cpu_seq_fops = {
331 .open = rt_cpu_seq_open,
332 .read = seq_read,
333 .llseek = seq_lseek,
334 .release = seq_release,
335 };
336
337 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)338 static int rt_acct_proc_show(struct seq_file *m, void *v)
339 {
340 struct ip_rt_acct *dst, *src;
341 unsigned int i, j;
342
343 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
344 if (!dst)
345 return -ENOMEM;
346
347 for_each_possible_cpu(i) {
348 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349 for (j = 0; j < 256; j++) {
350 dst[j].o_bytes += src[j].o_bytes;
351 dst[j].o_packets += src[j].o_packets;
352 dst[j].i_bytes += src[j].i_bytes;
353 dst[j].i_packets += src[j].i_packets;
354 }
355 }
356
357 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
358 kfree(dst);
359 return 0;
360 }
361 #endif
362
ip_rt_do_proc_init(struct net * net)363 static int __net_init ip_rt_do_proc_init(struct net *net)
364 {
365 struct proc_dir_entry *pde;
366
367 pde = proc_create("rt_cache", 0444, net->proc_net,
368 &rt_cache_seq_fops);
369 if (!pde)
370 goto err1;
371
372 pde = proc_create("rt_cache", 0444,
373 net->proc_net_stat, &rt_cpu_seq_fops);
374 if (!pde)
375 goto err2;
376
377 #ifdef CONFIG_IP_ROUTE_CLASSID
378 pde = proc_create_single("rt_acct", 0, net->proc_net,
379 rt_acct_proc_show);
380 if (!pde)
381 goto err3;
382 #endif
383 return 0;
384
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386 err3:
387 remove_proc_entry("rt_cache", net->proc_net_stat);
388 #endif
389 err2:
390 remove_proc_entry("rt_cache", net->proc_net);
391 err1:
392 return -ENOMEM;
393 }
394
ip_rt_do_proc_exit(struct net * net)395 static void __net_exit ip_rt_do_proc_exit(struct net *net)
396 {
397 remove_proc_entry("rt_cache", net->proc_net_stat);
398 remove_proc_entry("rt_cache", net->proc_net);
399 #ifdef CONFIG_IP_ROUTE_CLASSID
400 remove_proc_entry("rt_acct", net->proc_net);
401 #endif
402 }
403
404 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
405 .init = ip_rt_do_proc_init,
406 .exit = ip_rt_do_proc_exit,
407 };
408
ip_rt_proc_init(void)409 static int __init ip_rt_proc_init(void)
410 {
411 return register_pernet_subsys(&ip_rt_proc_ops);
412 }
413
414 #else
ip_rt_proc_init(void)415 static inline int ip_rt_proc_init(void)
416 {
417 return 0;
418 }
419 #endif /* CONFIG_PROC_FS */
420
rt_is_expired(const struct rtable * rth)421 static inline bool rt_is_expired(const struct rtable *rth)
422 {
423 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
424 }
425
rt_cache_flush(struct net * net)426 void rt_cache_flush(struct net *net)
427 {
428 rt_genid_bump_ipv4(net);
429 }
430
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)431 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
432 struct sk_buff *skb,
433 const void *daddr)
434 {
435 const struct rtable *rt = container_of(dst, struct rtable, dst);
436 struct net_device *dev = dst->dev;
437 struct neighbour *n;
438
439 rcu_read_lock_bh();
440
441 if (likely(rt->rt_gw_family == AF_INET)) {
442 n = ip_neigh_gw4(dev, rt->rt_gw4);
443 } else if (rt->rt_gw_family == AF_INET6) {
444 n = ip_neigh_gw6(dev, &rt->rt_gw6);
445 } else {
446 __be32 pkey;
447
448 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
449 n = ip_neigh_gw4(dev, pkey);
450 }
451
452 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
453 n = NULL;
454
455 rcu_read_unlock_bh();
456
457 return n;
458 }
459
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)460 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
461 {
462 const struct rtable *rt = container_of(dst, struct rtable, dst);
463 struct net_device *dev = dst->dev;
464 const __be32 *pkey = daddr;
465
466 if (rt->rt_gw_family == AF_INET) {
467 pkey = (const __be32 *)&rt->rt_gw4;
468 } else if (rt->rt_gw_family == AF_INET6) {
469 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
470 } else if (!daddr ||
471 (rt->rt_flags &
472 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
473 return;
474 }
475 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
476 }
477
478 #define IP_IDENTS_SZ 2048u
479
480 static atomic_t *ip_idents __read_mostly;
481 static u32 *ip_tstamps __read_mostly;
482
483 /* In order to protect privacy, we add a perturbation to identifiers
484 * if one generator is seldom used. This makes hard for an attacker
485 * to infer how many packets were sent between two points in time.
486 */
ip_idents_reserve(u32 hash,int segs)487 u32 ip_idents_reserve(u32 hash, int segs)
488 {
489 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
490 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
491 u32 old = READ_ONCE(*p_tstamp);
492 u32 now = (u32)jiffies;
493 u32 new, delta = 0;
494
495 if (old != now && cmpxchg(p_tstamp, old, now) == old)
496 delta = prandom_u32_max(now - old);
497
498 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
499 do {
500 old = (u32)atomic_read(p_id);
501 new = old + delta + segs;
502 } while (atomic_cmpxchg(p_id, old, new) != old);
503
504 return new - segs;
505 }
506 EXPORT_SYMBOL(ip_idents_reserve);
507
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)508 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
509 {
510 u32 hash, id;
511
512 /* Note the following code is not safe, but this is okay. */
513 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
514 get_random_bytes(&net->ipv4.ip_id_key,
515 sizeof(net->ipv4.ip_id_key));
516
517 hash = siphash_3u32((__force u32)iph->daddr,
518 (__force u32)iph->saddr,
519 iph->protocol,
520 &net->ipv4.ip_id_key);
521 id = ip_idents_reserve(hash, segs);
522 iph->id = htons(id);
523 }
524 EXPORT_SYMBOL(__ip_select_ident);
525
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)526 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
527 const struct sock *sk,
528 const struct iphdr *iph,
529 int oif, u8 tos,
530 u8 prot, u32 mark, int flow_flags)
531 {
532 if (sk) {
533 const struct inet_sock *inet = inet_sk(sk);
534
535 oif = sk->sk_bound_dev_if;
536 mark = sk->sk_mark;
537 tos = RT_CONN_FLAGS(sk);
538 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
539 }
540 flowi4_init_output(fl4, oif, mark, tos,
541 RT_SCOPE_UNIVERSE, prot,
542 flow_flags,
543 iph->daddr, iph->saddr, 0, 0,
544 sock_net_uid(net, sk));
545 }
546
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)547 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
548 const struct sock *sk)
549 {
550 const struct net *net = dev_net(skb->dev);
551 const struct iphdr *iph = ip_hdr(skb);
552 int oif = skb->dev->ifindex;
553 u8 tos = RT_TOS(iph->tos);
554 u8 prot = iph->protocol;
555 u32 mark = skb->mark;
556
557 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
558 }
559
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)560 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
561 {
562 const struct inet_sock *inet = inet_sk(sk);
563 const struct ip_options_rcu *inet_opt;
564 __be32 daddr = inet->inet_daddr;
565
566 rcu_read_lock();
567 inet_opt = rcu_dereference(inet->inet_opt);
568 if (inet_opt && inet_opt->opt.srr)
569 daddr = inet_opt->opt.faddr;
570 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
571 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
572 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
573 inet_sk_flowi_flags(sk),
574 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
575 rcu_read_unlock();
576 }
577
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)578 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
579 const struct sk_buff *skb)
580 {
581 if (skb)
582 build_skb_flow_key(fl4, skb, sk);
583 else
584 build_sk_flow_key(fl4, sk);
585 }
586
587 static DEFINE_SPINLOCK(fnhe_lock);
588
fnhe_flush_routes(struct fib_nh_exception * fnhe)589 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
590 {
591 struct rtable *rt;
592
593 rt = rcu_dereference(fnhe->fnhe_rth_input);
594 if (rt) {
595 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
596 dst_dev_put(&rt->dst);
597 dst_release(&rt->dst);
598 }
599 rt = rcu_dereference(fnhe->fnhe_rth_output);
600 if (rt) {
601 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
604 }
605 }
606
fnhe_oldest(struct fnhe_hash_bucket * hash)607 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
608 {
609 struct fib_nh_exception *fnhe, *oldest;
610
611 oldest = rcu_dereference(hash->chain);
612 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
613 fnhe = rcu_dereference(fnhe->fnhe_next)) {
614 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
615 oldest = fnhe;
616 }
617 fnhe_flush_routes(oldest);
618 return oldest;
619 }
620
fnhe_hashfun(__be32 daddr)621 static inline u32 fnhe_hashfun(__be32 daddr)
622 {
623 static u32 fnhe_hashrnd __read_mostly;
624 u32 hval;
625
626 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
627 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
628 return hash_32(hval, FNHE_HASH_SHIFT);
629 }
630
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)631 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
632 {
633 rt->rt_pmtu = fnhe->fnhe_pmtu;
634 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
635 rt->dst.expires = fnhe->fnhe_expires;
636
637 if (fnhe->fnhe_gw) {
638 rt->rt_flags |= RTCF_REDIRECTED;
639 rt->rt_uses_gateway = 1;
640 rt->rt_gw_family = AF_INET;
641 rt->rt_gw4 = fnhe->fnhe_gw;
642 }
643 }
644
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)645 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
646 __be32 gw, u32 pmtu, bool lock,
647 unsigned long expires)
648 {
649 struct fnhe_hash_bucket *hash;
650 struct fib_nh_exception *fnhe;
651 struct rtable *rt;
652 u32 genid, hval;
653 unsigned int i;
654 int depth;
655
656 genid = fnhe_genid(dev_net(nhc->nhc_dev));
657 hval = fnhe_hashfun(daddr);
658
659 spin_lock_bh(&fnhe_lock);
660
661 hash = rcu_dereference(nhc->nhc_exceptions);
662 if (!hash) {
663 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
664 if (!hash)
665 goto out_unlock;
666 rcu_assign_pointer(nhc->nhc_exceptions, hash);
667 }
668
669 hash += hval;
670
671 depth = 0;
672 for (fnhe = rcu_dereference(hash->chain); fnhe;
673 fnhe = rcu_dereference(fnhe->fnhe_next)) {
674 if (fnhe->fnhe_daddr == daddr)
675 break;
676 depth++;
677 }
678
679 if (fnhe) {
680 if (fnhe->fnhe_genid != genid)
681 fnhe->fnhe_genid = genid;
682 if (gw)
683 fnhe->fnhe_gw = gw;
684 if (pmtu) {
685 fnhe->fnhe_pmtu = pmtu;
686 fnhe->fnhe_mtu_locked = lock;
687 }
688 fnhe->fnhe_expires = max(1UL, expires);
689 /* Update all cached dsts too */
690 rt = rcu_dereference(fnhe->fnhe_rth_input);
691 if (rt)
692 fill_route_from_fnhe(rt, fnhe);
693 rt = rcu_dereference(fnhe->fnhe_rth_output);
694 if (rt)
695 fill_route_from_fnhe(rt, fnhe);
696 } else {
697 if (depth > FNHE_RECLAIM_DEPTH)
698 fnhe = fnhe_oldest(hash);
699 else {
700 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
701 if (!fnhe)
702 goto out_unlock;
703
704 fnhe->fnhe_next = hash->chain;
705 rcu_assign_pointer(hash->chain, fnhe);
706 }
707 fnhe->fnhe_genid = genid;
708 fnhe->fnhe_daddr = daddr;
709 fnhe->fnhe_gw = gw;
710 fnhe->fnhe_pmtu = pmtu;
711 fnhe->fnhe_mtu_locked = lock;
712 fnhe->fnhe_expires = max(1UL, expires);
713
714 /* Exception created; mark the cached routes for the nexthop
715 * stale, so anyone caching it rechecks if this exception
716 * applies to them.
717 */
718 rt = rcu_dereference(nhc->nhc_rth_input);
719 if (rt)
720 rt->dst.obsolete = DST_OBSOLETE_KILL;
721
722 for_each_possible_cpu(i) {
723 struct rtable __rcu **prt;
724 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
725 rt = rcu_dereference(*prt);
726 if (rt)
727 rt->dst.obsolete = DST_OBSOLETE_KILL;
728 }
729 }
730
731 fnhe->fnhe_stamp = jiffies;
732
733 out_unlock:
734 spin_unlock_bh(&fnhe_lock);
735 }
736
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
738 bool kill_route)
739 {
740 __be32 new_gw = icmp_hdr(skb)->un.gateway;
741 __be32 old_gw = ip_hdr(skb)->saddr;
742 struct net_device *dev = skb->dev;
743 struct in_device *in_dev;
744 struct fib_result res;
745 struct neighbour *n;
746 struct net *net;
747
748 switch (icmp_hdr(skb)->code & 7) {
749 case ICMP_REDIR_NET:
750 case ICMP_REDIR_NETTOS:
751 case ICMP_REDIR_HOST:
752 case ICMP_REDIR_HOSTTOS:
753 break;
754
755 default:
756 return;
757 }
758
759 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
760 return;
761
762 in_dev = __in_dev_get_rcu(dev);
763 if (!in_dev)
764 return;
765
766 net = dev_net(dev);
767 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
768 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
769 ipv4_is_zeronet(new_gw))
770 goto reject_redirect;
771
772 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
773 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
774 goto reject_redirect;
775 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
776 goto reject_redirect;
777 } else {
778 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
779 goto reject_redirect;
780 }
781
782 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
783 if (!n)
784 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
785 if (!IS_ERR(n)) {
786 if (!(n->nud_state & NUD_VALID)) {
787 neigh_event_send(n, NULL);
788 } else {
789 if (fib_lookup(net, fl4, &res, 0) == 0) {
790 struct fib_nh_common *nhc = FIB_RES_NHC(res);
791
792 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
793 0, false,
794 jiffies + ip_rt_gc_timeout);
795 }
796 if (kill_route)
797 rt->dst.obsolete = DST_OBSOLETE_KILL;
798 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
799 }
800 neigh_release(n);
801 }
802 return;
803
804 reject_redirect:
805 #ifdef CONFIG_IP_ROUTE_VERBOSE
806 if (IN_DEV_LOG_MARTIANS(in_dev)) {
807 const struct iphdr *iph = (const struct iphdr *) skb->data;
808 __be32 daddr = iph->daddr;
809 __be32 saddr = iph->saddr;
810
811 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
812 " Advised path = %pI4 -> %pI4\n",
813 &old_gw, dev->name, &new_gw,
814 &saddr, &daddr);
815 }
816 #endif
817 ;
818 }
819
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)820 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
821 {
822 struct rtable *rt;
823 struct flowi4 fl4;
824 const struct iphdr *iph = (const struct iphdr *) skb->data;
825 struct net *net = dev_net(skb->dev);
826 int oif = skb->dev->ifindex;
827 u8 tos = RT_TOS(iph->tos);
828 u8 prot = iph->protocol;
829 u32 mark = skb->mark;
830
831 rt = (struct rtable *) dst;
832
833 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
834 __ip_do_redirect(rt, skb, &fl4, true);
835 }
836
ipv4_negative_advice(struct dst_entry * dst)837 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
838 {
839 struct rtable *rt = (struct rtable *)dst;
840 struct dst_entry *ret = dst;
841
842 if (rt) {
843 if (dst->obsolete > 0) {
844 ip_rt_put(rt);
845 ret = NULL;
846 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
847 rt->dst.expires) {
848 ip_rt_put(rt);
849 ret = NULL;
850 }
851 }
852 return ret;
853 }
854
855 /*
856 * Algorithm:
857 * 1. The first ip_rt_redirect_number redirects are sent
858 * with exponential backoff, then we stop sending them at all,
859 * assuming that the host ignores our redirects.
860 * 2. If we did not see packets requiring redirects
861 * during ip_rt_redirect_silence, we assume that the host
862 * forgot redirected route and start to send redirects again.
863 *
864 * This algorithm is much cheaper and more intelligent than dumb load limiting
865 * in icmp.c.
866 *
867 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
868 * and "frag. need" (breaks PMTU discovery) in icmp.c.
869 */
870
ip_rt_send_redirect(struct sk_buff * skb)871 void ip_rt_send_redirect(struct sk_buff *skb)
872 {
873 struct rtable *rt = skb_rtable(skb);
874 struct in_device *in_dev;
875 struct inet_peer *peer;
876 struct net *net;
877 int log_martians;
878 int vif;
879
880 rcu_read_lock();
881 in_dev = __in_dev_get_rcu(rt->dst.dev);
882 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
883 rcu_read_unlock();
884 return;
885 }
886 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
887 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
888 rcu_read_unlock();
889
890 net = dev_net(rt->dst.dev);
891 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
892 if (!peer) {
893 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
894 rt_nexthop(rt, ip_hdr(skb)->daddr));
895 return;
896 }
897
898 /* No redirected packets during ip_rt_redirect_silence;
899 * reset the algorithm.
900 */
901 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
902 peer->rate_tokens = 0;
903 peer->n_redirects = 0;
904 }
905
906 /* Too many ignored redirects; do not send anything
907 * set dst.rate_last to the last seen redirected packet.
908 */
909 if (peer->n_redirects >= ip_rt_redirect_number) {
910 peer->rate_last = jiffies;
911 goto out_put_peer;
912 }
913
914 /* Check for load limit; set rate_last to the latest sent
915 * redirect.
916 */
917 if (peer->rate_tokens == 0 ||
918 time_after(jiffies,
919 (peer->rate_last +
920 (ip_rt_redirect_load << peer->n_redirects)))) {
921 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
922
923 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
924 peer->rate_last = jiffies;
925 ++peer->n_redirects;
926 #ifdef CONFIG_IP_ROUTE_VERBOSE
927 if (log_martians &&
928 peer->n_redirects == ip_rt_redirect_number)
929 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
930 &ip_hdr(skb)->saddr, inet_iif(skb),
931 &ip_hdr(skb)->daddr, &gw);
932 #endif
933 }
934 out_put_peer:
935 inet_putpeer(peer);
936 }
937
ip_error(struct sk_buff * skb)938 static int ip_error(struct sk_buff *skb)
939 {
940 struct rtable *rt = skb_rtable(skb);
941 struct net_device *dev = skb->dev;
942 struct in_device *in_dev;
943 struct inet_peer *peer;
944 unsigned long now;
945 struct net *net;
946 bool send;
947 int code;
948
949 if (netif_is_l3_master(skb->dev)) {
950 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
951 if (!dev)
952 goto out;
953 }
954
955 in_dev = __in_dev_get_rcu(dev);
956
957 /* IP on this device is disabled. */
958 if (!in_dev)
959 goto out;
960
961 net = dev_net(rt->dst.dev);
962 if (!IN_DEV_FORWARD(in_dev)) {
963 switch (rt->dst.error) {
964 case EHOSTUNREACH:
965 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
966 break;
967
968 case ENETUNREACH:
969 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
970 break;
971 }
972 goto out;
973 }
974
975 switch (rt->dst.error) {
976 case EINVAL:
977 default:
978 goto out;
979 case EHOSTUNREACH:
980 code = ICMP_HOST_UNREACH;
981 break;
982 case ENETUNREACH:
983 code = ICMP_NET_UNREACH;
984 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
985 break;
986 case EACCES:
987 code = ICMP_PKT_FILTERED;
988 break;
989 }
990
991 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
992 l3mdev_master_ifindex(skb->dev), 1);
993
994 send = true;
995 if (peer) {
996 now = jiffies;
997 peer->rate_tokens += now - peer->rate_last;
998 if (peer->rate_tokens > ip_rt_error_burst)
999 peer->rate_tokens = ip_rt_error_burst;
1000 peer->rate_last = now;
1001 if (peer->rate_tokens >= ip_rt_error_cost)
1002 peer->rate_tokens -= ip_rt_error_cost;
1003 else
1004 send = false;
1005 inet_putpeer(peer);
1006 }
1007 if (send)
1008 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out: kfree_skb(skb);
1011 return 0;
1012 }
1013
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016 struct dst_entry *dst = &rt->dst;
1017 u32 old_mtu = ipv4_mtu(dst);
1018 struct fib_result res;
1019 bool lock = false;
1020
1021 if (ip_mtu_locked(dst))
1022 return;
1023
1024 if (old_mtu < mtu)
1025 return;
1026
1027 if (mtu < ip_rt_min_pmtu) {
1028 lock = true;
1029 mtu = min(old_mtu, ip_rt_min_pmtu);
1030 }
1031
1032 if (rt->rt_pmtu == mtu && !lock &&
1033 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034 return;
1035
1036 rcu_read_lock();
1037 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1039
1040 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041 jiffies + ip_rt_mtu_expires);
1042 }
1043 rcu_read_unlock();
1044 }
1045
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047 struct sk_buff *skb, u32 mtu,
1048 bool confirm_neigh)
1049 {
1050 struct rtable *rt = (struct rtable *) dst;
1051 struct flowi4 fl4;
1052
1053 ip_rt_build_flow_key(&fl4, sk, skb);
1054 __ip_rt_update_pmtu(rt, &fl4, mtu);
1055 }
1056
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1057 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1058 int oif, u8 protocol)
1059 {
1060 const struct iphdr *iph = (const struct iphdr *) skb->data;
1061 struct flowi4 fl4;
1062 struct rtable *rt;
1063 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1064
1065 __build_flow_key(net, &fl4, NULL, iph, oif,
1066 RT_TOS(iph->tos), protocol, mark, 0);
1067 rt = __ip_route_output_key(net, &fl4);
1068 if (!IS_ERR(rt)) {
1069 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070 ip_rt_put(rt);
1071 }
1072 }
1073 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1074
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1075 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077 const struct iphdr *iph = (const struct iphdr *) skb->data;
1078 struct flowi4 fl4;
1079 struct rtable *rt;
1080
1081 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1082
1083 if (!fl4.flowi4_mark)
1084 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1085
1086 rt = __ip_route_output_key(sock_net(sk), &fl4);
1087 if (!IS_ERR(rt)) {
1088 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089 ip_rt_put(rt);
1090 }
1091 }
1092
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1093 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1094 {
1095 const struct iphdr *iph = (const struct iphdr *) skb->data;
1096 struct flowi4 fl4;
1097 struct rtable *rt;
1098 struct dst_entry *odst = NULL;
1099 bool new = false;
1100 struct net *net = sock_net(sk);
1101
1102 bh_lock_sock(sk);
1103
1104 if (!ip_sk_accept_pmtu(sk))
1105 goto out;
1106
1107 odst = sk_dst_get(sk);
1108
1109 if (sock_owned_by_user(sk) || !odst) {
1110 __ipv4_sk_update_pmtu(skb, sk, mtu);
1111 goto out;
1112 }
1113
1114 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1115
1116 rt = (struct rtable *)odst;
1117 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1118 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119 if (IS_ERR(rt))
1120 goto out;
1121
1122 new = true;
1123 }
1124
1125 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1126
1127 if (!dst_check(&rt->dst, 0)) {
1128 if (new)
1129 dst_release(&rt->dst);
1130
1131 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1132 if (IS_ERR(rt))
1133 goto out;
1134
1135 new = true;
1136 }
1137
1138 if (new)
1139 sk_dst_set(sk, &rt->dst);
1140
1141 out:
1142 bh_unlock_sock(sk);
1143 dst_release(odst);
1144 }
1145 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1146
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1147 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1148 int oif, u8 protocol)
1149 {
1150 const struct iphdr *iph = (const struct iphdr *) skb->data;
1151 struct flowi4 fl4;
1152 struct rtable *rt;
1153
1154 __build_flow_key(net, &fl4, NULL, iph, oif,
1155 RT_TOS(iph->tos), protocol, 0, 0);
1156 rt = __ip_route_output_key(net, &fl4);
1157 if (!IS_ERR(rt)) {
1158 __ip_do_redirect(rt, skb, &fl4, false);
1159 ip_rt_put(rt);
1160 }
1161 }
1162 EXPORT_SYMBOL_GPL(ipv4_redirect);
1163
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1164 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1165 {
1166 const struct iphdr *iph = (const struct iphdr *) skb->data;
1167 struct flowi4 fl4;
1168 struct rtable *rt;
1169 struct net *net = sock_net(sk);
1170
1171 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1172 rt = __ip_route_output_key(net, &fl4);
1173 if (!IS_ERR(rt)) {
1174 __ip_do_redirect(rt, skb, &fl4, false);
1175 ip_rt_put(rt);
1176 }
1177 }
1178 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1179
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1180 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1181 {
1182 struct rtable *rt = (struct rtable *) dst;
1183
1184 /* All IPV4 dsts are created with ->obsolete set to the value
1185 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1186 * into this function always.
1187 *
1188 * When a PMTU/redirect information update invalidates a route,
1189 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1190 * DST_OBSOLETE_DEAD.
1191 */
1192 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1193 return NULL;
1194 return dst;
1195 }
1196
ipv4_send_dest_unreach(struct sk_buff * skb)1197 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1198 {
1199 struct ip_options opt;
1200 int res;
1201
1202 /* Recompile ip options since IPCB may not be valid anymore.
1203 * Also check we have a reasonable ipv4 header.
1204 */
1205 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1206 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1207 return;
1208
1209 memset(&opt, 0, sizeof(opt));
1210 if (ip_hdr(skb)->ihl > 5) {
1211 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1212 return;
1213 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1214
1215 rcu_read_lock();
1216 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1217 rcu_read_unlock();
1218
1219 if (res)
1220 return;
1221 }
1222 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1223 }
1224
ipv4_link_failure(struct sk_buff * skb)1225 static void ipv4_link_failure(struct sk_buff *skb)
1226 {
1227 struct rtable *rt;
1228
1229 ipv4_send_dest_unreach(skb);
1230
1231 rt = skb_rtable(skb);
1232 if (rt)
1233 dst_set_expires(&rt->dst, 0);
1234 }
1235
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1236 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1237 {
1238 pr_debug("%s: %pI4 -> %pI4, %s\n",
1239 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1240 skb->dev ? skb->dev->name : "?");
1241 kfree_skb(skb);
1242 WARN_ON(1);
1243 return 0;
1244 }
1245
1246 /*
1247 We do not cache source address of outgoing interface,
1248 because it is used only by IP RR, TS and SRR options,
1249 so that it out of fast path.
1250
1251 BTW remember: "addr" is allowed to be not aligned
1252 in IP options!
1253 */
1254
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1255 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1256 {
1257 __be32 src;
1258
1259 if (rt_is_output_route(rt))
1260 src = ip_hdr(skb)->saddr;
1261 else {
1262 struct fib_result res;
1263 struct iphdr *iph = ip_hdr(skb);
1264 struct flowi4 fl4 = {
1265 .daddr = iph->daddr,
1266 .saddr = iph->saddr,
1267 .flowi4_tos = RT_TOS(iph->tos),
1268 .flowi4_oif = rt->dst.dev->ifindex,
1269 .flowi4_iif = skb->dev->ifindex,
1270 .flowi4_mark = skb->mark,
1271 };
1272
1273 rcu_read_lock();
1274 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1275 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1276 else
1277 src = inet_select_addr(rt->dst.dev,
1278 rt_nexthop(rt, iph->daddr),
1279 RT_SCOPE_UNIVERSE);
1280 rcu_read_unlock();
1281 }
1282 memcpy(addr, &src, 4);
1283 }
1284
1285 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1286 static void set_class_tag(struct rtable *rt, u32 tag)
1287 {
1288 if (!(rt->dst.tclassid & 0xFFFF))
1289 rt->dst.tclassid |= tag & 0xFFFF;
1290 if (!(rt->dst.tclassid & 0xFFFF0000))
1291 rt->dst.tclassid |= tag & 0xFFFF0000;
1292 }
1293 #endif
1294
ipv4_default_advmss(const struct dst_entry * dst)1295 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1296 {
1297 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1298 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1299 ip_rt_min_advmss);
1300
1301 return min(advmss, IPV4_MAX_PMTU - header_size);
1302 }
1303
ipv4_mtu(const struct dst_entry * dst)1304 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1305 {
1306 const struct rtable *rt = (const struct rtable *) dst;
1307 unsigned int mtu = rt->rt_pmtu;
1308
1309 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1310 mtu = dst_metric_raw(dst, RTAX_MTU);
1311
1312 if (mtu)
1313 return mtu;
1314
1315 mtu = READ_ONCE(dst->dev->mtu);
1316
1317 if (unlikely(ip_mtu_locked(dst))) {
1318 if (rt->rt_uses_gateway && mtu > 576)
1319 mtu = 576;
1320 }
1321
1322 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1323
1324 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1325 }
1326
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1327 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1328 {
1329 struct fnhe_hash_bucket *hash;
1330 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1331 u32 hval = fnhe_hashfun(daddr);
1332
1333 spin_lock_bh(&fnhe_lock);
1334
1335 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1336 lockdep_is_held(&fnhe_lock));
1337 hash += hval;
1338
1339 fnhe_p = &hash->chain;
1340 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1341 while (fnhe) {
1342 if (fnhe->fnhe_daddr == daddr) {
1343 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1344 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1345 /* set fnhe_daddr to 0 to ensure it won't bind with
1346 * new dsts in rt_bind_exception().
1347 */
1348 fnhe->fnhe_daddr = 0;
1349 fnhe_flush_routes(fnhe);
1350 kfree_rcu(fnhe, rcu);
1351 break;
1352 }
1353 fnhe_p = &fnhe->fnhe_next;
1354 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1355 lockdep_is_held(&fnhe_lock));
1356 }
1357
1358 spin_unlock_bh(&fnhe_lock);
1359 }
1360
find_exception(struct fib_nh_common * nhc,__be32 daddr)1361 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1362 __be32 daddr)
1363 {
1364 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1365 struct fib_nh_exception *fnhe;
1366 u32 hval;
1367
1368 if (!hash)
1369 return NULL;
1370
1371 hval = fnhe_hashfun(daddr);
1372
1373 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1374 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1375 if (fnhe->fnhe_daddr == daddr) {
1376 if (fnhe->fnhe_expires &&
1377 time_after(jiffies, fnhe->fnhe_expires)) {
1378 ip_del_fnhe(nhc, daddr);
1379 break;
1380 }
1381 return fnhe;
1382 }
1383 }
1384 return NULL;
1385 }
1386
1387 /* MTU selection:
1388 * 1. mtu on route is locked - use it
1389 * 2. mtu from nexthop exception
1390 * 3. mtu from egress device
1391 */
1392
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1393 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1394 {
1395 struct fib_nh_common *nhc = res->nhc;
1396 struct net_device *dev = nhc->nhc_dev;
1397 struct fib_info *fi = res->fi;
1398 u32 mtu = 0;
1399
1400 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1401 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1402 mtu = fi->fib_mtu;
1403
1404 if (likely(!mtu)) {
1405 struct fib_nh_exception *fnhe;
1406
1407 fnhe = find_exception(nhc, daddr);
1408 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1409 mtu = fnhe->fnhe_pmtu;
1410 }
1411
1412 if (likely(!mtu))
1413 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1414
1415 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1416 }
1417
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1418 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1419 __be32 daddr, const bool do_cache)
1420 {
1421 bool ret = false;
1422
1423 spin_lock_bh(&fnhe_lock);
1424
1425 if (daddr == fnhe->fnhe_daddr) {
1426 struct rtable __rcu **porig;
1427 struct rtable *orig;
1428 int genid = fnhe_genid(dev_net(rt->dst.dev));
1429
1430 if (rt_is_input_route(rt))
1431 porig = &fnhe->fnhe_rth_input;
1432 else
1433 porig = &fnhe->fnhe_rth_output;
1434 orig = rcu_dereference(*porig);
1435
1436 if (fnhe->fnhe_genid != genid) {
1437 fnhe->fnhe_genid = genid;
1438 fnhe->fnhe_gw = 0;
1439 fnhe->fnhe_pmtu = 0;
1440 fnhe->fnhe_expires = 0;
1441 fnhe->fnhe_mtu_locked = false;
1442 fnhe_flush_routes(fnhe);
1443 orig = NULL;
1444 }
1445 fill_route_from_fnhe(rt, fnhe);
1446 if (!rt->rt_gw4) {
1447 rt->rt_gw4 = daddr;
1448 rt->rt_gw_family = AF_INET;
1449 }
1450
1451 if (do_cache) {
1452 dst_hold(&rt->dst);
1453 rcu_assign_pointer(*porig, rt);
1454 if (orig) {
1455 dst_dev_put(&orig->dst);
1456 dst_release(&orig->dst);
1457 }
1458 ret = true;
1459 }
1460
1461 fnhe->fnhe_stamp = jiffies;
1462 }
1463 spin_unlock_bh(&fnhe_lock);
1464
1465 return ret;
1466 }
1467
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1468 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1469 {
1470 struct rtable *orig, *prev, **p;
1471 bool ret = true;
1472
1473 if (rt_is_input_route(rt)) {
1474 p = (struct rtable **)&nhc->nhc_rth_input;
1475 } else {
1476 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1477 }
1478 orig = *p;
1479
1480 /* hold dst before doing cmpxchg() to avoid race condition
1481 * on this dst
1482 */
1483 dst_hold(&rt->dst);
1484 prev = cmpxchg(p, orig, rt);
1485 if (prev == orig) {
1486 if (orig) {
1487 rt_add_uncached_list(orig);
1488 dst_release(&orig->dst);
1489 }
1490 } else {
1491 dst_release(&rt->dst);
1492 ret = false;
1493 }
1494
1495 return ret;
1496 }
1497
1498 struct uncached_list {
1499 spinlock_t lock;
1500 struct list_head head;
1501 };
1502
1503 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1504
rt_add_uncached_list(struct rtable * rt)1505 void rt_add_uncached_list(struct rtable *rt)
1506 {
1507 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1508
1509 rt->rt_uncached_list = ul;
1510
1511 spin_lock_bh(&ul->lock);
1512 list_add_tail(&rt->rt_uncached, &ul->head);
1513 spin_unlock_bh(&ul->lock);
1514 }
1515
rt_del_uncached_list(struct rtable * rt)1516 void rt_del_uncached_list(struct rtable *rt)
1517 {
1518 if (!list_empty(&rt->rt_uncached)) {
1519 struct uncached_list *ul = rt->rt_uncached_list;
1520
1521 spin_lock_bh(&ul->lock);
1522 list_del(&rt->rt_uncached);
1523 spin_unlock_bh(&ul->lock);
1524 }
1525 }
1526
ipv4_dst_destroy(struct dst_entry * dst)1527 static void ipv4_dst_destroy(struct dst_entry *dst)
1528 {
1529 struct rtable *rt = (struct rtable *)dst;
1530
1531 ip_dst_metrics_put(dst);
1532 rt_del_uncached_list(rt);
1533 }
1534
rt_flush_dev(struct net_device * dev)1535 void rt_flush_dev(struct net_device *dev)
1536 {
1537 struct rtable *rt;
1538 int cpu;
1539
1540 for_each_possible_cpu(cpu) {
1541 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1542
1543 spin_lock_bh(&ul->lock);
1544 list_for_each_entry(rt, &ul->head, rt_uncached) {
1545 if (rt->dst.dev != dev)
1546 continue;
1547 rt->dst.dev = blackhole_netdev;
1548 dev_hold(rt->dst.dev);
1549 dev_put(dev);
1550 }
1551 spin_unlock_bh(&ul->lock);
1552 }
1553 }
1554
rt_cache_valid(const struct rtable * rt)1555 static bool rt_cache_valid(const struct rtable *rt)
1556 {
1557 return rt &&
1558 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1559 !rt_is_expired(rt);
1560 }
1561
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1562 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1563 const struct fib_result *res,
1564 struct fib_nh_exception *fnhe,
1565 struct fib_info *fi, u16 type, u32 itag,
1566 const bool do_cache)
1567 {
1568 bool cached = false;
1569
1570 if (fi) {
1571 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1572
1573 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1574 rt->rt_uses_gateway = 1;
1575 rt->rt_gw_family = nhc->nhc_gw_family;
1576 /* only INET and INET6 are supported */
1577 if (likely(nhc->nhc_gw_family == AF_INET))
1578 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1579 else
1580 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1581 }
1582
1583 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1584
1585 #ifdef CONFIG_IP_ROUTE_CLASSID
1586 if (nhc->nhc_family == AF_INET) {
1587 struct fib_nh *nh;
1588
1589 nh = container_of(nhc, struct fib_nh, nh_common);
1590 rt->dst.tclassid = nh->nh_tclassid;
1591 }
1592 #endif
1593 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1594 if (unlikely(fnhe))
1595 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1596 else if (do_cache)
1597 cached = rt_cache_route(nhc, rt);
1598 if (unlikely(!cached)) {
1599 /* Routes we intend to cache in nexthop exception or
1600 * FIB nexthop have the DST_NOCACHE bit clear.
1601 * However, if we are unsuccessful at storing this
1602 * route into the cache we really need to set it.
1603 */
1604 if (!rt->rt_gw4) {
1605 rt->rt_gw_family = AF_INET;
1606 rt->rt_gw4 = daddr;
1607 }
1608 rt_add_uncached_list(rt);
1609 }
1610 } else
1611 rt_add_uncached_list(rt);
1612
1613 #ifdef CONFIG_IP_ROUTE_CLASSID
1614 #ifdef CONFIG_IP_MULTIPLE_TABLES
1615 set_class_tag(rt, res->tclassid);
1616 #endif
1617 set_class_tag(rt, itag);
1618 #endif
1619 }
1620
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool nopolicy,bool noxfrm,bool will_cache)1621 struct rtable *rt_dst_alloc(struct net_device *dev,
1622 unsigned int flags, u16 type,
1623 bool nopolicy, bool noxfrm, bool will_cache)
1624 {
1625 struct rtable *rt;
1626
1627 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1628 (will_cache ? 0 : DST_HOST) |
1629 (nopolicy ? DST_NOPOLICY : 0) |
1630 (noxfrm ? DST_NOXFRM : 0));
1631
1632 if (rt) {
1633 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1634 rt->rt_flags = flags;
1635 rt->rt_type = type;
1636 rt->rt_is_input = 0;
1637 rt->rt_iif = 0;
1638 rt->rt_pmtu = 0;
1639 rt->rt_mtu_locked = 0;
1640 rt->rt_uses_gateway = 0;
1641 rt->rt_gw_family = 0;
1642 rt->rt_gw4 = 0;
1643 INIT_LIST_HEAD(&rt->rt_uncached);
1644
1645 rt->dst.output = ip_output;
1646 if (flags & RTCF_LOCAL)
1647 rt->dst.input = ip_local_deliver;
1648 }
1649
1650 return rt;
1651 }
1652 EXPORT_SYMBOL(rt_dst_alloc);
1653
rt_dst_clone(struct net_device * dev,struct rtable * rt)1654 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1655 {
1656 struct rtable *new_rt;
1657
1658 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1659 rt->dst.flags);
1660
1661 if (new_rt) {
1662 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1663 new_rt->rt_flags = rt->rt_flags;
1664 new_rt->rt_type = rt->rt_type;
1665 new_rt->rt_is_input = rt->rt_is_input;
1666 new_rt->rt_iif = rt->rt_iif;
1667 new_rt->rt_pmtu = rt->rt_pmtu;
1668 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1669 new_rt->rt_gw_family = rt->rt_gw_family;
1670 if (rt->rt_gw_family == AF_INET)
1671 new_rt->rt_gw4 = rt->rt_gw4;
1672 else if (rt->rt_gw_family == AF_INET6)
1673 new_rt->rt_gw6 = rt->rt_gw6;
1674 INIT_LIST_HEAD(&new_rt->rt_uncached);
1675
1676 new_rt->dst.flags |= DST_HOST;
1677 new_rt->dst.input = rt->dst.input;
1678 new_rt->dst.output = rt->dst.output;
1679 new_rt->dst.error = rt->dst.error;
1680 new_rt->dst.lastuse = jiffies;
1681 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1682 }
1683 return new_rt;
1684 }
1685 EXPORT_SYMBOL(rt_dst_clone);
1686
1687 /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1688 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1689 u8 tos, struct net_device *dev,
1690 struct in_device *in_dev, u32 *itag)
1691 {
1692 int err;
1693
1694 /* Primary sanity checks. */
1695 if (!in_dev)
1696 return -EINVAL;
1697
1698 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1699 skb->protocol != htons(ETH_P_IP))
1700 return -EINVAL;
1701
1702 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1703 return -EINVAL;
1704
1705 if (ipv4_is_zeronet(saddr)) {
1706 if (!ipv4_is_local_multicast(daddr) &&
1707 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1708 return -EINVAL;
1709 } else {
1710 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1711 in_dev, itag);
1712 if (err < 0)
1713 return err;
1714 }
1715 return 0;
1716 }
1717
1718 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1719 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1720 u8 tos, struct net_device *dev, int our)
1721 {
1722 struct in_device *in_dev = __in_dev_get_rcu(dev);
1723 unsigned int flags = RTCF_MULTICAST;
1724 struct rtable *rth;
1725 u32 itag = 0;
1726 int err;
1727
1728 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1729 if (err)
1730 return err;
1731
1732 if (our)
1733 flags |= RTCF_LOCAL;
1734
1735 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1736 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1737 if (!rth)
1738 return -ENOBUFS;
1739
1740 #ifdef CONFIG_IP_ROUTE_CLASSID
1741 rth->dst.tclassid = itag;
1742 #endif
1743 rth->dst.output = ip_rt_bug;
1744 rth->rt_is_input= 1;
1745
1746 #ifdef CONFIG_IP_MROUTE
1747 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1748 rth->dst.input = ip_mr_input;
1749 #endif
1750 RT_CACHE_STAT_INC(in_slow_mc);
1751
1752 skb_dst_set(skb, &rth->dst);
1753 return 0;
1754 }
1755
1756
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1757 static void ip_handle_martian_source(struct net_device *dev,
1758 struct in_device *in_dev,
1759 struct sk_buff *skb,
1760 __be32 daddr,
1761 __be32 saddr)
1762 {
1763 RT_CACHE_STAT_INC(in_martian_src);
1764 #ifdef CONFIG_IP_ROUTE_VERBOSE
1765 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1766 /*
1767 * RFC1812 recommendation, if source is martian,
1768 * the only hint is MAC header.
1769 */
1770 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1771 &daddr, &saddr, dev->name);
1772 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1773 print_hex_dump(KERN_WARNING, "ll header: ",
1774 DUMP_PREFIX_OFFSET, 16, 1,
1775 skb_mac_header(skb),
1776 dev->hard_header_len, false);
1777 }
1778 }
1779 #endif
1780 }
1781
1782 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1783 static int __mkroute_input(struct sk_buff *skb,
1784 const struct fib_result *res,
1785 struct in_device *in_dev,
1786 __be32 daddr, __be32 saddr, u32 tos)
1787 {
1788 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1789 struct net_device *dev = nhc->nhc_dev;
1790 struct fib_nh_exception *fnhe;
1791 struct rtable *rth;
1792 int err;
1793 struct in_device *out_dev;
1794 bool do_cache;
1795 u32 itag = 0;
1796
1797 /* get a working reference to the output device */
1798 out_dev = __in_dev_get_rcu(dev);
1799 if (!out_dev) {
1800 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1801 return -EINVAL;
1802 }
1803
1804 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1805 in_dev->dev, in_dev, &itag);
1806 if (err < 0) {
1807 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1808 saddr);
1809
1810 goto cleanup;
1811 }
1812
1813 do_cache = res->fi && !itag;
1814 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1815 skb->protocol == htons(ETH_P_IP)) {
1816 __be32 gw;
1817
1818 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1819 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1820 inet_addr_onlink(out_dev, saddr, gw))
1821 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1822 }
1823
1824 if (skb->protocol != htons(ETH_P_IP)) {
1825 /* Not IP (i.e. ARP). Do not create route, if it is
1826 * invalid for proxy arp. DNAT routes are always valid.
1827 *
1828 * Proxy arp feature have been extended to allow, ARP
1829 * replies back to the same interface, to support
1830 * Private VLAN switch technologies. See arp.c.
1831 */
1832 if (out_dev == in_dev &&
1833 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1834 err = -EINVAL;
1835 goto cleanup;
1836 }
1837 }
1838
1839 fnhe = find_exception(nhc, daddr);
1840 if (do_cache) {
1841 if (fnhe)
1842 rth = rcu_dereference(fnhe->fnhe_rth_input);
1843 else
1844 rth = rcu_dereference(nhc->nhc_rth_input);
1845 if (rt_cache_valid(rth)) {
1846 skb_dst_set_noref(skb, &rth->dst);
1847 goto out;
1848 }
1849 }
1850
1851 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1852 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1853 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1854 if (!rth) {
1855 err = -ENOBUFS;
1856 goto cleanup;
1857 }
1858
1859 rth->rt_is_input = 1;
1860 RT_CACHE_STAT_INC(in_slow_tot);
1861
1862 rth->dst.input = ip_forward;
1863
1864 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1865 do_cache);
1866 lwtunnel_set_redirect(&rth->dst);
1867 skb_dst_set(skb, &rth->dst);
1868 out:
1869 err = 0;
1870 cleanup:
1871 return err;
1872 }
1873
1874 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1875 /* To make ICMP packets follow the right flow, the multipath hash is
1876 * calculated from the inner IP addresses.
1877 */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1878 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1879 struct flow_keys *hash_keys)
1880 {
1881 const struct iphdr *outer_iph = ip_hdr(skb);
1882 const struct iphdr *key_iph = outer_iph;
1883 const struct iphdr *inner_iph;
1884 const struct icmphdr *icmph;
1885 struct iphdr _inner_iph;
1886 struct icmphdr _icmph;
1887
1888 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1889 goto out;
1890
1891 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1892 goto out;
1893
1894 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1895 &_icmph);
1896 if (!icmph)
1897 goto out;
1898
1899 if (icmph->type != ICMP_DEST_UNREACH &&
1900 icmph->type != ICMP_REDIRECT &&
1901 icmph->type != ICMP_TIME_EXCEEDED &&
1902 icmph->type != ICMP_PARAMETERPROB)
1903 goto out;
1904
1905 inner_iph = skb_header_pointer(skb,
1906 outer_iph->ihl * 4 + sizeof(_icmph),
1907 sizeof(_inner_iph), &_inner_iph);
1908 if (!inner_iph)
1909 goto out;
1910
1911 key_iph = inner_iph;
1912 out:
1913 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1914 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1915 }
1916
1917 /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)1918 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1919 const struct sk_buff *skb, struct flow_keys *flkeys)
1920 {
1921 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1922 struct flow_keys hash_keys;
1923 u32 mhash;
1924
1925 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1926 case 0:
1927 memset(&hash_keys, 0, sizeof(hash_keys));
1928 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1929 if (skb) {
1930 ip_multipath_l3_keys(skb, &hash_keys);
1931 } else {
1932 hash_keys.addrs.v4addrs.src = fl4->saddr;
1933 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1934 }
1935 break;
1936 case 1:
1937 /* skb is currently provided only when forwarding */
1938 if (skb) {
1939 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1940 struct flow_keys keys;
1941
1942 /* short-circuit if we already have L4 hash present */
1943 if (skb->l4_hash)
1944 return skb_get_hash_raw(skb) >> 1;
1945
1946 memset(&hash_keys, 0, sizeof(hash_keys));
1947
1948 if (!flkeys) {
1949 skb_flow_dissect_flow_keys(skb, &keys, flag);
1950 flkeys = &keys;
1951 }
1952
1953 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1954 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1955 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1956 hash_keys.ports.src = flkeys->ports.src;
1957 hash_keys.ports.dst = flkeys->ports.dst;
1958 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1959 } else {
1960 memset(&hash_keys, 0, sizeof(hash_keys));
1961 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1962 hash_keys.addrs.v4addrs.src = fl4->saddr;
1963 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1964 hash_keys.ports.src = fl4->fl4_sport;
1965 hash_keys.ports.dst = fl4->fl4_dport;
1966 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1967 }
1968 break;
1969 case 2:
1970 memset(&hash_keys, 0, sizeof(hash_keys));
1971 /* skb is currently provided only when forwarding */
1972 if (skb) {
1973 struct flow_keys keys;
1974
1975 skb_flow_dissect_flow_keys(skb, &keys, 0);
1976 /* Inner can be v4 or v6 */
1977 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1978 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1979 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1980 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1984 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1985 hash_keys.tags.flow_label = keys.tags.flow_label;
1986 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1987 } else {
1988 /* Same as case 0 */
1989 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1990 ip_multipath_l3_keys(skb, &hash_keys);
1991 }
1992 } else {
1993 /* Same as case 0 */
1994 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995 hash_keys.addrs.v4addrs.src = fl4->saddr;
1996 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1997 }
1998 break;
1999 }
2000 mhash = flow_hash_from_keys(&hash_keys);
2001
2002 if (multipath_hash)
2003 mhash = jhash_2words(mhash, multipath_hash, 0);
2004
2005 return mhash >> 1;
2006 }
2007 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2008
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2009 static int ip_mkroute_input(struct sk_buff *skb,
2010 struct fib_result *res,
2011 struct in_device *in_dev,
2012 __be32 daddr, __be32 saddr, u32 tos,
2013 struct flow_keys *hkeys)
2014 {
2015 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2016 if (res->fi && fib_info_num_path(res->fi) > 1) {
2017 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2018
2019 fib_select_multipath(res, h);
2020 }
2021 #endif
2022
2023 /* create a routing cache entry */
2024 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2025 }
2026
2027 /*
2028 * NOTE. We drop all the packets that has local source
2029 * addresses, because every properly looped back packet
2030 * must have correct destination already attached by output routine.
2031 *
2032 * Such approach solves two big problems:
2033 * 1. Not simplex devices are handled properly.
2034 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2035 * called with rcu_read_lock()
2036 */
2037
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2038 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2039 u8 tos, struct net_device *dev,
2040 struct fib_result *res)
2041 {
2042 struct in_device *in_dev = __in_dev_get_rcu(dev);
2043 struct flow_keys *flkeys = NULL, _flkeys;
2044 struct net *net = dev_net(dev);
2045 struct ip_tunnel_info *tun_info;
2046 int err = -EINVAL;
2047 unsigned int flags = 0;
2048 u32 itag = 0;
2049 struct rtable *rth;
2050 struct flowi4 fl4;
2051 bool do_cache = true;
2052
2053 /* IP on this device is disabled. */
2054
2055 if (!in_dev)
2056 goto out;
2057
2058 /* Check for the most weird martians, which can be not detected
2059 by fib_lookup.
2060 */
2061
2062 tun_info = skb_tunnel_info(skb);
2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2064 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2065 else
2066 fl4.flowi4_tun_key.tun_id = 0;
2067 skb_dst_drop(skb);
2068
2069 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2070 goto martian_source;
2071
2072 res->fi = NULL;
2073 res->table = NULL;
2074 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2075 goto brd_input;
2076
2077 /* Accept zero addresses only to limited broadcast;
2078 * I even do not know to fix it or not. Waiting for complains :-)
2079 */
2080 if (ipv4_is_zeronet(saddr))
2081 goto martian_source;
2082
2083 if (ipv4_is_zeronet(daddr))
2084 goto martian_destination;
2085
2086 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2087 * and call it once if daddr or/and saddr are loopback addresses
2088 */
2089 if (ipv4_is_loopback(daddr)) {
2090 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2091 goto martian_destination;
2092 } else if (ipv4_is_loopback(saddr)) {
2093 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2094 goto martian_source;
2095 }
2096
2097 /*
2098 * Now we are ready to route packet.
2099 */
2100 fl4.flowi4_oif = 0;
2101 fl4.flowi4_iif = dev->ifindex;
2102 fl4.flowi4_mark = skb->mark;
2103 fl4.flowi4_tos = tos;
2104 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2105 fl4.flowi4_flags = 0;
2106 fl4.daddr = daddr;
2107 fl4.saddr = saddr;
2108 fl4.flowi4_uid = sock_net_uid(net, NULL);
2109
2110 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2111 flkeys = &_flkeys;
2112 } else {
2113 fl4.flowi4_proto = 0;
2114 fl4.fl4_sport = 0;
2115 fl4.fl4_dport = 0;
2116 }
2117
2118 err = fib_lookup(net, &fl4, res, 0);
2119 if (err != 0) {
2120 if (!IN_DEV_FORWARD(in_dev))
2121 err = -EHOSTUNREACH;
2122 goto no_route;
2123 }
2124
2125 if (res->type == RTN_BROADCAST) {
2126 if (IN_DEV_BFORWARD(in_dev))
2127 goto make_route;
2128 /* not do cache if bc_forwarding is enabled */
2129 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2130 do_cache = false;
2131 goto brd_input;
2132 }
2133
2134 if (res->type == RTN_LOCAL) {
2135 err = fib_validate_source(skb, saddr, daddr, tos,
2136 0, dev, in_dev, &itag);
2137 if (err < 0)
2138 goto martian_source;
2139 goto local_input;
2140 }
2141
2142 if (!IN_DEV_FORWARD(in_dev)) {
2143 err = -EHOSTUNREACH;
2144 goto no_route;
2145 }
2146 if (res->type != RTN_UNICAST)
2147 goto martian_destination;
2148
2149 make_route:
2150 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2151 out: return err;
2152
2153 brd_input:
2154 if (skb->protocol != htons(ETH_P_IP))
2155 goto e_inval;
2156
2157 if (!ipv4_is_zeronet(saddr)) {
2158 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2159 in_dev, &itag);
2160 if (err < 0)
2161 goto martian_source;
2162 }
2163 flags |= RTCF_BROADCAST;
2164 res->type = RTN_BROADCAST;
2165 RT_CACHE_STAT_INC(in_brd);
2166
2167 local_input:
2168 do_cache &= res->fi && !itag;
2169 if (do_cache) {
2170 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2171
2172 rth = rcu_dereference(nhc->nhc_rth_input);
2173 if (rt_cache_valid(rth)) {
2174 skb_dst_set_noref(skb, &rth->dst);
2175 err = 0;
2176 goto out;
2177 }
2178 }
2179
2180 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2181 flags | RTCF_LOCAL, res->type,
2182 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2183 if (!rth)
2184 goto e_nobufs;
2185
2186 rth->dst.output= ip_rt_bug;
2187 #ifdef CONFIG_IP_ROUTE_CLASSID
2188 rth->dst.tclassid = itag;
2189 #endif
2190 rth->rt_is_input = 1;
2191
2192 RT_CACHE_STAT_INC(in_slow_tot);
2193 if (res->type == RTN_UNREACHABLE) {
2194 rth->dst.input= ip_error;
2195 rth->dst.error= -err;
2196 rth->rt_flags &= ~RTCF_LOCAL;
2197 }
2198
2199 if (do_cache) {
2200 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2201
2202 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2203 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2204 WARN_ON(rth->dst.input == lwtunnel_input);
2205 rth->dst.lwtstate->orig_input = rth->dst.input;
2206 rth->dst.input = lwtunnel_input;
2207 }
2208
2209 if (unlikely(!rt_cache_route(nhc, rth)))
2210 rt_add_uncached_list(rth);
2211 }
2212 skb_dst_set(skb, &rth->dst);
2213 err = 0;
2214 goto out;
2215
2216 no_route:
2217 RT_CACHE_STAT_INC(in_no_route);
2218 res->type = RTN_UNREACHABLE;
2219 res->fi = NULL;
2220 res->table = NULL;
2221 goto local_input;
2222
2223 /*
2224 * Do not cache martian addresses: they should be logged (RFC1812)
2225 */
2226 martian_destination:
2227 RT_CACHE_STAT_INC(in_martian_dst);
2228 #ifdef CONFIG_IP_ROUTE_VERBOSE
2229 if (IN_DEV_LOG_MARTIANS(in_dev))
2230 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2231 &daddr, &saddr, dev->name);
2232 #endif
2233
2234 e_inval:
2235 err = -EINVAL;
2236 goto out;
2237
2238 e_nobufs:
2239 err = -ENOBUFS;
2240 goto out;
2241
2242 martian_source:
2243 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2244 goto out;
2245 }
2246
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2247 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2248 u8 tos, struct net_device *dev)
2249 {
2250 struct fib_result res;
2251 int err;
2252
2253 tos &= IPTOS_RT_MASK;
2254 rcu_read_lock();
2255 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2256 rcu_read_unlock();
2257
2258 return err;
2259 }
2260 EXPORT_SYMBOL(ip_route_input_noref);
2261
2262 /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2263 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2264 u8 tos, struct net_device *dev, struct fib_result *res)
2265 {
2266 /* Multicast recognition logic is moved from route cache to here.
2267 The problem was that too many Ethernet cards have broken/missing
2268 hardware multicast filters :-( As result the host on multicasting
2269 network acquires a lot of useless route cache entries, sort of
2270 SDR messages from all the world. Now we try to get rid of them.
2271 Really, provided software IP multicast filter is organized
2272 reasonably (at least, hashed), it does not result in a slowdown
2273 comparing with route cache reject entries.
2274 Note, that multicast routers are not affected, because
2275 route cache entry is created eventually.
2276 */
2277 if (ipv4_is_multicast(daddr)) {
2278 struct in_device *in_dev = __in_dev_get_rcu(dev);
2279 int our = 0;
2280 int err = -EINVAL;
2281
2282 if (!in_dev)
2283 return err;
2284 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2285 ip_hdr(skb)->protocol);
2286
2287 /* check l3 master if no match yet */
2288 if (!our && netif_is_l3_slave(dev)) {
2289 struct in_device *l3_in_dev;
2290
2291 l3_in_dev = __in_dev_get_rcu(skb->dev);
2292 if (l3_in_dev)
2293 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2294 ip_hdr(skb)->protocol);
2295 }
2296
2297 if (our
2298 #ifdef CONFIG_IP_MROUTE
2299 ||
2300 (!ipv4_is_local_multicast(daddr) &&
2301 IN_DEV_MFORWARD(in_dev))
2302 #endif
2303 ) {
2304 err = ip_route_input_mc(skb, daddr, saddr,
2305 tos, dev, our);
2306 }
2307 return err;
2308 }
2309
2310 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2311 }
2312
2313 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2314 static struct rtable *__mkroute_output(const struct fib_result *res,
2315 const struct flowi4 *fl4, int orig_oif,
2316 struct net_device *dev_out,
2317 unsigned int flags)
2318 {
2319 struct fib_info *fi = res->fi;
2320 struct fib_nh_exception *fnhe;
2321 struct in_device *in_dev;
2322 u16 type = res->type;
2323 struct rtable *rth;
2324 bool do_cache;
2325
2326 in_dev = __in_dev_get_rcu(dev_out);
2327 if (!in_dev)
2328 return ERR_PTR(-EINVAL);
2329
2330 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2331 if (ipv4_is_loopback(fl4->saddr) &&
2332 !(dev_out->flags & IFF_LOOPBACK) &&
2333 !netif_is_l3_master(dev_out))
2334 return ERR_PTR(-EINVAL);
2335
2336 if (ipv4_is_lbcast(fl4->daddr))
2337 type = RTN_BROADCAST;
2338 else if (ipv4_is_multicast(fl4->daddr))
2339 type = RTN_MULTICAST;
2340 else if (ipv4_is_zeronet(fl4->daddr))
2341 return ERR_PTR(-EINVAL);
2342
2343 if (dev_out->flags & IFF_LOOPBACK)
2344 flags |= RTCF_LOCAL;
2345
2346 do_cache = true;
2347 if (type == RTN_BROADCAST) {
2348 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2349 fi = NULL;
2350 } else if (type == RTN_MULTICAST) {
2351 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2352 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2353 fl4->flowi4_proto))
2354 flags &= ~RTCF_LOCAL;
2355 else
2356 do_cache = false;
2357 /* If multicast route do not exist use
2358 * default one, but do not gateway in this case.
2359 * Yes, it is hack.
2360 */
2361 if (fi && res->prefixlen < 4)
2362 fi = NULL;
2363 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2364 (orig_oif != dev_out->ifindex)) {
2365 /* For local routes that require a particular output interface
2366 * we do not want to cache the result. Caching the result
2367 * causes incorrect behaviour when there are multiple source
2368 * addresses on the interface, the end result being that if the
2369 * intended recipient is waiting on that interface for the
2370 * packet he won't receive it because it will be delivered on
2371 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2372 * be set to the loopback interface as well.
2373 */
2374 do_cache = false;
2375 }
2376
2377 fnhe = NULL;
2378 do_cache &= fi != NULL;
2379 if (fi) {
2380 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2381 struct rtable __rcu **prth;
2382
2383 fnhe = find_exception(nhc, fl4->daddr);
2384 if (!do_cache)
2385 goto add;
2386 if (fnhe) {
2387 prth = &fnhe->fnhe_rth_output;
2388 } else {
2389 if (unlikely(fl4->flowi4_flags &
2390 FLOWI_FLAG_KNOWN_NH &&
2391 !(nhc->nhc_gw_family &&
2392 nhc->nhc_scope == RT_SCOPE_LINK))) {
2393 do_cache = false;
2394 goto add;
2395 }
2396 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2397 }
2398 rth = rcu_dereference(*prth);
2399 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2400 return rth;
2401 }
2402
2403 add:
2404 rth = rt_dst_alloc(dev_out, flags, type,
2405 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2406 IN_DEV_CONF_GET(in_dev, NOXFRM),
2407 do_cache);
2408 if (!rth)
2409 return ERR_PTR(-ENOBUFS);
2410
2411 rth->rt_iif = orig_oif;
2412
2413 RT_CACHE_STAT_INC(out_slow_tot);
2414
2415 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2416 if (flags & RTCF_LOCAL &&
2417 !(dev_out->flags & IFF_LOOPBACK)) {
2418 rth->dst.output = ip_mc_output;
2419 RT_CACHE_STAT_INC(out_slow_mc);
2420 }
2421 #ifdef CONFIG_IP_MROUTE
2422 if (type == RTN_MULTICAST) {
2423 if (IN_DEV_MFORWARD(in_dev) &&
2424 !ipv4_is_local_multicast(fl4->daddr)) {
2425 rth->dst.input = ip_mr_input;
2426 rth->dst.output = ip_mc_output;
2427 }
2428 }
2429 #endif
2430 }
2431
2432 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2433 lwtunnel_set_redirect(&rth->dst);
2434
2435 return rth;
2436 }
2437
2438 /*
2439 * Major route resolver routine.
2440 */
2441
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2442 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2443 const struct sk_buff *skb)
2444 {
2445 __u8 tos = RT_FL_TOS(fl4);
2446 struct fib_result res = {
2447 .type = RTN_UNSPEC,
2448 .fi = NULL,
2449 .table = NULL,
2450 .tclassid = 0,
2451 };
2452 struct rtable *rth;
2453
2454 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2455 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2456 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2457 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2458
2459 rcu_read_lock();
2460 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2461 rcu_read_unlock();
2462
2463 return rth;
2464 }
2465 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2466
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2467 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2468 struct fib_result *res,
2469 const struct sk_buff *skb)
2470 {
2471 struct net_device *dev_out = NULL;
2472 int orig_oif = fl4->flowi4_oif;
2473 unsigned int flags = 0;
2474 struct rtable *rth;
2475 int err;
2476
2477 if (fl4->saddr) {
2478 if (ipv4_is_multicast(fl4->saddr) ||
2479 ipv4_is_lbcast(fl4->saddr) ||
2480 ipv4_is_zeronet(fl4->saddr)) {
2481 rth = ERR_PTR(-EINVAL);
2482 goto out;
2483 }
2484
2485 rth = ERR_PTR(-ENETUNREACH);
2486
2487 /* I removed check for oif == dev_out->oif here.
2488 It was wrong for two reasons:
2489 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2490 is assigned to multiple interfaces.
2491 2. Moreover, we are allowed to send packets with saddr
2492 of another iface. --ANK
2493 */
2494
2495 if (fl4->flowi4_oif == 0 &&
2496 (ipv4_is_multicast(fl4->daddr) ||
2497 ipv4_is_lbcast(fl4->daddr))) {
2498 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2499 dev_out = __ip_dev_find(net, fl4->saddr, false);
2500 if (!dev_out)
2501 goto out;
2502
2503 /* Special hack: user can direct multicasts
2504 and limited broadcast via necessary interface
2505 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2506 This hack is not just for fun, it allows
2507 vic,vat and friends to work.
2508 They bind socket to loopback, set ttl to zero
2509 and expect that it will work.
2510 From the viewpoint of routing cache they are broken,
2511 because we are not allowed to build multicast path
2512 with loopback source addr (look, routing cache
2513 cannot know, that ttl is zero, so that packet
2514 will not leave this host and route is valid).
2515 Luckily, this hack is good workaround.
2516 */
2517
2518 fl4->flowi4_oif = dev_out->ifindex;
2519 goto make_route;
2520 }
2521
2522 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2523 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2524 if (!__ip_dev_find(net, fl4->saddr, false))
2525 goto out;
2526 }
2527 }
2528
2529
2530 if (fl4->flowi4_oif) {
2531 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2532 rth = ERR_PTR(-ENODEV);
2533 if (!dev_out)
2534 goto out;
2535
2536 /* RACE: Check return value of inet_select_addr instead. */
2537 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2538 rth = ERR_PTR(-ENETUNREACH);
2539 goto out;
2540 }
2541 if (ipv4_is_local_multicast(fl4->daddr) ||
2542 ipv4_is_lbcast(fl4->daddr) ||
2543 fl4->flowi4_proto == IPPROTO_IGMP) {
2544 if (!fl4->saddr)
2545 fl4->saddr = inet_select_addr(dev_out, 0,
2546 RT_SCOPE_LINK);
2547 goto make_route;
2548 }
2549 if (!fl4->saddr) {
2550 if (ipv4_is_multicast(fl4->daddr))
2551 fl4->saddr = inet_select_addr(dev_out, 0,
2552 fl4->flowi4_scope);
2553 else if (!fl4->daddr)
2554 fl4->saddr = inet_select_addr(dev_out, 0,
2555 RT_SCOPE_HOST);
2556 }
2557 }
2558
2559 if (!fl4->daddr) {
2560 fl4->daddr = fl4->saddr;
2561 if (!fl4->daddr)
2562 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2563 dev_out = net->loopback_dev;
2564 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2565 res->type = RTN_LOCAL;
2566 flags |= RTCF_LOCAL;
2567 goto make_route;
2568 }
2569
2570 err = fib_lookup(net, fl4, res, 0);
2571 if (err) {
2572 res->fi = NULL;
2573 res->table = NULL;
2574 if (fl4->flowi4_oif &&
2575 (ipv4_is_multicast(fl4->daddr) ||
2576 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2577 /* Apparently, routing tables are wrong. Assume,
2578 that the destination is on link.
2579
2580 WHY? DW.
2581 Because we are allowed to send to iface
2582 even if it has NO routes and NO assigned
2583 addresses. When oif is specified, routing
2584 tables are looked up with only one purpose:
2585 to catch if destination is gatewayed, rather than
2586 direct. Moreover, if MSG_DONTROUTE is set,
2587 we send packet, ignoring both routing tables
2588 and ifaddr state. --ANK
2589
2590
2591 We could make it even if oif is unknown,
2592 likely IPv6, but we do not.
2593 */
2594
2595 if (fl4->saddr == 0)
2596 fl4->saddr = inet_select_addr(dev_out, 0,
2597 RT_SCOPE_LINK);
2598 res->type = RTN_UNICAST;
2599 goto make_route;
2600 }
2601 rth = ERR_PTR(err);
2602 goto out;
2603 }
2604
2605 if (res->type == RTN_LOCAL) {
2606 if (!fl4->saddr) {
2607 if (res->fi->fib_prefsrc)
2608 fl4->saddr = res->fi->fib_prefsrc;
2609 else
2610 fl4->saddr = fl4->daddr;
2611 }
2612
2613 /* L3 master device is the loopback for that domain */
2614 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2615 net->loopback_dev;
2616
2617 /* make sure orig_oif points to fib result device even
2618 * though packet rx/tx happens over loopback or l3mdev
2619 */
2620 orig_oif = FIB_RES_OIF(*res);
2621
2622 fl4->flowi4_oif = dev_out->ifindex;
2623 flags |= RTCF_LOCAL;
2624 goto make_route;
2625 }
2626
2627 fib_select_path(net, res, fl4, skb);
2628
2629 dev_out = FIB_RES_DEV(*res);
2630 fl4->flowi4_oif = dev_out->ifindex;
2631
2632
2633 make_route:
2634 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2635
2636 out:
2637 return rth;
2638 }
2639
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2640 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2641 {
2642 return NULL;
2643 }
2644
ipv4_blackhole_mtu(const struct dst_entry * dst)2645 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2646 {
2647 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2648
2649 return mtu ? : dst->dev->mtu;
2650 }
2651
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)2652 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2653 struct sk_buff *skb, u32 mtu,
2654 bool confirm_neigh)
2655 {
2656 }
2657
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2658 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2659 struct sk_buff *skb)
2660 {
2661 }
2662
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2663 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2664 unsigned long old)
2665 {
2666 return NULL;
2667 }
2668
2669 static struct dst_ops ipv4_dst_blackhole_ops = {
2670 .family = AF_INET,
2671 .check = ipv4_blackhole_dst_check,
2672 .mtu = ipv4_blackhole_mtu,
2673 .default_advmss = ipv4_default_advmss,
2674 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2675 .redirect = ipv4_rt_blackhole_redirect,
2676 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2677 .neigh_lookup = ipv4_neigh_lookup,
2678 };
2679
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2680 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2681 {
2682 struct rtable *ort = (struct rtable *) dst_orig;
2683 struct rtable *rt;
2684
2685 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2686 if (rt) {
2687 struct dst_entry *new = &rt->dst;
2688
2689 new->__use = 1;
2690 new->input = dst_discard;
2691 new->output = dst_discard_out;
2692
2693 new->dev = net->loopback_dev;
2694 if (new->dev)
2695 dev_hold(new->dev);
2696
2697 rt->rt_is_input = ort->rt_is_input;
2698 rt->rt_iif = ort->rt_iif;
2699 rt->rt_pmtu = ort->rt_pmtu;
2700 rt->rt_mtu_locked = ort->rt_mtu_locked;
2701
2702 rt->rt_genid = rt_genid_ipv4(net);
2703 rt->rt_flags = ort->rt_flags;
2704 rt->rt_type = ort->rt_type;
2705 rt->rt_uses_gateway = ort->rt_uses_gateway;
2706 rt->rt_gw_family = ort->rt_gw_family;
2707 if (rt->rt_gw_family == AF_INET)
2708 rt->rt_gw4 = ort->rt_gw4;
2709 else if (rt->rt_gw_family == AF_INET6)
2710 rt->rt_gw6 = ort->rt_gw6;
2711
2712 INIT_LIST_HEAD(&rt->rt_uncached);
2713 }
2714
2715 dst_release(dst_orig);
2716
2717 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2718 }
2719
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2720 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2721 const struct sock *sk)
2722 {
2723 struct rtable *rt = __ip_route_output_key(net, flp4);
2724
2725 if (IS_ERR(rt))
2726 return rt;
2727
2728 if (flp4->flowi4_proto)
2729 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2730 flowi4_to_flowi(flp4),
2731 sk, 0);
2732
2733 return rt;
2734 }
2735 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2736
2737 /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2738 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2739 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2740 struct sk_buff *skb, u32 portid, u32 seq,
2741 unsigned int flags)
2742 {
2743 struct rtmsg *r;
2744 struct nlmsghdr *nlh;
2745 unsigned long expires = 0;
2746 u32 error;
2747 u32 metrics[RTAX_MAX];
2748
2749 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2750 if (!nlh)
2751 return -EMSGSIZE;
2752
2753 r = nlmsg_data(nlh);
2754 r->rtm_family = AF_INET;
2755 r->rtm_dst_len = 32;
2756 r->rtm_src_len = 0;
2757 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2758 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2759 if (nla_put_u32(skb, RTA_TABLE, table_id))
2760 goto nla_put_failure;
2761 r->rtm_type = rt->rt_type;
2762 r->rtm_scope = RT_SCOPE_UNIVERSE;
2763 r->rtm_protocol = RTPROT_UNSPEC;
2764 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2765 if (rt->rt_flags & RTCF_NOTIFY)
2766 r->rtm_flags |= RTM_F_NOTIFY;
2767 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2768 r->rtm_flags |= RTCF_DOREDIRECT;
2769
2770 if (nla_put_in_addr(skb, RTA_DST, dst))
2771 goto nla_put_failure;
2772 if (src) {
2773 r->rtm_src_len = 32;
2774 if (nla_put_in_addr(skb, RTA_SRC, src))
2775 goto nla_put_failure;
2776 }
2777 if (rt->dst.dev &&
2778 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2779 goto nla_put_failure;
2780 #ifdef CONFIG_IP_ROUTE_CLASSID
2781 if (rt->dst.tclassid &&
2782 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2783 goto nla_put_failure;
2784 #endif
2785 if (fl4 && !rt_is_input_route(rt) &&
2786 fl4->saddr != src) {
2787 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2788 goto nla_put_failure;
2789 }
2790 if (rt->rt_uses_gateway) {
2791 if (rt->rt_gw_family == AF_INET &&
2792 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2793 goto nla_put_failure;
2794 } else if (rt->rt_gw_family == AF_INET6) {
2795 int alen = sizeof(struct in6_addr);
2796 struct nlattr *nla;
2797 struct rtvia *via;
2798
2799 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2800 if (!nla)
2801 goto nla_put_failure;
2802
2803 via = nla_data(nla);
2804 via->rtvia_family = AF_INET6;
2805 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2806 }
2807 }
2808
2809 expires = rt->dst.expires;
2810 if (expires) {
2811 unsigned long now = jiffies;
2812
2813 if (time_before(now, expires))
2814 expires -= now;
2815 else
2816 expires = 0;
2817 }
2818
2819 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2820 if (rt->rt_pmtu && expires)
2821 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2822 if (rt->rt_mtu_locked && expires)
2823 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2824 if (rtnetlink_put_metrics(skb, metrics) < 0)
2825 goto nla_put_failure;
2826
2827 if (fl4) {
2828 if (fl4->flowi4_mark &&
2829 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2830 goto nla_put_failure;
2831
2832 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2833 nla_put_u32(skb, RTA_UID,
2834 from_kuid_munged(current_user_ns(),
2835 fl4->flowi4_uid)))
2836 goto nla_put_failure;
2837
2838 if (rt_is_input_route(rt)) {
2839 #ifdef CONFIG_IP_MROUTE
2840 if (ipv4_is_multicast(dst) &&
2841 !ipv4_is_local_multicast(dst) &&
2842 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2843 int err = ipmr_get_route(net, skb,
2844 fl4->saddr, fl4->daddr,
2845 r, portid);
2846
2847 if (err <= 0) {
2848 if (err == 0)
2849 return 0;
2850 goto nla_put_failure;
2851 }
2852 } else
2853 #endif
2854 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2855 goto nla_put_failure;
2856 }
2857 }
2858
2859 error = rt->dst.error;
2860
2861 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2862 goto nla_put_failure;
2863
2864 nlmsg_end(skb, nlh);
2865 return 0;
2866
2867 nla_put_failure:
2868 nlmsg_cancel(skb, nlh);
2869 return -EMSGSIZE;
2870 }
2871
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)2872 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2873 struct netlink_callback *cb, u32 table_id,
2874 struct fnhe_hash_bucket *bucket, int genid,
2875 int *fa_index, int fa_start, unsigned int flags)
2876 {
2877 int i;
2878
2879 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2880 struct fib_nh_exception *fnhe;
2881
2882 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2883 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2884 struct rtable *rt;
2885 int err;
2886
2887 if (*fa_index < fa_start)
2888 goto next;
2889
2890 if (fnhe->fnhe_genid != genid)
2891 goto next;
2892
2893 if (fnhe->fnhe_expires &&
2894 time_after(jiffies, fnhe->fnhe_expires))
2895 goto next;
2896
2897 rt = rcu_dereference(fnhe->fnhe_rth_input);
2898 if (!rt)
2899 rt = rcu_dereference(fnhe->fnhe_rth_output);
2900 if (!rt)
2901 goto next;
2902
2903 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2904 table_id, NULL, skb,
2905 NETLINK_CB(cb->skb).portid,
2906 cb->nlh->nlmsg_seq, flags);
2907 if (err)
2908 return err;
2909 next:
2910 (*fa_index)++;
2911 }
2912 }
2913
2914 return 0;
2915 }
2916
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)2917 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2918 u32 table_id, struct fib_info *fi,
2919 int *fa_index, int fa_start, unsigned int flags)
2920 {
2921 struct net *net = sock_net(cb->skb->sk);
2922 int nhsel, genid = fnhe_genid(net);
2923
2924 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2925 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2926 struct fnhe_hash_bucket *bucket;
2927 int err;
2928
2929 if (nhc->nhc_flags & RTNH_F_DEAD)
2930 continue;
2931
2932 rcu_read_lock();
2933 bucket = rcu_dereference(nhc->nhc_exceptions);
2934 err = 0;
2935 if (bucket)
2936 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2937 genid, fa_index, fa_start,
2938 flags);
2939 rcu_read_unlock();
2940 if (err)
2941 return err;
2942 }
2943
2944 return 0;
2945 }
2946
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)2947 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2948 u8 ip_proto, __be16 sport,
2949 __be16 dport)
2950 {
2951 struct sk_buff *skb;
2952 struct iphdr *iph;
2953
2954 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2955 if (!skb)
2956 return NULL;
2957
2958 /* Reserve room for dummy headers, this skb can pass
2959 * through good chunk of routing engine.
2960 */
2961 skb_reset_mac_header(skb);
2962 skb_reset_network_header(skb);
2963 skb->protocol = htons(ETH_P_IP);
2964 iph = skb_put(skb, sizeof(struct iphdr));
2965 iph->protocol = ip_proto;
2966 iph->saddr = src;
2967 iph->daddr = dst;
2968 iph->version = 0x4;
2969 iph->frag_off = 0;
2970 iph->ihl = 0x5;
2971 skb_set_transport_header(skb, skb->len);
2972
2973 switch (iph->protocol) {
2974 case IPPROTO_UDP: {
2975 struct udphdr *udph;
2976
2977 udph = skb_put_zero(skb, sizeof(struct udphdr));
2978 udph->source = sport;
2979 udph->dest = dport;
2980 udph->len = sizeof(struct udphdr);
2981 udph->check = 0;
2982 break;
2983 }
2984 case IPPROTO_TCP: {
2985 struct tcphdr *tcph;
2986
2987 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2988 tcph->source = sport;
2989 tcph->dest = dport;
2990 tcph->doff = sizeof(struct tcphdr) / 4;
2991 tcph->rst = 1;
2992 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2993 src, dst, 0);
2994 break;
2995 }
2996 case IPPROTO_ICMP: {
2997 struct icmphdr *icmph;
2998
2999 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3000 icmph->type = ICMP_ECHO;
3001 icmph->code = 0;
3002 }
3003 }
3004
3005 return skb;
3006 }
3007
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3008 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3009 const struct nlmsghdr *nlh,
3010 struct nlattr **tb,
3011 struct netlink_ext_ack *extack)
3012 {
3013 struct rtmsg *rtm;
3014 int i, err;
3015
3016 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3017 NL_SET_ERR_MSG(extack,
3018 "ipv4: Invalid header for route get request");
3019 return -EINVAL;
3020 }
3021
3022 if (!netlink_strict_get_check(skb))
3023 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3024 rtm_ipv4_policy, extack);
3025
3026 rtm = nlmsg_data(nlh);
3027 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3028 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3029 rtm->rtm_table || rtm->rtm_protocol ||
3030 rtm->rtm_scope || rtm->rtm_type) {
3031 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3032 return -EINVAL;
3033 }
3034
3035 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3036 RTM_F_LOOKUP_TABLE |
3037 RTM_F_FIB_MATCH)) {
3038 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3039 return -EINVAL;
3040 }
3041
3042 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3043 rtm_ipv4_policy, extack);
3044 if (err)
3045 return err;
3046
3047 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3048 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3049 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3050 return -EINVAL;
3051 }
3052
3053 for (i = 0; i <= RTA_MAX; i++) {
3054 if (!tb[i])
3055 continue;
3056
3057 switch (i) {
3058 case RTA_IIF:
3059 case RTA_OIF:
3060 case RTA_SRC:
3061 case RTA_DST:
3062 case RTA_IP_PROTO:
3063 case RTA_SPORT:
3064 case RTA_DPORT:
3065 case RTA_MARK:
3066 case RTA_UID:
3067 break;
3068 default:
3069 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3070 return -EINVAL;
3071 }
3072 }
3073
3074 return 0;
3075 }
3076
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3077 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3078 struct netlink_ext_ack *extack)
3079 {
3080 struct net *net = sock_net(in_skb->sk);
3081 struct nlattr *tb[RTA_MAX+1];
3082 u32 table_id = RT_TABLE_MAIN;
3083 __be16 sport = 0, dport = 0;
3084 struct fib_result res = {};
3085 u8 ip_proto = IPPROTO_UDP;
3086 struct rtable *rt = NULL;
3087 struct sk_buff *skb;
3088 struct rtmsg *rtm;
3089 struct flowi4 fl4 = {};
3090 __be32 dst = 0;
3091 __be32 src = 0;
3092 kuid_t uid;
3093 u32 iif;
3094 int err;
3095 int mark;
3096
3097 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3098 if (err < 0)
3099 return err;
3100
3101 rtm = nlmsg_data(nlh);
3102 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3103 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3104 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3105 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3106 if (tb[RTA_UID])
3107 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3108 else
3109 uid = (iif ? INVALID_UID : current_uid());
3110
3111 if (tb[RTA_IP_PROTO]) {
3112 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3113 &ip_proto, AF_INET, extack);
3114 if (err)
3115 return err;
3116 }
3117
3118 if (tb[RTA_SPORT])
3119 sport = nla_get_be16(tb[RTA_SPORT]);
3120
3121 if (tb[RTA_DPORT])
3122 dport = nla_get_be16(tb[RTA_DPORT]);
3123
3124 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3125 if (!skb)
3126 return -ENOBUFS;
3127
3128 fl4.daddr = dst;
3129 fl4.saddr = src;
3130 fl4.flowi4_tos = rtm->rtm_tos;
3131 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3132 fl4.flowi4_mark = mark;
3133 fl4.flowi4_uid = uid;
3134 if (sport)
3135 fl4.fl4_sport = sport;
3136 if (dport)
3137 fl4.fl4_dport = dport;
3138 fl4.flowi4_proto = ip_proto;
3139
3140 rcu_read_lock();
3141
3142 if (iif) {
3143 struct net_device *dev;
3144
3145 dev = dev_get_by_index_rcu(net, iif);
3146 if (!dev) {
3147 err = -ENODEV;
3148 goto errout_rcu;
3149 }
3150
3151 fl4.flowi4_iif = iif; /* for rt_fill_info */
3152 skb->dev = dev;
3153 skb->mark = mark;
3154 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3155 dev, &res);
3156
3157 rt = skb_rtable(skb);
3158 if (err == 0 && rt->dst.error)
3159 err = -rt->dst.error;
3160 } else {
3161 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3162 skb->dev = net->loopback_dev;
3163 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3164 err = 0;
3165 if (IS_ERR(rt))
3166 err = PTR_ERR(rt);
3167 else
3168 skb_dst_set(skb, &rt->dst);
3169 }
3170
3171 if (err)
3172 goto errout_rcu;
3173
3174 if (rtm->rtm_flags & RTM_F_NOTIFY)
3175 rt->rt_flags |= RTCF_NOTIFY;
3176
3177 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3178 table_id = res.table ? res.table->tb_id : 0;
3179
3180 /* reset skb for netlink reply msg */
3181 skb_trim(skb, 0);
3182 skb_reset_network_header(skb);
3183 skb_reset_transport_header(skb);
3184 skb_reset_mac_header(skb);
3185
3186 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3187 if (!res.fi) {
3188 err = fib_props[res.type].error;
3189 if (!err)
3190 err = -EHOSTUNREACH;
3191 goto errout_rcu;
3192 }
3193 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3194 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3195 rt->rt_type, res.prefix, res.prefixlen,
3196 fl4.flowi4_tos, res.fi, 0);
3197 } else {
3198 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3199 NETLINK_CB(in_skb).portid,
3200 nlh->nlmsg_seq, 0);
3201 }
3202 if (err < 0)
3203 goto errout_rcu;
3204
3205 rcu_read_unlock();
3206
3207 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3208
3209 errout_free:
3210 return err;
3211 errout_rcu:
3212 rcu_read_unlock();
3213 kfree_skb(skb);
3214 goto errout_free;
3215 }
3216
ip_rt_multicast_event(struct in_device * in_dev)3217 void ip_rt_multicast_event(struct in_device *in_dev)
3218 {
3219 rt_cache_flush(dev_net(in_dev->dev));
3220 }
3221
3222 #ifdef CONFIG_SYSCTL
3223 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3224 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3225 static int ip_rt_gc_elasticity __read_mostly = 8;
3226 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3227
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)3228 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3229 void __user *buffer,
3230 size_t *lenp, loff_t *ppos)
3231 {
3232 struct net *net = (struct net *)__ctl->extra1;
3233
3234 if (write) {
3235 rt_cache_flush(net);
3236 fnhe_genid_bump(net);
3237 return 0;
3238 }
3239
3240 return -EINVAL;
3241 }
3242
3243 static struct ctl_table ipv4_route_table[] = {
3244 {
3245 .procname = "gc_thresh",
3246 .data = &ipv4_dst_ops.gc_thresh,
3247 .maxlen = sizeof(int),
3248 .mode = 0644,
3249 .proc_handler = proc_dointvec,
3250 },
3251 {
3252 .procname = "max_size",
3253 .data = &ip_rt_max_size,
3254 .maxlen = sizeof(int),
3255 .mode = 0644,
3256 .proc_handler = proc_dointvec,
3257 },
3258 {
3259 /* Deprecated. Use gc_min_interval_ms */
3260
3261 .procname = "gc_min_interval",
3262 .data = &ip_rt_gc_min_interval,
3263 .maxlen = sizeof(int),
3264 .mode = 0644,
3265 .proc_handler = proc_dointvec_jiffies,
3266 },
3267 {
3268 .procname = "gc_min_interval_ms",
3269 .data = &ip_rt_gc_min_interval,
3270 .maxlen = sizeof(int),
3271 .mode = 0644,
3272 .proc_handler = proc_dointvec_ms_jiffies,
3273 },
3274 {
3275 .procname = "gc_timeout",
3276 .data = &ip_rt_gc_timeout,
3277 .maxlen = sizeof(int),
3278 .mode = 0644,
3279 .proc_handler = proc_dointvec_jiffies,
3280 },
3281 {
3282 .procname = "gc_interval",
3283 .data = &ip_rt_gc_interval,
3284 .maxlen = sizeof(int),
3285 .mode = 0644,
3286 .proc_handler = proc_dointvec_jiffies,
3287 },
3288 {
3289 .procname = "redirect_load",
3290 .data = &ip_rt_redirect_load,
3291 .maxlen = sizeof(int),
3292 .mode = 0644,
3293 .proc_handler = proc_dointvec,
3294 },
3295 {
3296 .procname = "redirect_number",
3297 .data = &ip_rt_redirect_number,
3298 .maxlen = sizeof(int),
3299 .mode = 0644,
3300 .proc_handler = proc_dointvec,
3301 },
3302 {
3303 .procname = "redirect_silence",
3304 .data = &ip_rt_redirect_silence,
3305 .maxlen = sizeof(int),
3306 .mode = 0644,
3307 .proc_handler = proc_dointvec,
3308 },
3309 {
3310 .procname = "error_cost",
3311 .data = &ip_rt_error_cost,
3312 .maxlen = sizeof(int),
3313 .mode = 0644,
3314 .proc_handler = proc_dointvec,
3315 },
3316 {
3317 .procname = "error_burst",
3318 .data = &ip_rt_error_burst,
3319 .maxlen = sizeof(int),
3320 .mode = 0644,
3321 .proc_handler = proc_dointvec,
3322 },
3323 {
3324 .procname = "gc_elasticity",
3325 .data = &ip_rt_gc_elasticity,
3326 .maxlen = sizeof(int),
3327 .mode = 0644,
3328 .proc_handler = proc_dointvec,
3329 },
3330 {
3331 .procname = "mtu_expires",
3332 .data = &ip_rt_mtu_expires,
3333 .maxlen = sizeof(int),
3334 .mode = 0644,
3335 .proc_handler = proc_dointvec_jiffies,
3336 },
3337 {
3338 .procname = "min_pmtu",
3339 .data = &ip_rt_min_pmtu,
3340 .maxlen = sizeof(int),
3341 .mode = 0644,
3342 .proc_handler = proc_dointvec_minmax,
3343 .extra1 = &ip_min_valid_pmtu,
3344 },
3345 {
3346 .procname = "min_adv_mss",
3347 .data = &ip_rt_min_advmss,
3348 .maxlen = sizeof(int),
3349 .mode = 0644,
3350 .proc_handler = proc_dointvec,
3351 },
3352 { }
3353 };
3354
3355 static const char ipv4_route_flush_procname[] = "flush";
3356
3357 static struct ctl_table ipv4_route_flush_table[] = {
3358 {
3359 .procname = ipv4_route_flush_procname,
3360 .maxlen = sizeof(int),
3361 .mode = 0200,
3362 .proc_handler = ipv4_sysctl_rtcache_flush,
3363 },
3364 { },
3365 };
3366
sysctl_route_net_init(struct net * net)3367 static __net_init int sysctl_route_net_init(struct net *net)
3368 {
3369 struct ctl_table *tbl;
3370
3371 tbl = ipv4_route_flush_table;
3372 if (!net_eq(net, &init_net)) {
3373 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3374 if (!tbl)
3375 goto err_dup;
3376
3377 /* Don't export non-whitelisted sysctls to unprivileged users */
3378 if (net->user_ns != &init_user_ns) {
3379 if (tbl[0].procname != ipv4_route_flush_procname)
3380 tbl[0].procname = NULL;
3381 }
3382 }
3383 tbl[0].extra1 = net;
3384
3385 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3386 if (!net->ipv4.route_hdr)
3387 goto err_reg;
3388 return 0;
3389
3390 err_reg:
3391 if (tbl != ipv4_route_flush_table)
3392 kfree(tbl);
3393 err_dup:
3394 return -ENOMEM;
3395 }
3396
sysctl_route_net_exit(struct net * net)3397 static __net_exit void sysctl_route_net_exit(struct net *net)
3398 {
3399 struct ctl_table *tbl;
3400
3401 tbl = net->ipv4.route_hdr->ctl_table_arg;
3402 unregister_net_sysctl_table(net->ipv4.route_hdr);
3403 BUG_ON(tbl == ipv4_route_flush_table);
3404 kfree(tbl);
3405 }
3406
3407 static __net_initdata struct pernet_operations sysctl_route_ops = {
3408 .init = sysctl_route_net_init,
3409 .exit = sysctl_route_net_exit,
3410 };
3411 #endif
3412
rt_genid_init(struct net * net)3413 static __net_init int rt_genid_init(struct net *net)
3414 {
3415 atomic_set(&net->ipv4.rt_genid, 0);
3416 atomic_set(&net->fnhe_genid, 0);
3417 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3418 return 0;
3419 }
3420
3421 static __net_initdata struct pernet_operations rt_genid_ops = {
3422 .init = rt_genid_init,
3423 };
3424
ipv4_inetpeer_init(struct net * net)3425 static int __net_init ipv4_inetpeer_init(struct net *net)
3426 {
3427 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3428
3429 if (!bp)
3430 return -ENOMEM;
3431 inet_peer_base_init(bp);
3432 net->ipv4.peers = bp;
3433 return 0;
3434 }
3435
ipv4_inetpeer_exit(struct net * net)3436 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3437 {
3438 struct inet_peer_base *bp = net->ipv4.peers;
3439
3440 net->ipv4.peers = NULL;
3441 inetpeer_invalidate_tree(bp);
3442 kfree(bp);
3443 }
3444
3445 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3446 .init = ipv4_inetpeer_init,
3447 .exit = ipv4_inetpeer_exit,
3448 };
3449
3450 #ifdef CONFIG_IP_ROUTE_CLASSID
3451 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3452 #endif /* CONFIG_IP_ROUTE_CLASSID */
3453
ip_rt_init(void)3454 int __init ip_rt_init(void)
3455 {
3456 int cpu;
3457
3458 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3459 GFP_KERNEL);
3460 if (!ip_idents)
3461 panic("IP: failed to allocate ip_idents\n");
3462
3463 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3464
3465 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3466 if (!ip_tstamps)
3467 panic("IP: failed to allocate ip_tstamps\n");
3468
3469 for_each_possible_cpu(cpu) {
3470 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3471
3472 INIT_LIST_HEAD(&ul->head);
3473 spin_lock_init(&ul->lock);
3474 }
3475 #ifdef CONFIG_IP_ROUTE_CLASSID
3476 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3477 if (!ip_rt_acct)
3478 panic("IP: failed to allocate ip_rt_acct\n");
3479 #endif
3480
3481 ipv4_dst_ops.kmem_cachep =
3482 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3484
3485 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3486
3487 if (dst_entries_init(&ipv4_dst_ops) < 0)
3488 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3489
3490 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3491 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3492
3493 ipv4_dst_ops.gc_thresh = ~0;
3494 ip_rt_max_size = INT_MAX;
3495
3496 devinet_init();
3497 ip_fib_init();
3498
3499 if (ip_rt_proc_init())
3500 pr_err("Unable to create route proc files\n");
3501 #ifdef CONFIG_XFRM
3502 xfrm_init();
3503 xfrm4_init();
3504 #endif
3505 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3506 RTNL_FLAG_DOIT_UNLOCKED);
3507
3508 #ifdef CONFIG_SYSCTL
3509 register_pernet_subsys(&sysctl_route_ops);
3510 #endif
3511 register_pernet_subsys(&rt_genid_ops);
3512 register_pernet_subsys(&ipv4_inetpeer_ops);
3513 return 0;
3514 }
3515
3516 #ifdef CONFIG_SYSCTL
3517 /*
3518 * We really need to sanitize the damn ipv4 init order, then all
3519 * this nonsense will go away.
3520 */
ip_static_sysctl_init(void)3521 void __init ip_static_sysctl_init(void)
3522 {
3523 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3524 }
3525 #endif
3526