• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 	int delta = hh_len - skb_headroom(skb);
65 	const struct in6_addr *nexthop;
66 	struct neighbour *neigh;
67 	int ret;
68 
69 	/* Be paranoid, rather than too clever. */
70 	if (unlikely(delta > 0) && dev->header_ops) {
71 		/* pskb_expand_head() might crash, if skb is shared */
72 		if (skb_shared(skb)) {
73 			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74 
75 			if (likely(nskb)) {
76 				if (skb->sk)
77 					skb_set_owner_w(nskb, skb->sk);
78 				consume_skb(skb);
79 			} else {
80 				kfree_skb(skb);
81 			}
82 			skb = nskb;
83 		}
84 		if (skb &&
85 		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 			kfree_skb(skb);
87 			skb = NULL;
88 		}
89 		if (!skb) {
90 			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 			return -ENOMEM;
92 		}
93 	}
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99 		    ((mroute6_is_socket(net, skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					net, sk, newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(net, idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122 
123 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 		    IPV6_ADDR_SCOPE_NODELOCAL &&
125 		    !(dev->flags & IFF_LOOPBACK)) {
126 			kfree_skb(skb);
127 			return 0;
128 		}
129 	}
130 
131 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 		int res = lwtunnel_xmit(skb);
133 
134 		if (res != LWTUNNEL_XMIT_CONTINUE)
135 			return res;
136 	}
137 
138 	rcu_read_lock_bh();
139 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 	if (unlikely(!neigh))
142 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 	if (!IS_ERR(neigh)) {
144 		sock_confirm_neigh(skb, neigh);
145 		ret = neigh_output(neigh, skb, false);
146 		rcu_read_unlock_bh();
147 		return ret;
148 	}
149 	rcu_read_unlock_bh();
150 
151 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152 	kfree_skb(skb);
153 	return -EINVAL;
154 }
155 
156 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 				    struct sk_buff *skb, unsigned int mtu)
159 {
160 	struct sk_buff *segs, *nskb;
161 	netdev_features_t features;
162 	int ret = 0;
163 
164 	/* Please see corresponding comment in ip_finish_output_gso
165 	 * describing the cases where GSO segment length exceeds the
166 	 * egress MTU.
167 	 */
168 	features = netif_skb_features(skb);
169 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 	if (IS_ERR_OR_NULL(segs)) {
171 		kfree_skb(skb);
172 		return -ENOMEM;
173 	}
174 
175 	consume_skb(skb);
176 
177 	skb_list_walk_safe(segs, segs, nskb) {
178 		int err;
179 
180 		skb_mark_not_on_list(segs);
181 		/* Last GSO segment can be smaller than gso_size (and MTU).
182 		 * Adding a fragment header would produce an "atomic fragment",
183 		 * which is considered harmful (RFC-8021). Avoid that.
184 		 */
185 		err = segs->len > mtu ?
186 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
187 			ip6_finish_output2(net, sk, segs);
188 		if (err && ret == 0)
189 			ret = err;
190 	}
191 
192 	return ret;
193 }
194 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197 	unsigned int mtu;
198 
199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200 	/* Policy lookup after SNAT yielded a new policy */
201 	if (skb_dst(skb)->xfrm) {
202 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
203 		return dst_output(net, sk, skb);
204 	}
205 #endif
206 
207 	mtu = ip6_skb_dst_mtu(skb);
208 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
209 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
210 
211 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
212 	    dst_allfrag(skb_dst(skb)) ||
213 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
214 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
215 	else
216 		return ip6_finish_output2(net, sk, skb);
217 }
218 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)219 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
220 {
221 	int ret;
222 
223 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
224 	switch (ret) {
225 	case NET_XMIT_SUCCESS:
226 		return __ip6_finish_output(net, sk, skb);
227 	case NET_XMIT_CN:
228 		return __ip6_finish_output(net, sk, skb) ? : ret;
229 	default:
230 		kfree_skb(skb);
231 		return ret;
232 	}
233 }
234 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)235 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
236 {
237 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
238 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
239 
240 	skb->protocol = htons(ETH_P_IPV6);
241 	skb->dev = dev;
242 
243 	if (unlikely(idev->cnf.disable_ipv6)) {
244 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
245 		kfree_skb(skb);
246 		return 0;
247 	}
248 
249 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
250 			    net, sk, skb, indev, dev,
251 			    ip6_finish_output,
252 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
253 }
254 
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)255 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
256 {
257 	if (!np->autoflowlabel_set)
258 		return ip6_default_np_autolabel(net);
259 	else
260 		return np->autoflowlabel;
261 }
262 
263 /*
264  * xmit an sk_buff (used by TCP, SCTP and DCCP)
265  * Note : socket lock is not held for SYNACK packets, but might be modified
266  * by calls to skb_set_owner_w() and ipv6_local_error(),
267  * which are using proper atomic operations or spinlocks.
268  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
270 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
271 {
272 	struct net *net = sock_net(sk);
273 	const struct ipv6_pinfo *np = inet6_sk(sk);
274 	struct in6_addr *first_hop = &fl6->daddr;
275 	struct dst_entry *dst = skb_dst(skb);
276 	unsigned int head_room;
277 	struct ipv6hdr *hdr;
278 	u8  proto = fl6->flowi6_proto;
279 	int seg_len = skb->len;
280 	int hlimit = -1;
281 	u32 mtu;
282 
283 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
284 	if (opt)
285 		head_room += opt->opt_nflen + opt->opt_flen;
286 
287 	if (unlikely(skb_headroom(skb) < head_room)) {
288 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
289 		if (!skb2) {
290 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
291 				      IPSTATS_MIB_OUTDISCARDS);
292 			kfree_skb(skb);
293 			return -ENOBUFS;
294 		}
295 		if (skb->sk)
296 			skb_set_owner_w(skb2, skb->sk);
297 		consume_skb(skb);
298 		skb = skb2;
299 	}
300 
301 	if (opt) {
302 		seg_len += opt->opt_nflen + opt->opt_flen;
303 
304 		if (opt->opt_flen)
305 			ipv6_push_frag_opts(skb, opt, &proto);
306 
307 		if (opt->opt_nflen)
308 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
309 					     &fl6->saddr);
310 	}
311 
312 	skb_push(skb, sizeof(struct ipv6hdr));
313 	skb_reset_network_header(skb);
314 	hdr = ipv6_hdr(skb);
315 
316 	/*
317 	 *	Fill in the IPv6 header
318 	 */
319 	if (np)
320 		hlimit = np->hop_limit;
321 	if (hlimit < 0)
322 		hlimit = ip6_dst_hoplimit(dst);
323 
324 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
325 				ip6_autoflowlabel(net, np), fl6));
326 
327 	hdr->payload_len = htons(seg_len);
328 	hdr->nexthdr = proto;
329 	hdr->hop_limit = hlimit;
330 
331 	hdr->saddr = fl6->saddr;
332 	hdr->daddr = *first_hop;
333 
334 	skb->protocol = htons(ETH_P_IPV6);
335 	skb->priority = priority;
336 	skb->mark = mark;
337 
338 	mtu = dst_mtu(dst);
339 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
340 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
341 			      IPSTATS_MIB_OUT, skb->len);
342 
343 		/* if egress device is enslaved to an L3 master device pass the
344 		 * skb to its handler for processing
345 		 */
346 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
347 		if (unlikely(!skb))
348 			return 0;
349 
350 		/* hooks should never assume socket lock is held.
351 		 * we promote our socket to non const
352 		 */
353 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
354 			       net, (struct sock *)sk, skb, NULL, dst->dev,
355 			       dst_output);
356 	}
357 
358 	skb->dev = dst->dev;
359 	/* ipv6_local_error() does not require socket lock,
360 	 * we promote our socket to non const
361 	 */
362 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
363 
364 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
365 	kfree_skb(skb);
366 	return -EMSGSIZE;
367 }
368 EXPORT_SYMBOL(ip6_xmit);
369 
ip6_call_ra_chain(struct sk_buff * skb,int sel)370 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
371 {
372 	struct ip6_ra_chain *ra;
373 	struct sock *last = NULL;
374 
375 	read_lock(&ip6_ra_lock);
376 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
377 		struct sock *sk = ra->sk;
378 		if (sk && ra->sel == sel &&
379 		    (!sk->sk_bound_dev_if ||
380 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
381 			struct ipv6_pinfo *np = inet6_sk(sk);
382 
383 			if (np && np->rtalert_isolate &&
384 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
385 				continue;
386 			}
387 			if (last) {
388 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
389 				if (skb2)
390 					rawv6_rcv(last, skb2);
391 			}
392 			last = sk;
393 		}
394 	}
395 
396 	if (last) {
397 		rawv6_rcv(last, skb);
398 		read_unlock(&ip6_ra_lock);
399 		return 1;
400 	}
401 	read_unlock(&ip6_ra_lock);
402 	return 0;
403 }
404 
ip6_forward_proxy_check(struct sk_buff * skb)405 static int ip6_forward_proxy_check(struct sk_buff *skb)
406 {
407 	struct ipv6hdr *hdr = ipv6_hdr(skb);
408 	u8 nexthdr = hdr->nexthdr;
409 	__be16 frag_off;
410 	int offset;
411 
412 	if (ipv6_ext_hdr(nexthdr)) {
413 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
414 		if (offset < 0)
415 			return 0;
416 	} else
417 		offset = sizeof(struct ipv6hdr);
418 
419 	if (nexthdr == IPPROTO_ICMPV6) {
420 		struct icmp6hdr *icmp6;
421 
422 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
423 					 offset + 1 - skb->data)))
424 			return 0;
425 
426 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
427 
428 		switch (icmp6->icmp6_type) {
429 		case NDISC_ROUTER_SOLICITATION:
430 		case NDISC_ROUTER_ADVERTISEMENT:
431 		case NDISC_NEIGHBOUR_SOLICITATION:
432 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
433 		case NDISC_REDIRECT:
434 			/* For reaction involving unicast neighbor discovery
435 			 * message destined to the proxied address, pass it to
436 			 * input function.
437 			 */
438 			return 1;
439 		default:
440 			break;
441 		}
442 	}
443 
444 	/*
445 	 * The proxying router can't forward traffic sent to a link-local
446 	 * address, so signal the sender and discard the packet. This
447 	 * behavior is clarified by the MIPv6 specification.
448 	 */
449 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
450 		dst_link_failure(skb);
451 		return -1;
452 	}
453 
454 	return 0;
455 }
456 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)457 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
458 				     struct sk_buff *skb)
459 {
460 	struct dst_entry *dst = skb_dst(skb);
461 
462 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
463 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
464 
465 #ifdef CONFIG_NET_SWITCHDEV
466 	if (skb->offload_l3_fwd_mark) {
467 		consume_skb(skb);
468 		return 0;
469 	}
470 #endif
471 
472 	skb->tstamp = 0;
473 	return dst_output(net, sk, skb);
474 }
475 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)476 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
477 {
478 	if (skb->len <= mtu)
479 		return false;
480 
481 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
482 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
483 		return true;
484 
485 	if (skb->ignore_df)
486 		return false;
487 
488 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
489 		return false;
490 
491 	return true;
492 }
493 
ip6_forward(struct sk_buff * skb)494 int ip6_forward(struct sk_buff *skb)
495 {
496 	struct dst_entry *dst = skb_dst(skb);
497 	struct ipv6hdr *hdr = ipv6_hdr(skb);
498 	struct inet6_skb_parm *opt = IP6CB(skb);
499 	struct net *net = dev_net(dst->dev);
500 	struct inet6_dev *idev;
501 	u32 mtu;
502 
503 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504 	if (net->ipv6.devconf_all->forwarding == 0)
505 		goto error;
506 
507 	if (skb->pkt_type != PACKET_HOST)
508 		goto drop;
509 
510 	if (unlikely(skb->sk))
511 		goto drop;
512 
513 	if (skb_warn_if_lro(skb))
514 		goto drop;
515 
516 	if (!net->ipv6.devconf_all->disable_policy &&
517 	    (!idev || !idev->cnf.disable_policy) &&
518 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520 		goto drop;
521 	}
522 
523 	skb_forward_csum(skb);
524 
525 	/*
526 	 *	We DO NOT make any processing on
527 	 *	RA packets, pushing them to user level AS IS
528 	 *	without ane WARRANTY that application will be able
529 	 *	to interpret them. The reason is that we
530 	 *	cannot make anything clever here.
531 	 *
532 	 *	We are not end-node, so that if packet contains
533 	 *	AH/ESP, we cannot make anything.
534 	 *	Defragmentation also would be mistake, RA packets
535 	 *	cannot be fragmented, because there is no warranty
536 	 *	that different fragments will go along one path. --ANK
537 	 */
538 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540 			return 0;
541 	}
542 
543 	/*
544 	 *	check and decrement ttl
545 	 */
546 	if (hdr->hop_limit <= 1) {
547 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549 
550 		kfree_skb(skb);
551 		return -ETIMEDOUT;
552 	}
553 
554 	/* XXX: idev->cnf.proxy_ndp? */
555 	if (net->ipv6.devconf_all->proxy_ndp &&
556 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557 		int proxied = ip6_forward_proxy_check(skb);
558 		if (proxied > 0)
559 			return ip6_input(skb);
560 		else if (proxied < 0) {
561 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
562 			goto drop;
563 		}
564 	}
565 
566 	if (!xfrm6_route_forward(skb)) {
567 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
568 		goto drop;
569 	}
570 	dst = skb_dst(skb);
571 
572 	/* IPv6 specs say nothing about it, but it is clear that we cannot
573 	   send redirects to source routed frames.
574 	   We don't send redirects to frames decapsulated from IPsec.
575 	 */
576 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
577 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
578 		struct in6_addr *target = NULL;
579 		struct inet_peer *peer;
580 		struct rt6_info *rt;
581 
582 		/*
583 		 *	incoming and outgoing devices are the same
584 		 *	send a redirect.
585 		 */
586 
587 		rt = (struct rt6_info *) dst;
588 		if (rt->rt6i_flags & RTF_GATEWAY)
589 			target = &rt->rt6i_gateway;
590 		else
591 			target = &hdr->daddr;
592 
593 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
594 
595 		/* Limit redirects both by destination (here)
596 		   and by source (inside ndisc_send_redirect)
597 		 */
598 		if (inet_peer_xrlim_allow(peer, 1*HZ))
599 			ndisc_send_redirect(skb, target);
600 		if (peer)
601 			inet_putpeer(peer);
602 	} else {
603 		int addrtype = ipv6_addr_type(&hdr->saddr);
604 
605 		/* This check is security critical. */
606 		if (addrtype == IPV6_ADDR_ANY ||
607 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
608 			goto error;
609 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
610 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
611 				    ICMPV6_NOT_NEIGHBOUR, 0);
612 			goto error;
613 		}
614 	}
615 
616 	mtu = ip6_dst_mtu_forward(dst);
617 	if (mtu < IPV6_MIN_MTU)
618 		mtu = IPV6_MIN_MTU;
619 
620 	if (ip6_pkt_too_big(skb, mtu)) {
621 		/* Again, force OUTPUT device used as source address */
622 		skb->dev = dst->dev;
623 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
624 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
625 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
626 				IPSTATS_MIB_FRAGFAILS);
627 		kfree_skb(skb);
628 		return -EMSGSIZE;
629 	}
630 
631 	if (skb_cow(skb, dst->dev->hard_header_len)) {
632 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
633 				IPSTATS_MIB_OUTDISCARDS);
634 		goto drop;
635 	}
636 
637 	hdr = ipv6_hdr(skb);
638 
639 	/* Mangling hops number delayed to point after skb COW */
640 
641 	hdr->hop_limit--;
642 
643 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
644 		       net, NULL, skb, skb->dev, dst->dev,
645 		       ip6_forward_finish);
646 
647 error:
648 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
649 drop:
650 	kfree_skb(skb);
651 	return -EINVAL;
652 }
653 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)654 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
655 {
656 	to->pkt_type = from->pkt_type;
657 	to->priority = from->priority;
658 	to->protocol = from->protocol;
659 	skb_dst_drop(to);
660 	skb_dst_set(to, dst_clone(skb_dst(from)));
661 	to->dev = from->dev;
662 	to->mark = from->mark;
663 
664 	skb_copy_hash(to, from);
665 
666 #ifdef CONFIG_NET_SCHED
667 	to->tc_index = from->tc_index;
668 #endif
669 	nf_copy(to, from);
670 	skb_ext_copy(to, from);
671 	skb_copy_secmark(to, from);
672 }
673 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)674 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
675 		      u8 nexthdr, __be32 frag_id,
676 		      struct ip6_fraglist_iter *iter)
677 {
678 	unsigned int first_len;
679 	struct frag_hdr *fh;
680 
681 	/* BUILD HEADER */
682 	*prevhdr = NEXTHDR_FRAGMENT;
683 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
684 	if (!iter->tmp_hdr)
685 		return -ENOMEM;
686 
687 	iter->frag = skb_shinfo(skb)->frag_list;
688 	skb_frag_list_init(skb);
689 
690 	iter->offset = 0;
691 	iter->hlen = hlen;
692 	iter->frag_id = frag_id;
693 	iter->nexthdr = nexthdr;
694 
695 	__skb_pull(skb, hlen);
696 	fh = __skb_push(skb, sizeof(struct frag_hdr));
697 	__skb_push(skb, hlen);
698 	skb_reset_network_header(skb);
699 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
700 
701 	fh->nexthdr = nexthdr;
702 	fh->reserved = 0;
703 	fh->frag_off = htons(IP6_MF);
704 	fh->identification = frag_id;
705 
706 	first_len = skb_pagelen(skb);
707 	skb->data_len = first_len - skb_headlen(skb);
708 	skb->len = first_len;
709 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
710 
711 	return 0;
712 }
713 EXPORT_SYMBOL(ip6_fraglist_init);
714 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)715 void ip6_fraglist_prepare(struct sk_buff *skb,
716 			  struct ip6_fraglist_iter *iter)
717 {
718 	struct sk_buff *frag = iter->frag;
719 	unsigned int hlen = iter->hlen;
720 	struct frag_hdr *fh;
721 
722 	frag->ip_summed = CHECKSUM_NONE;
723 	skb_reset_transport_header(frag);
724 	fh = __skb_push(frag, sizeof(struct frag_hdr));
725 	__skb_push(frag, hlen);
726 	skb_reset_network_header(frag);
727 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
728 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
729 	fh->nexthdr = iter->nexthdr;
730 	fh->reserved = 0;
731 	fh->frag_off = htons(iter->offset);
732 	if (frag->next)
733 		fh->frag_off |= htons(IP6_MF);
734 	fh->identification = iter->frag_id;
735 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
736 	ip6_copy_metadata(frag, skb);
737 }
738 EXPORT_SYMBOL(ip6_fraglist_prepare);
739 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)740 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
741 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
742 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
743 {
744 	state->prevhdr = prevhdr;
745 	state->nexthdr = nexthdr;
746 	state->frag_id = frag_id;
747 
748 	state->hlen = hlen;
749 	state->mtu = mtu;
750 
751 	state->left = skb->len - hlen;	/* Space per frame */
752 	state->ptr = hlen;		/* Where to start from */
753 
754 	state->hroom = hdr_room;
755 	state->troom = needed_tailroom;
756 
757 	state->offset = 0;
758 }
759 EXPORT_SYMBOL(ip6_frag_init);
760 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)761 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
762 {
763 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
764 	struct sk_buff *frag;
765 	struct frag_hdr *fh;
766 	unsigned int len;
767 
768 	len = state->left;
769 	/* IF: it doesn't fit, use 'mtu' - the data space left */
770 	if (len > state->mtu)
771 		len = state->mtu;
772 	/* IF: we are not sending up to and including the packet end
773 	   then align the next start on an eight byte boundary */
774 	if (len < state->left)
775 		len &= ~7;
776 
777 	/* Allocate buffer */
778 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
779 			 state->hroom + state->troom, GFP_ATOMIC);
780 	if (!frag)
781 		return ERR_PTR(-ENOMEM);
782 
783 	/*
784 	 *	Set up data on packet
785 	 */
786 
787 	ip6_copy_metadata(frag, skb);
788 	skb_reserve(frag, state->hroom);
789 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
790 	skb_reset_network_header(frag);
791 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
792 	frag->transport_header = (frag->network_header + state->hlen +
793 				  sizeof(struct frag_hdr));
794 
795 	/*
796 	 *	Charge the memory for the fragment to any owner
797 	 *	it might possess
798 	 */
799 	if (skb->sk)
800 		skb_set_owner_w(frag, skb->sk);
801 
802 	/*
803 	 *	Copy the packet header into the new buffer.
804 	 */
805 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
806 
807 	fragnexthdr_offset = skb_network_header(frag);
808 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
809 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
810 
811 	/*
812 	 *	Build fragment header.
813 	 */
814 	fh->nexthdr = state->nexthdr;
815 	fh->reserved = 0;
816 	fh->identification = state->frag_id;
817 
818 	/*
819 	 *	Copy a block of the IP datagram.
820 	 */
821 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
822 			     len));
823 	state->left -= len;
824 
825 	fh->frag_off = htons(state->offset);
826 	if (state->left > 0)
827 		fh->frag_off |= htons(IP6_MF);
828 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
829 
830 	state->ptr += len;
831 	state->offset += len;
832 
833 	return frag;
834 }
835 EXPORT_SYMBOL(ip6_frag_next);
836 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))837 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
838 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
839 {
840 	struct sk_buff *frag;
841 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
842 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
843 				inet6_sk(skb->sk) : NULL;
844 	struct ip6_frag_state state;
845 	unsigned int mtu, hlen, nexthdr_offset;
846 	ktime_t tstamp = skb->tstamp;
847 	int hroom, err = 0;
848 	__be32 frag_id;
849 	u8 *prevhdr, nexthdr = 0;
850 
851 	err = ip6_find_1stfragopt(skb, &prevhdr);
852 	if (err < 0)
853 		goto fail;
854 	hlen = err;
855 	nexthdr = *prevhdr;
856 	nexthdr_offset = prevhdr - skb_network_header(skb);
857 
858 	mtu = ip6_skb_dst_mtu(skb);
859 
860 	/* We must not fragment if the socket is set to force MTU discovery
861 	 * or if the skb it not generated by a local socket.
862 	 */
863 	if (unlikely(!skb->ignore_df && skb->len > mtu))
864 		goto fail_toobig;
865 
866 	if (IP6CB(skb)->frag_max_size) {
867 		if (IP6CB(skb)->frag_max_size > mtu)
868 			goto fail_toobig;
869 
870 		/* don't send fragments larger than what we received */
871 		mtu = IP6CB(skb)->frag_max_size;
872 		if (mtu < IPV6_MIN_MTU)
873 			mtu = IPV6_MIN_MTU;
874 	}
875 
876 	if (np && np->frag_size < mtu) {
877 		if (np->frag_size)
878 			mtu = np->frag_size;
879 	}
880 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
881 		goto fail_toobig;
882 	mtu -= hlen + sizeof(struct frag_hdr);
883 
884 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
885 				    &ipv6_hdr(skb)->saddr);
886 
887 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
888 	    (err = skb_checksum_help(skb)))
889 		goto fail;
890 
891 	prevhdr = skb_network_header(skb) + nexthdr_offset;
892 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
893 	if (skb_has_frag_list(skb)) {
894 		unsigned int first_len = skb_pagelen(skb);
895 		struct ip6_fraglist_iter iter;
896 		struct sk_buff *frag2;
897 
898 		if (first_len - hlen > mtu ||
899 		    ((first_len - hlen) & 7) ||
900 		    skb_cloned(skb) ||
901 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
902 			goto slow_path;
903 
904 		skb_walk_frags(skb, frag) {
905 			/* Correct geometry. */
906 			if (frag->len > mtu ||
907 			    ((frag->len & 7) && frag->next) ||
908 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
909 				goto slow_path_clean;
910 
911 			/* Partially cloned skb? */
912 			if (skb_shared(frag))
913 				goto slow_path_clean;
914 
915 			BUG_ON(frag->sk);
916 			if (skb->sk) {
917 				frag->sk = skb->sk;
918 				frag->destructor = sock_wfree;
919 			}
920 			skb->truesize -= frag->truesize;
921 		}
922 
923 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
924 					&iter);
925 		if (err < 0)
926 			goto fail;
927 
928 		/* We prevent @rt from being freed. */
929 		rcu_read_lock();
930 
931 		for (;;) {
932 			/* Prepare header of the next frame,
933 			 * before previous one went down. */
934 			if (iter.frag)
935 				ip6_fraglist_prepare(skb, &iter);
936 
937 			skb->tstamp = tstamp;
938 			err = output(net, sk, skb);
939 			if (!err)
940 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
941 					      IPSTATS_MIB_FRAGCREATES);
942 
943 			if (err || !iter.frag)
944 				break;
945 
946 			skb = ip6_fraglist_next(&iter);
947 		}
948 
949 		kfree(iter.tmp_hdr);
950 
951 		if (err == 0) {
952 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
953 				      IPSTATS_MIB_FRAGOKS);
954 			rcu_read_unlock();
955 			return 0;
956 		}
957 
958 		kfree_skb_list(iter.frag);
959 
960 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 			      IPSTATS_MIB_FRAGFAILS);
962 		rcu_read_unlock();
963 		return err;
964 
965 slow_path_clean:
966 		skb_walk_frags(skb, frag2) {
967 			if (frag2 == frag)
968 				break;
969 			frag2->sk = NULL;
970 			frag2->destructor = NULL;
971 			skb->truesize += frag2->truesize;
972 		}
973 	}
974 
975 slow_path:
976 	/*
977 	 *	Fragment the datagram.
978 	 */
979 
980 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
981 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
982 		      &state);
983 
984 	/*
985 	 *	Keep copying data until we run out.
986 	 */
987 
988 	while (state.left > 0) {
989 		frag = ip6_frag_next(skb, &state);
990 		if (IS_ERR(frag)) {
991 			err = PTR_ERR(frag);
992 			goto fail;
993 		}
994 
995 		/*
996 		 *	Put this fragment into the sending queue.
997 		 */
998 		frag->tstamp = tstamp;
999 		err = output(net, sk, frag);
1000 		if (err)
1001 			goto fail;
1002 
1003 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1004 			      IPSTATS_MIB_FRAGCREATES);
1005 	}
1006 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1007 		      IPSTATS_MIB_FRAGOKS);
1008 	consume_skb(skb);
1009 	return err;
1010 
1011 fail_toobig:
1012 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1013 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1014 
1015 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1016 	err = -EMSGSIZE;
1017 
1018 fail:
1019 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1020 		      IPSTATS_MIB_FRAGFAILS);
1021 	kfree_skb(skb);
1022 	return err;
1023 }
1024 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1025 static inline int ip6_rt_check(const struct rt6key *rt_key,
1026 			       const struct in6_addr *fl_addr,
1027 			       const struct in6_addr *addr_cache)
1028 {
1029 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1030 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1031 }
1032 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1033 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1034 					  struct dst_entry *dst,
1035 					  const struct flowi6 *fl6)
1036 {
1037 	struct ipv6_pinfo *np = inet6_sk(sk);
1038 	struct rt6_info *rt;
1039 
1040 	if (!dst)
1041 		goto out;
1042 
1043 	if (dst->ops->family != AF_INET6) {
1044 		dst_release(dst);
1045 		return NULL;
1046 	}
1047 
1048 	rt = (struct rt6_info *)dst;
1049 	/* Yes, checking route validity in not connected
1050 	 * case is not very simple. Take into account,
1051 	 * that we do not support routing by source, TOS,
1052 	 * and MSG_DONTROUTE		--ANK (980726)
1053 	 *
1054 	 * 1. ip6_rt_check(): If route was host route,
1055 	 *    check that cached destination is current.
1056 	 *    If it is network route, we still may
1057 	 *    check its validity using saved pointer
1058 	 *    to the last used address: daddr_cache.
1059 	 *    We do not want to save whole address now,
1060 	 *    (because main consumer of this service
1061 	 *    is tcp, which has not this problem),
1062 	 *    so that the last trick works only on connected
1063 	 *    sockets.
1064 	 * 2. oif also should be the same.
1065 	 */
1066 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1067 #ifdef CONFIG_IPV6_SUBTREES
1068 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1069 #endif
1070 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1071 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1072 		dst_release(dst);
1073 		dst = NULL;
1074 	}
1075 
1076 out:
1077 	return dst;
1078 }
1079 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1080 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1081 			       struct dst_entry **dst, struct flowi6 *fl6)
1082 {
1083 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1084 	struct neighbour *n;
1085 	struct rt6_info *rt;
1086 #endif
1087 	int err;
1088 	int flags = 0;
1089 
1090 	/* The correct way to handle this would be to do
1091 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1092 	 * the route-specific preferred source forces the
1093 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1094 	 *
1095 	 * In source specific routing (no src=any default route),
1096 	 * ip6_route_output will fail given src=any saddr, though, so
1097 	 * that's why we try it again later.
1098 	 */
1099 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1100 		struct fib6_info *from;
1101 		struct rt6_info *rt;
1102 		bool had_dst = *dst != NULL;
1103 
1104 		if (!had_dst)
1105 			*dst = ip6_route_output(net, sk, fl6);
1106 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1107 
1108 		rcu_read_lock();
1109 		from = rt ? rcu_dereference(rt->from) : NULL;
1110 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1111 					  sk ? inet6_sk(sk)->srcprefs : 0,
1112 					  &fl6->saddr);
1113 		rcu_read_unlock();
1114 
1115 		if (err)
1116 			goto out_err_release;
1117 
1118 		/* If we had an erroneous initial result, pretend it
1119 		 * never existed and let the SA-enabled version take
1120 		 * over.
1121 		 */
1122 		if (!had_dst && (*dst)->error) {
1123 			dst_release(*dst);
1124 			*dst = NULL;
1125 		}
1126 
1127 		if (fl6->flowi6_oif)
1128 			flags |= RT6_LOOKUP_F_IFACE;
1129 	}
1130 
1131 	if (!*dst)
1132 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1133 
1134 	err = (*dst)->error;
1135 	if (err)
1136 		goto out_err_release;
1137 
1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1139 	/*
1140 	 * Here if the dst entry we've looked up
1141 	 * has a neighbour entry that is in the INCOMPLETE
1142 	 * state and the src address from the flow is
1143 	 * marked as OPTIMISTIC, we release the found
1144 	 * dst entry and replace it instead with the
1145 	 * dst entry of the nexthop router
1146 	 */
1147 	rt = (struct rt6_info *) *dst;
1148 	rcu_read_lock_bh();
1149 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1150 				      rt6_nexthop(rt, &fl6->daddr));
1151 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1152 	rcu_read_unlock_bh();
1153 
1154 	if (err) {
1155 		struct inet6_ifaddr *ifp;
1156 		struct flowi6 fl_gw6;
1157 		int redirect;
1158 
1159 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1160 				      (*dst)->dev, 1);
1161 
1162 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1163 		if (ifp)
1164 			in6_ifa_put(ifp);
1165 
1166 		if (redirect) {
1167 			/*
1168 			 * We need to get the dst entry for the
1169 			 * default router instead
1170 			 */
1171 			dst_release(*dst);
1172 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1173 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1174 			*dst = ip6_route_output(net, sk, &fl_gw6);
1175 			err = (*dst)->error;
1176 			if (err)
1177 				goto out_err_release;
1178 		}
1179 	}
1180 #endif
1181 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1182 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1183 		err = -EAFNOSUPPORT;
1184 		goto out_err_release;
1185 	}
1186 
1187 	return 0;
1188 
1189 out_err_release:
1190 	dst_release(*dst);
1191 	*dst = NULL;
1192 
1193 	if (err == -ENETUNREACH)
1194 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1195 	return err;
1196 }
1197 
1198 /**
1199  *	ip6_dst_lookup - perform route lookup on flow
1200  *	@net: Network namespace to perform lookup in
1201  *	@sk: socket which provides route info
1202  *	@dst: pointer to dst_entry * for result
1203  *	@fl6: flow to lookup
1204  *
1205  *	This function performs a route lookup on the given flow.
1206  *
1207  *	It returns zero on success, or a standard errno code on error.
1208  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1209 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1210 		   struct flowi6 *fl6)
1211 {
1212 	*dst = NULL;
1213 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1214 }
1215 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1216 
1217 /**
1218  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1219  *	@net: Network namespace to perform lookup in
1220  *	@sk: socket which provides route info
1221  *	@fl6: flow to lookup
1222  *	@final_dst: final destination address for ipsec lookup
1223  *
1224  *	This function performs a route lookup on the given flow.
1225  *
1226  *	It returns a valid dst pointer on success, or a pointer encoded
1227  *	error code.
1228  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1229 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1230 				      const struct in6_addr *final_dst)
1231 {
1232 	struct dst_entry *dst = NULL;
1233 	int err;
1234 
1235 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1236 	if (err)
1237 		return ERR_PTR(err);
1238 	if (final_dst)
1239 		fl6->daddr = *final_dst;
1240 
1241 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1244 
1245 /**
1246  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1247  *	@sk: socket which provides the dst cache and route info
1248  *	@fl6: flow to lookup
1249  *	@final_dst: final destination address for ipsec lookup
1250  *	@connected: whether @sk is connected or not
1251  *
1252  *	This function performs a route lookup on the given flow with the
1253  *	possibility of using the cached route in the socket if it is valid.
1254  *	It will take the socket dst lock when operating on the dst cache.
1255  *	As a result, this function can only be used in process context.
1256  *
1257  *	In addition, for a connected socket, cache the dst in the socket
1258  *	if the current cache is not valid.
1259  *
1260  *	It returns a valid dst pointer on success, or a pointer encoded
1261  *	error code.
1262  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1263 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1264 					 const struct in6_addr *final_dst,
1265 					 bool connected)
1266 {
1267 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1268 
1269 	dst = ip6_sk_dst_check(sk, dst, fl6);
1270 	if (dst)
1271 		return dst;
1272 
1273 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1274 	if (connected && !IS_ERR(dst))
1275 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1276 
1277 	return dst;
1278 }
1279 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1280 
1281 /**
1282  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1283  *      @skb: Packet for which lookup is done
1284  *      @dev: Tunnel device
1285  *      @net: Network namespace of tunnel device
1286  *      @sock: Socket which provides route info
1287  *      @saddr: Memory to store the src ip address
1288  *      @info: Tunnel information
1289  *      @protocol: IP protocol
1290  *      @use_cache: Flag to enable cache usage
1291  *      This function performs a route lookup on a tunnel
1292  *
1293  *      It returns a valid dst pointer and stores src address to be used in
1294  *      tunnel in param saddr on success, else a pointer encoded error code.
1295  */
1296 
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1297 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1298 					struct net_device *dev,
1299 					struct net *net,
1300 					struct socket *sock,
1301 					struct in6_addr *saddr,
1302 					const struct ip_tunnel_info *info,
1303 					u8 protocol,
1304 					bool use_cache)
1305 {
1306 	struct dst_entry *dst = NULL;
1307 #ifdef CONFIG_DST_CACHE
1308 	struct dst_cache *dst_cache;
1309 #endif
1310 	struct flowi6 fl6;
1311 	__u8 prio;
1312 
1313 #ifdef CONFIG_DST_CACHE
1314 	dst_cache = (struct dst_cache *)&info->dst_cache;
1315 	if (use_cache) {
1316 		dst = dst_cache_get_ip6(dst_cache, saddr);
1317 		if (dst)
1318 			return dst;
1319 	}
1320 #endif
1321 	memset(&fl6, 0, sizeof(fl6));
1322 	fl6.flowi6_mark = skb->mark;
1323 	fl6.flowi6_proto = protocol;
1324 	fl6.daddr = info->key.u.ipv6.dst;
1325 	fl6.saddr = info->key.u.ipv6.src;
1326 	prio = info->key.tos;
1327 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1328 
1329 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1330 					      NULL);
1331 	if (IS_ERR(dst)) {
1332 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1333 		return ERR_PTR(-ENETUNREACH);
1334 	}
1335 	if (dst->dev == dev) { /* is this necessary? */
1336 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1337 		dst_release(dst);
1338 		return ERR_PTR(-ELOOP);
1339 	}
1340 #ifdef CONFIG_DST_CACHE
1341 	if (use_cache)
1342 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1343 #endif
1344 	*saddr = fl6.saddr;
1345 	return dst;
1346 }
1347 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1348 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1349 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1350 					       gfp_t gfp)
1351 {
1352 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1353 }
1354 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1355 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1356 						gfp_t gfp)
1357 {
1358 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1361 static void ip6_append_data_mtu(unsigned int *mtu,
1362 				int *maxfraglen,
1363 				unsigned int fragheaderlen,
1364 				struct sk_buff *skb,
1365 				struct rt6_info *rt,
1366 				unsigned int orig_mtu)
1367 {
1368 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1369 		if (!skb) {
1370 			/* first fragment, reserve header_len */
1371 			*mtu = orig_mtu - rt->dst.header_len;
1372 
1373 		} else {
1374 			/*
1375 			 * this fragment is not first, the headers
1376 			 * space is regarded as data space.
1377 			 */
1378 			*mtu = orig_mtu;
1379 		}
1380 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1381 			      + fragheaderlen - sizeof(struct frag_hdr);
1382 	}
1383 }
1384 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1385 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1386 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1387 			  struct rt6_info *rt, struct flowi6 *fl6)
1388 {
1389 	struct ipv6_pinfo *np = inet6_sk(sk);
1390 	unsigned int mtu;
1391 	struct ipv6_txoptions *opt = ipc6->opt;
1392 
1393 	/*
1394 	 * setup for corking
1395 	 */
1396 	if (opt) {
1397 		if (WARN_ON(v6_cork->opt))
1398 			return -EINVAL;
1399 
1400 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1401 		if (unlikely(!v6_cork->opt))
1402 			return -ENOBUFS;
1403 
1404 		v6_cork->opt->tot_len = sizeof(*opt);
1405 		v6_cork->opt->opt_flen = opt->opt_flen;
1406 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1407 
1408 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1409 						    sk->sk_allocation);
1410 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1411 			return -ENOBUFS;
1412 
1413 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1414 						    sk->sk_allocation);
1415 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1416 			return -ENOBUFS;
1417 
1418 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1419 						   sk->sk_allocation);
1420 		if (opt->hopopt && !v6_cork->opt->hopopt)
1421 			return -ENOBUFS;
1422 
1423 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1424 						    sk->sk_allocation);
1425 		if (opt->srcrt && !v6_cork->opt->srcrt)
1426 			return -ENOBUFS;
1427 
1428 		/* need source address above miyazawa*/
1429 	}
1430 	dst_hold(&rt->dst);
1431 	cork->base.dst = &rt->dst;
1432 	cork->fl.u.ip6 = *fl6;
1433 	v6_cork->hop_limit = ipc6->hlimit;
1434 	v6_cork->tclass = ipc6->tclass;
1435 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1436 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1437 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1438 	else
1439 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1441 	if (np->frag_size < mtu) {
1442 		if (np->frag_size)
1443 			mtu = np->frag_size;
1444 	}
1445 	cork->base.fragsize = mtu;
1446 	cork->base.gso_size = ipc6->gso_size;
1447 	cork->base.tx_flags = 0;
1448 	cork->base.mark = ipc6->sockc.mark;
1449 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1450 
1451 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1452 		cork->base.flags |= IPCORK_ALLFRAG;
1453 	cork->base.length = 0;
1454 
1455 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1456 
1457 	return 0;
1458 }
1459 
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1460 static int __ip6_append_data(struct sock *sk,
1461 			     struct flowi6 *fl6,
1462 			     struct sk_buff_head *queue,
1463 			     struct inet_cork *cork,
1464 			     struct inet6_cork *v6_cork,
1465 			     struct page_frag *pfrag,
1466 			     int getfrag(void *from, char *to, int offset,
1467 					 int len, int odd, struct sk_buff *skb),
1468 			     void *from, int length, int transhdrlen,
1469 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1470 {
1471 	struct sk_buff *skb, *skb_prev = NULL;
1472 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1473 	struct ubuf_info *uarg = NULL;
1474 	int exthdrlen = 0;
1475 	int dst_exthdrlen = 0;
1476 	int hh_len;
1477 	int copy;
1478 	int err;
1479 	int offset = 0;
1480 	u32 tskey = 0;
1481 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1482 	struct ipv6_txoptions *opt = v6_cork->opt;
1483 	int csummode = CHECKSUM_NONE;
1484 	unsigned int maxnonfragsize, headersize;
1485 	unsigned int wmem_alloc_delta = 0;
1486 	bool paged, extra_uref = false;
1487 
1488 	skb = skb_peek_tail(queue);
1489 	if (!skb) {
1490 		exthdrlen = opt ? opt->opt_flen : 0;
1491 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1492 	}
1493 
1494 	paged = !!cork->gso_size;
1495 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1496 	orig_mtu = mtu;
1497 
1498 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1499 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1500 		tskey = sk->sk_tskey++;
1501 
1502 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1503 
1504 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1505 			(opt ? opt->opt_nflen : 0);
1506 
1507 	headersize = sizeof(struct ipv6hdr) +
1508 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1509 		     (dst_allfrag(&rt->dst) ?
1510 		      sizeof(struct frag_hdr) : 0) +
1511 		     rt->rt6i_nfheader_len;
1512 
1513 	if (mtu <= fragheaderlen ||
1514 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1515 		goto emsgsize;
1516 
1517 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1518 		     sizeof(struct frag_hdr);
1519 
1520 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1521 	 * the first fragment
1522 	 */
1523 	if (headersize + transhdrlen > mtu)
1524 		goto emsgsize;
1525 
1526 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1527 	    (sk->sk_protocol == IPPROTO_UDP ||
1528 	     sk->sk_protocol == IPPROTO_RAW)) {
1529 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1530 				sizeof(struct ipv6hdr));
1531 		goto emsgsize;
1532 	}
1533 
1534 	if (ip6_sk_ignore_df(sk))
1535 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1536 	else
1537 		maxnonfragsize = mtu;
1538 
1539 	if (cork->length + length > maxnonfragsize - headersize) {
1540 emsgsize:
1541 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1542 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1543 		return -EMSGSIZE;
1544 	}
1545 
1546 	/* CHECKSUM_PARTIAL only with no extension headers and when
1547 	 * we are not going to fragment
1548 	 */
1549 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1550 	    headersize == sizeof(struct ipv6hdr) &&
1551 	    length <= mtu - headersize &&
1552 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1553 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1554 		csummode = CHECKSUM_PARTIAL;
1555 
1556 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1557 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1558 		if (!uarg)
1559 			return -ENOBUFS;
1560 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1561 		if (rt->dst.dev->features & NETIF_F_SG &&
1562 		    csummode == CHECKSUM_PARTIAL) {
1563 			paged = true;
1564 		} else {
1565 			uarg->zerocopy = 0;
1566 			skb_zcopy_set(skb, uarg, &extra_uref);
1567 		}
1568 	}
1569 
1570 	/*
1571 	 * Let's try using as much space as possible.
1572 	 * Use MTU if total length of the message fits into the MTU.
1573 	 * Otherwise, we need to reserve fragment header and
1574 	 * fragment alignment (= 8-15 octects, in total).
1575 	 *
1576 	 * Note that we may need to "move" the data from the tail
1577 	 * of the buffer to the new fragment when we split
1578 	 * the message.
1579 	 *
1580 	 * FIXME: It may be fragmented into multiple chunks
1581 	 *        at once if non-fragmentable extension headers
1582 	 *        are too large.
1583 	 * --yoshfuji
1584 	 */
1585 
1586 	cork->length += length;
1587 	if (!skb)
1588 		goto alloc_new_skb;
1589 
1590 	while (length > 0) {
1591 		/* Check if the remaining data fits into current packet. */
1592 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1593 		if (copy < length)
1594 			copy = maxfraglen - skb->len;
1595 
1596 		if (copy <= 0) {
1597 			char *data;
1598 			unsigned int datalen;
1599 			unsigned int fraglen;
1600 			unsigned int fraggap;
1601 			unsigned int alloclen, alloc_extra;
1602 			unsigned int pagedlen;
1603 alloc_new_skb:
1604 			/* There's no room in the current skb */
1605 			if (skb)
1606 				fraggap = skb->len - maxfraglen;
1607 			else
1608 				fraggap = 0;
1609 			/* update mtu and maxfraglen if necessary */
1610 			if (!skb || !skb_prev)
1611 				ip6_append_data_mtu(&mtu, &maxfraglen,
1612 						    fragheaderlen, skb, rt,
1613 						    orig_mtu);
1614 
1615 			skb_prev = skb;
1616 
1617 			/*
1618 			 * If remaining data exceeds the mtu,
1619 			 * we know we need more fragment(s).
1620 			 */
1621 			datalen = length + fraggap;
1622 
1623 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1624 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1625 			fraglen = datalen + fragheaderlen;
1626 			pagedlen = 0;
1627 
1628 			alloc_extra = hh_len;
1629 			alloc_extra += dst_exthdrlen;
1630 			alloc_extra += rt->dst.trailer_len;
1631 
1632 			/* We just reserve space for fragment header.
1633 			 * Note: this may be overallocation if the message
1634 			 * (without MSG_MORE) fits into the MTU.
1635 			 */
1636 			alloc_extra += sizeof(struct frag_hdr);
1637 
1638 			if ((flags & MSG_MORE) &&
1639 			    !(rt->dst.dev->features&NETIF_F_SG))
1640 				alloclen = mtu;
1641 			else if (!paged &&
1642 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1643 				  !(rt->dst.dev->features & NETIF_F_SG)))
1644 				alloclen = fraglen;
1645 			else {
1646 				alloclen = min_t(int, fraglen, MAX_HEADER);
1647 				pagedlen = fraglen - alloclen;
1648 			}
1649 			alloclen += alloc_extra;
1650 
1651 			if (datalen != length + fraggap) {
1652 				/*
1653 				 * this is not the last fragment, the trailer
1654 				 * space is regarded as data space.
1655 				 */
1656 				datalen += rt->dst.trailer_len;
1657 			}
1658 
1659 			fraglen = datalen + fragheaderlen;
1660 
1661 			copy = datalen - transhdrlen - fraggap - pagedlen;
1662 			if (copy < 0) {
1663 				err = -EINVAL;
1664 				goto error;
1665 			}
1666 			if (transhdrlen) {
1667 				skb = sock_alloc_send_skb(sk, alloclen,
1668 						(flags & MSG_DONTWAIT), &err);
1669 			} else {
1670 				skb = NULL;
1671 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1672 				    2 * sk->sk_sndbuf)
1673 					skb = alloc_skb(alloclen,
1674 							sk->sk_allocation);
1675 				if (unlikely(!skb))
1676 					err = -ENOBUFS;
1677 			}
1678 			if (!skb)
1679 				goto error;
1680 			/*
1681 			 *	Fill in the control structures
1682 			 */
1683 			skb->protocol = htons(ETH_P_IPV6);
1684 			skb->ip_summed = csummode;
1685 			skb->csum = 0;
1686 			/* reserve for fragmentation and ipsec header */
1687 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1688 				    dst_exthdrlen);
1689 
1690 			/*
1691 			 *	Find where to start putting bytes
1692 			 */
1693 			data = skb_put(skb, fraglen - pagedlen);
1694 			skb_set_network_header(skb, exthdrlen);
1695 			data += fragheaderlen;
1696 			skb->transport_header = (skb->network_header +
1697 						 fragheaderlen);
1698 			if (fraggap) {
1699 				skb->csum = skb_copy_and_csum_bits(
1700 					skb_prev, maxfraglen,
1701 					data + transhdrlen, fraggap);
1702 				skb_prev->csum = csum_sub(skb_prev->csum,
1703 							  skb->csum);
1704 				data += fraggap;
1705 				pskb_trim_unique(skb_prev, maxfraglen);
1706 			}
1707 			if (copy > 0 &&
1708 			    getfrag(from, data + transhdrlen, offset,
1709 				    copy, fraggap, skb) < 0) {
1710 				err = -EFAULT;
1711 				kfree_skb(skb);
1712 				goto error;
1713 			}
1714 
1715 			offset += copy;
1716 			length -= copy + transhdrlen;
1717 			transhdrlen = 0;
1718 			exthdrlen = 0;
1719 			dst_exthdrlen = 0;
1720 
1721 			/* Only the initial fragment is time stamped */
1722 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1723 			cork->tx_flags = 0;
1724 			skb_shinfo(skb)->tskey = tskey;
1725 			tskey = 0;
1726 			skb_zcopy_set(skb, uarg, &extra_uref);
1727 
1728 			if ((flags & MSG_CONFIRM) && !skb_prev)
1729 				skb_set_dst_pending_confirm(skb, 1);
1730 
1731 			/*
1732 			 * Put the packet on the pending queue
1733 			 */
1734 			if (!skb->destructor) {
1735 				skb->destructor = sock_wfree;
1736 				skb->sk = sk;
1737 				wmem_alloc_delta += skb->truesize;
1738 			}
1739 			__skb_queue_tail(queue, skb);
1740 			continue;
1741 		}
1742 
1743 		if (copy > length)
1744 			copy = length;
1745 
1746 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1747 		    skb_tailroom(skb) >= copy) {
1748 			unsigned int off;
1749 
1750 			off = skb->len;
1751 			if (getfrag(from, skb_put(skb, copy),
1752 						offset, copy, off, skb) < 0) {
1753 				__skb_trim(skb, off);
1754 				err = -EFAULT;
1755 				goto error;
1756 			}
1757 		} else if (!uarg || !uarg->zerocopy) {
1758 			int i = skb_shinfo(skb)->nr_frags;
1759 
1760 			err = -ENOMEM;
1761 			if (!sk_page_frag_refill(sk, pfrag))
1762 				goto error;
1763 
1764 			if (!skb_can_coalesce(skb, i, pfrag->page,
1765 					      pfrag->offset)) {
1766 				err = -EMSGSIZE;
1767 				if (i == MAX_SKB_FRAGS)
1768 					goto error;
1769 
1770 				__skb_fill_page_desc(skb, i, pfrag->page,
1771 						     pfrag->offset, 0);
1772 				skb_shinfo(skb)->nr_frags = ++i;
1773 				get_page(pfrag->page);
1774 			}
1775 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1776 			if (getfrag(from,
1777 				    page_address(pfrag->page) + pfrag->offset,
1778 				    offset, copy, skb->len, skb) < 0)
1779 				goto error_efault;
1780 
1781 			pfrag->offset += copy;
1782 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1783 			skb->len += copy;
1784 			skb->data_len += copy;
1785 			skb->truesize += copy;
1786 			wmem_alloc_delta += copy;
1787 		} else {
1788 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1789 			if (err < 0)
1790 				goto error;
1791 		}
1792 		offset += copy;
1793 		length -= copy;
1794 	}
1795 
1796 	if (wmem_alloc_delta)
1797 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1798 	return 0;
1799 
1800 error_efault:
1801 	err = -EFAULT;
1802 error:
1803 	if (uarg)
1804 		sock_zerocopy_put_abort(uarg, extra_uref);
1805 	cork->length -= length;
1806 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1807 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1808 	return err;
1809 }
1810 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1811 int ip6_append_data(struct sock *sk,
1812 		    int getfrag(void *from, char *to, int offset, int len,
1813 				int odd, struct sk_buff *skb),
1814 		    void *from, int length, int transhdrlen,
1815 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1816 		    struct rt6_info *rt, unsigned int flags)
1817 {
1818 	struct inet_sock *inet = inet_sk(sk);
1819 	struct ipv6_pinfo *np = inet6_sk(sk);
1820 	int exthdrlen;
1821 	int err;
1822 
1823 	if (flags&MSG_PROBE)
1824 		return 0;
1825 	if (skb_queue_empty(&sk->sk_write_queue)) {
1826 		/*
1827 		 * setup for corking
1828 		 */
1829 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1830 				     ipc6, rt, fl6);
1831 		if (err)
1832 			return err;
1833 
1834 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1835 		length += exthdrlen;
1836 		transhdrlen += exthdrlen;
1837 	} else {
1838 		fl6 = &inet->cork.fl.u.ip6;
1839 		transhdrlen = 0;
1840 	}
1841 
1842 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1843 				 &np->cork, sk_page_frag(sk), getfrag,
1844 				 from, length, transhdrlen, flags, ipc6);
1845 }
1846 EXPORT_SYMBOL_GPL(ip6_append_data);
1847 
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1848 static void ip6_cork_release(struct inet_cork_full *cork,
1849 			     struct inet6_cork *v6_cork)
1850 {
1851 	if (v6_cork->opt) {
1852 		kfree(v6_cork->opt->dst0opt);
1853 		kfree(v6_cork->opt->dst1opt);
1854 		kfree(v6_cork->opt->hopopt);
1855 		kfree(v6_cork->opt->srcrt);
1856 		kfree(v6_cork->opt);
1857 		v6_cork->opt = NULL;
1858 	}
1859 
1860 	if (cork->base.dst) {
1861 		dst_release(cork->base.dst);
1862 		cork->base.dst = NULL;
1863 		cork->base.flags &= ~IPCORK_ALLFRAG;
1864 	}
1865 	memset(&cork->fl, 0, sizeof(cork->fl));
1866 }
1867 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1868 struct sk_buff *__ip6_make_skb(struct sock *sk,
1869 			       struct sk_buff_head *queue,
1870 			       struct inet_cork_full *cork,
1871 			       struct inet6_cork *v6_cork)
1872 {
1873 	struct sk_buff *skb, *tmp_skb;
1874 	struct sk_buff **tail_skb;
1875 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1876 	struct ipv6_pinfo *np = inet6_sk(sk);
1877 	struct net *net = sock_net(sk);
1878 	struct ipv6hdr *hdr;
1879 	struct ipv6_txoptions *opt = v6_cork->opt;
1880 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1881 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1882 	unsigned char proto = fl6->flowi6_proto;
1883 
1884 	skb = __skb_dequeue(queue);
1885 	if (!skb)
1886 		goto out;
1887 	tail_skb = &(skb_shinfo(skb)->frag_list);
1888 
1889 	/* move skb->data to ip header from ext header */
1890 	if (skb->data < skb_network_header(skb))
1891 		__skb_pull(skb, skb_network_offset(skb));
1892 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1893 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1894 		*tail_skb = tmp_skb;
1895 		tail_skb = &(tmp_skb->next);
1896 		skb->len += tmp_skb->len;
1897 		skb->data_len += tmp_skb->len;
1898 		skb->truesize += tmp_skb->truesize;
1899 		tmp_skb->destructor = NULL;
1900 		tmp_skb->sk = NULL;
1901 	}
1902 
1903 	/* Allow local fragmentation. */
1904 	skb->ignore_df = ip6_sk_ignore_df(sk);
1905 
1906 	*final_dst = fl6->daddr;
1907 	__skb_pull(skb, skb_network_header_len(skb));
1908 	if (opt && opt->opt_flen)
1909 		ipv6_push_frag_opts(skb, opt, &proto);
1910 	if (opt && opt->opt_nflen)
1911 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1912 
1913 	skb_push(skb, sizeof(struct ipv6hdr));
1914 	skb_reset_network_header(skb);
1915 	hdr = ipv6_hdr(skb);
1916 
1917 	ip6_flow_hdr(hdr, v6_cork->tclass,
1918 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1919 					ip6_autoflowlabel(net, np), fl6));
1920 	hdr->hop_limit = v6_cork->hop_limit;
1921 	hdr->nexthdr = proto;
1922 	hdr->saddr = fl6->saddr;
1923 	hdr->daddr = *final_dst;
1924 
1925 	skb->priority = sk->sk_priority;
1926 	skb->mark = cork->base.mark;
1927 
1928 	skb->tstamp = cork->base.transmit_time;
1929 
1930 	skb_dst_set(skb, dst_clone(&rt->dst));
1931 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1932 	if (proto == IPPROTO_ICMPV6) {
1933 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1934 		u8 icmp6_type;
1935 
1936 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1937 			icmp6_type = fl6->fl6_icmp_type;
1938 		else
1939 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1940 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1941 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1942 	}
1943 
1944 	ip6_cork_release(cork, v6_cork);
1945 out:
1946 	return skb;
1947 }
1948 
ip6_send_skb(struct sk_buff * skb)1949 int ip6_send_skb(struct sk_buff *skb)
1950 {
1951 	struct net *net = sock_net(skb->sk);
1952 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1953 	int err;
1954 
1955 	err = ip6_local_out(net, skb->sk, skb);
1956 	if (err) {
1957 		if (err > 0)
1958 			err = net_xmit_errno(err);
1959 		if (err)
1960 			IP6_INC_STATS(net, rt->rt6i_idev,
1961 				      IPSTATS_MIB_OUTDISCARDS);
1962 	}
1963 
1964 	return err;
1965 }
1966 
ip6_push_pending_frames(struct sock * sk)1967 int ip6_push_pending_frames(struct sock *sk)
1968 {
1969 	struct sk_buff *skb;
1970 
1971 	skb = ip6_finish_skb(sk);
1972 	if (!skb)
1973 		return 0;
1974 
1975 	return ip6_send_skb(skb);
1976 }
1977 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1978 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1979 static void __ip6_flush_pending_frames(struct sock *sk,
1980 				       struct sk_buff_head *queue,
1981 				       struct inet_cork_full *cork,
1982 				       struct inet6_cork *v6_cork)
1983 {
1984 	struct sk_buff *skb;
1985 
1986 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1987 		if (skb_dst(skb))
1988 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1989 				      IPSTATS_MIB_OUTDISCARDS);
1990 		kfree_skb(skb);
1991 	}
1992 
1993 	ip6_cork_release(cork, v6_cork);
1994 }
1995 
ip6_flush_pending_frames(struct sock * sk)1996 void ip6_flush_pending_frames(struct sock *sk)
1997 {
1998 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1999 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2000 }
2001 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2002 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2003 struct sk_buff *ip6_make_skb(struct sock *sk,
2004 			     int getfrag(void *from, char *to, int offset,
2005 					 int len, int odd, struct sk_buff *skb),
2006 			     void *from, int length, int transhdrlen,
2007 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
2008 			     struct rt6_info *rt, unsigned int flags,
2009 			     struct inet_cork_full *cork)
2010 {
2011 	struct inet6_cork v6_cork;
2012 	struct sk_buff_head queue;
2013 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2014 	int err;
2015 
2016 	if (flags & MSG_PROBE)
2017 		return NULL;
2018 
2019 	__skb_queue_head_init(&queue);
2020 
2021 	cork->base.flags = 0;
2022 	cork->base.addr = 0;
2023 	cork->base.opt = NULL;
2024 	cork->base.dst = NULL;
2025 	v6_cork.opt = NULL;
2026 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2027 	if (err) {
2028 		ip6_cork_release(cork, &v6_cork);
2029 		return ERR_PTR(err);
2030 	}
2031 	if (ipc6->dontfrag < 0)
2032 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2033 
2034 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2035 				&current->task_frag, getfrag, from,
2036 				length + exthdrlen, transhdrlen + exthdrlen,
2037 				flags, ipc6);
2038 	if (err) {
2039 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2040 		return ERR_PTR(err);
2041 	}
2042 
2043 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2044 }
2045