• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 	int delta = hh_len - skb_headroom(skb);
65 	const struct in6_addr *nexthop;
66 	struct neighbour *neigh;
67 	int ret;
68 
69 	/* Be paranoid, rather than too clever. */
70 	if (unlikely(delta > 0) && dev->header_ops) {
71 		/* pskb_expand_head() might crash, if skb is shared */
72 		if (skb_shared(skb)) {
73 			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74 
75 			if (likely(nskb)) {
76 				if (skb->sk)
77 					skb_set_owner_w(nskb, skb->sk);
78 				consume_skb(skb);
79 			} else {
80 				kfree_skb(skb);
81 			}
82 			skb = nskb;
83 		}
84 		/* Make sure idev stays alive */
85 		rcu_read_lock();
86 		if (skb &&
87 		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
88 			kfree_skb(skb);
89 			skb = NULL;
90 		}
91 		if (!skb) {
92 			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
93 			rcu_read_unlock();
94 			return -ENOMEM;
95 		}
96 		rcu_read_unlock();
97 	}
98 
99 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
100 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
101 
102 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
103 		    ((mroute6_is_socket(net, skb) &&
104 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
105 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
106 					 &ipv6_hdr(skb)->saddr))) {
107 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
108 
109 			/* Do not check for IFF_ALLMULTI; multicast routing
110 			   is not supported in any case.
111 			 */
112 			if (newskb)
113 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
114 					net, sk, newskb, NULL, newskb->dev,
115 					dev_loopback_xmit);
116 
117 			if (ipv6_hdr(skb)->hop_limit == 0) {
118 				IP6_INC_STATS(net, idev,
119 					      IPSTATS_MIB_OUTDISCARDS);
120 				kfree_skb(skb);
121 				return 0;
122 			}
123 		}
124 
125 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
126 
127 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
128 		    IPV6_ADDR_SCOPE_NODELOCAL &&
129 		    !(dev->flags & IFF_LOOPBACK)) {
130 			kfree_skb(skb);
131 			return 0;
132 		}
133 	}
134 
135 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
136 		int res = lwtunnel_xmit(skb);
137 
138 		if (res != LWTUNNEL_XMIT_CONTINUE)
139 			return res;
140 	}
141 
142 	rcu_read_lock_bh();
143 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
144 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
145 	if (unlikely(!neigh))
146 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
147 	if (!IS_ERR(neigh)) {
148 		sock_confirm_neigh(skb, neigh);
149 		ret = neigh_output(neigh, skb, false);
150 		rcu_read_unlock_bh();
151 		return ret;
152 	}
153 	rcu_read_unlock_bh();
154 
155 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
156 	kfree_skb(skb);
157 	return -EINVAL;
158 }
159 
160 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)161 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
162 				    struct sk_buff *skb, unsigned int mtu)
163 {
164 	struct sk_buff *segs, *nskb;
165 	netdev_features_t features;
166 	int ret = 0;
167 
168 	/* Please see corresponding comment in ip_finish_output_gso
169 	 * describing the cases where GSO segment length exceeds the
170 	 * egress MTU.
171 	 */
172 	features = netif_skb_features(skb);
173 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
174 	if (IS_ERR_OR_NULL(segs)) {
175 		kfree_skb(skb);
176 		return -ENOMEM;
177 	}
178 
179 	consume_skb(skb);
180 
181 	skb_list_walk_safe(segs, segs, nskb) {
182 		int err;
183 
184 		skb_mark_not_on_list(segs);
185 		/* Last GSO segment can be smaller than gso_size (and MTU).
186 		 * Adding a fragment header would produce an "atomic fragment",
187 		 * which is considered harmful (RFC-8021). Avoid that.
188 		 */
189 		err = segs->len > mtu ?
190 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
191 			ip6_finish_output2(net, sk, segs);
192 		if (err && ret == 0)
193 			ret = err;
194 	}
195 
196 	return ret;
197 }
198 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)199 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	unsigned int mtu;
202 
203 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
204 	/* Policy lookup after SNAT yielded a new policy */
205 	if (skb_dst(skb)->xfrm) {
206 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
207 		return dst_output(net, sk, skb);
208 	}
209 #endif
210 
211 	mtu = ip6_skb_dst_mtu(skb);
212 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
213 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
214 
215 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
216 	    dst_allfrag(skb_dst(skb)) ||
217 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
218 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
219 	else
220 		return ip6_finish_output2(net, sk, skb);
221 }
222 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)223 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
224 {
225 	int ret;
226 
227 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
228 	switch (ret) {
229 	case NET_XMIT_SUCCESS:
230 		return __ip6_finish_output(net, sk, skb);
231 	case NET_XMIT_CN:
232 		return __ip6_finish_output(net, sk, skb) ? : ret;
233 	default:
234 		kfree_skb(skb);
235 		return ret;
236 	}
237 }
238 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)239 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
240 {
241 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
242 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
243 
244 	skb->protocol = htons(ETH_P_IPV6);
245 	skb->dev = dev;
246 
247 	if (unlikely(!idev || (idev->cnf.disable_ipv6))) {
248 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
249 		kfree_skb(skb);
250 		return 0;
251 	}
252 
253 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
254 			    net, sk, skb, indev, dev,
255 			    ip6_finish_output,
256 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
257 }
258 
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)259 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
260 {
261 	if (!np->autoflowlabel_set)
262 		return ip6_default_np_autolabel(net);
263 	else
264 		return np->autoflowlabel;
265 }
266 
267 /*
268  * xmit an sk_buff (used by TCP, SCTP and DCCP)
269  * Note : socket lock is not held for SYNACK packets, but might be modified
270  * by calls to skb_set_owner_w() and ipv6_local_error(),
271  * which are using proper atomic operations or spinlocks.
272  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)273 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
274 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
275 {
276 	struct net *net = sock_net(sk);
277 	const struct ipv6_pinfo *np = inet6_sk(sk);
278 	struct in6_addr *first_hop = &fl6->daddr;
279 	struct dst_entry *dst = skb_dst(skb);
280 	unsigned int head_room;
281 	struct ipv6hdr *hdr;
282 	u8  proto = fl6->flowi6_proto;
283 	int seg_len = skb->len;
284 	int hlimit = -1;
285 	u32 mtu;
286 
287 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
288 	if (opt)
289 		head_room += opt->opt_nflen + opt->opt_flen;
290 
291 	if (unlikely(skb_headroom(skb) < head_room)) {
292 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
293 		if (!skb2) {
294 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
295 				      IPSTATS_MIB_OUTDISCARDS);
296 			kfree_skb(skb);
297 			return -ENOBUFS;
298 		}
299 		if (skb->sk)
300 			skb_set_owner_w(skb2, skb->sk);
301 		consume_skb(skb);
302 		skb = skb2;
303 	}
304 
305 	if (opt) {
306 		seg_len += opt->opt_nflen + opt->opt_flen;
307 
308 		if (opt->opt_flen)
309 			ipv6_push_frag_opts(skb, opt, &proto);
310 
311 		if (opt->opt_nflen)
312 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
313 					     &fl6->saddr);
314 	}
315 
316 	skb_push(skb, sizeof(struct ipv6hdr));
317 	skb_reset_network_header(skb);
318 	hdr = ipv6_hdr(skb);
319 
320 	/*
321 	 *	Fill in the IPv6 header
322 	 */
323 	if (np)
324 		hlimit = np->hop_limit;
325 	if (hlimit < 0)
326 		hlimit = ip6_dst_hoplimit(dst);
327 
328 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
329 				ip6_autoflowlabel(net, np), fl6));
330 
331 	hdr->payload_len = htons(seg_len);
332 	hdr->nexthdr = proto;
333 	hdr->hop_limit = hlimit;
334 
335 	hdr->saddr = fl6->saddr;
336 	hdr->daddr = *first_hop;
337 
338 	skb->protocol = htons(ETH_P_IPV6);
339 	skb->priority = priority;
340 	skb->mark = mark;
341 
342 	mtu = dst_mtu(dst);
343 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
344 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
345 			      IPSTATS_MIB_OUT, skb->len);
346 
347 		/* if egress device is enslaved to an L3 master device pass the
348 		 * skb to its handler for processing
349 		 */
350 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
351 		if (unlikely(!skb))
352 			return 0;
353 
354 		/* hooks should never assume socket lock is held.
355 		 * we promote our socket to non const
356 		 */
357 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
358 			       net, (struct sock *)sk, skb, NULL, dst->dev,
359 			       dst_output);
360 	}
361 
362 	skb->dev = dst->dev;
363 	/* ipv6_local_error() does not require socket lock,
364 	 * we promote our socket to non const
365 	 */
366 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
367 
368 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
369 	kfree_skb(skb);
370 	return -EMSGSIZE;
371 }
372 EXPORT_SYMBOL(ip6_xmit);
373 
ip6_call_ra_chain(struct sk_buff * skb,int sel)374 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
375 {
376 	struct ip6_ra_chain *ra;
377 	struct sock *last = NULL;
378 
379 	read_lock(&ip6_ra_lock);
380 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
381 		struct sock *sk = ra->sk;
382 		if (sk && ra->sel == sel &&
383 		    (!sk->sk_bound_dev_if ||
384 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
385 			struct ipv6_pinfo *np = inet6_sk(sk);
386 
387 			if (np && np->rtalert_isolate &&
388 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
389 				continue;
390 			}
391 			if (last) {
392 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
393 				if (skb2)
394 					rawv6_rcv(last, skb2);
395 			}
396 			last = sk;
397 		}
398 	}
399 
400 	if (last) {
401 		rawv6_rcv(last, skb);
402 		read_unlock(&ip6_ra_lock);
403 		return 1;
404 	}
405 	read_unlock(&ip6_ra_lock);
406 	return 0;
407 }
408 
ip6_forward_proxy_check(struct sk_buff * skb)409 static int ip6_forward_proxy_check(struct sk_buff *skb)
410 {
411 	struct ipv6hdr *hdr = ipv6_hdr(skb);
412 	u8 nexthdr = hdr->nexthdr;
413 	__be16 frag_off;
414 	int offset;
415 
416 	if (ipv6_ext_hdr(nexthdr)) {
417 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
418 		if (offset < 0)
419 			return 0;
420 	} else
421 		offset = sizeof(struct ipv6hdr);
422 
423 	if (nexthdr == IPPROTO_ICMPV6) {
424 		struct icmp6hdr *icmp6;
425 
426 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
427 					 offset + 1 - skb->data)))
428 			return 0;
429 
430 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
431 
432 		switch (icmp6->icmp6_type) {
433 		case NDISC_ROUTER_SOLICITATION:
434 		case NDISC_ROUTER_ADVERTISEMENT:
435 		case NDISC_NEIGHBOUR_SOLICITATION:
436 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
437 		case NDISC_REDIRECT:
438 			/* For reaction involving unicast neighbor discovery
439 			 * message destined to the proxied address, pass it to
440 			 * input function.
441 			 */
442 			return 1;
443 		default:
444 			break;
445 		}
446 	}
447 
448 	/*
449 	 * The proxying router can't forward traffic sent to a link-local
450 	 * address, so signal the sender and discard the packet. This
451 	 * behavior is clarified by the MIPv6 specification.
452 	 */
453 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
454 		dst_link_failure(skb);
455 		return -1;
456 	}
457 
458 	return 0;
459 }
460 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
462 				     struct sk_buff *skb)
463 {
464 	struct dst_entry *dst = skb_dst(skb);
465 
466 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
467 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
468 
469 #ifdef CONFIG_NET_SWITCHDEV
470 	if (skb->offload_l3_fwd_mark) {
471 		consume_skb(skb);
472 		return 0;
473 	}
474 #endif
475 
476 	skb->tstamp = 0;
477 	return dst_output(net, sk, skb);
478 }
479 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)480 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
481 {
482 	if (skb->len <= mtu)
483 		return false;
484 
485 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
486 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
487 		return true;
488 
489 	if (skb->ignore_df)
490 		return false;
491 
492 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
493 		return false;
494 
495 	return true;
496 }
497 
ip6_forward(struct sk_buff * skb)498 int ip6_forward(struct sk_buff *skb)
499 {
500 	struct dst_entry *dst = skb_dst(skb);
501 	struct ipv6hdr *hdr = ipv6_hdr(skb);
502 	struct inet6_skb_parm *opt = IP6CB(skb);
503 	struct net *net = dev_net(dst->dev);
504 	struct inet6_dev *idev;
505 	u32 mtu;
506 
507 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
508 	if (net->ipv6.devconf_all->forwarding == 0)
509 		goto error;
510 
511 	if (skb->pkt_type != PACKET_HOST)
512 		goto drop;
513 
514 	if (unlikely(skb->sk))
515 		goto drop;
516 
517 	if (skb_warn_if_lro(skb))
518 		goto drop;
519 
520 	if (!net->ipv6.devconf_all->disable_policy &&
521 	    (!idev || !idev->cnf.disable_policy) &&
522 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
523 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
524 		goto drop;
525 	}
526 
527 	skb_forward_csum(skb);
528 
529 	/*
530 	 *	We DO NOT make any processing on
531 	 *	RA packets, pushing them to user level AS IS
532 	 *	without ane WARRANTY that application will be able
533 	 *	to interpret them. The reason is that we
534 	 *	cannot make anything clever here.
535 	 *
536 	 *	We are not end-node, so that if packet contains
537 	 *	AH/ESP, we cannot make anything.
538 	 *	Defragmentation also would be mistake, RA packets
539 	 *	cannot be fragmented, because there is no warranty
540 	 *	that different fragments will go along one path. --ANK
541 	 */
542 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
543 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
544 			return 0;
545 	}
546 
547 	/*
548 	 *	check and decrement ttl
549 	 */
550 	if (hdr->hop_limit <= 1) {
551 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
552 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
553 
554 		kfree_skb(skb);
555 		return -ETIMEDOUT;
556 	}
557 
558 	/* XXX: idev->cnf.proxy_ndp? */
559 	if (net->ipv6.devconf_all->proxy_ndp &&
560 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
561 		int proxied = ip6_forward_proxy_check(skb);
562 		if (proxied > 0)
563 			return ip6_input(skb);
564 		else if (proxied < 0) {
565 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
566 			goto drop;
567 		}
568 	}
569 
570 	if (!xfrm6_route_forward(skb)) {
571 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
572 		goto drop;
573 	}
574 	dst = skb_dst(skb);
575 
576 	/* IPv6 specs say nothing about it, but it is clear that we cannot
577 	   send redirects to source routed frames.
578 	   We don't send redirects to frames decapsulated from IPsec.
579 	 */
580 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
581 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
582 		struct in6_addr *target = NULL;
583 		struct inet_peer *peer;
584 		struct rt6_info *rt;
585 
586 		/*
587 		 *	incoming and outgoing devices are the same
588 		 *	send a redirect.
589 		 */
590 
591 		rt = (struct rt6_info *) dst;
592 		if (rt->rt6i_flags & RTF_GATEWAY)
593 			target = &rt->rt6i_gateway;
594 		else
595 			target = &hdr->daddr;
596 
597 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
598 
599 		/* Limit redirects both by destination (here)
600 		   and by source (inside ndisc_send_redirect)
601 		 */
602 		if (inet_peer_xrlim_allow(peer, 1*HZ))
603 			ndisc_send_redirect(skb, target);
604 		if (peer)
605 			inet_putpeer(peer);
606 	} else {
607 		int addrtype = ipv6_addr_type(&hdr->saddr);
608 
609 		/* This check is security critical. */
610 		if (addrtype == IPV6_ADDR_ANY ||
611 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
612 			goto error;
613 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
614 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
615 				    ICMPV6_NOT_NEIGHBOUR, 0);
616 			goto error;
617 		}
618 	}
619 
620 	mtu = ip6_dst_mtu_forward(dst);
621 	if (mtu < IPV6_MIN_MTU)
622 		mtu = IPV6_MIN_MTU;
623 
624 	if (ip6_pkt_too_big(skb, mtu)) {
625 		/* Again, force OUTPUT device used as source address */
626 		skb->dev = dst->dev;
627 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
629 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
630 				IPSTATS_MIB_FRAGFAILS);
631 		kfree_skb(skb);
632 		return -EMSGSIZE;
633 	}
634 
635 	if (skb_cow(skb, dst->dev->hard_header_len)) {
636 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
637 				IPSTATS_MIB_OUTDISCARDS);
638 		goto drop;
639 	}
640 
641 	hdr = ipv6_hdr(skb);
642 
643 	/* Mangling hops number delayed to point after skb COW */
644 
645 	hdr->hop_limit--;
646 
647 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
648 		       net, NULL, skb, skb->dev, dst->dev,
649 		       ip6_forward_finish);
650 
651 error:
652 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
653 drop:
654 	kfree_skb(skb);
655 	return -EINVAL;
656 }
657 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)658 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
659 {
660 	to->pkt_type = from->pkt_type;
661 	to->priority = from->priority;
662 	to->protocol = from->protocol;
663 	skb_dst_drop(to);
664 	skb_dst_set(to, dst_clone(skb_dst(from)));
665 	to->dev = from->dev;
666 	to->mark = from->mark;
667 
668 	skb_copy_hash(to, from);
669 
670 #ifdef CONFIG_NET_SCHED
671 	to->tc_index = from->tc_index;
672 #endif
673 	nf_copy(to, from);
674 	skb_ext_copy(to, from);
675 	skb_copy_secmark(to, from);
676 }
677 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)678 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
679 		      u8 nexthdr, __be32 frag_id,
680 		      struct ip6_fraglist_iter *iter)
681 {
682 	unsigned int first_len;
683 	struct frag_hdr *fh;
684 
685 	/* BUILD HEADER */
686 	*prevhdr = NEXTHDR_FRAGMENT;
687 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
688 	if (!iter->tmp_hdr)
689 		return -ENOMEM;
690 
691 	iter->frag = skb_shinfo(skb)->frag_list;
692 	skb_frag_list_init(skb);
693 
694 	iter->offset = 0;
695 	iter->hlen = hlen;
696 	iter->frag_id = frag_id;
697 	iter->nexthdr = nexthdr;
698 
699 	__skb_pull(skb, hlen);
700 	fh = __skb_push(skb, sizeof(struct frag_hdr));
701 	__skb_push(skb, hlen);
702 	skb_reset_network_header(skb);
703 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
704 
705 	fh->nexthdr = nexthdr;
706 	fh->reserved = 0;
707 	fh->frag_off = htons(IP6_MF);
708 	fh->identification = frag_id;
709 
710 	first_len = skb_pagelen(skb);
711 	skb->data_len = first_len - skb_headlen(skb);
712 	skb->len = first_len;
713 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
714 
715 	return 0;
716 }
717 EXPORT_SYMBOL(ip6_fraglist_init);
718 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)719 void ip6_fraglist_prepare(struct sk_buff *skb,
720 			  struct ip6_fraglist_iter *iter)
721 {
722 	struct sk_buff *frag = iter->frag;
723 	unsigned int hlen = iter->hlen;
724 	struct frag_hdr *fh;
725 
726 	frag->ip_summed = CHECKSUM_NONE;
727 	skb_reset_transport_header(frag);
728 	fh = __skb_push(frag, sizeof(struct frag_hdr));
729 	__skb_push(frag, hlen);
730 	skb_reset_network_header(frag);
731 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
732 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
733 	fh->nexthdr = iter->nexthdr;
734 	fh->reserved = 0;
735 	fh->frag_off = htons(iter->offset);
736 	if (frag->next)
737 		fh->frag_off |= htons(IP6_MF);
738 	fh->identification = iter->frag_id;
739 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
740 	ip6_copy_metadata(frag, skb);
741 }
742 EXPORT_SYMBOL(ip6_fraglist_prepare);
743 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)744 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
745 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
746 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
747 {
748 	state->prevhdr = prevhdr;
749 	state->nexthdr = nexthdr;
750 	state->frag_id = frag_id;
751 
752 	state->hlen = hlen;
753 	state->mtu = mtu;
754 
755 	state->left = skb->len - hlen;	/* Space per frame */
756 	state->ptr = hlen;		/* Where to start from */
757 
758 	state->hroom = hdr_room;
759 	state->troom = needed_tailroom;
760 
761 	state->offset = 0;
762 }
763 EXPORT_SYMBOL(ip6_frag_init);
764 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)765 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
766 {
767 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
768 	struct sk_buff *frag;
769 	struct frag_hdr *fh;
770 	unsigned int len;
771 
772 	len = state->left;
773 	/* IF: it doesn't fit, use 'mtu' - the data space left */
774 	if (len > state->mtu)
775 		len = state->mtu;
776 	/* IF: we are not sending up to and including the packet end
777 	   then align the next start on an eight byte boundary */
778 	if (len < state->left)
779 		len &= ~7;
780 
781 	/* Allocate buffer */
782 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
783 			 state->hroom + state->troom, GFP_ATOMIC);
784 	if (!frag)
785 		return ERR_PTR(-ENOMEM);
786 
787 	/*
788 	 *	Set up data on packet
789 	 */
790 
791 	ip6_copy_metadata(frag, skb);
792 	skb_reserve(frag, state->hroom);
793 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
794 	skb_reset_network_header(frag);
795 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
796 	frag->transport_header = (frag->network_header + state->hlen +
797 				  sizeof(struct frag_hdr));
798 
799 	/*
800 	 *	Charge the memory for the fragment to any owner
801 	 *	it might possess
802 	 */
803 	if (skb->sk)
804 		skb_set_owner_w(frag, skb->sk);
805 
806 	/*
807 	 *	Copy the packet header into the new buffer.
808 	 */
809 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
810 
811 	fragnexthdr_offset = skb_network_header(frag);
812 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
813 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
814 
815 	/*
816 	 *	Build fragment header.
817 	 */
818 	fh->nexthdr = state->nexthdr;
819 	fh->reserved = 0;
820 	fh->identification = state->frag_id;
821 
822 	/*
823 	 *	Copy a block of the IP datagram.
824 	 */
825 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
826 			     len));
827 	state->left -= len;
828 
829 	fh->frag_off = htons(state->offset);
830 	if (state->left > 0)
831 		fh->frag_off |= htons(IP6_MF);
832 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
833 
834 	state->ptr += len;
835 	state->offset += len;
836 
837 	return frag;
838 }
839 EXPORT_SYMBOL(ip6_frag_next);
840 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))841 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
842 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
843 {
844 	struct sk_buff *frag;
845 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
846 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
847 				inet6_sk(skb->sk) : NULL;
848 	struct ip6_frag_state state;
849 	unsigned int mtu, hlen, nexthdr_offset;
850 	ktime_t tstamp = skb->tstamp;
851 	int hroom, err = 0;
852 	__be32 frag_id;
853 	u8 *prevhdr, nexthdr = 0;
854 
855 	err = ip6_find_1stfragopt(skb, &prevhdr);
856 	if (err < 0)
857 		goto fail;
858 	hlen = err;
859 	nexthdr = *prevhdr;
860 	nexthdr_offset = prevhdr - skb_network_header(skb);
861 
862 	mtu = ip6_skb_dst_mtu(skb);
863 
864 	/* We must not fragment if the socket is set to force MTU discovery
865 	 * or if the skb it not generated by a local socket.
866 	 */
867 	if (unlikely(!skb->ignore_df && skb->len > mtu))
868 		goto fail_toobig;
869 
870 	if (IP6CB(skb)->frag_max_size) {
871 		if (IP6CB(skb)->frag_max_size > mtu)
872 			goto fail_toobig;
873 
874 		/* don't send fragments larger than what we received */
875 		mtu = IP6CB(skb)->frag_max_size;
876 		if (mtu < IPV6_MIN_MTU)
877 			mtu = IPV6_MIN_MTU;
878 	}
879 
880 	if (np && np->frag_size < mtu) {
881 		if (np->frag_size)
882 			mtu = np->frag_size;
883 	}
884 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
885 		goto fail_toobig;
886 	mtu -= hlen + sizeof(struct frag_hdr);
887 
888 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
889 				    &ipv6_hdr(skb)->saddr);
890 
891 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
892 	    (err = skb_checksum_help(skb)))
893 		goto fail;
894 
895 	prevhdr = skb_network_header(skb) + nexthdr_offset;
896 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
897 	if (skb_has_frag_list(skb)) {
898 		unsigned int first_len = skb_pagelen(skb);
899 		struct ip6_fraglist_iter iter;
900 		struct sk_buff *frag2;
901 
902 		if (first_len - hlen > mtu ||
903 		    ((first_len - hlen) & 7) ||
904 		    skb_cloned(skb) ||
905 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
906 			goto slow_path;
907 
908 		skb_walk_frags(skb, frag) {
909 			/* Correct geometry. */
910 			if (frag->len > mtu ||
911 			    ((frag->len & 7) && frag->next) ||
912 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
913 				goto slow_path_clean;
914 
915 			/* Partially cloned skb? */
916 			if (skb_shared(frag))
917 				goto slow_path_clean;
918 
919 			BUG_ON(frag->sk);
920 			if (skb->sk) {
921 				frag->sk = skb->sk;
922 				frag->destructor = sock_wfree;
923 			}
924 			skb->truesize -= frag->truesize;
925 		}
926 
927 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
928 					&iter);
929 		if (err < 0)
930 			goto fail;
931 
932 		/* We prevent @rt from being freed. */
933 		rcu_read_lock();
934 
935 		for (;;) {
936 			/* Prepare header of the next frame,
937 			 * before previous one went down. */
938 			if (iter.frag)
939 				ip6_fraglist_prepare(skb, &iter);
940 
941 			skb->tstamp = tstamp;
942 			err = output(net, sk, skb);
943 			if (!err)
944 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
945 					      IPSTATS_MIB_FRAGCREATES);
946 
947 			if (err || !iter.frag)
948 				break;
949 
950 			skb = ip6_fraglist_next(&iter);
951 		}
952 
953 		kfree(iter.tmp_hdr);
954 
955 		if (err == 0) {
956 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
957 				      IPSTATS_MIB_FRAGOKS);
958 			rcu_read_unlock();
959 			return 0;
960 		}
961 
962 		kfree_skb_list(iter.frag);
963 
964 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
965 			      IPSTATS_MIB_FRAGFAILS);
966 		rcu_read_unlock();
967 		return err;
968 
969 slow_path_clean:
970 		skb_walk_frags(skb, frag2) {
971 			if (frag2 == frag)
972 				break;
973 			frag2->sk = NULL;
974 			frag2->destructor = NULL;
975 			skb->truesize += frag2->truesize;
976 		}
977 	}
978 
979 slow_path:
980 	/*
981 	 *	Fragment the datagram.
982 	 */
983 
984 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
985 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
986 		      &state);
987 
988 	/*
989 	 *	Keep copying data until we run out.
990 	 */
991 
992 	while (state.left > 0) {
993 		frag = ip6_frag_next(skb, &state);
994 		if (IS_ERR(frag)) {
995 			err = PTR_ERR(frag);
996 			goto fail;
997 		}
998 
999 		/*
1000 		 *	Put this fragment into the sending queue.
1001 		 */
1002 		frag->tstamp = tstamp;
1003 		err = output(net, sk, frag);
1004 		if (err)
1005 			goto fail;
1006 
1007 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1008 			      IPSTATS_MIB_FRAGCREATES);
1009 	}
1010 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1011 		      IPSTATS_MIB_FRAGOKS);
1012 	consume_skb(skb);
1013 	return err;
1014 
1015 fail_toobig:
1016 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1017 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1018 
1019 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1020 	err = -EMSGSIZE;
1021 
1022 fail:
1023 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1024 		      IPSTATS_MIB_FRAGFAILS);
1025 	kfree_skb(skb);
1026 	return err;
1027 }
1028 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1029 static inline int ip6_rt_check(const struct rt6key *rt_key,
1030 			       const struct in6_addr *fl_addr,
1031 			       const struct in6_addr *addr_cache)
1032 {
1033 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1034 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1035 }
1036 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1037 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1038 					  struct dst_entry *dst,
1039 					  const struct flowi6 *fl6)
1040 {
1041 	struct ipv6_pinfo *np = inet6_sk(sk);
1042 	struct rt6_info *rt;
1043 
1044 	if (!dst)
1045 		goto out;
1046 
1047 	if (dst->ops->family != AF_INET6) {
1048 		dst_release(dst);
1049 		return NULL;
1050 	}
1051 
1052 	rt = (struct rt6_info *)dst;
1053 	/* Yes, checking route validity in not connected
1054 	 * case is not very simple. Take into account,
1055 	 * that we do not support routing by source, TOS,
1056 	 * and MSG_DONTROUTE		--ANK (980726)
1057 	 *
1058 	 * 1. ip6_rt_check(): If route was host route,
1059 	 *    check that cached destination is current.
1060 	 *    If it is network route, we still may
1061 	 *    check its validity using saved pointer
1062 	 *    to the last used address: daddr_cache.
1063 	 *    We do not want to save whole address now,
1064 	 *    (because main consumer of this service
1065 	 *    is tcp, which has not this problem),
1066 	 *    so that the last trick works only on connected
1067 	 *    sockets.
1068 	 * 2. oif also should be the same.
1069 	 */
1070 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1071 #ifdef CONFIG_IPV6_SUBTREES
1072 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1073 #endif
1074 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1075 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1076 		dst_release(dst);
1077 		dst = NULL;
1078 	}
1079 
1080 out:
1081 	return dst;
1082 }
1083 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1084 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1085 			       struct dst_entry **dst, struct flowi6 *fl6)
1086 {
1087 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1088 	struct neighbour *n;
1089 	struct rt6_info *rt;
1090 #endif
1091 	int err;
1092 	int flags = 0;
1093 
1094 	/* The correct way to handle this would be to do
1095 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1096 	 * the route-specific preferred source forces the
1097 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1098 	 *
1099 	 * In source specific routing (no src=any default route),
1100 	 * ip6_route_output will fail given src=any saddr, though, so
1101 	 * that's why we try it again later.
1102 	 */
1103 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1104 		struct fib6_info *from;
1105 		struct rt6_info *rt;
1106 		bool had_dst = *dst != NULL;
1107 
1108 		if (!had_dst)
1109 			*dst = ip6_route_output(net, sk, fl6);
1110 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1111 
1112 		rcu_read_lock();
1113 		from = rt ? rcu_dereference(rt->from) : NULL;
1114 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1115 					  sk ? inet6_sk(sk)->srcprefs : 0,
1116 					  &fl6->saddr);
1117 		rcu_read_unlock();
1118 
1119 		if (err)
1120 			goto out_err_release;
1121 
1122 		/* If we had an erroneous initial result, pretend it
1123 		 * never existed and let the SA-enabled version take
1124 		 * over.
1125 		 */
1126 		if (!had_dst && (*dst)->error) {
1127 			dst_release(*dst);
1128 			*dst = NULL;
1129 		}
1130 
1131 		if (fl6->flowi6_oif)
1132 			flags |= RT6_LOOKUP_F_IFACE;
1133 	}
1134 
1135 	if (!*dst)
1136 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1137 
1138 	err = (*dst)->error;
1139 	if (err)
1140 		goto out_err_release;
1141 
1142 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1143 	/*
1144 	 * Here if the dst entry we've looked up
1145 	 * has a neighbour entry that is in the INCOMPLETE
1146 	 * state and the src address from the flow is
1147 	 * marked as OPTIMISTIC, we release the found
1148 	 * dst entry and replace it instead with the
1149 	 * dst entry of the nexthop router
1150 	 */
1151 	rt = (struct rt6_info *) *dst;
1152 	rcu_read_lock_bh();
1153 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1154 				      rt6_nexthop(rt, &fl6->daddr));
1155 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1156 	rcu_read_unlock_bh();
1157 
1158 	if (err) {
1159 		struct inet6_ifaddr *ifp;
1160 		struct flowi6 fl_gw6;
1161 		int redirect;
1162 
1163 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1164 				      (*dst)->dev, 1);
1165 
1166 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1167 		if (ifp)
1168 			in6_ifa_put(ifp);
1169 
1170 		if (redirect) {
1171 			/*
1172 			 * We need to get the dst entry for the
1173 			 * default router instead
1174 			 */
1175 			dst_release(*dst);
1176 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1177 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1178 			*dst = ip6_route_output(net, sk, &fl_gw6);
1179 			err = (*dst)->error;
1180 			if (err)
1181 				goto out_err_release;
1182 		}
1183 	}
1184 #endif
1185 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1186 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1187 		err = -EAFNOSUPPORT;
1188 		goto out_err_release;
1189 	}
1190 
1191 	return 0;
1192 
1193 out_err_release:
1194 	dst_release(*dst);
1195 	*dst = NULL;
1196 
1197 	if (err == -ENETUNREACH)
1198 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1199 	return err;
1200 }
1201 
1202 /**
1203  *	ip6_dst_lookup - perform route lookup on flow
1204  *	@net: Network namespace to perform lookup in
1205  *	@sk: socket which provides route info
1206  *	@dst: pointer to dst_entry * for result
1207  *	@fl6: flow to lookup
1208  *
1209  *	This function performs a route lookup on the given flow.
1210  *
1211  *	It returns zero on success, or a standard errno code on error.
1212  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1213 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1214 		   struct flowi6 *fl6)
1215 {
1216 	*dst = NULL;
1217 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1218 }
1219 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1220 
1221 /**
1222  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1223  *	@net: Network namespace to perform lookup in
1224  *	@sk: socket which provides route info
1225  *	@fl6: flow to lookup
1226  *	@final_dst: final destination address for ipsec lookup
1227  *
1228  *	This function performs a route lookup on the given flow.
1229  *
1230  *	It returns a valid dst pointer on success, or a pointer encoded
1231  *	error code.
1232  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1233 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1234 				      const struct in6_addr *final_dst)
1235 {
1236 	struct dst_entry *dst = NULL;
1237 	int err;
1238 
1239 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1240 	if (err)
1241 		return ERR_PTR(err);
1242 	if (final_dst)
1243 		fl6->daddr = *final_dst;
1244 
1245 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1246 }
1247 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1248 
1249 /**
1250  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1251  *	@sk: socket which provides the dst cache and route info
1252  *	@fl6: flow to lookup
1253  *	@final_dst: final destination address for ipsec lookup
1254  *	@connected: whether @sk is connected or not
1255  *
1256  *	This function performs a route lookup on the given flow with the
1257  *	possibility of using the cached route in the socket if it is valid.
1258  *	It will take the socket dst lock when operating on the dst cache.
1259  *	As a result, this function can only be used in process context.
1260  *
1261  *	In addition, for a connected socket, cache the dst in the socket
1262  *	if the current cache is not valid.
1263  *
1264  *	It returns a valid dst pointer on success, or a pointer encoded
1265  *	error code.
1266  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1267 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1268 					 const struct in6_addr *final_dst,
1269 					 bool connected)
1270 {
1271 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1272 
1273 	dst = ip6_sk_dst_check(sk, dst, fl6);
1274 	if (dst)
1275 		return dst;
1276 
1277 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1278 	if (connected && !IS_ERR(dst))
1279 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1280 
1281 	return dst;
1282 }
1283 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1284 
1285 /**
1286  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1287  *      @skb: Packet for which lookup is done
1288  *      @dev: Tunnel device
1289  *      @net: Network namespace of tunnel device
1290  *      @sock: Socket which provides route info
1291  *      @saddr: Memory to store the src ip address
1292  *      @info: Tunnel information
1293  *      @protocol: IP protocol
1294  *      @use_cache: Flag to enable cache usage
1295  *      This function performs a route lookup on a tunnel
1296  *
1297  *      It returns a valid dst pointer and stores src address to be used in
1298  *      tunnel in param saddr on success, else a pointer encoded error code.
1299  */
1300 
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1301 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1302 					struct net_device *dev,
1303 					struct net *net,
1304 					struct socket *sock,
1305 					struct in6_addr *saddr,
1306 					const struct ip_tunnel_info *info,
1307 					u8 protocol,
1308 					bool use_cache)
1309 {
1310 	struct dst_entry *dst = NULL;
1311 #ifdef CONFIG_DST_CACHE
1312 	struct dst_cache *dst_cache;
1313 #endif
1314 	struct flowi6 fl6;
1315 	__u8 prio;
1316 
1317 #ifdef CONFIG_DST_CACHE
1318 	dst_cache = (struct dst_cache *)&info->dst_cache;
1319 	if (use_cache) {
1320 		dst = dst_cache_get_ip6(dst_cache, saddr);
1321 		if (dst)
1322 			return dst;
1323 	}
1324 #endif
1325 	memset(&fl6, 0, sizeof(fl6));
1326 	fl6.flowi6_mark = skb->mark;
1327 	fl6.flowi6_proto = protocol;
1328 	fl6.daddr = info->key.u.ipv6.dst;
1329 	fl6.saddr = info->key.u.ipv6.src;
1330 	prio = info->key.tos;
1331 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1332 
1333 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1334 					      NULL);
1335 	if (IS_ERR(dst)) {
1336 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1337 		return ERR_PTR(-ENETUNREACH);
1338 	}
1339 	if (dst->dev == dev) { /* is this necessary? */
1340 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1341 		dst_release(dst);
1342 		return ERR_PTR(-ELOOP);
1343 	}
1344 #ifdef CONFIG_DST_CACHE
1345 	if (use_cache)
1346 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1347 #endif
1348 	*saddr = fl6.saddr;
1349 	return dst;
1350 }
1351 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1352 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1353 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1354 					       gfp_t gfp)
1355 {
1356 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1357 }
1358 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1359 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1360 						gfp_t gfp)
1361 {
1362 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1363 }
1364 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1365 static void ip6_append_data_mtu(unsigned int *mtu,
1366 				int *maxfraglen,
1367 				unsigned int fragheaderlen,
1368 				struct sk_buff *skb,
1369 				struct rt6_info *rt,
1370 				unsigned int orig_mtu)
1371 {
1372 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1373 		if (!skb) {
1374 			/* first fragment, reserve header_len */
1375 			*mtu = orig_mtu - rt->dst.header_len;
1376 
1377 		} else {
1378 			/*
1379 			 * this fragment is not first, the headers
1380 			 * space is regarded as data space.
1381 			 */
1382 			*mtu = orig_mtu;
1383 		}
1384 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1385 			      + fragheaderlen - sizeof(struct frag_hdr);
1386 	}
1387 }
1388 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1389 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1390 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1391 			  struct rt6_info *rt, struct flowi6 *fl6)
1392 {
1393 	struct ipv6_pinfo *np = inet6_sk(sk);
1394 	unsigned int mtu;
1395 	struct ipv6_txoptions *opt = ipc6->opt;
1396 
1397 	/*
1398 	 * setup for corking
1399 	 */
1400 	if (opt) {
1401 		if (WARN_ON(v6_cork->opt))
1402 			return -EINVAL;
1403 
1404 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1405 		if (unlikely(!v6_cork->opt))
1406 			return -ENOBUFS;
1407 
1408 		v6_cork->opt->tot_len = sizeof(*opt);
1409 		v6_cork->opt->opt_flen = opt->opt_flen;
1410 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1411 
1412 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1413 						    sk->sk_allocation);
1414 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1415 			return -ENOBUFS;
1416 
1417 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1418 						    sk->sk_allocation);
1419 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1420 			return -ENOBUFS;
1421 
1422 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1423 						   sk->sk_allocation);
1424 		if (opt->hopopt && !v6_cork->opt->hopopt)
1425 			return -ENOBUFS;
1426 
1427 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1428 						    sk->sk_allocation);
1429 		if (opt->srcrt && !v6_cork->opt->srcrt)
1430 			return -ENOBUFS;
1431 
1432 		/* need source address above miyazawa*/
1433 	}
1434 	dst_hold(&rt->dst);
1435 	cork->base.dst = &rt->dst;
1436 	cork->fl.u.ip6 = *fl6;
1437 	v6_cork->hop_limit = ipc6->hlimit;
1438 	v6_cork->tclass = ipc6->tclass;
1439 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1440 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1441 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1442 	else
1443 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1444 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1445 	if (np->frag_size < mtu) {
1446 		if (np->frag_size)
1447 			mtu = np->frag_size;
1448 	}
1449 	cork->base.fragsize = mtu;
1450 	cork->base.gso_size = ipc6->gso_size;
1451 	cork->base.tx_flags = 0;
1452 	cork->base.mark = ipc6->sockc.mark;
1453 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454 
1455 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456 		cork->base.flags |= IPCORK_ALLFRAG;
1457 	cork->base.length = 0;
1458 
1459 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1460 
1461 	return 0;
1462 }
1463 
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1464 static int __ip6_append_data(struct sock *sk,
1465 			     struct flowi6 *fl6,
1466 			     struct sk_buff_head *queue,
1467 			     struct inet_cork *cork,
1468 			     struct inet6_cork *v6_cork,
1469 			     struct page_frag *pfrag,
1470 			     int getfrag(void *from, char *to, int offset,
1471 					 int len, int odd, struct sk_buff *skb),
1472 			     void *from, int length, int transhdrlen,
1473 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1474 {
1475 	struct sk_buff *skb, *skb_prev = NULL;
1476 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477 	struct ubuf_info *uarg = NULL;
1478 	int exthdrlen = 0;
1479 	int dst_exthdrlen = 0;
1480 	int hh_len;
1481 	int copy;
1482 	int err;
1483 	int offset = 0;
1484 	u32 tskey = 0;
1485 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1486 	struct ipv6_txoptions *opt = v6_cork->opt;
1487 	int csummode = CHECKSUM_NONE;
1488 	unsigned int maxnonfragsize, headersize;
1489 	unsigned int wmem_alloc_delta = 0;
1490 	bool paged, extra_uref = false;
1491 
1492 	skb = skb_peek_tail(queue);
1493 	if (!skb) {
1494 		exthdrlen = opt ? opt->opt_flen : 0;
1495 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1496 	}
1497 
1498 	paged = !!cork->gso_size;
1499 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1500 	orig_mtu = mtu;
1501 
1502 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1503 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1504 		tskey = sk->sk_tskey++;
1505 
1506 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1507 
1508 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1509 			(opt ? opt->opt_nflen : 0);
1510 
1511 	headersize = sizeof(struct ipv6hdr) +
1512 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1513 		     (dst_allfrag(&rt->dst) ?
1514 		      sizeof(struct frag_hdr) : 0) +
1515 		     rt->rt6i_nfheader_len;
1516 
1517 	if (mtu <= fragheaderlen ||
1518 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1519 		goto emsgsize;
1520 
1521 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1522 		     sizeof(struct frag_hdr);
1523 
1524 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1525 	 * the first fragment
1526 	 */
1527 	if (headersize + transhdrlen > mtu)
1528 		goto emsgsize;
1529 
1530 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1531 	    (sk->sk_protocol == IPPROTO_UDP ||
1532 	     sk->sk_protocol == IPPROTO_RAW)) {
1533 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1534 				sizeof(struct ipv6hdr));
1535 		goto emsgsize;
1536 	}
1537 
1538 	if (ip6_sk_ignore_df(sk))
1539 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1540 	else
1541 		maxnonfragsize = mtu;
1542 
1543 	if (cork->length + length > maxnonfragsize - headersize) {
1544 emsgsize:
1545 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1546 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1547 		return -EMSGSIZE;
1548 	}
1549 
1550 	/* CHECKSUM_PARTIAL only with no extension headers and when
1551 	 * we are not going to fragment
1552 	 */
1553 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1554 	    headersize == sizeof(struct ipv6hdr) &&
1555 	    length <= mtu - headersize &&
1556 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1557 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1558 		csummode = CHECKSUM_PARTIAL;
1559 
1560 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1561 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1562 		if (!uarg)
1563 			return -ENOBUFS;
1564 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1565 		if (rt->dst.dev->features & NETIF_F_SG &&
1566 		    csummode == CHECKSUM_PARTIAL) {
1567 			paged = true;
1568 		} else {
1569 			uarg->zerocopy = 0;
1570 			skb_zcopy_set(skb, uarg, &extra_uref);
1571 		}
1572 	}
1573 
1574 	/*
1575 	 * Let's try using as much space as possible.
1576 	 * Use MTU if total length of the message fits into the MTU.
1577 	 * Otherwise, we need to reserve fragment header and
1578 	 * fragment alignment (= 8-15 octects, in total).
1579 	 *
1580 	 * Note that we may need to "move" the data from the tail
1581 	 * of the buffer to the new fragment when we split
1582 	 * the message.
1583 	 *
1584 	 * FIXME: It may be fragmented into multiple chunks
1585 	 *        at once if non-fragmentable extension headers
1586 	 *        are too large.
1587 	 * --yoshfuji
1588 	 */
1589 
1590 	cork->length += length;
1591 	if (!skb)
1592 		goto alloc_new_skb;
1593 
1594 	while (length > 0) {
1595 		/* Check if the remaining data fits into current packet. */
1596 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1597 		if (copy < length)
1598 			copy = maxfraglen - skb->len;
1599 
1600 		if (copy <= 0) {
1601 			char *data;
1602 			unsigned int datalen;
1603 			unsigned int fraglen;
1604 			unsigned int fraggap;
1605 			unsigned int alloclen, alloc_extra;
1606 			unsigned int pagedlen;
1607 alloc_new_skb:
1608 			/* There's no room in the current skb */
1609 			if (skb)
1610 				fraggap = skb->len - maxfraglen;
1611 			else
1612 				fraggap = 0;
1613 			/* update mtu and maxfraglen if necessary */
1614 			if (!skb || !skb_prev)
1615 				ip6_append_data_mtu(&mtu, &maxfraglen,
1616 						    fragheaderlen, skb, rt,
1617 						    orig_mtu);
1618 
1619 			skb_prev = skb;
1620 
1621 			/*
1622 			 * If remaining data exceeds the mtu,
1623 			 * we know we need more fragment(s).
1624 			 */
1625 			datalen = length + fraggap;
1626 
1627 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1628 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1629 			fraglen = datalen + fragheaderlen;
1630 			pagedlen = 0;
1631 
1632 			alloc_extra = hh_len;
1633 			alloc_extra += dst_exthdrlen;
1634 			alloc_extra += rt->dst.trailer_len;
1635 
1636 			/* We just reserve space for fragment header.
1637 			 * Note: this may be overallocation if the message
1638 			 * (without MSG_MORE) fits into the MTU.
1639 			 */
1640 			alloc_extra += sizeof(struct frag_hdr);
1641 
1642 			if ((flags & MSG_MORE) &&
1643 			    !(rt->dst.dev->features&NETIF_F_SG))
1644 				alloclen = mtu;
1645 			else if (!paged &&
1646 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1647 				  !(rt->dst.dev->features & NETIF_F_SG)))
1648 				alloclen = fraglen;
1649 			else {
1650 				alloclen = min_t(int, fraglen, MAX_HEADER);
1651 				pagedlen = fraglen - alloclen;
1652 			}
1653 			alloclen += alloc_extra;
1654 
1655 			if (datalen != length + fraggap) {
1656 				/*
1657 				 * this is not the last fragment, the trailer
1658 				 * space is regarded as data space.
1659 				 */
1660 				datalen += rt->dst.trailer_len;
1661 			}
1662 
1663 			fraglen = datalen + fragheaderlen;
1664 
1665 			copy = datalen - transhdrlen - fraggap - pagedlen;
1666 			if (copy < 0) {
1667 				err = -EINVAL;
1668 				goto error;
1669 			}
1670 			if (transhdrlen) {
1671 				skb = sock_alloc_send_skb(sk, alloclen,
1672 						(flags & MSG_DONTWAIT), &err);
1673 			} else {
1674 				skb = NULL;
1675 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1676 				    2 * sk->sk_sndbuf)
1677 					skb = alloc_skb(alloclen,
1678 							sk->sk_allocation);
1679 				if (unlikely(!skb))
1680 					err = -ENOBUFS;
1681 			}
1682 			if (!skb)
1683 				goto error;
1684 			/*
1685 			 *	Fill in the control structures
1686 			 */
1687 			skb->protocol = htons(ETH_P_IPV6);
1688 			skb->ip_summed = csummode;
1689 			skb->csum = 0;
1690 			/* reserve for fragmentation and ipsec header */
1691 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1692 				    dst_exthdrlen);
1693 
1694 			/*
1695 			 *	Find where to start putting bytes
1696 			 */
1697 			data = skb_put(skb, fraglen - pagedlen);
1698 			skb_set_network_header(skb, exthdrlen);
1699 			data += fragheaderlen;
1700 			skb->transport_header = (skb->network_header +
1701 						 fragheaderlen);
1702 			if (fraggap) {
1703 				skb->csum = skb_copy_and_csum_bits(
1704 					skb_prev, maxfraglen,
1705 					data + transhdrlen, fraggap);
1706 				skb_prev->csum = csum_sub(skb_prev->csum,
1707 							  skb->csum);
1708 				data += fraggap;
1709 				pskb_trim_unique(skb_prev, maxfraglen);
1710 			}
1711 			if (copy > 0 &&
1712 			    getfrag(from, data + transhdrlen, offset,
1713 				    copy, fraggap, skb) < 0) {
1714 				err = -EFAULT;
1715 				kfree_skb(skb);
1716 				goto error;
1717 			}
1718 
1719 			offset += copy;
1720 			length -= copy + transhdrlen;
1721 			transhdrlen = 0;
1722 			exthdrlen = 0;
1723 			dst_exthdrlen = 0;
1724 
1725 			/* Only the initial fragment is time stamped */
1726 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1727 			cork->tx_flags = 0;
1728 			skb_shinfo(skb)->tskey = tskey;
1729 			tskey = 0;
1730 			skb_zcopy_set(skb, uarg, &extra_uref);
1731 
1732 			if ((flags & MSG_CONFIRM) && !skb_prev)
1733 				skb_set_dst_pending_confirm(skb, 1);
1734 
1735 			/*
1736 			 * Put the packet on the pending queue
1737 			 */
1738 			if (!skb->destructor) {
1739 				skb->destructor = sock_wfree;
1740 				skb->sk = sk;
1741 				wmem_alloc_delta += skb->truesize;
1742 			}
1743 			__skb_queue_tail(queue, skb);
1744 			continue;
1745 		}
1746 
1747 		if (copy > length)
1748 			copy = length;
1749 
1750 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1751 		    skb_tailroom(skb) >= copy) {
1752 			unsigned int off;
1753 
1754 			off = skb->len;
1755 			if (getfrag(from, skb_put(skb, copy),
1756 						offset, copy, off, skb) < 0) {
1757 				__skb_trim(skb, off);
1758 				err = -EFAULT;
1759 				goto error;
1760 			}
1761 		} else if (!uarg || !uarg->zerocopy) {
1762 			int i = skb_shinfo(skb)->nr_frags;
1763 
1764 			err = -ENOMEM;
1765 			if (!sk_page_frag_refill(sk, pfrag))
1766 				goto error;
1767 
1768 			if (!skb_can_coalesce(skb, i, pfrag->page,
1769 					      pfrag->offset)) {
1770 				err = -EMSGSIZE;
1771 				if (i == MAX_SKB_FRAGS)
1772 					goto error;
1773 
1774 				__skb_fill_page_desc(skb, i, pfrag->page,
1775 						     pfrag->offset, 0);
1776 				skb_shinfo(skb)->nr_frags = ++i;
1777 				get_page(pfrag->page);
1778 			}
1779 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1780 			if (getfrag(from,
1781 				    page_address(pfrag->page) + pfrag->offset,
1782 				    offset, copy, skb->len, skb) < 0)
1783 				goto error_efault;
1784 
1785 			pfrag->offset += copy;
1786 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1787 			skb->len += copy;
1788 			skb->data_len += copy;
1789 			skb->truesize += copy;
1790 			wmem_alloc_delta += copy;
1791 		} else {
1792 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1793 			if (err < 0)
1794 				goto error;
1795 		}
1796 		offset += copy;
1797 		length -= copy;
1798 	}
1799 
1800 	if (wmem_alloc_delta)
1801 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1802 	return 0;
1803 
1804 error_efault:
1805 	err = -EFAULT;
1806 error:
1807 	if (uarg)
1808 		sock_zerocopy_put_abort(uarg, extra_uref);
1809 	cork->length -= length;
1810 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1811 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1812 	return err;
1813 }
1814 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1815 int ip6_append_data(struct sock *sk,
1816 		    int getfrag(void *from, char *to, int offset, int len,
1817 				int odd, struct sk_buff *skb),
1818 		    void *from, int length, int transhdrlen,
1819 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1820 		    struct rt6_info *rt, unsigned int flags)
1821 {
1822 	struct inet_sock *inet = inet_sk(sk);
1823 	struct ipv6_pinfo *np = inet6_sk(sk);
1824 	int exthdrlen;
1825 	int err;
1826 
1827 	if (flags&MSG_PROBE)
1828 		return 0;
1829 	if (skb_queue_empty(&sk->sk_write_queue)) {
1830 		/*
1831 		 * setup for corking
1832 		 */
1833 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1834 				     ipc6, rt, fl6);
1835 		if (err)
1836 			return err;
1837 
1838 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1839 		length += exthdrlen;
1840 		transhdrlen += exthdrlen;
1841 	} else {
1842 		fl6 = &inet->cork.fl.u.ip6;
1843 		transhdrlen = 0;
1844 	}
1845 
1846 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1847 				 &np->cork, sk_page_frag(sk), getfrag,
1848 				 from, length, transhdrlen, flags, ipc6);
1849 }
1850 EXPORT_SYMBOL_GPL(ip6_append_data);
1851 
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1852 static void ip6_cork_release(struct inet_cork_full *cork,
1853 			     struct inet6_cork *v6_cork)
1854 {
1855 	if (v6_cork->opt) {
1856 		kfree(v6_cork->opt->dst0opt);
1857 		kfree(v6_cork->opt->dst1opt);
1858 		kfree(v6_cork->opt->hopopt);
1859 		kfree(v6_cork->opt->srcrt);
1860 		kfree(v6_cork->opt);
1861 		v6_cork->opt = NULL;
1862 	}
1863 
1864 	if (cork->base.dst) {
1865 		dst_release(cork->base.dst);
1866 		cork->base.dst = NULL;
1867 		cork->base.flags &= ~IPCORK_ALLFRAG;
1868 	}
1869 	memset(&cork->fl, 0, sizeof(cork->fl));
1870 }
1871 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1872 struct sk_buff *__ip6_make_skb(struct sock *sk,
1873 			       struct sk_buff_head *queue,
1874 			       struct inet_cork_full *cork,
1875 			       struct inet6_cork *v6_cork)
1876 {
1877 	struct sk_buff *skb, *tmp_skb;
1878 	struct sk_buff **tail_skb;
1879 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1880 	struct ipv6_pinfo *np = inet6_sk(sk);
1881 	struct net *net = sock_net(sk);
1882 	struct ipv6hdr *hdr;
1883 	struct ipv6_txoptions *opt = v6_cork->opt;
1884 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1885 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1886 	unsigned char proto = fl6->flowi6_proto;
1887 
1888 	skb = __skb_dequeue(queue);
1889 	if (!skb)
1890 		goto out;
1891 	tail_skb = &(skb_shinfo(skb)->frag_list);
1892 
1893 	/* move skb->data to ip header from ext header */
1894 	if (skb->data < skb_network_header(skb))
1895 		__skb_pull(skb, skb_network_offset(skb));
1896 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1897 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1898 		*tail_skb = tmp_skb;
1899 		tail_skb = &(tmp_skb->next);
1900 		skb->len += tmp_skb->len;
1901 		skb->data_len += tmp_skb->len;
1902 		skb->truesize += tmp_skb->truesize;
1903 		tmp_skb->destructor = NULL;
1904 		tmp_skb->sk = NULL;
1905 	}
1906 
1907 	/* Allow local fragmentation. */
1908 	skb->ignore_df = ip6_sk_ignore_df(sk);
1909 
1910 	*final_dst = fl6->daddr;
1911 	__skb_pull(skb, skb_network_header_len(skb));
1912 	if (opt && opt->opt_flen)
1913 		ipv6_push_frag_opts(skb, opt, &proto);
1914 	if (opt && opt->opt_nflen)
1915 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1916 
1917 	skb_push(skb, sizeof(struct ipv6hdr));
1918 	skb_reset_network_header(skb);
1919 	hdr = ipv6_hdr(skb);
1920 
1921 	ip6_flow_hdr(hdr, v6_cork->tclass,
1922 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1923 					ip6_autoflowlabel(net, np), fl6));
1924 	hdr->hop_limit = v6_cork->hop_limit;
1925 	hdr->nexthdr = proto;
1926 	hdr->saddr = fl6->saddr;
1927 	hdr->daddr = *final_dst;
1928 
1929 	skb->priority = sk->sk_priority;
1930 	skb->mark = cork->base.mark;
1931 
1932 	skb->tstamp = cork->base.transmit_time;
1933 
1934 	skb_dst_set(skb, dst_clone(&rt->dst));
1935 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1936 	if (proto == IPPROTO_ICMPV6) {
1937 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1938 		u8 icmp6_type;
1939 
1940 		if (sk->sk_socket->type == SOCK_RAW &&
1941 			!(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1942 			icmp6_type = fl6->fl6_icmp_type;
1943 		else
1944 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1945 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1946 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1947 	}
1948 
1949 	ip6_cork_release(cork, v6_cork);
1950 out:
1951 	return skb;
1952 }
1953 
ip6_send_skb(struct sk_buff * skb)1954 int ip6_send_skb(struct sk_buff *skb)
1955 {
1956 	struct net *net = sock_net(skb->sk);
1957 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1958 	int err;
1959 
1960 	rcu_read_lock();
1961 	err = ip6_local_out(net, skb->sk, skb);
1962 	if (err) {
1963 		if (err > 0)
1964 			err = net_xmit_errno(err);
1965 		if (err)
1966 			IP6_INC_STATS(net, rt->rt6i_idev,
1967 				      IPSTATS_MIB_OUTDISCARDS);
1968 	}
1969 
1970 	rcu_read_unlock();
1971 	return err;
1972 }
1973 
ip6_push_pending_frames(struct sock * sk)1974 int ip6_push_pending_frames(struct sock *sk)
1975 {
1976 	struct sk_buff *skb;
1977 
1978 	skb = ip6_finish_skb(sk);
1979 	if (!skb)
1980 		return 0;
1981 
1982 	return ip6_send_skb(skb);
1983 }
1984 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1985 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1986 static void __ip6_flush_pending_frames(struct sock *sk,
1987 				       struct sk_buff_head *queue,
1988 				       struct inet_cork_full *cork,
1989 				       struct inet6_cork *v6_cork)
1990 {
1991 	struct sk_buff *skb;
1992 
1993 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1994 		if (skb_dst(skb))
1995 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1996 				      IPSTATS_MIB_OUTDISCARDS);
1997 		kfree_skb(skb);
1998 	}
1999 
2000 	ip6_cork_release(cork, v6_cork);
2001 }
2002 
ip6_flush_pending_frames(struct sock * sk)2003 void ip6_flush_pending_frames(struct sock *sk)
2004 {
2005 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2006 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2007 }
2008 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2009 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2010 struct sk_buff *ip6_make_skb(struct sock *sk,
2011 			     int getfrag(void *from, char *to, int offset,
2012 					 int len, int odd, struct sk_buff *skb),
2013 			     void *from, int length, int transhdrlen,
2014 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
2015 			     struct rt6_info *rt, unsigned int flags,
2016 			     struct inet_cork_full *cork)
2017 {
2018 	struct inet6_cork v6_cork;
2019 	struct sk_buff_head queue;
2020 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2021 	int err;
2022 
2023 	if (flags & MSG_PROBE)
2024 		return NULL;
2025 
2026 	__skb_queue_head_init(&queue);
2027 
2028 	cork->base.flags = 0;
2029 	cork->base.addr = 0;
2030 	cork->base.opt = NULL;
2031 	cork->base.dst = NULL;
2032 	v6_cork.opt = NULL;
2033 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2034 	if (err) {
2035 		ip6_cork_release(cork, &v6_cork);
2036 		return ERR_PTR(err);
2037 	}
2038 	if (ipc6->dontfrag < 0)
2039 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2040 
2041 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2042 				&current->task_frag, getfrag, from,
2043 				length + exthdrlen, transhdrlen + exthdrlen,
2044 				flags, ipc6);
2045 	if (err) {
2046 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2047 		return ERR_PTR(err);
2048 	}
2049 
2050 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2051 }
2052