• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 	int delta = hh_len - skb_headroom(skb);
65 	const struct in6_addr *nexthop;
66 	struct neighbour *neigh;
67 	int ret;
68 
69 	/* Be paranoid, rather than too clever. */
70 	if (unlikely(delta > 0) && dev->header_ops) {
71 		/* pskb_expand_head() might crash, if skb is shared */
72 		if (skb_shared(skb)) {
73 			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74 
75 			if (likely(nskb)) {
76 				if (skb->sk)
77 					skb_set_owner_w(nskb, skb->sk);
78 				consume_skb(skb);
79 			} else {
80 				kfree_skb(skb);
81 			}
82 			skb = nskb;
83 		}
84 		if (skb &&
85 		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 			kfree_skb(skb);
87 			skb = NULL;
88 		}
89 		if (!skb) {
90 			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 			return -ENOMEM;
92 		}
93 	}
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99 		    ((mroute6_is_socket(net, skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					net, sk, newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(net, idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122 
123 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 		    IPV6_ADDR_SCOPE_NODELOCAL &&
125 		    !(dev->flags & IFF_LOOPBACK)) {
126 			kfree_skb(skb);
127 			return 0;
128 		}
129 	}
130 
131 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 		int res = lwtunnel_xmit(skb);
133 
134 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
135 			return res;
136 	}
137 
138 	rcu_read_lock_bh();
139 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 	if (unlikely(!neigh))
142 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 	if (!IS_ERR(neigh)) {
144 		sock_confirm_neigh(skb, neigh);
145 		ret = neigh_output(neigh, skb, false);
146 		rcu_read_unlock_bh();
147 		return ret;
148 	}
149 	rcu_read_unlock_bh();
150 
151 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152 	kfree_skb(skb);
153 	return -EINVAL;
154 }
155 
156 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 				    struct sk_buff *skb, unsigned int mtu)
159 {
160 	struct sk_buff *segs, *nskb;
161 	netdev_features_t features;
162 	int ret = 0;
163 
164 	/* Please see corresponding comment in ip_finish_output_gso
165 	 * describing the cases where GSO segment length exceeds the
166 	 * egress MTU.
167 	 */
168 	features = netif_skb_features(skb);
169 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 	if (IS_ERR_OR_NULL(segs)) {
171 		kfree_skb(skb);
172 		return -ENOMEM;
173 	}
174 
175 	consume_skb(skb);
176 
177 	skb_list_walk_safe(segs, segs, nskb) {
178 		int err;
179 
180 		skb_mark_not_on_list(segs);
181 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
182 		if (err && ret == 0)
183 			ret = err;
184 	}
185 
186 	return ret;
187 }
188 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
190 {
191 	unsigned int mtu;
192 
193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194 	/* Policy lookup after SNAT yielded a new policy */
195 	if (skb_dst(skb)->xfrm) {
196 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
197 		return dst_output(net, sk, skb);
198 	}
199 #endif
200 
201 	mtu = ip6_skb_dst_mtu(skb);
202 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
203 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
204 
205 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
206 	    dst_allfrag(skb_dst(skb)) ||
207 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
208 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
209 	else
210 		return ip6_finish_output2(net, sk, skb);
211 }
212 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)213 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215 	int ret;
216 
217 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
218 	switch (ret) {
219 	case NET_XMIT_SUCCESS:
220 		return __ip6_finish_output(net, sk, skb);
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb(skb);
225 		return ret;
226 	}
227 }
228 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(idev->cnf.disable_ipv6)) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb(skb);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)249 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
250 {
251 	if (!np->autoflowlabel_set)
252 		return ip6_default_np_autolabel(net);
253 	else
254 		return np->autoflowlabel;
255 }
256 
257 /*
258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
259  * Note : socket lock is not held for SYNACK packets, but might be modified
260  * by calls to skb_set_owner_w() and ipv6_local_error(),
261  * which are using proper atomic operations or spinlocks.
262  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
264 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 {
266 	struct net *net = sock_net(sk);
267 	const struct ipv6_pinfo *np = inet6_sk(sk);
268 	struct in6_addr *first_hop = &fl6->daddr;
269 	struct dst_entry *dst = skb_dst(skb);
270 	unsigned int head_room;
271 	struct ipv6hdr *hdr;
272 	u8  proto = fl6->flowi6_proto;
273 	int seg_len = skb->len;
274 	int hlimit = -1;
275 	u32 mtu;
276 
277 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
278 	if (opt)
279 		head_room += opt->opt_nflen + opt->opt_flen;
280 
281 	if (unlikely(skb_headroom(skb) < head_room)) {
282 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
283 		if (!skb2) {
284 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
285 				      IPSTATS_MIB_OUTDISCARDS);
286 			kfree_skb(skb);
287 			return -ENOBUFS;
288 		}
289 		if (skb->sk)
290 			skb_set_owner_w(skb2, skb->sk);
291 		consume_skb(skb);
292 		skb = skb2;
293 	}
294 
295 	if (opt) {
296 		seg_len += opt->opt_nflen + opt->opt_flen;
297 
298 		if (opt->opt_flen)
299 			ipv6_push_frag_opts(skb, opt, &proto);
300 
301 		if (opt->opt_nflen)
302 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
303 					     &fl6->saddr);
304 	}
305 
306 	skb_push(skb, sizeof(struct ipv6hdr));
307 	skb_reset_network_header(skb);
308 	hdr = ipv6_hdr(skb);
309 
310 	/*
311 	 *	Fill in the IPv6 header
312 	 */
313 	if (np)
314 		hlimit = np->hop_limit;
315 	if (hlimit < 0)
316 		hlimit = ip6_dst_hoplimit(dst);
317 
318 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
319 				ip6_autoflowlabel(net, np), fl6));
320 
321 	hdr->payload_len = htons(seg_len);
322 	hdr->nexthdr = proto;
323 	hdr->hop_limit = hlimit;
324 
325 	hdr->saddr = fl6->saddr;
326 	hdr->daddr = *first_hop;
327 
328 	skb->protocol = htons(ETH_P_IPV6);
329 	skb->priority = priority;
330 	skb->mark = mark;
331 
332 	mtu = dst_mtu(dst);
333 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
334 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
335 			      IPSTATS_MIB_OUT, skb->len);
336 
337 		/* if egress device is enslaved to an L3 master device pass the
338 		 * skb to its handler for processing
339 		 */
340 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
341 		if (unlikely(!skb))
342 			return 0;
343 
344 		/* hooks should never assume socket lock is held.
345 		 * we promote our socket to non const
346 		 */
347 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
348 			       net, (struct sock *)sk, skb, NULL, dst->dev,
349 			       dst_output);
350 	}
351 
352 	skb->dev = dst->dev;
353 	/* ipv6_local_error() does not require socket lock,
354 	 * we promote our socket to non const
355 	 */
356 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
357 
358 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
359 	kfree_skb(skb);
360 	return -EMSGSIZE;
361 }
362 EXPORT_SYMBOL(ip6_xmit);
363 
ip6_call_ra_chain(struct sk_buff * skb,int sel)364 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
365 {
366 	struct ip6_ra_chain *ra;
367 	struct sock *last = NULL;
368 
369 	read_lock(&ip6_ra_lock);
370 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
371 		struct sock *sk = ra->sk;
372 		if (sk && ra->sel == sel &&
373 		    (!sk->sk_bound_dev_if ||
374 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
375 			struct ipv6_pinfo *np = inet6_sk(sk);
376 
377 			if (np && np->rtalert_isolate &&
378 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
379 				continue;
380 			}
381 			if (last) {
382 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
383 				if (skb2)
384 					rawv6_rcv(last, skb2);
385 			}
386 			last = sk;
387 		}
388 	}
389 
390 	if (last) {
391 		rawv6_rcv(last, skb);
392 		read_unlock(&ip6_ra_lock);
393 		return 1;
394 	}
395 	read_unlock(&ip6_ra_lock);
396 	return 0;
397 }
398 
ip6_forward_proxy_check(struct sk_buff * skb)399 static int ip6_forward_proxy_check(struct sk_buff *skb)
400 {
401 	struct ipv6hdr *hdr = ipv6_hdr(skb);
402 	u8 nexthdr = hdr->nexthdr;
403 	__be16 frag_off;
404 	int offset;
405 
406 	if (ipv6_ext_hdr(nexthdr)) {
407 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
408 		if (offset < 0)
409 			return 0;
410 	} else
411 		offset = sizeof(struct ipv6hdr);
412 
413 	if (nexthdr == IPPROTO_ICMPV6) {
414 		struct icmp6hdr *icmp6;
415 
416 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
417 					 offset + 1 - skb->data)))
418 			return 0;
419 
420 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
421 
422 		switch (icmp6->icmp6_type) {
423 		case NDISC_ROUTER_SOLICITATION:
424 		case NDISC_ROUTER_ADVERTISEMENT:
425 		case NDISC_NEIGHBOUR_SOLICITATION:
426 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
427 		case NDISC_REDIRECT:
428 			/* For reaction involving unicast neighbor discovery
429 			 * message destined to the proxied address, pass it to
430 			 * input function.
431 			 */
432 			return 1;
433 		default:
434 			break;
435 		}
436 	}
437 
438 	/*
439 	 * The proxying router can't forward traffic sent to a link-local
440 	 * address, so signal the sender and discard the packet. This
441 	 * behavior is clarified by the MIPv6 specification.
442 	 */
443 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
444 		dst_link_failure(skb);
445 		return -1;
446 	}
447 
448 	return 0;
449 }
450 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)451 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
452 				     struct sk_buff *skb)
453 {
454 	struct dst_entry *dst = skb_dst(skb);
455 
456 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
457 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
458 
459 #ifdef CONFIG_NET_SWITCHDEV
460 	if (skb->offload_l3_fwd_mark) {
461 		consume_skb(skb);
462 		return 0;
463 	}
464 #endif
465 
466 	skb->tstamp = 0;
467 	return dst_output(net, sk, skb);
468 }
469 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)470 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
471 {
472 	if (skb->len <= mtu)
473 		return false;
474 
475 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
476 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
477 		return true;
478 
479 	if (skb->ignore_df)
480 		return false;
481 
482 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
483 		return false;
484 
485 	return true;
486 }
487 
ip6_forward(struct sk_buff * skb)488 int ip6_forward(struct sk_buff *skb)
489 {
490 	struct dst_entry *dst = skb_dst(skb);
491 	struct ipv6hdr *hdr = ipv6_hdr(skb);
492 	struct inet6_skb_parm *opt = IP6CB(skb);
493 	struct net *net = dev_net(dst->dev);
494 	struct inet6_dev *idev;
495 	u32 mtu;
496 
497 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
498 	if (net->ipv6.devconf_all->forwarding == 0)
499 		goto error;
500 
501 	if (skb->pkt_type != PACKET_HOST)
502 		goto drop;
503 
504 	if (unlikely(skb->sk))
505 		goto drop;
506 
507 	if (skb_warn_if_lro(skb))
508 		goto drop;
509 
510 	if (!net->ipv6.devconf_all->disable_policy &&
511 	    !idev->cnf.disable_policy &&
512 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
513 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
514 		goto drop;
515 	}
516 
517 	skb_forward_csum(skb);
518 
519 	/*
520 	 *	We DO NOT make any processing on
521 	 *	RA packets, pushing them to user level AS IS
522 	 *	without ane WARRANTY that application will be able
523 	 *	to interpret them. The reason is that we
524 	 *	cannot make anything clever here.
525 	 *
526 	 *	We are not end-node, so that if packet contains
527 	 *	AH/ESP, we cannot make anything.
528 	 *	Defragmentation also would be mistake, RA packets
529 	 *	cannot be fragmented, because there is no warranty
530 	 *	that different fragments will go along one path. --ANK
531 	 */
532 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
533 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
534 			return 0;
535 	}
536 
537 	/*
538 	 *	check and decrement ttl
539 	 */
540 	if (hdr->hop_limit <= 1) {
541 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
542 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
543 
544 		kfree_skb(skb);
545 		return -ETIMEDOUT;
546 	}
547 
548 	/* XXX: idev->cnf.proxy_ndp? */
549 	if (net->ipv6.devconf_all->proxy_ndp &&
550 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
551 		int proxied = ip6_forward_proxy_check(skb);
552 		if (proxied > 0)
553 			return ip6_input(skb);
554 		else if (proxied < 0) {
555 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
556 			goto drop;
557 		}
558 	}
559 
560 	if (!xfrm6_route_forward(skb)) {
561 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
562 		goto drop;
563 	}
564 	dst = skb_dst(skb);
565 
566 	/* IPv6 specs say nothing about it, but it is clear that we cannot
567 	   send redirects to source routed frames.
568 	   We don't send redirects to frames decapsulated from IPsec.
569 	 */
570 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
571 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
572 		struct in6_addr *target = NULL;
573 		struct inet_peer *peer;
574 		struct rt6_info *rt;
575 
576 		/*
577 		 *	incoming and outgoing devices are the same
578 		 *	send a redirect.
579 		 */
580 
581 		rt = (struct rt6_info *) dst;
582 		if (rt->rt6i_flags & RTF_GATEWAY)
583 			target = &rt->rt6i_gateway;
584 		else
585 			target = &hdr->daddr;
586 
587 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
588 
589 		/* Limit redirects both by destination (here)
590 		   and by source (inside ndisc_send_redirect)
591 		 */
592 		if (inet_peer_xrlim_allow(peer, 1*HZ))
593 			ndisc_send_redirect(skb, target);
594 		if (peer)
595 			inet_putpeer(peer);
596 	} else {
597 		int addrtype = ipv6_addr_type(&hdr->saddr);
598 
599 		/* This check is security critical. */
600 		if (addrtype == IPV6_ADDR_ANY ||
601 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
602 			goto error;
603 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
604 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
605 				    ICMPV6_NOT_NEIGHBOUR, 0);
606 			goto error;
607 		}
608 	}
609 
610 	mtu = ip6_dst_mtu_forward(dst);
611 	if (mtu < IPV6_MIN_MTU)
612 		mtu = IPV6_MIN_MTU;
613 
614 	if (ip6_pkt_too_big(skb, mtu)) {
615 		/* Again, force OUTPUT device used as source address */
616 		skb->dev = dst->dev;
617 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
618 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
619 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
620 				IPSTATS_MIB_FRAGFAILS);
621 		kfree_skb(skb);
622 		return -EMSGSIZE;
623 	}
624 
625 	if (skb_cow(skb, dst->dev->hard_header_len)) {
626 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
627 				IPSTATS_MIB_OUTDISCARDS);
628 		goto drop;
629 	}
630 
631 	hdr = ipv6_hdr(skb);
632 
633 	/* Mangling hops number delayed to point after skb COW */
634 
635 	hdr->hop_limit--;
636 
637 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
638 		       net, NULL, skb, skb->dev, dst->dev,
639 		       ip6_forward_finish);
640 
641 error:
642 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
643 drop:
644 	kfree_skb(skb);
645 	return -EINVAL;
646 }
647 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
649 {
650 	to->pkt_type = from->pkt_type;
651 	to->priority = from->priority;
652 	to->protocol = from->protocol;
653 	skb_dst_drop(to);
654 	skb_dst_set(to, dst_clone(skb_dst(from)));
655 	to->dev = from->dev;
656 	to->mark = from->mark;
657 
658 	skb_copy_hash(to, from);
659 
660 #ifdef CONFIG_NET_SCHED
661 	to->tc_index = from->tc_index;
662 #endif
663 	nf_copy(to, from);
664 	skb_ext_copy(to, from);
665 	skb_copy_secmark(to, from);
666 }
667 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
669 		      u8 nexthdr, __be32 frag_id,
670 		      struct ip6_fraglist_iter *iter)
671 {
672 	unsigned int first_len;
673 	struct frag_hdr *fh;
674 
675 	/* BUILD HEADER */
676 	*prevhdr = NEXTHDR_FRAGMENT;
677 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678 	if (!iter->tmp_hdr)
679 		return -ENOMEM;
680 
681 	iter->frag = skb_shinfo(skb)->frag_list;
682 	skb_frag_list_init(skb);
683 
684 	iter->offset = 0;
685 	iter->hlen = hlen;
686 	iter->frag_id = frag_id;
687 	iter->nexthdr = nexthdr;
688 
689 	__skb_pull(skb, hlen);
690 	fh = __skb_push(skb, sizeof(struct frag_hdr));
691 	__skb_push(skb, hlen);
692 	skb_reset_network_header(skb);
693 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
694 
695 	fh->nexthdr = nexthdr;
696 	fh->reserved = 0;
697 	fh->frag_off = htons(IP6_MF);
698 	fh->identification = frag_id;
699 
700 	first_len = skb_pagelen(skb);
701 	skb->data_len = first_len - skb_headlen(skb);
702 	skb->len = first_len;
703 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
704 
705 	return 0;
706 }
707 EXPORT_SYMBOL(ip6_fraglist_init);
708 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)709 void ip6_fraglist_prepare(struct sk_buff *skb,
710 			  struct ip6_fraglist_iter *iter)
711 {
712 	struct sk_buff *frag = iter->frag;
713 	unsigned int hlen = iter->hlen;
714 	struct frag_hdr *fh;
715 
716 	frag->ip_summed = CHECKSUM_NONE;
717 	skb_reset_transport_header(frag);
718 	fh = __skb_push(frag, sizeof(struct frag_hdr));
719 	__skb_push(frag, hlen);
720 	skb_reset_network_header(frag);
721 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
722 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
723 	fh->nexthdr = iter->nexthdr;
724 	fh->reserved = 0;
725 	fh->frag_off = htons(iter->offset);
726 	if (frag->next)
727 		fh->frag_off |= htons(IP6_MF);
728 	fh->identification = iter->frag_id;
729 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
730 	ip6_copy_metadata(frag, skb);
731 }
732 EXPORT_SYMBOL(ip6_fraglist_prepare);
733 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
735 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
736 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
737 {
738 	state->prevhdr = prevhdr;
739 	state->nexthdr = nexthdr;
740 	state->frag_id = frag_id;
741 
742 	state->hlen = hlen;
743 	state->mtu = mtu;
744 
745 	state->left = skb->len - hlen;	/* Space per frame */
746 	state->ptr = hlen;		/* Where to start from */
747 
748 	state->hroom = hdr_room;
749 	state->troom = needed_tailroom;
750 
751 	state->offset = 0;
752 }
753 EXPORT_SYMBOL(ip6_frag_init);
754 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
756 {
757 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
758 	struct sk_buff *frag;
759 	struct frag_hdr *fh;
760 	unsigned int len;
761 
762 	len = state->left;
763 	/* IF: it doesn't fit, use 'mtu' - the data space left */
764 	if (len > state->mtu)
765 		len = state->mtu;
766 	/* IF: we are not sending up to and including the packet end
767 	   then align the next start on an eight byte boundary */
768 	if (len < state->left)
769 		len &= ~7;
770 
771 	/* Allocate buffer */
772 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
773 			 state->hroom + state->troom, GFP_ATOMIC);
774 	if (!frag)
775 		return ERR_PTR(-ENOMEM);
776 
777 	/*
778 	 *	Set up data on packet
779 	 */
780 
781 	ip6_copy_metadata(frag, skb);
782 	skb_reserve(frag, state->hroom);
783 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
784 	skb_reset_network_header(frag);
785 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
786 	frag->transport_header = (frag->network_header + state->hlen +
787 				  sizeof(struct frag_hdr));
788 
789 	/*
790 	 *	Charge the memory for the fragment to any owner
791 	 *	it might possess
792 	 */
793 	if (skb->sk)
794 		skb_set_owner_w(frag, skb->sk);
795 
796 	/*
797 	 *	Copy the packet header into the new buffer.
798 	 */
799 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
800 
801 	fragnexthdr_offset = skb_network_header(frag);
802 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
803 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
804 
805 	/*
806 	 *	Build fragment header.
807 	 */
808 	fh->nexthdr = state->nexthdr;
809 	fh->reserved = 0;
810 	fh->identification = state->frag_id;
811 
812 	/*
813 	 *	Copy a block of the IP datagram.
814 	 */
815 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
816 			     len));
817 	state->left -= len;
818 
819 	fh->frag_off = htons(state->offset);
820 	if (state->left > 0)
821 		fh->frag_off |= htons(IP6_MF);
822 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
823 
824 	state->ptr += len;
825 	state->offset += len;
826 
827 	return frag;
828 }
829 EXPORT_SYMBOL(ip6_frag_next);
830 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
832 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
833 {
834 	struct sk_buff *frag;
835 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
836 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
837 				inet6_sk(skb->sk) : NULL;
838 	struct ip6_frag_state state;
839 	unsigned int mtu, hlen, nexthdr_offset;
840 	ktime_t tstamp = skb->tstamp;
841 	int hroom, err = 0;
842 	__be32 frag_id;
843 	u8 *prevhdr, nexthdr = 0;
844 
845 	err = ip6_find_1stfragopt(skb, &prevhdr);
846 	if (err < 0)
847 		goto fail;
848 	hlen = err;
849 	nexthdr = *prevhdr;
850 	nexthdr_offset = prevhdr - skb_network_header(skb);
851 
852 	mtu = ip6_skb_dst_mtu(skb);
853 
854 	/* We must not fragment if the socket is set to force MTU discovery
855 	 * or if the skb it not generated by a local socket.
856 	 */
857 	if (unlikely(!skb->ignore_df && skb->len > mtu))
858 		goto fail_toobig;
859 
860 	if (IP6CB(skb)->frag_max_size) {
861 		if (IP6CB(skb)->frag_max_size > mtu)
862 			goto fail_toobig;
863 
864 		/* don't send fragments larger than what we received */
865 		mtu = IP6CB(skb)->frag_max_size;
866 		if (mtu < IPV6_MIN_MTU)
867 			mtu = IPV6_MIN_MTU;
868 	}
869 
870 	if (np && np->frag_size < mtu) {
871 		if (np->frag_size)
872 			mtu = np->frag_size;
873 	}
874 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
875 		goto fail_toobig;
876 	mtu -= hlen + sizeof(struct frag_hdr);
877 
878 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
879 				    &ipv6_hdr(skb)->saddr);
880 
881 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
882 	    (err = skb_checksum_help(skb)))
883 		goto fail;
884 
885 	prevhdr = skb_network_header(skb) + nexthdr_offset;
886 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
887 	if (skb_has_frag_list(skb)) {
888 		unsigned int first_len = skb_pagelen(skb);
889 		struct ip6_fraglist_iter iter;
890 		struct sk_buff *frag2;
891 
892 		if (first_len - hlen > mtu ||
893 		    ((first_len - hlen) & 7) ||
894 		    skb_cloned(skb) ||
895 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
896 			goto slow_path;
897 
898 		skb_walk_frags(skb, frag) {
899 			/* Correct geometry. */
900 			if (frag->len > mtu ||
901 			    ((frag->len & 7) && frag->next) ||
902 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
903 				goto slow_path_clean;
904 
905 			/* Partially cloned skb? */
906 			if (skb_shared(frag))
907 				goto slow_path_clean;
908 
909 			BUG_ON(frag->sk);
910 			if (skb->sk) {
911 				frag->sk = skb->sk;
912 				frag->destructor = sock_wfree;
913 			}
914 			skb->truesize -= frag->truesize;
915 		}
916 
917 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
918 					&iter);
919 		if (err < 0)
920 			goto fail;
921 
922 		for (;;) {
923 			/* Prepare header of the next frame,
924 			 * before previous one went down. */
925 			if (iter.frag)
926 				ip6_fraglist_prepare(skb, &iter);
927 
928 			skb->tstamp = tstamp;
929 			err = output(net, sk, skb);
930 			if (!err)
931 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
932 					      IPSTATS_MIB_FRAGCREATES);
933 
934 			if (err || !iter.frag)
935 				break;
936 
937 			skb = ip6_fraglist_next(&iter);
938 		}
939 
940 		kfree(iter.tmp_hdr);
941 
942 		if (err == 0) {
943 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
944 				      IPSTATS_MIB_FRAGOKS);
945 			return 0;
946 		}
947 
948 		kfree_skb_list(iter.frag);
949 
950 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
951 			      IPSTATS_MIB_FRAGFAILS);
952 		return err;
953 
954 slow_path_clean:
955 		skb_walk_frags(skb, frag2) {
956 			if (frag2 == frag)
957 				break;
958 			frag2->sk = NULL;
959 			frag2->destructor = NULL;
960 			skb->truesize += frag2->truesize;
961 		}
962 	}
963 
964 slow_path:
965 	/*
966 	 *	Fragment the datagram.
967 	 */
968 
969 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
970 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
971 		      &state);
972 
973 	/*
974 	 *	Keep copying data until we run out.
975 	 */
976 
977 	while (state.left > 0) {
978 		frag = ip6_frag_next(skb, &state);
979 		if (IS_ERR(frag)) {
980 			err = PTR_ERR(frag);
981 			goto fail;
982 		}
983 
984 		/*
985 		 *	Put this fragment into the sending queue.
986 		 */
987 		frag->tstamp = tstamp;
988 		err = output(net, sk, frag);
989 		if (err)
990 			goto fail;
991 
992 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
993 			      IPSTATS_MIB_FRAGCREATES);
994 	}
995 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
996 		      IPSTATS_MIB_FRAGOKS);
997 	consume_skb(skb);
998 	return err;
999 
1000 fail_toobig:
1001 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1002 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1003 
1004 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1005 	err = -EMSGSIZE;
1006 
1007 fail:
1008 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1009 		      IPSTATS_MIB_FRAGFAILS);
1010 	kfree_skb(skb);
1011 	return err;
1012 }
1013 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1014 static inline int ip6_rt_check(const struct rt6key *rt_key,
1015 			       const struct in6_addr *fl_addr,
1016 			       const struct in6_addr *addr_cache)
1017 {
1018 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1019 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1020 }
1021 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1022 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1023 					  struct dst_entry *dst,
1024 					  const struct flowi6 *fl6)
1025 {
1026 	struct ipv6_pinfo *np = inet6_sk(sk);
1027 	struct rt6_info *rt;
1028 
1029 	if (!dst)
1030 		goto out;
1031 
1032 	if (dst->ops->family != AF_INET6) {
1033 		dst_release(dst);
1034 		return NULL;
1035 	}
1036 
1037 	rt = (struct rt6_info *)dst;
1038 	/* Yes, checking route validity in not connected
1039 	 * case is not very simple. Take into account,
1040 	 * that we do not support routing by source, TOS,
1041 	 * and MSG_DONTROUTE		--ANK (980726)
1042 	 *
1043 	 * 1. ip6_rt_check(): If route was host route,
1044 	 *    check that cached destination is current.
1045 	 *    If it is network route, we still may
1046 	 *    check its validity using saved pointer
1047 	 *    to the last used address: daddr_cache.
1048 	 *    We do not want to save whole address now,
1049 	 *    (because main consumer of this service
1050 	 *    is tcp, which has not this problem),
1051 	 *    so that the last trick works only on connected
1052 	 *    sockets.
1053 	 * 2. oif also should be the same.
1054 	 */
1055 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1056 #ifdef CONFIG_IPV6_SUBTREES
1057 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1058 #endif
1059 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1060 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1061 		dst_release(dst);
1062 		dst = NULL;
1063 	}
1064 
1065 out:
1066 	return dst;
1067 }
1068 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070 			       struct dst_entry **dst, struct flowi6 *fl6)
1071 {
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073 	struct neighbour *n;
1074 	struct rt6_info *rt;
1075 #endif
1076 	int err;
1077 	int flags = 0;
1078 
1079 	/* The correct way to handle this would be to do
1080 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1081 	 * the route-specific preferred source forces the
1082 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1083 	 *
1084 	 * In source specific routing (no src=any default route),
1085 	 * ip6_route_output will fail given src=any saddr, though, so
1086 	 * that's why we try it again later.
1087 	 */
1088 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1089 		struct fib6_info *from;
1090 		struct rt6_info *rt;
1091 		bool had_dst = *dst != NULL;
1092 
1093 		if (!had_dst)
1094 			*dst = ip6_route_output(net, sk, fl6);
1095 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1096 
1097 		rcu_read_lock();
1098 		from = rt ? rcu_dereference(rt->from) : NULL;
1099 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1100 					  sk ? inet6_sk(sk)->srcprefs : 0,
1101 					  &fl6->saddr);
1102 		rcu_read_unlock();
1103 
1104 		if (err)
1105 			goto out_err_release;
1106 
1107 		/* If we had an erroneous initial result, pretend it
1108 		 * never existed and let the SA-enabled version take
1109 		 * over.
1110 		 */
1111 		if (!had_dst && (*dst)->error) {
1112 			dst_release(*dst);
1113 			*dst = NULL;
1114 		}
1115 
1116 		if (fl6->flowi6_oif)
1117 			flags |= RT6_LOOKUP_F_IFACE;
1118 	}
1119 
1120 	if (!*dst)
1121 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1122 
1123 	err = (*dst)->error;
1124 	if (err)
1125 		goto out_err_release;
1126 
1127 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1128 	/*
1129 	 * Here if the dst entry we've looked up
1130 	 * has a neighbour entry that is in the INCOMPLETE
1131 	 * state and the src address from the flow is
1132 	 * marked as OPTIMISTIC, we release the found
1133 	 * dst entry and replace it instead with the
1134 	 * dst entry of the nexthop router
1135 	 */
1136 	rt = (struct rt6_info *) *dst;
1137 	rcu_read_lock_bh();
1138 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1139 				      rt6_nexthop(rt, &fl6->daddr));
1140 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1141 	rcu_read_unlock_bh();
1142 
1143 	if (err) {
1144 		struct inet6_ifaddr *ifp;
1145 		struct flowi6 fl_gw6;
1146 		int redirect;
1147 
1148 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1149 				      (*dst)->dev, 1);
1150 
1151 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1152 		if (ifp)
1153 			in6_ifa_put(ifp);
1154 
1155 		if (redirect) {
1156 			/*
1157 			 * We need to get the dst entry for the
1158 			 * default router instead
1159 			 */
1160 			dst_release(*dst);
1161 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1162 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1163 			*dst = ip6_route_output(net, sk, &fl_gw6);
1164 			err = (*dst)->error;
1165 			if (err)
1166 				goto out_err_release;
1167 		}
1168 	}
1169 #endif
1170 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1171 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1172 		err = -EAFNOSUPPORT;
1173 		goto out_err_release;
1174 	}
1175 
1176 	return 0;
1177 
1178 out_err_release:
1179 	dst_release(*dst);
1180 	*dst = NULL;
1181 
1182 	if (err == -ENETUNREACH)
1183 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1184 	return err;
1185 }
1186 
1187 /**
1188  *	ip6_dst_lookup - perform route lookup on flow
1189  *	@net: Network namespace to perform lookup in
1190  *	@sk: socket which provides route info
1191  *	@dst: pointer to dst_entry * for result
1192  *	@fl6: flow to lookup
1193  *
1194  *	This function performs a route lookup on the given flow.
1195  *
1196  *	It returns zero on success, or a standard errno code on error.
1197  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1198 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1199 		   struct flowi6 *fl6)
1200 {
1201 	*dst = NULL;
1202 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1203 }
1204 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1205 
1206 /**
1207  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1208  *	@net: Network namespace to perform lookup in
1209  *	@sk: socket which provides route info
1210  *	@fl6: flow to lookup
1211  *	@final_dst: final destination address for ipsec lookup
1212  *
1213  *	This function performs a route lookup on the given flow.
1214  *
1215  *	It returns a valid dst pointer on success, or a pointer encoded
1216  *	error code.
1217  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1218 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1219 				      const struct in6_addr *final_dst)
1220 {
1221 	struct dst_entry *dst = NULL;
1222 	int err;
1223 
1224 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1225 	if (err)
1226 		return ERR_PTR(err);
1227 	if (final_dst)
1228 		fl6->daddr = *final_dst;
1229 
1230 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1231 }
1232 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1233 
1234 /**
1235  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1236  *	@sk: socket which provides the dst cache and route info
1237  *	@fl6: flow to lookup
1238  *	@final_dst: final destination address for ipsec lookup
1239  *	@connected: whether @sk is connected or not
1240  *
1241  *	This function performs a route lookup on the given flow with the
1242  *	possibility of using the cached route in the socket if it is valid.
1243  *	It will take the socket dst lock when operating on the dst cache.
1244  *	As a result, this function can only be used in process context.
1245  *
1246  *	In addition, for a connected socket, cache the dst in the socket
1247  *	if the current cache is not valid.
1248  *
1249  *	It returns a valid dst pointer on success, or a pointer encoded
1250  *	error code.
1251  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1252 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1253 					 const struct in6_addr *final_dst,
1254 					 bool connected)
1255 {
1256 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1257 
1258 	dst = ip6_sk_dst_check(sk, dst, fl6);
1259 	if (dst)
1260 		return dst;
1261 
1262 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1263 	if (connected && !IS_ERR(dst))
1264 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1265 
1266 	return dst;
1267 }
1268 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1269 
1270 /**
1271  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1272  *      @skb: Packet for which lookup is done
1273  *      @dev: Tunnel device
1274  *      @net: Network namespace of tunnel device
1275  *      @sock: Socket which provides route info
1276  *      @saddr: Memory to store the src ip address
1277  *      @info: Tunnel information
1278  *      @protocol: IP protocol
1279  *      @use_cache: Flag to enable cache usage
1280  *      This function performs a route lookup on a tunnel
1281  *
1282  *      It returns a valid dst pointer and stores src address to be used in
1283  *      tunnel in param saddr on success, else a pointer encoded error code.
1284  */
1285 
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1286 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1287 					struct net_device *dev,
1288 					struct net *net,
1289 					struct socket *sock,
1290 					struct in6_addr *saddr,
1291 					const struct ip_tunnel_info *info,
1292 					u8 protocol,
1293 					bool use_cache)
1294 {
1295 	struct dst_entry *dst = NULL;
1296 #ifdef CONFIG_DST_CACHE
1297 	struct dst_cache *dst_cache;
1298 #endif
1299 	struct flowi6 fl6;
1300 	__u8 prio;
1301 
1302 #ifdef CONFIG_DST_CACHE
1303 	dst_cache = (struct dst_cache *)&info->dst_cache;
1304 	if (use_cache) {
1305 		dst = dst_cache_get_ip6(dst_cache, saddr);
1306 		if (dst)
1307 			return dst;
1308 	}
1309 #endif
1310 	memset(&fl6, 0, sizeof(fl6));
1311 	fl6.flowi6_mark = skb->mark;
1312 	fl6.flowi6_proto = protocol;
1313 	fl6.daddr = info->key.u.ipv6.dst;
1314 	fl6.saddr = info->key.u.ipv6.src;
1315 	prio = info->key.tos;
1316 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1317 					  info->key.label);
1318 
1319 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1320 					      NULL);
1321 	if (IS_ERR(dst)) {
1322 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1323 		return ERR_PTR(-ENETUNREACH);
1324 	}
1325 	if (dst->dev == dev) { /* is this necessary? */
1326 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1327 		dst_release(dst);
1328 		return ERR_PTR(-ELOOP);
1329 	}
1330 #ifdef CONFIG_DST_CACHE
1331 	if (use_cache)
1332 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1333 #endif
1334 	*saddr = fl6.saddr;
1335 	return dst;
1336 }
1337 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1338 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1339 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1340 					       gfp_t gfp)
1341 {
1342 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1343 }
1344 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1345 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1346 						gfp_t gfp)
1347 {
1348 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1349 }
1350 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1351 static void ip6_append_data_mtu(unsigned int *mtu,
1352 				int *maxfraglen,
1353 				unsigned int fragheaderlen,
1354 				struct sk_buff *skb,
1355 				struct rt6_info *rt,
1356 				unsigned int orig_mtu)
1357 {
1358 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1359 		if (!skb) {
1360 			/* first fragment, reserve header_len */
1361 			*mtu = orig_mtu - rt->dst.header_len;
1362 
1363 		} else {
1364 			/*
1365 			 * this fragment is not first, the headers
1366 			 * space is regarded as data space.
1367 			 */
1368 			*mtu = orig_mtu;
1369 		}
1370 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1371 			      + fragheaderlen - sizeof(struct frag_hdr);
1372 	}
1373 }
1374 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1375 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1376 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1377 			  struct rt6_info *rt, struct flowi6 *fl6)
1378 {
1379 	struct ipv6_pinfo *np = inet6_sk(sk);
1380 	unsigned int mtu;
1381 	struct ipv6_txoptions *opt = ipc6->opt;
1382 
1383 	/*
1384 	 * setup for corking
1385 	 */
1386 	if (opt) {
1387 		if (WARN_ON(v6_cork->opt))
1388 			return -EINVAL;
1389 
1390 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1391 		if (unlikely(!v6_cork->opt))
1392 			return -ENOBUFS;
1393 
1394 		v6_cork->opt->tot_len = sizeof(*opt);
1395 		v6_cork->opt->opt_flen = opt->opt_flen;
1396 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1397 
1398 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1399 						    sk->sk_allocation);
1400 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1401 			return -ENOBUFS;
1402 
1403 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1404 						    sk->sk_allocation);
1405 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1406 			return -ENOBUFS;
1407 
1408 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1409 						   sk->sk_allocation);
1410 		if (opt->hopopt && !v6_cork->opt->hopopt)
1411 			return -ENOBUFS;
1412 
1413 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1414 						    sk->sk_allocation);
1415 		if (opt->srcrt && !v6_cork->opt->srcrt)
1416 			return -ENOBUFS;
1417 
1418 		/* need source address above miyazawa*/
1419 	}
1420 	dst_hold(&rt->dst);
1421 	cork->base.dst = &rt->dst;
1422 	cork->fl.u.ip6 = *fl6;
1423 	v6_cork->hop_limit = ipc6->hlimit;
1424 	v6_cork->tclass = ipc6->tclass;
1425 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1426 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1427 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1428 	else
1429 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1430 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1431 	if (np->frag_size < mtu) {
1432 		if (np->frag_size)
1433 			mtu = np->frag_size;
1434 	}
1435 	if (mtu < IPV6_MIN_MTU)
1436 		return -EINVAL;
1437 	cork->base.fragsize = mtu;
1438 	cork->base.gso_size = ipc6->gso_size;
1439 	cork->base.tx_flags = 0;
1440 	cork->base.mark = ipc6->sockc.mark;
1441 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1442 
1443 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1444 		cork->base.flags |= IPCORK_ALLFRAG;
1445 	cork->base.length = 0;
1446 
1447 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1448 
1449 	return 0;
1450 }
1451 
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1452 static int __ip6_append_data(struct sock *sk,
1453 			     struct flowi6 *fl6,
1454 			     struct sk_buff_head *queue,
1455 			     struct inet_cork *cork,
1456 			     struct inet6_cork *v6_cork,
1457 			     struct page_frag *pfrag,
1458 			     int getfrag(void *from, char *to, int offset,
1459 					 int len, int odd, struct sk_buff *skb),
1460 			     void *from, int length, int transhdrlen,
1461 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1462 {
1463 	struct sk_buff *skb, *skb_prev = NULL;
1464 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1465 	struct ubuf_info *uarg = NULL;
1466 	int exthdrlen = 0;
1467 	int dst_exthdrlen = 0;
1468 	int hh_len;
1469 	int copy;
1470 	int err;
1471 	int offset = 0;
1472 	u32 tskey = 0;
1473 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1474 	struct ipv6_txoptions *opt = v6_cork->opt;
1475 	int csummode = CHECKSUM_NONE;
1476 	unsigned int maxnonfragsize, headersize;
1477 	unsigned int wmem_alloc_delta = 0;
1478 	bool paged, extra_uref = false;
1479 
1480 	skb = skb_peek_tail(queue);
1481 	if (!skb) {
1482 		exthdrlen = opt ? opt->opt_flen : 0;
1483 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1484 	}
1485 
1486 	paged = !!cork->gso_size;
1487 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1488 	orig_mtu = mtu;
1489 
1490 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1491 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1492 		tskey = sk->sk_tskey++;
1493 
1494 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1495 
1496 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1497 			(opt ? opt->opt_nflen : 0);
1498 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1499 		     sizeof(struct frag_hdr);
1500 
1501 	headersize = sizeof(struct ipv6hdr) +
1502 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1503 		     (dst_allfrag(&rt->dst) ?
1504 		      sizeof(struct frag_hdr) : 0) +
1505 		     rt->rt6i_nfheader_len;
1506 
1507 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1508 	 * the first fragment
1509 	 */
1510 	if (headersize + transhdrlen > mtu)
1511 		goto emsgsize;
1512 
1513 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1514 	    (sk->sk_protocol == IPPROTO_UDP ||
1515 	     sk->sk_protocol == IPPROTO_RAW)) {
1516 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1517 				sizeof(struct ipv6hdr));
1518 		goto emsgsize;
1519 	}
1520 
1521 	if (ip6_sk_ignore_df(sk))
1522 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1523 	else
1524 		maxnonfragsize = mtu;
1525 
1526 	if (cork->length + length > maxnonfragsize - headersize) {
1527 emsgsize:
1528 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1529 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1530 		return -EMSGSIZE;
1531 	}
1532 
1533 	/* CHECKSUM_PARTIAL only with no extension headers and when
1534 	 * we are not going to fragment
1535 	 */
1536 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1537 	    headersize == sizeof(struct ipv6hdr) &&
1538 	    length <= mtu - headersize &&
1539 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1540 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1541 		csummode = CHECKSUM_PARTIAL;
1542 
1543 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1544 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1545 		if (!uarg)
1546 			return -ENOBUFS;
1547 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1548 		if (rt->dst.dev->features & NETIF_F_SG &&
1549 		    csummode == CHECKSUM_PARTIAL) {
1550 			paged = true;
1551 		} else {
1552 			uarg->zerocopy = 0;
1553 			skb_zcopy_set(skb, uarg, &extra_uref);
1554 		}
1555 	}
1556 
1557 	/*
1558 	 * Let's try using as much space as possible.
1559 	 * Use MTU if total length of the message fits into the MTU.
1560 	 * Otherwise, we need to reserve fragment header and
1561 	 * fragment alignment (= 8-15 octects, in total).
1562 	 *
1563 	 * Note that we may need to "move" the data from the tail
1564 	 * of the buffer to the new fragment when we split
1565 	 * the message.
1566 	 *
1567 	 * FIXME: It may be fragmented into multiple chunks
1568 	 *        at once if non-fragmentable extension headers
1569 	 *        are too large.
1570 	 * --yoshfuji
1571 	 */
1572 
1573 	cork->length += length;
1574 	if (!skb)
1575 		goto alloc_new_skb;
1576 
1577 	while (length > 0) {
1578 		/* Check if the remaining data fits into current packet. */
1579 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1580 		if (copy < length)
1581 			copy = maxfraglen - skb->len;
1582 
1583 		if (copy <= 0) {
1584 			char *data;
1585 			unsigned int datalen;
1586 			unsigned int fraglen;
1587 			unsigned int fraggap;
1588 			unsigned int alloclen, alloc_extra;
1589 			unsigned int pagedlen;
1590 alloc_new_skb:
1591 			/* There's no room in the current skb */
1592 			if (skb)
1593 				fraggap = skb->len - maxfraglen;
1594 			else
1595 				fraggap = 0;
1596 			/* update mtu and maxfraglen if necessary */
1597 			if (!skb || !skb_prev)
1598 				ip6_append_data_mtu(&mtu, &maxfraglen,
1599 						    fragheaderlen, skb, rt,
1600 						    orig_mtu);
1601 
1602 			skb_prev = skb;
1603 
1604 			/*
1605 			 * If remaining data exceeds the mtu,
1606 			 * we know we need more fragment(s).
1607 			 */
1608 			datalen = length + fraggap;
1609 
1610 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1611 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1612 			fraglen = datalen + fragheaderlen;
1613 			pagedlen = 0;
1614 
1615 			alloc_extra = hh_len;
1616 			alloc_extra += dst_exthdrlen;
1617 			alloc_extra += rt->dst.trailer_len;
1618 
1619 			/* We just reserve space for fragment header.
1620 			 * Note: this may be overallocation if the message
1621 			 * (without MSG_MORE) fits into the MTU.
1622 			 */
1623 			alloc_extra += sizeof(struct frag_hdr);
1624 
1625 			if ((flags & MSG_MORE) &&
1626 			    !(rt->dst.dev->features&NETIF_F_SG))
1627 				alloclen = mtu;
1628 			else if (!paged &&
1629 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1630 				  !(rt->dst.dev->features & NETIF_F_SG)))
1631 				alloclen = fraglen;
1632 			else {
1633 				alloclen = min_t(int, fraglen, MAX_HEADER);
1634 				pagedlen = fraglen - alloclen;
1635 			}
1636 			alloclen += alloc_extra;
1637 
1638 			if (datalen != length + fraggap) {
1639 				/*
1640 				 * this is not the last fragment, the trailer
1641 				 * space is regarded as data space.
1642 				 */
1643 				datalen += rt->dst.trailer_len;
1644 			}
1645 
1646 			fraglen = datalen + fragheaderlen;
1647 
1648 			copy = datalen - transhdrlen - fraggap - pagedlen;
1649 			if (copy < 0) {
1650 				err = -EINVAL;
1651 				goto error;
1652 			}
1653 			if (transhdrlen) {
1654 				skb = sock_alloc_send_skb(sk, alloclen,
1655 						(flags & MSG_DONTWAIT), &err);
1656 			} else {
1657 				skb = NULL;
1658 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1659 				    2 * sk->sk_sndbuf)
1660 					skb = alloc_skb(alloclen,
1661 							sk->sk_allocation);
1662 				if (unlikely(!skb))
1663 					err = -ENOBUFS;
1664 			}
1665 			if (!skb)
1666 				goto error;
1667 			/*
1668 			 *	Fill in the control structures
1669 			 */
1670 			skb->protocol = htons(ETH_P_IPV6);
1671 			skb->ip_summed = csummode;
1672 			skb->csum = 0;
1673 			/* reserve for fragmentation and ipsec header */
1674 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1675 				    dst_exthdrlen);
1676 
1677 			/*
1678 			 *	Find where to start putting bytes
1679 			 */
1680 			data = skb_put(skb, fraglen - pagedlen);
1681 			skb_set_network_header(skb, exthdrlen);
1682 			data += fragheaderlen;
1683 			skb->transport_header = (skb->network_header +
1684 						 fragheaderlen);
1685 			if (fraggap) {
1686 				skb->csum = skb_copy_and_csum_bits(
1687 					skb_prev, maxfraglen,
1688 					data + transhdrlen, fraggap);
1689 				skb_prev->csum = csum_sub(skb_prev->csum,
1690 							  skb->csum);
1691 				data += fraggap;
1692 				pskb_trim_unique(skb_prev, maxfraglen);
1693 			}
1694 			if (copy > 0 &&
1695 			    getfrag(from, data + transhdrlen, offset,
1696 				    copy, fraggap, skb) < 0) {
1697 				err = -EFAULT;
1698 				kfree_skb(skb);
1699 				goto error;
1700 			}
1701 
1702 			offset += copy;
1703 			length -= copy + transhdrlen;
1704 			transhdrlen = 0;
1705 			exthdrlen = 0;
1706 			dst_exthdrlen = 0;
1707 
1708 			/* Only the initial fragment is time stamped */
1709 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1710 			cork->tx_flags = 0;
1711 			skb_shinfo(skb)->tskey = tskey;
1712 			tskey = 0;
1713 			skb_zcopy_set(skb, uarg, &extra_uref);
1714 
1715 			if ((flags & MSG_CONFIRM) && !skb_prev)
1716 				skb_set_dst_pending_confirm(skb, 1);
1717 
1718 			/*
1719 			 * Put the packet on the pending queue
1720 			 */
1721 			if (!skb->destructor) {
1722 				skb->destructor = sock_wfree;
1723 				skb->sk = sk;
1724 				wmem_alloc_delta += skb->truesize;
1725 			}
1726 			__skb_queue_tail(queue, skb);
1727 			continue;
1728 		}
1729 
1730 		if (copy > length)
1731 			copy = length;
1732 
1733 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1734 		    skb_tailroom(skb) >= copy) {
1735 			unsigned int off;
1736 
1737 			off = skb->len;
1738 			if (getfrag(from, skb_put(skb, copy),
1739 						offset, copy, off, skb) < 0) {
1740 				__skb_trim(skb, off);
1741 				err = -EFAULT;
1742 				goto error;
1743 			}
1744 		} else if (!uarg || !uarg->zerocopy) {
1745 			int i = skb_shinfo(skb)->nr_frags;
1746 
1747 			err = -ENOMEM;
1748 			if (!sk_page_frag_refill(sk, pfrag))
1749 				goto error;
1750 
1751 			if (!skb_can_coalesce(skb, i, pfrag->page,
1752 					      pfrag->offset)) {
1753 				err = -EMSGSIZE;
1754 				if (i == MAX_SKB_FRAGS)
1755 					goto error;
1756 
1757 				__skb_fill_page_desc(skb, i, pfrag->page,
1758 						     pfrag->offset, 0);
1759 				skb_shinfo(skb)->nr_frags = ++i;
1760 				get_page(pfrag->page);
1761 			}
1762 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1763 			if (getfrag(from,
1764 				    page_address(pfrag->page) + pfrag->offset,
1765 				    offset, copy, skb->len, skb) < 0)
1766 				goto error_efault;
1767 
1768 			pfrag->offset += copy;
1769 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1770 			skb->len += copy;
1771 			skb->data_len += copy;
1772 			skb->truesize += copy;
1773 			wmem_alloc_delta += copy;
1774 		} else {
1775 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1776 			if (err < 0)
1777 				goto error;
1778 		}
1779 		offset += copy;
1780 		length -= copy;
1781 	}
1782 
1783 	if (wmem_alloc_delta)
1784 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1785 	return 0;
1786 
1787 error_efault:
1788 	err = -EFAULT;
1789 error:
1790 	if (uarg)
1791 		sock_zerocopy_put_abort(uarg, extra_uref);
1792 	cork->length -= length;
1793 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1794 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1795 	return err;
1796 }
1797 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1798 int ip6_append_data(struct sock *sk,
1799 		    int getfrag(void *from, char *to, int offset, int len,
1800 				int odd, struct sk_buff *skb),
1801 		    void *from, int length, int transhdrlen,
1802 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1803 		    struct rt6_info *rt, unsigned int flags)
1804 {
1805 	struct inet_sock *inet = inet_sk(sk);
1806 	struct ipv6_pinfo *np = inet6_sk(sk);
1807 	int exthdrlen;
1808 	int err;
1809 
1810 	if (flags&MSG_PROBE)
1811 		return 0;
1812 	if (skb_queue_empty(&sk->sk_write_queue)) {
1813 		/*
1814 		 * setup for corking
1815 		 */
1816 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1817 				     ipc6, rt, fl6);
1818 		if (err)
1819 			return err;
1820 
1821 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1822 		length += exthdrlen;
1823 		transhdrlen += exthdrlen;
1824 	} else {
1825 		fl6 = &inet->cork.fl.u.ip6;
1826 		transhdrlen = 0;
1827 	}
1828 
1829 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1830 				 &np->cork, sk_page_frag(sk), getfrag,
1831 				 from, length, transhdrlen, flags, ipc6);
1832 }
1833 EXPORT_SYMBOL_GPL(ip6_append_data);
1834 
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1835 static void ip6_cork_release(struct inet_cork_full *cork,
1836 			     struct inet6_cork *v6_cork)
1837 {
1838 	if (v6_cork->opt) {
1839 		kfree(v6_cork->opt->dst0opt);
1840 		kfree(v6_cork->opt->dst1opt);
1841 		kfree(v6_cork->opt->hopopt);
1842 		kfree(v6_cork->opt->srcrt);
1843 		kfree(v6_cork->opt);
1844 		v6_cork->opt = NULL;
1845 	}
1846 
1847 	if (cork->base.dst) {
1848 		dst_release(cork->base.dst);
1849 		cork->base.dst = NULL;
1850 		cork->base.flags &= ~IPCORK_ALLFRAG;
1851 	}
1852 	memset(&cork->fl, 0, sizeof(cork->fl));
1853 }
1854 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1855 struct sk_buff *__ip6_make_skb(struct sock *sk,
1856 			       struct sk_buff_head *queue,
1857 			       struct inet_cork_full *cork,
1858 			       struct inet6_cork *v6_cork)
1859 {
1860 	struct sk_buff *skb, *tmp_skb;
1861 	struct sk_buff **tail_skb;
1862 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1863 	struct ipv6_pinfo *np = inet6_sk(sk);
1864 	struct net *net = sock_net(sk);
1865 	struct ipv6hdr *hdr;
1866 	struct ipv6_txoptions *opt = v6_cork->opt;
1867 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1868 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1869 	unsigned char proto = fl6->flowi6_proto;
1870 
1871 	skb = __skb_dequeue(queue);
1872 	if (!skb)
1873 		goto out;
1874 	tail_skb = &(skb_shinfo(skb)->frag_list);
1875 
1876 	/* move skb->data to ip header from ext header */
1877 	if (skb->data < skb_network_header(skb))
1878 		__skb_pull(skb, skb_network_offset(skb));
1879 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1880 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1881 		*tail_skb = tmp_skb;
1882 		tail_skb = &(tmp_skb->next);
1883 		skb->len += tmp_skb->len;
1884 		skb->data_len += tmp_skb->len;
1885 		skb->truesize += tmp_skb->truesize;
1886 		tmp_skb->destructor = NULL;
1887 		tmp_skb->sk = NULL;
1888 	}
1889 
1890 	/* Allow local fragmentation. */
1891 	skb->ignore_df = ip6_sk_ignore_df(sk);
1892 
1893 	*final_dst = fl6->daddr;
1894 	__skb_pull(skb, skb_network_header_len(skb));
1895 	if (opt && opt->opt_flen)
1896 		ipv6_push_frag_opts(skb, opt, &proto);
1897 	if (opt && opt->opt_nflen)
1898 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1899 
1900 	skb_push(skb, sizeof(struct ipv6hdr));
1901 	skb_reset_network_header(skb);
1902 	hdr = ipv6_hdr(skb);
1903 
1904 	ip6_flow_hdr(hdr, v6_cork->tclass,
1905 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1906 					ip6_autoflowlabel(net, np), fl6));
1907 	hdr->hop_limit = v6_cork->hop_limit;
1908 	hdr->nexthdr = proto;
1909 	hdr->saddr = fl6->saddr;
1910 	hdr->daddr = *final_dst;
1911 
1912 	skb->priority = sk->sk_priority;
1913 	skb->mark = cork->base.mark;
1914 
1915 	skb->tstamp = cork->base.transmit_time;
1916 
1917 	skb_dst_set(skb, dst_clone(&rt->dst));
1918 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1919 	if (proto == IPPROTO_ICMPV6) {
1920 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1921 
1922 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1923 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1924 	}
1925 
1926 	ip6_cork_release(cork, v6_cork);
1927 out:
1928 	return skb;
1929 }
1930 
ip6_send_skb(struct sk_buff * skb)1931 int ip6_send_skb(struct sk_buff *skb)
1932 {
1933 	struct net *net = sock_net(skb->sk);
1934 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1935 	int err;
1936 
1937 	err = ip6_local_out(net, skb->sk, skb);
1938 	if (err) {
1939 		if (err > 0)
1940 			err = net_xmit_errno(err);
1941 		if (err)
1942 			IP6_INC_STATS(net, rt->rt6i_idev,
1943 				      IPSTATS_MIB_OUTDISCARDS);
1944 	}
1945 
1946 	return err;
1947 }
1948 
ip6_push_pending_frames(struct sock * sk)1949 int ip6_push_pending_frames(struct sock *sk)
1950 {
1951 	struct sk_buff *skb;
1952 
1953 	skb = ip6_finish_skb(sk);
1954 	if (!skb)
1955 		return 0;
1956 
1957 	return ip6_send_skb(skb);
1958 }
1959 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1960 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1961 static void __ip6_flush_pending_frames(struct sock *sk,
1962 				       struct sk_buff_head *queue,
1963 				       struct inet_cork_full *cork,
1964 				       struct inet6_cork *v6_cork)
1965 {
1966 	struct sk_buff *skb;
1967 
1968 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1969 		if (skb_dst(skb))
1970 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1971 				      IPSTATS_MIB_OUTDISCARDS);
1972 		kfree_skb(skb);
1973 	}
1974 
1975 	ip6_cork_release(cork, v6_cork);
1976 }
1977 
ip6_flush_pending_frames(struct sock * sk)1978 void ip6_flush_pending_frames(struct sock *sk)
1979 {
1980 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1981 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1982 }
1983 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1984 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)1985 struct sk_buff *ip6_make_skb(struct sock *sk,
1986 			     int getfrag(void *from, char *to, int offset,
1987 					 int len, int odd, struct sk_buff *skb),
1988 			     void *from, int length, int transhdrlen,
1989 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1990 			     struct rt6_info *rt, unsigned int flags,
1991 			     struct inet_cork_full *cork)
1992 {
1993 	struct inet6_cork v6_cork;
1994 	struct sk_buff_head queue;
1995 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1996 	int err;
1997 
1998 	if (flags & MSG_PROBE)
1999 		return NULL;
2000 
2001 	__skb_queue_head_init(&queue);
2002 
2003 	cork->base.flags = 0;
2004 	cork->base.addr = 0;
2005 	cork->base.opt = NULL;
2006 	cork->base.dst = NULL;
2007 	v6_cork.opt = NULL;
2008 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2009 	if (err) {
2010 		ip6_cork_release(cork, &v6_cork);
2011 		return ERR_PTR(err);
2012 	}
2013 	if (ipc6->dontfrag < 0)
2014 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2015 
2016 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2017 				&current->task_frag, getfrag, from,
2018 				length + exthdrlen, transhdrlen + exthdrlen,
2019 				flags, ipc6);
2020 	if (err) {
2021 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2022 		return ERR_PTR(err);
2023 	}
2024 
2025 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2026 }
2027