• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 	int delta = hh_len - skb_headroom(skb);
65 	const struct in6_addr *nexthop;
66 	struct neighbour *neigh;
67 	int ret;
68 
69 	/* Be paranoid, rather than too clever. */
70 	if (unlikely(delta > 0) && dev->header_ops) {
71 		/* pskb_expand_head() might crash, if skb is shared */
72 		if (skb_shared(skb)) {
73 			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74 
75 			if (likely(nskb)) {
76 				if (skb->sk)
77 					skb_set_owner_w(nskb, skb->sk);
78 				consume_skb(skb);
79 			} else {
80 				kfree_skb(skb);
81 			}
82 			skb = nskb;
83 		}
84 		if (skb &&
85 		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 			kfree_skb(skb);
87 			skb = NULL;
88 		}
89 		if (!skb) {
90 			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 			return -ENOMEM;
92 		}
93 	}
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99 		    ((mroute6_is_socket(net, skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					net, sk, newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(net, idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122 
123 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 		    IPV6_ADDR_SCOPE_NODELOCAL &&
125 		    !(dev->flags & IFF_LOOPBACK)) {
126 			kfree_skb(skb);
127 			return 0;
128 		}
129 	}
130 
131 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 		int res = lwtunnel_xmit(skb);
133 
134 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
135 			return res;
136 	}
137 
138 	rcu_read_lock_bh();
139 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 	if (unlikely(!neigh))
142 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 	if (!IS_ERR(neigh)) {
144 		sock_confirm_neigh(skb, neigh);
145 		ret = neigh_output(neigh, skb, false);
146 		rcu_read_unlock_bh();
147 		return ret;
148 	}
149 	rcu_read_unlock_bh();
150 
151 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152 	kfree_skb(skb);
153 	return -EINVAL;
154 }
155 
156 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 				    struct sk_buff *skb, unsigned int mtu)
159 {
160 	struct sk_buff *segs, *nskb;
161 	netdev_features_t features;
162 	int ret = 0;
163 
164 	/* Please see corresponding comment in ip_finish_output_gso
165 	 * describing the cases where GSO segment length exceeds the
166 	 * egress MTU.
167 	 */
168 	features = netif_skb_features(skb);
169 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 	if (IS_ERR_OR_NULL(segs)) {
171 		kfree_skb(skb);
172 		return -ENOMEM;
173 	}
174 
175 	consume_skb(skb);
176 
177 	skb_list_walk_safe(segs, segs, nskb) {
178 		int err;
179 
180 		skb_mark_not_on_list(segs);
181 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
182 		if (err && ret == 0)
183 			ret = err;
184 	}
185 
186 	return ret;
187 }
188 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
190 {
191 	unsigned int mtu;
192 
193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194 	/* Policy lookup after SNAT yielded a new policy */
195 	if (skb_dst(skb)->xfrm) {
196 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
197 		return dst_output(net, sk, skb);
198 	}
199 #endif
200 
201 	mtu = ip6_skb_dst_mtu(skb);
202 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
203 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
204 
205 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
206 	    dst_allfrag(skb_dst(skb)) ||
207 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
208 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
209 	else
210 		return ip6_finish_output2(net, sk, skb);
211 }
212 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)213 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215 	int ret;
216 
217 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
218 	switch (ret) {
219 	case NET_XMIT_SUCCESS:
220 		return __ip6_finish_output(net, sk, skb);
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb(skb);
225 		return ret;
226 	}
227 }
228 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(idev->cnf.disable_ipv6)) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb(skb);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)249 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
250 {
251 	if (!np->autoflowlabel_set)
252 		return ip6_default_np_autolabel(net);
253 	else
254 		return np->autoflowlabel;
255 }
256 
257 /*
258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
259  * Note : socket lock is not held for SYNACK packets, but might be modified
260  * by calls to skb_set_owner_w() and ipv6_local_error(),
261  * which are using proper atomic operations or spinlocks.
262  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
264 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 {
266 	struct net *net = sock_net(sk);
267 	const struct ipv6_pinfo *np = inet6_sk(sk);
268 	struct in6_addr *first_hop = &fl6->daddr;
269 	struct dst_entry *dst = skb_dst(skb);
270 	unsigned int head_room;
271 	struct ipv6hdr *hdr;
272 	u8  proto = fl6->flowi6_proto;
273 	int seg_len = skb->len;
274 	int hlimit = -1;
275 	u32 mtu;
276 
277 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
278 	if (opt)
279 		head_room += opt->opt_nflen + opt->opt_flen;
280 
281 	if (unlikely(skb_headroom(skb) < head_room)) {
282 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
283 		if (!skb2) {
284 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
285 				      IPSTATS_MIB_OUTDISCARDS);
286 			kfree_skb(skb);
287 			return -ENOBUFS;
288 		}
289 		if (skb->sk)
290 			skb_set_owner_w(skb2, skb->sk);
291 		consume_skb(skb);
292 		skb = skb2;
293 	}
294 
295 	if (opt) {
296 		seg_len += opt->opt_nflen + opt->opt_flen;
297 
298 		if (opt->opt_flen)
299 			ipv6_push_frag_opts(skb, opt, &proto);
300 
301 		if (opt->opt_nflen)
302 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
303 					     &fl6->saddr);
304 	}
305 
306 	skb_push(skb, sizeof(struct ipv6hdr));
307 	skb_reset_network_header(skb);
308 	hdr = ipv6_hdr(skb);
309 
310 	/*
311 	 *	Fill in the IPv6 header
312 	 */
313 	if (np)
314 		hlimit = np->hop_limit;
315 	if (hlimit < 0)
316 		hlimit = ip6_dst_hoplimit(dst);
317 
318 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
319 				ip6_autoflowlabel(net, np), fl6));
320 
321 	hdr->payload_len = htons(seg_len);
322 	hdr->nexthdr = proto;
323 	hdr->hop_limit = hlimit;
324 
325 	hdr->saddr = fl6->saddr;
326 	hdr->daddr = *first_hop;
327 
328 	skb->protocol = htons(ETH_P_IPV6);
329 	skb->priority = priority;
330 	skb->mark = mark;
331 
332 	mtu = dst_mtu(dst);
333 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
334 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
335 			      IPSTATS_MIB_OUT, skb->len);
336 
337 		/* if egress device is enslaved to an L3 master device pass the
338 		 * skb to its handler for processing
339 		 */
340 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
341 		if (unlikely(!skb))
342 			return 0;
343 
344 		/* hooks should never assume socket lock is held.
345 		 * we promote our socket to non const
346 		 */
347 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
348 			       net, (struct sock *)sk, skb, NULL, dst->dev,
349 			       dst_output);
350 	}
351 
352 	skb->dev = dst->dev;
353 	/* ipv6_local_error() does not require socket lock,
354 	 * we promote our socket to non const
355 	 */
356 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
357 
358 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
359 	kfree_skb(skb);
360 	return -EMSGSIZE;
361 }
362 EXPORT_SYMBOL(ip6_xmit);
363 
ip6_call_ra_chain(struct sk_buff * skb,int sel)364 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
365 {
366 	struct ip6_ra_chain *ra;
367 	struct sock *last = NULL;
368 
369 	read_lock(&ip6_ra_lock);
370 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
371 		struct sock *sk = ra->sk;
372 		if (sk && ra->sel == sel &&
373 		    (!sk->sk_bound_dev_if ||
374 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
375 			struct ipv6_pinfo *np = inet6_sk(sk);
376 
377 			if (np && np->rtalert_isolate &&
378 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
379 				continue;
380 			}
381 			if (last) {
382 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
383 				if (skb2)
384 					rawv6_rcv(last, skb2);
385 			}
386 			last = sk;
387 		}
388 	}
389 
390 	if (last) {
391 		rawv6_rcv(last, skb);
392 		read_unlock(&ip6_ra_lock);
393 		return 1;
394 	}
395 	read_unlock(&ip6_ra_lock);
396 	return 0;
397 }
398 
ip6_forward_proxy_check(struct sk_buff * skb)399 static int ip6_forward_proxy_check(struct sk_buff *skb)
400 {
401 	struct ipv6hdr *hdr = ipv6_hdr(skb);
402 	u8 nexthdr = hdr->nexthdr;
403 	__be16 frag_off;
404 	int offset;
405 
406 	if (ipv6_ext_hdr(nexthdr)) {
407 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
408 		if (offset < 0)
409 			return 0;
410 	} else
411 		offset = sizeof(struct ipv6hdr);
412 
413 	if (nexthdr == IPPROTO_ICMPV6) {
414 		struct icmp6hdr *icmp6;
415 
416 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
417 					 offset + 1 - skb->data)))
418 			return 0;
419 
420 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
421 
422 		switch (icmp6->icmp6_type) {
423 		case NDISC_ROUTER_SOLICITATION:
424 		case NDISC_ROUTER_ADVERTISEMENT:
425 		case NDISC_NEIGHBOUR_SOLICITATION:
426 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
427 		case NDISC_REDIRECT:
428 			/* For reaction involving unicast neighbor discovery
429 			 * message destined to the proxied address, pass it to
430 			 * input function.
431 			 */
432 			return 1;
433 		default:
434 			break;
435 		}
436 	}
437 
438 	/*
439 	 * The proxying router can't forward traffic sent to a link-local
440 	 * address, so signal the sender and discard the packet. This
441 	 * behavior is clarified by the MIPv6 specification.
442 	 */
443 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
444 		dst_link_failure(skb);
445 		return -1;
446 	}
447 
448 	return 0;
449 }
450 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)451 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
452 				     struct sk_buff *skb)
453 {
454 	struct dst_entry *dst = skb_dst(skb);
455 
456 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
457 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
458 
459 #ifdef CONFIG_NET_SWITCHDEV
460 	if (skb->offload_l3_fwd_mark) {
461 		consume_skb(skb);
462 		return 0;
463 	}
464 #endif
465 
466 	skb->tstamp = 0;
467 	return dst_output(net, sk, skb);
468 }
469 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)470 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
471 {
472 	if (skb->len <= mtu)
473 		return false;
474 
475 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
476 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
477 		return true;
478 
479 	if (skb->ignore_df)
480 		return false;
481 
482 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
483 		return false;
484 
485 	return true;
486 }
487 
ip6_forward(struct sk_buff * skb)488 int ip6_forward(struct sk_buff *skb)
489 {
490 	struct dst_entry *dst = skb_dst(skb);
491 	struct ipv6hdr *hdr = ipv6_hdr(skb);
492 	struct inet6_skb_parm *opt = IP6CB(skb);
493 	struct net *net = dev_net(dst->dev);
494 	struct inet6_dev *idev;
495 	u32 mtu;
496 
497 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
498 	if (net->ipv6.devconf_all->forwarding == 0)
499 		goto error;
500 
501 	if (skb->pkt_type != PACKET_HOST)
502 		goto drop;
503 
504 	if (unlikely(skb->sk))
505 		goto drop;
506 
507 	if (skb_warn_if_lro(skb))
508 		goto drop;
509 
510 	if (!net->ipv6.devconf_all->disable_policy &&
511 	    (!idev || !idev->cnf.disable_policy) &&
512 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
513 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
514 		goto drop;
515 	}
516 
517 	skb_forward_csum(skb);
518 
519 	/*
520 	 *	We DO NOT make any processing on
521 	 *	RA packets, pushing them to user level AS IS
522 	 *	without ane WARRANTY that application will be able
523 	 *	to interpret them. The reason is that we
524 	 *	cannot make anything clever here.
525 	 *
526 	 *	We are not end-node, so that if packet contains
527 	 *	AH/ESP, we cannot make anything.
528 	 *	Defragmentation also would be mistake, RA packets
529 	 *	cannot be fragmented, because there is no warranty
530 	 *	that different fragments will go along one path. --ANK
531 	 */
532 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
533 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
534 			return 0;
535 	}
536 
537 	/*
538 	 *	check and decrement ttl
539 	 */
540 	if (hdr->hop_limit <= 1) {
541 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
542 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
543 
544 		kfree_skb(skb);
545 		return -ETIMEDOUT;
546 	}
547 
548 	/* XXX: idev->cnf.proxy_ndp? */
549 	if (net->ipv6.devconf_all->proxy_ndp &&
550 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
551 		int proxied = ip6_forward_proxy_check(skb);
552 		if (proxied > 0)
553 			return ip6_input(skb);
554 		else if (proxied < 0) {
555 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
556 			goto drop;
557 		}
558 	}
559 
560 	if (!xfrm6_route_forward(skb)) {
561 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
562 		goto drop;
563 	}
564 	dst = skb_dst(skb);
565 
566 	/* IPv6 specs say nothing about it, but it is clear that we cannot
567 	   send redirects to source routed frames.
568 	   We don't send redirects to frames decapsulated from IPsec.
569 	 */
570 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
571 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
572 		struct in6_addr *target = NULL;
573 		struct inet_peer *peer;
574 		struct rt6_info *rt;
575 
576 		/*
577 		 *	incoming and outgoing devices are the same
578 		 *	send a redirect.
579 		 */
580 
581 		rt = (struct rt6_info *) dst;
582 		if (rt->rt6i_flags & RTF_GATEWAY)
583 			target = &rt->rt6i_gateway;
584 		else
585 			target = &hdr->daddr;
586 
587 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
588 
589 		/* Limit redirects both by destination (here)
590 		   and by source (inside ndisc_send_redirect)
591 		 */
592 		if (inet_peer_xrlim_allow(peer, 1*HZ))
593 			ndisc_send_redirect(skb, target);
594 		if (peer)
595 			inet_putpeer(peer);
596 	} else {
597 		int addrtype = ipv6_addr_type(&hdr->saddr);
598 
599 		/* This check is security critical. */
600 		if (addrtype == IPV6_ADDR_ANY ||
601 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
602 			goto error;
603 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
604 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
605 				    ICMPV6_NOT_NEIGHBOUR, 0);
606 			goto error;
607 		}
608 	}
609 
610 	mtu = ip6_dst_mtu_forward(dst);
611 	if (mtu < IPV6_MIN_MTU)
612 		mtu = IPV6_MIN_MTU;
613 
614 	if (ip6_pkt_too_big(skb, mtu)) {
615 		/* Again, force OUTPUT device used as source address */
616 		skb->dev = dst->dev;
617 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
618 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
619 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
620 				IPSTATS_MIB_FRAGFAILS);
621 		kfree_skb(skb);
622 		return -EMSGSIZE;
623 	}
624 
625 	if (skb_cow(skb, dst->dev->hard_header_len)) {
626 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
627 				IPSTATS_MIB_OUTDISCARDS);
628 		goto drop;
629 	}
630 
631 	hdr = ipv6_hdr(skb);
632 
633 	/* Mangling hops number delayed to point after skb COW */
634 
635 	hdr->hop_limit--;
636 
637 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
638 		       net, NULL, skb, skb->dev, dst->dev,
639 		       ip6_forward_finish);
640 
641 error:
642 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
643 drop:
644 	kfree_skb(skb);
645 	return -EINVAL;
646 }
647 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
649 {
650 	to->pkt_type = from->pkt_type;
651 	to->priority = from->priority;
652 	to->protocol = from->protocol;
653 	skb_dst_drop(to);
654 	skb_dst_set(to, dst_clone(skb_dst(from)));
655 	to->dev = from->dev;
656 	to->mark = from->mark;
657 
658 	skb_copy_hash(to, from);
659 
660 #ifdef CONFIG_NET_SCHED
661 	to->tc_index = from->tc_index;
662 #endif
663 	nf_copy(to, from);
664 	skb_ext_copy(to, from);
665 	skb_copy_secmark(to, from);
666 }
667 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
669 		      u8 nexthdr, __be32 frag_id,
670 		      struct ip6_fraglist_iter *iter)
671 {
672 	unsigned int first_len;
673 	struct frag_hdr *fh;
674 
675 	/* BUILD HEADER */
676 	*prevhdr = NEXTHDR_FRAGMENT;
677 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678 	if (!iter->tmp_hdr)
679 		return -ENOMEM;
680 
681 	iter->frag = skb_shinfo(skb)->frag_list;
682 	skb_frag_list_init(skb);
683 
684 	iter->offset = 0;
685 	iter->hlen = hlen;
686 	iter->frag_id = frag_id;
687 	iter->nexthdr = nexthdr;
688 
689 	__skb_pull(skb, hlen);
690 	fh = __skb_push(skb, sizeof(struct frag_hdr));
691 	__skb_push(skb, hlen);
692 	skb_reset_network_header(skb);
693 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
694 
695 	fh->nexthdr = nexthdr;
696 	fh->reserved = 0;
697 	fh->frag_off = htons(IP6_MF);
698 	fh->identification = frag_id;
699 
700 	first_len = skb_pagelen(skb);
701 	skb->data_len = first_len - skb_headlen(skb);
702 	skb->len = first_len;
703 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
704 
705 	return 0;
706 }
707 EXPORT_SYMBOL(ip6_fraglist_init);
708 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)709 void ip6_fraglist_prepare(struct sk_buff *skb,
710 			  struct ip6_fraglist_iter *iter)
711 {
712 	struct sk_buff *frag = iter->frag;
713 	unsigned int hlen = iter->hlen;
714 	struct frag_hdr *fh;
715 
716 	frag->ip_summed = CHECKSUM_NONE;
717 	skb_reset_transport_header(frag);
718 	fh = __skb_push(frag, sizeof(struct frag_hdr));
719 	__skb_push(frag, hlen);
720 	skb_reset_network_header(frag);
721 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
722 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
723 	fh->nexthdr = iter->nexthdr;
724 	fh->reserved = 0;
725 	fh->frag_off = htons(iter->offset);
726 	if (frag->next)
727 		fh->frag_off |= htons(IP6_MF);
728 	fh->identification = iter->frag_id;
729 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
730 	ip6_copy_metadata(frag, skb);
731 }
732 EXPORT_SYMBOL(ip6_fraglist_prepare);
733 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
735 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
736 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
737 {
738 	state->prevhdr = prevhdr;
739 	state->nexthdr = nexthdr;
740 	state->frag_id = frag_id;
741 
742 	state->hlen = hlen;
743 	state->mtu = mtu;
744 
745 	state->left = skb->len - hlen;	/* Space per frame */
746 	state->ptr = hlen;		/* Where to start from */
747 
748 	state->hroom = hdr_room;
749 	state->troom = needed_tailroom;
750 
751 	state->offset = 0;
752 }
753 EXPORT_SYMBOL(ip6_frag_init);
754 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
756 {
757 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
758 	struct sk_buff *frag;
759 	struct frag_hdr *fh;
760 	unsigned int len;
761 
762 	len = state->left;
763 	/* IF: it doesn't fit, use 'mtu' - the data space left */
764 	if (len > state->mtu)
765 		len = state->mtu;
766 	/* IF: we are not sending up to and including the packet end
767 	   then align the next start on an eight byte boundary */
768 	if (len < state->left)
769 		len &= ~7;
770 
771 	/* Allocate buffer */
772 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
773 			 state->hroom + state->troom, GFP_ATOMIC);
774 	if (!frag)
775 		return ERR_PTR(-ENOMEM);
776 
777 	/*
778 	 *	Set up data on packet
779 	 */
780 
781 	ip6_copy_metadata(frag, skb);
782 	skb_reserve(frag, state->hroom);
783 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
784 	skb_reset_network_header(frag);
785 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
786 	frag->transport_header = (frag->network_header + state->hlen +
787 				  sizeof(struct frag_hdr));
788 
789 	/*
790 	 *	Charge the memory for the fragment to any owner
791 	 *	it might possess
792 	 */
793 	if (skb->sk)
794 		skb_set_owner_w(frag, skb->sk);
795 
796 	/*
797 	 *	Copy the packet header into the new buffer.
798 	 */
799 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
800 
801 	fragnexthdr_offset = skb_network_header(frag);
802 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
803 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
804 
805 	/*
806 	 *	Build fragment header.
807 	 */
808 	fh->nexthdr = state->nexthdr;
809 	fh->reserved = 0;
810 	fh->identification = state->frag_id;
811 
812 	/*
813 	 *	Copy a block of the IP datagram.
814 	 */
815 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
816 			     len));
817 	state->left -= len;
818 
819 	fh->frag_off = htons(state->offset);
820 	if (state->left > 0)
821 		fh->frag_off |= htons(IP6_MF);
822 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
823 
824 	state->ptr += len;
825 	state->offset += len;
826 
827 	return frag;
828 }
829 EXPORT_SYMBOL(ip6_frag_next);
830 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
832 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
833 {
834 	struct sk_buff *frag;
835 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
836 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
837 				inet6_sk(skb->sk) : NULL;
838 	struct ip6_frag_state state;
839 	unsigned int mtu, hlen, nexthdr_offset;
840 	ktime_t tstamp = skb->tstamp;
841 	int hroom, err = 0;
842 	__be32 frag_id;
843 	u8 *prevhdr, nexthdr = 0;
844 
845 	err = ip6_find_1stfragopt(skb, &prevhdr);
846 	if (err < 0)
847 		goto fail;
848 	hlen = err;
849 	nexthdr = *prevhdr;
850 	nexthdr_offset = prevhdr - skb_network_header(skb);
851 
852 	mtu = ip6_skb_dst_mtu(skb);
853 
854 	/* We must not fragment if the socket is set to force MTU discovery
855 	 * or if the skb it not generated by a local socket.
856 	 */
857 	if (unlikely(!skb->ignore_df && skb->len > mtu))
858 		goto fail_toobig;
859 
860 	if (IP6CB(skb)->frag_max_size) {
861 		if (IP6CB(skb)->frag_max_size > mtu)
862 			goto fail_toobig;
863 
864 		/* don't send fragments larger than what we received */
865 		mtu = IP6CB(skb)->frag_max_size;
866 		if (mtu < IPV6_MIN_MTU)
867 			mtu = IPV6_MIN_MTU;
868 	}
869 
870 	if (np && np->frag_size < mtu) {
871 		if (np->frag_size)
872 			mtu = np->frag_size;
873 	}
874 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
875 		goto fail_toobig;
876 	mtu -= hlen + sizeof(struct frag_hdr);
877 
878 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
879 				    &ipv6_hdr(skb)->saddr);
880 
881 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
882 	    (err = skb_checksum_help(skb)))
883 		goto fail;
884 
885 	prevhdr = skb_network_header(skb) + nexthdr_offset;
886 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
887 	if (skb_has_frag_list(skb)) {
888 		unsigned int first_len = skb_pagelen(skb);
889 		struct ip6_fraglist_iter iter;
890 		struct sk_buff *frag2;
891 
892 		if (first_len - hlen > mtu ||
893 		    ((first_len - hlen) & 7) ||
894 		    skb_cloned(skb) ||
895 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
896 			goto slow_path;
897 
898 		skb_walk_frags(skb, frag) {
899 			/* Correct geometry. */
900 			if (frag->len > mtu ||
901 			    ((frag->len & 7) && frag->next) ||
902 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
903 				goto slow_path_clean;
904 
905 			/* Partially cloned skb? */
906 			if (skb_shared(frag))
907 				goto slow_path_clean;
908 
909 			BUG_ON(frag->sk);
910 			if (skb->sk) {
911 				frag->sk = skb->sk;
912 				frag->destructor = sock_wfree;
913 			}
914 			skb->truesize -= frag->truesize;
915 		}
916 
917 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
918 					&iter);
919 		if (err < 0)
920 			goto fail;
921 
922 		/* We prevent @rt from being freed. */
923 		rcu_read_lock();
924 
925 		for (;;) {
926 			/* Prepare header of the next frame,
927 			 * before previous one went down. */
928 			if (iter.frag)
929 				ip6_fraglist_prepare(skb, &iter);
930 
931 			skb->tstamp = tstamp;
932 			err = output(net, sk, skb);
933 			if (!err)
934 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
935 					      IPSTATS_MIB_FRAGCREATES);
936 
937 			if (err || !iter.frag)
938 				break;
939 
940 			skb = ip6_fraglist_next(&iter);
941 		}
942 
943 		kfree(iter.tmp_hdr);
944 
945 		if (err == 0) {
946 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
947 				      IPSTATS_MIB_FRAGOKS);
948 			rcu_read_unlock();
949 			return 0;
950 		}
951 
952 		kfree_skb_list(iter.frag);
953 
954 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
955 			      IPSTATS_MIB_FRAGFAILS);
956 		rcu_read_unlock();
957 		return err;
958 
959 slow_path_clean:
960 		skb_walk_frags(skb, frag2) {
961 			if (frag2 == frag)
962 				break;
963 			frag2->sk = NULL;
964 			frag2->destructor = NULL;
965 			skb->truesize += frag2->truesize;
966 		}
967 	}
968 
969 slow_path:
970 	/*
971 	 *	Fragment the datagram.
972 	 */
973 
974 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
975 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
976 		      &state);
977 
978 	/*
979 	 *	Keep copying data until we run out.
980 	 */
981 
982 	while (state.left > 0) {
983 		frag = ip6_frag_next(skb, &state);
984 		if (IS_ERR(frag)) {
985 			err = PTR_ERR(frag);
986 			goto fail;
987 		}
988 
989 		/*
990 		 *	Put this fragment into the sending queue.
991 		 */
992 		frag->tstamp = tstamp;
993 		err = output(net, sk, frag);
994 		if (err)
995 			goto fail;
996 
997 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
998 			      IPSTATS_MIB_FRAGCREATES);
999 	}
1000 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1001 		      IPSTATS_MIB_FRAGOKS);
1002 	consume_skb(skb);
1003 	return err;
1004 
1005 fail_toobig:
1006 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1007 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1008 
1009 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1010 	err = -EMSGSIZE;
1011 
1012 fail:
1013 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1014 		      IPSTATS_MIB_FRAGFAILS);
1015 	kfree_skb(skb);
1016 	return err;
1017 }
1018 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1019 static inline int ip6_rt_check(const struct rt6key *rt_key,
1020 			       const struct in6_addr *fl_addr,
1021 			       const struct in6_addr *addr_cache)
1022 {
1023 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1024 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1025 }
1026 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1027 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1028 					  struct dst_entry *dst,
1029 					  const struct flowi6 *fl6)
1030 {
1031 	struct ipv6_pinfo *np = inet6_sk(sk);
1032 	struct rt6_info *rt;
1033 
1034 	if (!dst)
1035 		goto out;
1036 
1037 	if (dst->ops->family != AF_INET6) {
1038 		dst_release(dst);
1039 		return NULL;
1040 	}
1041 
1042 	rt = (struct rt6_info *)dst;
1043 	/* Yes, checking route validity in not connected
1044 	 * case is not very simple. Take into account,
1045 	 * that we do not support routing by source, TOS,
1046 	 * and MSG_DONTROUTE		--ANK (980726)
1047 	 *
1048 	 * 1. ip6_rt_check(): If route was host route,
1049 	 *    check that cached destination is current.
1050 	 *    If it is network route, we still may
1051 	 *    check its validity using saved pointer
1052 	 *    to the last used address: daddr_cache.
1053 	 *    We do not want to save whole address now,
1054 	 *    (because main consumer of this service
1055 	 *    is tcp, which has not this problem),
1056 	 *    so that the last trick works only on connected
1057 	 *    sockets.
1058 	 * 2. oif also should be the same.
1059 	 */
1060 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1061 #ifdef CONFIG_IPV6_SUBTREES
1062 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1063 #endif
1064 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1065 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1066 		dst_release(dst);
1067 		dst = NULL;
1068 	}
1069 
1070 out:
1071 	return dst;
1072 }
1073 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1074 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1075 			       struct dst_entry **dst, struct flowi6 *fl6)
1076 {
1077 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1078 	struct neighbour *n;
1079 	struct rt6_info *rt;
1080 #endif
1081 	int err;
1082 	int flags = 0;
1083 
1084 	/* The correct way to handle this would be to do
1085 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1086 	 * the route-specific preferred source forces the
1087 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1088 	 *
1089 	 * In source specific routing (no src=any default route),
1090 	 * ip6_route_output will fail given src=any saddr, though, so
1091 	 * that's why we try it again later.
1092 	 */
1093 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1094 		struct fib6_info *from;
1095 		struct rt6_info *rt;
1096 		bool had_dst = *dst != NULL;
1097 
1098 		if (!had_dst)
1099 			*dst = ip6_route_output(net, sk, fl6);
1100 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1101 
1102 		rcu_read_lock();
1103 		from = rt ? rcu_dereference(rt->from) : NULL;
1104 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1105 					  sk ? inet6_sk(sk)->srcprefs : 0,
1106 					  &fl6->saddr);
1107 		rcu_read_unlock();
1108 
1109 		if (err)
1110 			goto out_err_release;
1111 
1112 		/* If we had an erroneous initial result, pretend it
1113 		 * never existed and let the SA-enabled version take
1114 		 * over.
1115 		 */
1116 		if (!had_dst && (*dst)->error) {
1117 			dst_release(*dst);
1118 			*dst = NULL;
1119 		}
1120 
1121 		if (fl6->flowi6_oif)
1122 			flags |= RT6_LOOKUP_F_IFACE;
1123 	}
1124 
1125 	if (!*dst)
1126 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1127 
1128 	err = (*dst)->error;
1129 	if (err)
1130 		goto out_err_release;
1131 
1132 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1133 	/*
1134 	 * Here if the dst entry we've looked up
1135 	 * has a neighbour entry that is in the INCOMPLETE
1136 	 * state and the src address from the flow is
1137 	 * marked as OPTIMISTIC, we release the found
1138 	 * dst entry and replace it instead with the
1139 	 * dst entry of the nexthop router
1140 	 */
1141 	rt = (struct rt6_info *) *dst;
1142 	rcu_read_lock_bh();
1143 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1144 				      rt6_nexthop(rt, &fl6->daddr));
1145 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1146 	rcu_read_unlock_bh();
1147 
1148 	if (err) {
1149 		struct inet6_ifaddr *ifp;
1150 		struct flowi6 fl_gw6;
1151 		int redirect;
1152 
1153 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1154 				      (*dst)->dev, 1);
1155 
1156 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1157 		if (ifp)
1158 			in6_ifa_put(ifp);
1159 
1160 		if (redirect) {
1161 			/*
1162 			 * We need to get the dst entry for the
1163 			 * default router instead
1164 			 */
1165 			dst_release(*dst);
1166 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1167 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1168 			*dst = ip6_route_output(net, sk, &fl_gw6);
1169 			err = (*dst)->error;
1170 			if (err)
1171 				goto out_err_release;
1172 		}
1173 	}
1174 #endif
1175 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1176 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1177 		err = -EAFNOSUPPORT;
1178 		goto out_err_release;
1179 	}
1180 
1181 	return 0;
1182 
1183 out_err_release:
1184 	dst_release(*dst);
1185 	*dst = NULL;
1186 
1187 	if (err == -ENETUNREACH)
1188 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1189 	return err;
1190 }
1191 
1192 /**
1193  *	ip6_dst_lookup - perform route lookup on flow
1194  *	@net: Network namespace to perform lookup in
1195  *	@sk: socket which provides route info
1196  *	@dst: pointer to dst_entry * for result
1197  *	@fl6: flow to lookup
1198  *
1199  *	This function performs a route lookup on the given flow.
1200  *
1201  *	It returns zero on success, or a standard errno code on error.
1202  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1203 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1204 		   struct flowi6 *fl6)
1205 {
1206 	*dst = NULL;
1207 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1208 }
1209 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1210 
1211 /**
1212  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1213  *	@net: Network namespace to perform lookup in
1214  *	@sk: socket which provides route info
1215  *	@fl6: flow to lookup
1216  *	@final_dst: final destination address for ipsec lookup
1217  *
1218  *	This function performs a route lookup on the given flow.
1219  *
1220  *	It returns a valid dst pointer on success, or a pointer encoded
1221  *	error code.
1222  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1223 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1224 				      const struct in6_addr *final_dst)
1225 {
1226 	struct dst_entry *dst = NULL;
1227 	int err;
1228 
1229 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1230 	if (err)
1231 		return ERR_PTR(err);
1232 	if (final_dst)
1233 		fl6->daddr = *final_dst;
1234 
1235 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1236 }
1237 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1238 
1239 /**
1240  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1241  *	@sk: socket which provides the dst cache and route info
1242  *	@fl6: flow to lookup
1243  *	@final_dst: final destination address for ipsec lookup
1244  *	@connected: whether @sk is connected or not
1245  *
1246  *	This function performs a route lookup on the given flow with the
1247  *	possibility of using the cached route in the socket if it is valid.
1248  *	It will take the socket dst lock when operating on the dst cache.
1249  *	As a result, this function can only be used in process context.
1250  *
1251  *	In addition, for a connected socket, cache the dst in the socket
1252  *	if the current cache is not valid.
1253  *
1254  *	It returns a valid dst pointer on success, or a pointer encoded
1255  *	error code.
1256  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1257 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1258 					 const struct in6_addr *final_dst,
1259 					 bool connected)
1260 {
1261 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1262 
1263 	dst = ip6_sk_dst_check(sk, dst, fl6);
1264 	if (dst)
1265 		return dst;
1266 
1267 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1268 	if (connected && !IS_ERR(dst))
1269 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1270 
1271 	return dst;
1272 }
1273 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1274 
1275 /**
1276  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1277  *      @skb: Packet for which lookup is done
1278  *      @dev: Tunnel device
1279  *      @net: Network namespace of tunnel device
1280  *      @sock: Socket which provides route info
1281  *      @saddr: Memory to store the src ip address
1282  *      @info: Tunnel information
1283  *      @protocol: IP protocol
1284  *      @use_cache: Flag to enable cache usage
1285  *      This function performs a route lookup on a tunnel
1286  *
1287  *      It returns a valid dst pointer and stores src address to be used in
1288  *      tunnel in param saddr on success, else a pointer encoded error code.
1289  */
1290 
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1291 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1292 					struct net_device *dev,
1293 					struct net *net,
1294 					struct socket *sock,
1295 					struct in6_addr *saddr,
1296 					const struct ip_tunnel_info *info,
1297 					u8 protocol,
1298 					bool use_cache)
1299 {
1300 	struct dst_entry *dst = NULL;
1301 #ifdef CONFIG_DST_CACHE
1302 	struct dst_cache *dst_cache;
1303 #endif
1304 	struct flowi6 fl6;
1305 	__u8 prio;
1306 
1307 #ifdef CONFIG_DST_CACHE
1308 	dst_cache = (struct dst_cache *)&info->dst_cache;
1309 	if (use_cache) {
1310 		dst = dst_cache_get_ip6(dst_cache, saddr);
1311 		if (dst)
1312 			return dst;
1313 	}
1314 #endif
1315 	memset(&fl6, 0, sizeof(fl6));
1316 	fl6.flowi6_mark = skb->mark;
1317 	fl6.flowi6_proto = protocol;
1318 	fl6.daddr = info->key.u.ipv6.dst;
1319 	fl6.saddr = info->key.u.ipv6.src;
1320 	prio = info->key.tos;
1321 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1322 
1323 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1324 					      NULL);
1325 	if (IS_ERR(dst)) {
1326 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1327 		return ERR_PTR(-ENETUNREACH);
1328 	}
1329 	if (dst->dev == dev) { /* is this necessary? */
1330 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1331 		dst_release(dst);
1332 		return ERR_PTR(-ELOOP);
1333 	}
1334 #ifdef CONFIG_DST_CACHE
1335 	if (use_cache)
1336 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1337 #endif
1338 	*saddr = fl6.saddr;
1339 	return dst;
1340 }
1341 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1342 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1343 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1344 					       gfp_t gfp)
1345 {
1346 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1347 }
1348 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1349 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1350 						gfp_t gfp)
1351 {
1352 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1353 }
1354 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1355 static void ip6_append_data_mtu(unsigned int *mtu,
1356 				int *maxfraglen,
1357 				unsigned int fragheaderlen,
1358 				struct sk_buff *skb,
1359 				struct rt6_info *rt,
1360 				unsigned int orig_mtu)
1361 {
1362 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1363 		if (!skb) {
1364 			/* first fragment, reserve header_len */
1365 			*mtu = orig_mtu - rt->dst.header_len;
1366 
1367 		} else {
1368 			/*
1369 			 * this fragment is not first, the headers
1370 			 * space is regarded as data space.
1371 			 */
1372 			*mtu = orig_mtu;
1373 		}
1374 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1375 			      + fragheaderlen - sizeof(struct frag_hdr);
1376 	}
1377 }
1378 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1379 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1380 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1381 			  struct rt6_info *rt, struct flowi6 *fl6)
1382 {
1383 	struct ipv6_pinfo *np = inet6_sk(sk);
1384 	unsigned int mtu;
1385 	struct ipv6_txoptions *opt = ipc6->opt;
1386 
1387 	/*
1388 	 * setup for corking
1389 	 */
1390 	if (opt) {
1391 		if (WARN_ON(v6_cork->opt))
1392 			return -EINVAL;
1393 
1394 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1395 		if (unlikely(!v6_cork->opt))
1396 			return -ENOBUFS;
1397 
1398 		v6_cork->opt->tot_len = sizeof(*opt);
1399 		v6_cork->opt->opt_flen = opt->opt_flen;
1400 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1401 
1402 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1403 						    sk->sk_allocation);
1404 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1405 			return -ENOBUFS;
1406 
1407 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1408 						    sk->sk_allocation);
1409 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1410 			return -ENOBUFS;
1411 
1412 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1413 						   sk->sk_allocation);
1414 		if (opt->hopopt && !v6_cork->opt->hopopt)
1415 			return -ENOBUFS;
1416 
1417 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1418 						    sk->sk_allocation);
1419 		if (opt->srcrt && !v6_cork->opt->srcrt)
1420 			return -ENOBUFS;
1421 
1422 		/* need source address above miyazawa*/
1423 	}
1424 	dst_hold(&rt->dst);
1425 	cork->base.dst = &rt->dst;
1426 	cork->fl.u.ip6 = *fl6;
1427 	v6_cork->hop_limit = ipc6->hlimit;
1428 	v6_cork->tclass = ipc6->tclass;
1429 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1430 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1431 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1432 	else
1433 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1434 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1435 	if (np->frag_size < mtu) {
1436 		if (np->frag_size)
1437 			mtu = np->frag_size;
1438 	}
1439 	cork->base.fragsize = mtu;
1440 	cork->base.gso_size = ipc6->gso_size;
1441 	cork->base.tx_flags = 0;
1442 	cork->base.mark = ipc6->sockc.mark;
1443 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1444 
1445 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1446 		cork->base.flags |= IPCORK_ALLFRAG;
1447 	cork->base.length = 0;
1448 
1449 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1450 
1451 	return 0;
1452 }
1453 
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1454 static int __ip6_append_data(struct sock *sk,
1455 			     struct flowi6 *fl6,
1456 			     struct sk_buff_head *queue,
1457 			     struct inet_cork *cork,
1458 			     struct inet6_cork *v6_cork,
1459 			     struct page_frag *pfrag,
1460 			     int getfrag(void *from, char *to, int offset,
1461 					 int len, int odd, struct sk_buff *skb),
1462 			     void *from, int length, int transhdrlen,
1463 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1464 {
1465 	struct sk_buff *skb, *skb_prev = NULL;
1466 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1467 	struct ubuf_info *uarg = NULL;
1468 	int exthdrlen = 0;
1469 	int dst_exthdrlen = 0;
1470 	int hh_len;
1471 	int copy;
1472 	int err;
1473 	int offset = 0;
1474 	u32 tskey = 0;
1475 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1476 	struct ipv6_txoptions *opt = v6_cork->opt;
1477 	int csummode = CHECKSUM_NONE;
1478 	unsigned int maxnonfragsize, headersize;
1479 	unsigned int wmem_alloc_delta = 0;
1480 	bool paged, extra_uref = false;
1481 
1482 	skb = skb_peek_tail(queue);
1483 	if (!skb) {
1484 		exthdrlen = opt ? opt->opt_flen : 0;
1485 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1486 	}
1487 
1488 	paged = !!cork->gso_size;
1489 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1490 	orig_mtu = mtu;
1491 
1492 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1493 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1494 		tskey = sk->sk_tskey++;
1495 
1496 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1497 
1498 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1499 			(opt ? opt->opt_nflen : 0);
1500 
1501 	headersize = sizeof(struct ipv6hdr) +
1502 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1503 		     (dst_allfrag(&rt->dst) ?
1504 		      sizeof(struct frag_hdr) : 0) +
1505 		     rt->rt6i_nfheader_len;
1506 
1507 	if (mtu <= fragheaderlen ||
1508 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1509 		goto emsgsize;
1510 
1511 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1512 		     sizeof(struct frag_hdr);
1513 
1514 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1515 	 * the first fragment
1516 	 */
1517 	if (headersize + transhdrlen > mtu)
1518 		goto emsgsize;
1519 
1520 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1521 	    (sk->sk_protocol == IPPROTO_UDP ||
1522 	     sk->sk_protocol == IPPROTO_RAW)) {
1523 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1524 				sizeof(struct ipv6hdr));
1525 		goto emsgsize;
1526 	}
1527 
1528 	if (ip6_sk_ignore_df(sk))
1529 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1530 	else
1531 		maxnonfragsize = mtu;
1532 
1533 	if (cork->length + length > maxnonfragsize - headersize) {
1534 emsgsize:
1535 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1536 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1537 		return -EMSGSIZE;
1538 	}
1539 
1540 	/* CHECKSUM_PARTIAL only with no extension headers and when
1541 	 * we are not going to fragment
1542 	 */
1543 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1544 	    headersize == sizeof(struct ipv6hdr) &&
1545 	    length <= mtu - headersize &&
1546 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1547 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1548 		csummode = CHECKSUM_PARTIAL;
1549 
1550 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1551 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1552 		if (!uarg)
1553 			return -ENOBUFS;
1554 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1555 		if (rt->dst.dev->features & NETIF_F_SG &&
1556 		    csummode == CHECKSUM_PARTIAL) {
1557 			paged = true;
1558 		} else {
1559 			uarg->zerocopy = 0;
1560 			skb_zcopy_set(skb, uarg, &extra_uref);
1561 		}
1562 	}
1563 
1564 	/*
1565 	 * Let's try using as much space as possible.
1566 	 * Use MTU if total length of the message fits into the MTU.
1567 	 * Otherwise, we need to reserve fragment header and
1568 	 * fragment alignment (= 8-15 octects, in total).
1569 	 *
1570 	 * Note that we may need to "move" the data from the tail
1571 	 * of the buffer to the new fragment when we split
1572 	 * the message.
1573 	 *
1574 	 * FIXME: It may be fragmented into multiple chunks
1575 	 *        at once if non-fragmentable extension headers
1576 	 *        are too large.
1577 	 * --yoshfuji
1578 	 */
1579 
1580 	cork->length += length;
1581 	if (!skb)
1582 		goto alloc_new_skb;
1583 
1584 	while (length > 0) {
1585 		/* Check if the remaining data fits into current packet. */
1586 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1587 		if (copy < length)
1588 			copy = maxfraglen - skb->len;
1589 
1590 		if (copy <= 0) {
1591 			char *data;
1592 			unsigned int datalen;
1593 			unsigned int fraglen;
1594 			unsigned int fraggap;
1595 			unsigned int alloclen, alloc_extra;
1596 			unsigned int pagedlen;
1597 alloc_new_skb:
1598 			/* There's no room in the current skb */
1599 			if (skb)
1600 				fraggap = skb->len - maxfraglen;
1601 			else
1602 				fraggap = 0;
1603 			/* update mtu and maxfraglen if necessary */
1604 			if (!skb || !skb_prev)
1605 				ip6_append_data_mtu(&mtu, &maxfraglen,
1606 						    fragheaderlen, skb, rt,
1607 						    orig_mtu);
1608 
1609 			skb_prev = skb;
1610 
1611 			/*
1612 			 * If remaining data exceeds the mtu,
1613 			 * we know we need more fragment(s).
1614 			 */
1615 			datalen = length + fraggap;
1616 
1617 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1618 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1619 			fraglen = datalen + fragheaderlen;
1620 			pagedlen = 0;
1621 
1622 			alloc_extra = hh_len;
1623 			alloc_extra += dst_exthdrlen;
1624 			alloc_extra += rt->dst.trailer_len;
1625 
1626 			/* We just reserve space for fragment header.
1627 			 * Note: this may be overallocation if the message
1628 			 * (without MSG_MORE) fits into the MTU.
1629 			 */
1630 			alloc_extra += sizeof(struct frag_hdr);
1631 
1632 			if ((flags & MSG_MORE) &&
1633 			    !(rt->dst.dev->features&NETIF_F_SG))
1634 				alloclen = mtu;
1635 			else if (!paged &&
1636 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1637 				  !(rt->dst.dev->features & NETIF_F_SG)))
1638 				alloclen = fraglen;
1639 			else {
1640 				alloclen = min_t(int, fraglen, MAX_HEADER);
1641 				pagedlen = fraglen - alloclen;
1642 			}
1643 			alloclen += alloc_extra;
1644 
1645 			if (datalen != length + fraggap) {
1646 				/*
1647 				 * this is not the last fragment, the trailer
1648 				 * space is regarded as data space.
1649 				 */
1650 				datalen += rt->dst.trailer_len;
1651 			}
1652 
1653 			fraglen = datalen + fragheaderlen;
1654 
1655 			copy = datalen - transhdrlen - fraggap - pagedlen;
1656 			if (copy < 0) {
1657 				err = -EINVAL;
1658 				goto error;
1659 			}
1660 			if (transhdrlen) {
1661 				skb = sock_alloc_send_skb(sk, alloclen,
1662 						(flags & MSG_DONTWAIT), &err);
1663 			} else {
1664 				skb = NULL;
1665 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1666 				    2 * sk->sk_sndbuf)
1667 					skb = alloc_skb(alloclen,
1668 							sk->sk_allocation);
1669 				if (unlikely(!skb))
1670 					err = -ENOBUFS;
1671 			}
1672 			if (!skb)
1673 				goto error;
1674 			/*
1675 			 *	Fill in the control structures
1676 			 */
1677 			skb->protocol = htons(ETH_P_IPV6);
1678 			skb->ip_summed = csummode;
1679 			skb->csum = 0;
1680 			/* reserve for fragmentation and ipsec header */
1681 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1682 				    dst_exthdrlen);
1683 
1684 			/*
1685 			 *	Find where to start putting bytes
1686 			 */
1687 			data = skb_put(skb, fraglen - pagedlen);
1688 			skb_set_network_header(skb, exthdrlen);
1689 			data += fragheaderlen;
1690 			skb->transport_header = (skb->network_header +
1691 						 fragheaderlen);
1692 			if (fraggap) {
1693 				skb->csum = skb_copy_and_csum_bits(
1694 					skb_prev, maxfraglen,
1695 					data + transhdrlen, fraggap);
1696 				skb_prev->csum = csum_sub(skb_prev->csum,
1697 							  skb->csum);
1698 				data += fraggap;
1699 				pskb_trim_unique(skb_prev, maxfraglen);
1700 			}
1701 			if (copy > 0 &&
1702 			    getfrag(from, data + transhdrlen, offset,
1703 				    copy, fraggap, skb) < 0) {
1704 				err = -EFAULT;
1705 				kfree_skb(skb);
1706 				goto error;
1707 			}
1708 
1709 			offset += copy;
1710 			length -= copy + transhdrlen;
1711 			transhdrlen = 0;
1712 			exthdrlen = 0;
1713 			dst_exthdrlen = 0;
1714 
1715 			/* Only the initial fragment is time stamped */
1716 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1717 			cork->tx_flags = 0;
1718 			skb_shinfo(skb)->tskey = tskey;
1719 			tskey = 0;
1720 			skb_zcopy_set(skb, uarg, &extra_uref);
1721 
1722 			if ((flags & MSG_CONFIRM) && !skb_prev)
1723 				skb_set_dst_pending_confirm(skb, 1);
1724 
1725 			/*
1726 			 * Put the packet on the pending queue
1727 			 */
1728 			if (!skb->destructor) {
1729 				skb->destructor = sock_wfree;
1730 				skb->sk = sk;
1731 				wmem_alloc_delta += skb->truesize;
1732 			}
1733 			__skb_queue_tail(queue, skb);
1734 			continue;
1735 		}
1736 
1737 		if (copy > length)
1738 			copy = length;
1739 
1740 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1741 		    skb_tailroom(skb) >= copy) {
1742 			unsigned int off;
1743 
1744 			off = skb->len;
1745 			if (getfrag(from, skb_put(skb, copy),
1746 						offset, copy, off, skb) < 0) {
1747 				__skb_trim(skb, off);
1748 				err = -EFAULT;
1749 				goto error;
1750 			}
1751 		} else if (!uarg || !uarg->zerocopy) {
1752 			int i = skb_shinfo(skb)->nr_frags;
1753 
1754 			err = -ENOMEM;
1755 			if (!sk_page_frag_refill(sk, pfrag))
1756 				goto error;
1757 
1758 			if (!skb_can_coalesce(skb, i, pfrag->page,
1759 					      pfrag->offset)) {
1760 				err = -EMSGSIZE;
1761 				if (i == MAX_SKB_FRAGS)
1762 					goto error;
1763 
1764 				__skb_fill_page_desc(skb, i, pfrag->page,
1765 						     pfrag->offset, 0);
1766 				skb_shinfo(skb)->nr_frags = ++i;
1767 				get_page(pfrag->page);
1768 			}
1769 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1770 			if (getfrag(from,
1771 				    page_address(pfrag->page) + pfrag->offset,
1772 				    offset, copy, skb->len, skb) < 0)
1773 				goto error_efault;
1774 
1775 			pfrag->offset += copy;
1776 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1777 			skb->len += copy;
1778 			skb->data_len += copy;
1779 			skb->truesize += copy;
1780 			wmem_alloc_delta += copy;
1781 		} else {
1782 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1783 			if (err < 0)
1784 				goto error;
1785 		}
1786 		offset += copy;
1787 		length -= copy;
1788 	}
1789 
1790 	if (wmem_alloc_delta)
1791 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1792 	return 0;
1793 
1794 error_efault:
1795 	err = -EFAULT;
1796 error:
1797 	if (uarg)
1798 		sock_zerocopy_put_abort(uarg, extra_uref);
1799 	cork->length -= length;
1800 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1801 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1802 	return err;
1803 }
1804 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1805 int ip6_append_data(struct sock *sk,
1806 		    int getfrag(void *from, char *to, int offset, int len,
1807 				int odd, struct sk_buff *skb),
1808 		    void *from, int length, int transhdrlen,
1809 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1810 		    struct rt6_info *rt, unsigned int flags)
1811 {
1812 	struct inet_sock *inet = inet_sk(sk);
1813 	struct ipv6_pinfo *np = inet6_sk(sk);
1814 	int exthdrlen;
1815 	int err;
1816 
1817 	if (flags&MSG_PROBE)
1818 		return 0;
1819 	if (skb_queue_empty(&sk->sk_write_queue)) {
1820 		/*
1821 		 * setup for corking
1822 		 */
1823 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1824 				     ipc6, rt, fl6);
1825 		if (err)
1826 			return err;
1827 
1828 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1829 		length += exthdrlen;
1830 		transhdrlen += exthdrlen;
1831 	} else {
1832 		fl6 = &inet->cork.fl.u.ip6;
1833 		transhdrlen = 0;
1834 	}
1835 
1836 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1837 				 &np->cork, sk_page_frag(sk), getfrag,
1838 				 from, length, transhdrlen, flags, ipc6);
1839 }
1840 EXPORT_SYMBOL_GPL(ip6_append_data);
1841 
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1842 static void ip6_cork_release(struct inet_cork_full *cork,
1843 			     struct inet6_cork *v6_cork)
1844 {
1845 	if (v6_cork->opt) {
1846 		kfree(v6_cork->opt->dst0opt);
1847 		kfree(v6_cork->opt->dst1opt);
1848 		kfree(v6_cork->opt->hopopt);
1849 		kfree(v6_cork->opt->srcrt);
1850 		kfree(v6_cork->opt);
1851 		v6_cork->opt = NULL;
1852 	}
1853 
1854 	if (cork->base.dst) {
1855 		dst_release(cork->base.dst);
1856 		cork->base.dst = NULL;
1857 		cork->base.flags &= ~IPCORK_ALLFRAG;
1858 	}
1859 	memset(&cork->fl, 0, sizeof(cork->fl));
1860 }
1861 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1862 struct sk_buff *__ip6_make_skb(struct sock *sk,
1863 			       struct sk_buff_head *queue,
1864 			       struct inet_cork_full *cork,
1865 			       struct inet6_cork *v6_cork)
1866 {
1867 	struct sk_buff *skb, *tmp_skb;
1868 	struct sk_buff **tail_skb;
1869 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1870 	struct ipv6_pinfo *np = inet6_sk(sk);
1871 	struct net *net = sock_net(sk);
1872 	struct ipv6hdr *hdr;
1873 	struct ipv6_txoptions *opt = v6_cork->opt;
1874 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1875 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1876 	unsigned char proto = fl6->flowi6_proto;
1877 
1878 	skb = __skb_dequeue(queue);
1879 	if (!skb)
1880 		goto out;
1881 	tail_skb = &(skb_shinfo(skb)->frag_list);
1882 
1883 	/* move skb->data to ip header from ext header */
1884 	if (skb->data < skb_network_header(skb))
1885 		__skb_pull(skb, skb_network_offset(skb));
1886 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1887 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1888 		*tail_skb = tmp_skb;
1889 		tail_skb = &(tmp_skb->next);
1890 		skb->len += tmp_skb->len;
1891 		skb->data_len += tmp_skb->len;
1892 		skb->truesize += tmp_skb->truesize;
1893 		tmp_skb->destructor = NULL;
1894 		tmp_skb->sk = NULL;
1895 	}
1896 
1897 	/* Allow local fragmentation. */
1898 	skb->ignore_df = ip6_sk_ignore_df(sk);
1899 
1900 	*final_dst = fl6->daddr;
1901 	__skb_pull(skb, skb_network_header_len(skb));
1902 	if (opt && opt->opt_flen)
1903 		ipv6_push_frag_opts(skb, opt, &proto);
1904 	if (opt && opt->opt_nflen)
1905 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1906 
1907 	skb_push(skb, sizeof(struct ipv6hdr));
1908 	skb_reset_network_header(skb);
1909 	hdr = ipv6_hdr(skb);
1910 
1911 	ip6_flow_hdr(hdr, v6_cork->tclass,
1912 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1913 					ip6_autoflowlabel(net, np), fl6));
1914 	hdr->hop_limit = v6_cork->hop_limit;
1915 	hdr->nexthdr = proto;
1916 	hdr->saddr = fl6->saddr;
1917 	hdr->daddr = *final_dst;
1918 
1919 	skb->priority = sk->sk_priority;
1920 	skb->mark = cork->base.mark;
1921 
1922 	skb->tstamp = cork->base.transmit_time;
1923 
1924 	skb_dst_set(skb, dst_clone(&rt->dst));
1925 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1926 	if (proto == IPPROTO_ICMPV6) {
1927 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1928 		u8 icmp6_type;
1929 
1930 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1931 			icmp6_type = fl6->fl6_icmp_type;
1932 		else
1933 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1934 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1935 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1936 	}
1937 
1938 	ip6_cork_release(cork, v6_cork);
1939 out:
1940 	return skb;
1941 }
1942 
ip6_send_skb(struct sk_buff * skb)1943 int ip6_send_skb(struct sk_buff *skb)
1944 {
1945 	struct net *net = sock_net(skb->sk);
1946 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1947 	int err;
1948 
1949 	err = ip6_local_out(net, skb->sk, skb);
1950 	if (err) {
1951 		if (err > 0)
1952 			err = net_xmit_errno(err);
1953 		if (err)
1954 			IP6_INC_STATS(net, rt->rt6i_idev,
1955 				      IPSTATS_MIB_OUTDISCARDS);
1956 	}
1957 
1958 	return err;
1959 }
1960 
ip6_push_pending_frames(struct sock * sk)1961 int ip6_push_pending_frames(struct sock *sk)
1962 {
1963 	struct sk_buff *skb;
1964 
1965 	skb = ip6_finish_skb(sk);
1966 	if (!skb)
1967 		return 0;
1968 
1969 	return ip6_send_skb(skb);
1970 }
1971 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1972 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1973 static void __ip6_flush_pending_frames(struct sock *sk,
1974 				       struct sk_buff_head *queue,
1975 				       struct inet_cork_full *cork,
1976 				       struct inet6_cork *v6_cork)
1977 {
1978 	struct sk_buff *skb;
1979 
1980 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1981 		if (skb_dst(skb))
1982 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1983 				      IPSTATS_MIB_OUTDISCARDS);
1984 		kfree_skb(skb);
1985 	}
1986 
1987 	ip6_cork_release(cork, v6_cork);
1988 }
1989 
ip6_flush_pending_frames(struct sock * sk)1990 void ip6_flush_pending_frames(struct sock *sk)
1991 {
1992 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1993 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1994 }
1995 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1996 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)1997 struct sk_buff *ip6_make_skb(struct sock *sk,
1998 			     int getfrag(void *from, char *to, int offset,
1999 					 int len, int odd, struct sk_buff *skb),
2000 			     void *from, int length, int transhdrlen,
2001 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
2002 			     struct rt6_info *rt, unsigned int flags,
2003 			     struct inet_cork_full *cork)
2004 {
2005 	struct inet6_cork v6_cork;
2006 	struct sk_buff_head queue;
2007 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2008 	int err;
2009 
2010 	if (flags & MSG_PROBE)
2011 		return NULL;
2012 
2013 	__skb_queue_head_init(&queue);
2014 
2015 	cork->base.flags = 0;
2016 	cork->base.addr = 0;
2017 	cork->base.opt = NULL;
2018 	cork->base.dst = NULL;
2019 	v6_cork.opt = NULL;
2020 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2021 	if (err) {
2022 		ip6_cork_release(cork, &v6_cork);
2023 		return ERR_PTR(err);
2024 	}
2025 	if (ipc6->dontfrag < 0)
2026 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2027 
2028 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2029 				&current->task_frag, getfrag, from,
2030 				length + exthdrlen, transhdrlen + exthdrlen,
2031 				flags, ipc6);
2032 	if (err) {
2033 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2034 		return ERR_PTR(err);
2035 	}
2036 
2037 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2038 }
2039