• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct inet6_dev *idev = ip6_dst_idev(dst);
64 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
65 	const struct in6_addr *daddr, *nexthop;
66 	struct ipv6hdr *hdr;
67 	struct neighbour *neigh;
68 	int ret;
69 
70 	/* Be paranoid, rather than too clever. */
71 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 		skb = skb_expand_head(skb, hh_len);
73 		if (!skb) {
74 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75 			return -ENOMEM;
76 		}
77 	}
78 
79 	hdr = ipv6_hdr(skb);
80 	daddr = &hdr->daddr;
81 	if (ipv6_addr_is_multicast(daddr)) {
82 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83 		    ((mroute6_is_socket(net, skb) &&
84 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87 
88 			/* Do not check for IFF_ALLMULTI; multicast routing
89 			   is not supported in any case.
90 			 */
91 			if (newskb)
92 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93 					net, sk, newskb, NULL, newskb->dev,
94 					dev_loopback_xmit);
95 
96 			if (hdr->hop_limit == 0) {
97 				IP6_INC_STATS(net, idev,
98 					      IPSTATS_MIB_OUTDISCARDS);
99 				kfree_skb(skb);
100 				return 0;
101 			}
102 		}
103 
104 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106 		    !(dev->flags & IFF_LOOPBACK)) {
107 			kfree_skb(skb);
108 			return 0;
109 		}
110 	}
111 
112 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 		int res = lwtunnel_xmit(skb);
114 
115 		if (res != LWTUNNEL_XMIT_CONTINUE)
116 			return res;
117 	}
118 
119 	rcu_read_lock_bh();
120 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122 	if (unlikely(!neigh))
123 		neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
124 	if (!IS_ERR(neigh)) {
125 		sock_confirm_neigh(skb, neigh);
126 		ret = neigh_output(neigh, skb, false);
127 		rcu_read_unlock_bh();
128 		return ret;
129 	}
130 	rcu_read_unlock_bh();
131 
132 	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
133 	kfree_skb(skb);
134 	return -EINVAL;
135 }
136 
137 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
139 				    struct sk_buff *skb, unsigned int mtu)
140 {
141 	struct sk_buff *segs, *nskb;
142 	netdev_features_t features;
143 	int ret = 0;
144 
145 	/* Please see corresponding comment in ip_finish_output_gso
146 	 * describing the cases where GSO segment length exceeds the
147 	 * egress MTU.
148 	 */
149 	features = netif_skb_features(skb);
150 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
151 	if (IS_ERR_OR_NULL(segs)) {
152 		kfree_skb(skb);
153 		return -ENOMEM;
154 	}
155 
156 	consume_skb(skb);
157 
158 	skb_list_walk_safe(segs, segs, nskb) {
159 		int err;
160 
161 		skb_mark_not_on_list(segs);
162 		/* Last GSO segment can be smaller than gso_size (and MTU).
163 		 * Adding a fragment header would produce an "atomic fragment",
164 		 * which is considered harmful (RFC-8021). Avoid that.
165 		 */
166 		err = segs->len > mtu ?
167 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
168 			ip6_finish_output2(net, sk, segs);
169 		if (err && ret == 0)
170 			ret = err;
171 	}
172 
173 	return ret;
174 }
175 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)176 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
177 {
178 	unsigned int mtu;
179 
180 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
181 	/* Policy lookup after SNAT yielded a new policy */
182 	if (skb_dst(skb)->xfrm) {
183 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
184 		return dst_output(net, sk, skb);
185 	}
186 #endif
187 
188 	mtu = ip6_skb_dst_mtu(skb);
189 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
190 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
191 
192 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
193 	    dst_allfrag(skb_dst(skb)) ||
194 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
195 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
196 	else
197 		return ip6_finish_output2(net, sk, skb);
198 }
199 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)200 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
201 {
202 	int ret;
203 
204 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
205 	switch (ret) {
206 	case NET_XMIT_SUCCESS:
207 		return __ip6_finish_output(net, sk, skb);
208 	case NET_XMIT_CN:
209 		return __ip6_finish_output(net, sk, skb) ? : ret;
210 	default:
211 		kfree_skb(skb);
212 		return ret;
213 	}
214 }
215 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)216 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
217 {
218 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
219 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
220 
221 	skb->protocol = htons(ETH_P_IPV6);
222 	skb->dev = dev;
223 
224 	if (unlikely(idev->cnf.disable_ipv6)) {
225 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
226 		kfree_skb(skb);
227 		return 0;
228 	}
229 
230 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
231 			    net, sk, skb, indev, dev,
232 			    ip6_finish_output,
233 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
234 }
235 EXPORT_SYMBOL(ip6_output);
236 
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)237 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
238 {
239 	if (!np->autoflowlabel_set)
240 		return ip6_default_np_autolabel(net);
241 	else
242 		return np->autoflowlabel;
243 }
244 
245 /*
246  * xmit an sk_buff (used by TCP, SCTP and DCCP)
247  * Note : socket lock is not held for SYNACK packets, but might be modified
248  * by calls to skb_set_owner_w() and ipv6_local_error(),
249  * which are using proper atomic operations or spinlocks.
250  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)251 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
252 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
253 {
254 	struct net *net = sock_net(sk);
255 	const struct ipv6_pinfo *np = inet6_sk(sk);
256 	struct in6_addr *first_hop = &fl6->daddr;
257 	struct dst_entry *dst = skb_dst(skb);
258 	struct net_device *dev = dst->dev;
259 	struct inet6_dev *idev = ip6_dst_idev(dst);
260 	unsigned int head_room;
261 	struct ipv6hdr *hdr;
262 	u8  proto = fl6->flowi6_proto;
263 	int seg_len = skb->len;
264 	int hlimit = -1;
265 	u32 mtu;
266 
267 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
268 	if (opt)
269 		head_room += opt->opt_nflen + opt->opt_flen;
270 
271 	if (unlikely(head_room > skb_headroom(skb))) {
272 		skb = skb_expand_head(skb, head_room);
273 		if (!skb) {
274 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
275 			return -ENOBUFS;
276 		}
277 	}
278 
279 	if (opt) {
280 		seg_len += opt->opt_nflen + opt->opt_flen;
281 
282 		if (opt->opt_flen)
283 			ipv6_push_frag_opts(skb, opt, &proto);
284 
285 		if (opt->opt_nflen)
286 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287 					     &fl6->saddr);
288 	}
289 
290 	skb_push(skb, sizeof(struct ipv6hdr));
291 	skb_reset_network_header(skb);
292 	hdr = ipv6_hdr(skb);
293 
294 	/*
295 	 *	Fill in the IPv6 header
296 	 */
297 	if (np)
298 		hlimit = np->hop_limit;
299 	if (hlimit < 0)
300 		hlimit = ip6_dst_hoplimit(dst);
301 
302 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
303 				ip6_autoflowlabel(net, np), fl6));
304 
305 	hdr->payload_len = htons(seg_len);
306 	hdr->nexthdr = proto;
307 	hdr->hop_limit = hlimit;
308 
309 	hdr->saddr = fl6->saddr;
310 	hdr->daddr = *first_hop;
311 
312 	skb->protocol = htons(ETH_P_IPV6);
313 	skb->priority = priority;
314 	skb->mark = mark;
315 
316 	mtu = dst_mtu(dst);
317 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
318 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
319 
320 		/* if egress device is enslaved to an L3 master device pass the
321 		 * skb to its handler for processing
322 		 */
323 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
324 		if (unlikely(!skb))
325 			return 0;
326 
327 		/* hooks should never assume socket lock is held.
328 		 * we promote our socket to non const
329 		 */
330 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
331 			       net, (struct sock *)sk, skb, NULL, dev,
332 			       dst_output);
333 	}
334 
335 	skb->dev = dev;
336 	/* ipv6_local_error() does not require socket lock,
337 	 * we promote our socket to non const
338 	 */
339 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
340 
341 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
342 	kfree_skb(skb);
343 	return -EMSGSIZE;
344 }
345 EXPORT_SYMBOL(ip6_xmit);
346 
ip6_call_ra_chain(struct sk_buff * skb,int sel)347 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
348 {
349 	struct ip6_ra_chain *ra;
350 	struct sock *last = NULL;
351 
352 	read_lock(&ip6_ra_lock);
353 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
354 		struct sock *sk = ra->sk;
355 		if (sk && ra->sel == sel &&
356 		    (!sk->sk_bound_dev_if ||
357 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
358 			struct ipv6_pinfo *np = inet6_sk(sk);
359 
360 			if (np && np->rtalert_isolate &&
361 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
362 				continue;
363 			}
364 			if (last) {
365 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
366 				if (skb2)
367 					rawv6_rcv(last, skb2);
368 			}
369 			last = sk;
370 		}
371 	}
372 
373 	if (last) {
374 		rawv6_rcv(last, skb);
375 		read_unlock(&ip6_ra_lock);
376 		return 1;
377 	}
378 	read_unlock(&ip6_ra_lock);
379 	return 0;
380 }
381 
ip6_forward_proxy_check(struct sk_buff * skb)382 static int ip6_forward_proxy_check(struct sk_buff *skb)
383 {
384 	struct ipv6hdr *hdr = ipv6_hdr(skb);
385 	u8 nexthdr = hdr->nexthdr;
386 	__be16 frag_off;
387 	int offset;
388 
389 	if (ipv6_ext_hdr(nexthdr)) {
390 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
391 		if (offset < 0)
392 			return 0;
393 	} else
394 		offset = sizeof(struct ipv6hdr);
395 
396 	if (nexthdr == IPPROTO_ICMPV6) {
397 		struct icmp6hdr *icmp6;
398 
399 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
400 					 offset + 1 - skb->data)))
401 			return 0;
402 
403 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
404 
405 		switch (icmp6->icmp6_type) {
406 		case NDISC_ROUTER_SOLICITATION:
407 		case NDISC_ROUTER_ADVERTISEMENT:
408 		case NDISC_NEIGHBOUR_SOLICITATION:
409 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
410 		case NDISC_REDIRECT:
411 			/* For reaction involving unicast neighbor discovery
412 			 * message destined to the proxied address, pass it to
413 			 * input function.
414 			 */
415 			return 1;
416 		default:
417 			break;
418 		}
419 	}
420 
421 	/*
422 	 * The proxying router can't forward traffic sent to a link-local
423 	 * address, so signal the sender and discard the packet. This
424 	 * behavior is clarified by the MIPv6 specification.
425 	 */
426 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
427 		dst_link_failure(skb);
428 		return -1;
429 	}
430 
431 	return 0;
432 }
433 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)434 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
435 				     struct sk_buff *skb)
436 {
437 	struct dst_entry *dst = skb_dst(skb);
438 
439 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
440 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
441 
442 #ifdef CONFIG_NET_SWITCHDEV
443 	if (skb->offload_l3_fwd_mark) {
444 		consume_skb(skb);
445 		return 0;
446 	}
447 #endif
448 
449 	skb->tstamp = 0;
450 	return dst_output(net, sk, skb);
451 }
452 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)453 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
454 {
455 	if (skb->len <= mtu)
456 		return false;
457 
458 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
459 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
460 		return true;
461 
462 	if (skb->ignore_df)
463 		return false;
464 
465 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
466 		return false;
467 
468 	return true;
469 }
470 
ip6_forward(struct sk_buff * skb)471 int ip6_forward(struct sk_buff *skb)
472 {
473 	struct dst_entry *dst = skb_dst(skb);
474 	struct ipv6hdr *hdr = ipv6_hdr(skb);
475 	struct inet6_skb_parm *opt = IP6CB(skb);
476 	struct net *net = dev_net(dst->dev);
477 	struct inet6_dev *idev;
478 	u32 mtu;
479 
480 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
481 	if (net->ipv6.devconf_all->forwarding == 0)
482 		goto error;
483 
484 	if (skb->pkt_type != PACKET_HOST)
485 		goto drop;
486 
487 	if (unlikely(skb->sk))
488 		goto drop;
489 
490 	if (skb_warn_if_lro(skb))
491 		goto drop;
492 
493 	if (!net->ipv6.devconf_all->disable_policy &&
494 	    (!idev || !idev->cnf.disable_policy) &&
495 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
496 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
497 		goto drop;
498 	}
499 
500 	skb_forward_csum(skb);
501 
502 	/*
503 	 *	We DO NOT make any processing on
504 	 *	RA packets, pushing them to user level AS IS
505 	 *	without ane WARRANTY that application will be able
506 	 *	to interpret them. The reason is that we
507 	 *	cannot make anything clever here.
508 	 *
509 	 *	We are not end-node, so that if packet contains
510 	 *	AH/ESP, we cannot make anything.
511 	 *	Defragmentation also would be mistake, RA packets
512 	 *	cannot be fragmented, because there is no warranty
513 	 *	that different fragments will go along one path. --ANK
514 	 */
515 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
516 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
517 			return 0;
518 	}
519 
520 	/*
521 	 *	check and decrement ttl
522 	 */
523 	if (hdr->hop_limit <= 1) {
524 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
525 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
526 
527 		kfree_skb(skb);
528 		return -ETIMEDOUT;
529 	}
530 
531 	/* XXX: idev->cnf.proxy_ndp? */
532 	if (net->ipv6.devconf_all->proxy_ndp &&
533 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
534 		int proxied = ip6_forward_proxy_check(skb);
535 		if (proxied > 0) {
536 			/* It's tempting to decrease the hop limit
537 			 * here by 1, as we do at the end of the
538 			 * function too.
539 			 *
540 			 * But that would be incorrect, as proxying is
541 			 * not forwarding.  The ip6_input function
542 			 * will handle this packet locally, and it
543 			 * depends on the hop limit being unchanged.
544 			 *
545 			 * One example is the NDP hop limit, that
546 			 * always has to stay 255, but other would be
547 			 * similar checks around RA packets, where the
548 			 * user can even change the desired limit.
549 			 */
550 			return ip6_input(skb);
551 		} else if (proxied < 0) {
552 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
553 			goto drop;
554 		}
555 	}
556 
557 	if (!xfrm6_route_forward(skb)) {
558 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
559 		goto drop;
560 	}
561 	dst = skb_dst(skb);
562 
563 	/* IPv6 specs say nothing about it, but it is clear that we cannot
564 	   send redirects to source routed frames.
565 	   We don't send redirects to frames decapsulated from IPsec.
566 	 */
567 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
568 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
569 		struct in6_addr *target = NULL;
570 		struct inet_peer *peer;
571 		struct rt6_info *rt;
572 
573 		/*
574 		 *	incoming and outgoing devices are the same
575 		 *	send a redirect.
576 		 */
577 
578 		rt = (struct rt6_info *) dst;
579 		if (rt->rt6i_flags & RTF_GATEWAY)
580 			target = &rt->rt6i_gateway;
581 		else
582 			target = &hdr->daddr;
583 
584 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
585 
586 		/* Limit redirects both by destination (here)
587 		   and by source (inside ndisc_send_redirect)
588 		 */
589 		if (inet_peer_xrlim_allow(peer, 1*HZ))
590 			ndisc_send_redirect(skb, target);
591 		if (peer)
592 			inet_putpeer(peer);
593 	} else {
594 		int addrtype = ipv6_addr_type(&hdr->saddr);
595 
596 		/* This check is security critical. */
597 		if (addrtype == IPV6_ADDR_ANY ||
598 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
599 			goto error;
600 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
601 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
602 				    ICMPV6_NOT_NEIGHBOUR, 0);
603 			goto error;
604 		}
605 	}
606 
607 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
608 	if (mtu < IPV6_MIN_MTU)
609 		mtu = IPV6_MIN_MTU;
610 
611 	if (ip6_pkt_too_big(skb, mtu)) {
612 		/* Again, force OUTPUT device used as source address */
613 		skb->dev = dst->dev;
614 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
615 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
616 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
617 				IPSTATS_MIB_FRAGFAILS);
618 		kfree_skb(skb);
619 		return -EMSGSIZE;
620 	}
621 
622 	if (skb_cow(skb, dst->dev->hard_header_len)) {
623 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
624 				IPSTATS_MIB_OUTDISCARDS);
625 		goto drop;
626 	}
627 
628 	hdr = ipv6_hdr(skb);
629 
630 	/* Mangling hops number delayed to point after skb COW */
631 
632 	hdr->hop_limit--;
633 
634 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
635 		       net, NULL, skb, skb->dev, dst->dev,
636 		       ip6_forward_finish);
637 
638 error:
639 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
640 drop:
641 	kfree_skb(skb);
642 	return -EINVAL;
643 }
644 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)645 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
646 {
647 	to->pkt_type = from->pkt_type;
648 	to->priority = from->priority;
649 	to->protocol = from->protocol;
650 	skb_dst_drop(to);
651 	skb_dst_set(to, dst_clone(skb_dst(from)));
652 	to->dev = from->dev;
653 	to->mark = from->mark;
654 
655 	skb_copy_hash(to, from);
656 
657 #ifdef CONFIG_NET_SCHED
658 	to->tc_index = from->tc_index;
659 #endif
660 	nf_copy(to, from);
661 	skb_ext_copy(to, from);
662 	skb_copy_secmark(to, from);
663 }
664 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)665 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
666 		      u8 nexthdr, __be32 frag_id,
667 		      struct ip6_fraglist_iter *iter)
668 {
669 	unsigned int first_len;
670 	struct frag_hdr *fh;
671 
672 	/* BUILD HEADER */
673 	*prevhdr = NEXTHDR_FRAGMENT;
674 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
675 	if (!iter->tmp_hdr)
676 		return -ENOMEM;
677 
678 	iter->frag = skb_shinfo(skb)->frag_list;
679 	skb_frag_list_init(skb);
680 
681 	iter->offset = 0;
682 	iter->hlen = hlen;
683 	iter->frag_id = frag_id;
684 	iter->nexthdr = nexthdr;
685 
686 	__skb_pull(skb, hlen);
687 	fh = __skb_push(skb, sizeof(struct frag_hdr));
688 	__skb_push(skb, hlen);
689 	skb_reset_network_header(skb);
690 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
691 
692 	fh->nexthdr = nexthdr;
693 	fh->reserved = 0;
694 	fh->frag_off = htons(IP6_MF);
695 	fh->identification = frag_id;
696 
697 	first_len = skb_pagelen(skb);
698 	skb->data_len = first_len - skb_headlen(skb);
699 	skb->len = first_len;
700 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
701 
702 	return 0;
703 }
704 EXPORT_SYMBOL(ip6_fraglist_init);
705 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)706 void ip6_fraglist_prepare(struct sk_buff *skb,
707 			  struct ip6_fraglist_iter *iter)
708 {
709 	struct sk_buff *frag = iter->frag;
710 	unsigned int hlen = iter->hlen;
711 	struct frag_hdr *fh;
712 
713 	frag->ip_summed = CHECKSUM_NONE;
714 	skb_reset_transport_header(frag);
715 	fh = __skb_push(frag, sizeof(struct frag_hdr));
716 	__skb_push(frag, hlen);
717 	skb_reset_network_header(frag);
718 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
719 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
720 	fh->nexthdr = iter->nexthdr;
721 	fh->reserved = 0;
722 	fh->frag_off = htons(iter->offset);
723 	if (frag->next)
724 		fh->frag_off |= htons(IP6_MF);
725 	fh->identification = iter->frag_id;
726 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
727 	ip6_copy_metadata(frag, skb);
728 }
729 EXPORT_SYMBOL(ip6_fraglist_prepare);
730 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)731 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
732 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
733 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
734 {
735 	state->prevhdr = prevhdr;
736 	state->nexthdr = nexthdr;
737 	state->frag_id = frag_id;
738 
739 	state->hlen = hlen;
740 	state->mtu = mtu;
741 
742 	state->left = skb->len - hlen;	/* Space per frame */
743 	state->ptr = hlen;		/* Where to start from */
744 
745 	state->hroom = hdr_room;
746 	state->troom = needed_tailroom;
747 
748 	state->offset = 0;
749 }
750 EXPORT_SYMBOL(ip6_frag_init);
751 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)752 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
753 {
754 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
755 	struct sk_buff *frag;
756 	struct frag_hdr *fh;
757 	unsigned int len;
758 
759 	len = state->left;
760 	/* IF: it doesn't fit, use 'mtu' - the data space left */
761 	if (len > state->mtu)
762 		len = state->mtu;
763 	/* IF: we are not sending up to and including the packet end
764 	   then align the next start on an eight byte boundary */
765 	if (len < state->left)
766 		len &= ~7;
767 
768 	/* Allocate buffer */
769 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
770 			 state->hroom + state->troom, GFP_ATOMIC);
771 	if (!frag)
772 		return ERR_PTR(-ENOMEM);
773 
774 	/*
775 	 *	Set up data on packet
776 	 */
777 
778 	ip6_copy_metadata(frag, skb);
779 	skb_reserve(frag, state->hroom);
780 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
781 	skb_reset_network_header(frag);
782 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
783 	frag->transport_header = (frag->network_header + state->hlen +
784 				  sizeof(struct frag_hdr));
785 
786 	/*
787 	 *	Charge the memory for the fragment to any owner
788 	 *	it might possess
789 	 */
790 	if (skb->sk)
791 		skb_set_owner_w(frag, skb->sk);
792 
793 	/*
794 	 *	Copy the packet header into the new buffer.
795 	 */
796 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
797 
798 	fragnexthdr_offset = skb_network_header(frag);
799 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
800 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
801 
802 	/*
803 	 *	Build fragment header.
804 	 */
805 	fh->nexthdr = state->nexthdr;
806 	fh->reserved = 0;
807 	fh->identification = state->frag_id;
808 
809 	/*
810 	 *	Copy a block of the IP datagram.
811 	 */
812 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
813 			     len));
814 	state->left -= len;
815 
816 	fh->frag_off = htons(state->offset);
817 	if (state->left > 0)
818 		fh->frag_off |= htons(IP6_MF);
819 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
820 
821 	state->ptr += len;
822 	state->offset += len;
823 
824 	return frag;
825 }
826 EXPORT_SYMBOL(ip6_frag_next);
827 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))828 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
829 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
830 {
831 	struct sk_buff *frag;
832 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
833 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
834 				inet6_sk(skb->sk) : NULL;
835 	struct ip6_frag_state state;
836 	unsigned int mtu, hlen, nexthdr_offset;
837 	ktime_t tstamp = skb->tstamp;
838 	int hroom, err = 0;
839 	__be32 frag_id;
840 	u8 *prevhdr, nexthdr = 0;
841 
842 	err = ip6_find_1stfragopt(skb, &prevhdr);
843 	if (err < 0)
844 		goto fail;
845 	hlen = err;
846 	nexthdr = *prevhdr;
847 	nexthdr_offset = prevhdr - skb_network_header(skb);
848 
849 	mtu = ip6_skb_dst_mtu(skb);
850 
851 	/* We must not fragment if the socket is set to force MTU discovery
852 	 * or if the skb it not generated by a local socket.
853 	 */
854 	if (unlikely(!skb->ignore_df && skb->len > mtu))
855 		goto fail_toobig;
856 
857 	if (IP6CB(skb)->frag_max_size) {
858 		if (IP6CB(skb)->frag_max_size > mtu)
859 			goto fail_toobig;
860 
861 		/* don't send fragments larger than what we received */
862 		mtu = IP6CB(skb)->frag_max_size;
863 		if (mtu < IPV6_MIN_MTU)
864 			mtu = IPV6_MIN_MTU;
865 	}
866 
867 	if (np && np->frag_size < mtu) {
868 		if (np->frag_size)
869 			mtu = np->frag_size;
870 	}
871 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
872 		goto fail_toobig;
873 	mtu -= hlen + sizeof(struct frag_hdr);
874 
875 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
876 				    &ipv6_hdr(skb)->saddr);
877 
878 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
879 	    (err = skb_checksum_help(skb)))
880 		goto fail;
881 
882 	prevhdr = skb_network_header(skb) + nexthdr_offset;
883 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
884 	if (skb_has_frag_list(skb)) {
885 		unsigned int first_len = skb_pagelen(skb);
886 		struct ip6_fraglist_iter iter;
887 		struct sk_buff *frag2;
888 
889 		if (first_len - hlen > mtu ||
890 		    ((first_len - hlen) & 7) ||
891 		    skb_cloned(skb) ||
892 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
893 			goto slow_path;
894 
895 		skb_walk_frags(skb, frag) {
896 			/* Correct geometry. */
897 			if (frag->len > mtu ||
898 			    ((frag->len & 7) && frag->next) ||
899 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
900 				goto slow_path_clean;
901 
902 			/* Partially cloned skb? */
903 			if (skb_shared(frag))
904 				goto slow_path_clean;
905 
906 			BUG_ON(frag->sk);
907 			if (skb->sk) {
908 				frag->sk = skb->sk;
909 				frag->destructor = sock_wfree;
910 			}
911 			skb->truesize -= frag->truesize;
912 		}
913 
914 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
915 					&iter);
916 		if (err < 0)
917 			goto fail;
918 
919 		/* We prevent @rt from being freed. */
920 		rcu_read_lock();
921 
922 		for (;;) {
923 			/* Prepare header of the next frame,
924 			 * before previous one went down. */
925 			if (iter.frag)
926 				ip6_fraglist_prepare(skb, &iter);
927 
928 			skb->tstamp = tstamp;
929 			err = output(net, sk, skb);
930 			if (!err)
931 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
932 					      IPSTATS_MIB_FRAGCREATES);
933 
934 			if (err || !iter.frag)
935 				break;
936 
937 			skb = ip6_fraglist_next(&iter);
938 		}
939 
940 		kfree(iter.tmp_hdr);
941 
942 		if (err == 0) {
943 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
944 				      IPSTATS_MIB_FRAGOKS);
945 			rcu_read_unlock();
946 			return 0;
947 		}
948 
949 		kfree_skb_list(iter.frag);
950 
951 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
952 			      IPSTATS_MIB_FRAGFAILS);
953 		rcu_read_unlock();
954 		return err;
955 
956 slow_path_clean:
957 		skb_walk_frags(skb, frag2) {
958 			if (frag2 == frag)
959 				break;
960 			frag2->sk = NULL;
961 			frag2->destructor = NULL;
962 			skb->truesize += frag2->truesize;
963 		}
964 	}
965 
966 slow_path:
967 	/*
968 	 *	Fragment the datagram.
969 	 */
970 
971 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
972 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
973 		      &state);
974 
975 	/*
976 	 *	Keep copying data until we run out.
977 	 */
978 
979 	while (state.left > 0) {
980 		frag = ip6_frag_next(skb, &state);
981 		if (IS_ERR(frag)) {
982 			err = PTR_ERR(frag);
983 			goto fail;
984 		}
985 
986 		/*
987 		 *	Put this fragment into the sending queue.
988 		 */
989 		frag->tstamp = tstamp;
990 		err = output(net, sk, frag);
991 		if (err)
992 			goto fail;
993 
994 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
995 			      IPSTATS_MIB_FRAGCREATES);
996 	}
997 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
998 		      IPSTATS_MIB_FRAGOKS);
999 	consume_skb(skb);
1000 	return err;
1001 
1002 fail_toobig:
1003 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1004 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1005 
1006 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1007 	err = -EMSGSIZE;
1008 
1009 fail:
1010 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1011 		      IPSTATS_MIB_FRAGFAILS);
1012 	kfree_skb(skb);
1013 	return err;
1014 }
1015 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1016 static inline int ip6_rt_check(const struct rt6key *rt_key,
1017 			       const struct in6_addr *fl_addr,
1018 			       const struct in6_addr *addr_cache)
1019 {
1020 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1021 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1022 }
1023 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1024 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1025 					  struct dst_entry *dst,
1026 					  const struct flowi6 *fl6)
1027 {
1028 	struct ipv6_pinfo *np = inet6_sk(sk);
1029 	struct rt6_info *rt;
1030 
1031 	if (!dst)
1032 		goto out;
1033 
1034 	if (dst->ops->family != AF_INET6) {
1035 		dst_release(dst);
1036 		return NULL;
1037 	}
1038 
1039 	rt = (struct rt6_info *)dst;
1040 	/* Yes, checking route validity in not connected
1041 	 * case is not very simple. Take into account,
1042 	 * that we do not support routing by source, TOS,
1043 	 * and MSG_DONTROUTE		--ANK (980726)
1044 	 *
1045 	 * 1. ip6_rt_check(): If route was host route,
1046 	 *    check that cached destination is current.
1047 	 *    If it is network route, we still may
1048 	 *    check its validity using saved pointer
1049 	 *    to the last used address: daddr_cache.
1050 	 *    We do not want to save whole address now,
1051 	 *    (because main consumer of this service
1052 	 *    is tcp, which has not this problem),
1053 	 *    so that the last trick works only on connected
1054 	 *    sockets.
1055 	 * 2. oif also should be the same.
1056 	 */
1057 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1058 #ifdef CONFIG_IPV6_SUBTREES
1059 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1060 #endif
1061 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1062 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1063 		dst_release(dst);
1064 		dst = NULL;
1065 	}
1066 
1067 out:
1068 	return dst;
1069 }
1070 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1071 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1072 			       struct dst_entry **dst, struct flowi6 *fl6)
1073 {
1074 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1075 	struct neighbour *n;
1076 	struct rt6_info *rt;
1077 #endif
1078 	int err;
1079 	int flags = 0;
1080 
1081 	/* The correct way to handle this would be to do
1082 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1083 	 * the route-specific preferred source forces the
1084 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1085 	 *
1086 	 * In source specific routing (no src=any default route),
1087 	 * ip6_route_output will fail given src=any saddr, though, so
1088 	 * that's why we try it again later.
1089 	 */
1090 	if (ipv6_addr_any(&fl6->saddr)) {
1091 		struct fib6_info *from;
1092 		struct rt6_info *rt;
1093 
1094 		*dst = ip6_route_output(net, sk, fl6);
1095 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1096 
1097 		rcu_read_lock();
1098 		from = rt ? rcu_dereference(rt->from) : NULL;
1099 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1100 					  sk ? inet6_sk(sk)->srcprefs : 0,
1101 					  &fl6->saddr);
1102 		rcu_read_unlock();
1103 
1104 		if (err)
1105 			goto out_err_release;
1106 
1107 		/* If we had an erroneous initial result, pretend it
1108 		 * never existed and let the SA-enabled version take
1109 		 * over.
1110 		 */
1111 		if ((*dst)->error) {
1112 			dst_release(*dst);
1113 			*dst = NULL;
1114 		}
1115 
1116 		if (fl6->flowi6_oif)
1117 			flags |= RT6_LOOKUP_F_IFACE;
1118 	}
1119 
1120 	if (!*dst)
1121 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1122 
1123 	err = (*dst)->error;
1124 	if (err)
1125 		goto out_err_release;
1126 
1127 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1128 	/*
1129 	 * Here if the dst entry we've looked up
1130 	 * has a neighbour entry that is in the INCOMPLETE
1131 	 * state and the src address from the flow is
1132 	 * marked as OPTIMISTIC, we release the found
1133 	 * dst entry and replace it instead with the
1134 	 * dst entry of the nexthop router
1135 	 */
1136 	rt = (struct rt6_info *) *dst;
1137 	rcu_read_lock_bh();
1138 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1139 				      rt6_nexthop(rt, &fl6->daddr));
1140 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1141 	rcu_read_unlock_bh();
1142 
1143 	if (err) {
1144 		struct inet6_ifaddr *ifp;
1145 		struct flowi6 fl_gw6;
1146 		int redirect;
1147 
1148 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1149 				      (*dst)->dev, 1);
1150 
1151 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1152 		if (ifp)
1153 			in6_ifa_put(ifp);
1154 
1155 		if (redirect) {
1156 			/*
1157 			 * We need to get the dst entry for the
1158 			 * default router instead
1159 			 */
1160 			dst_release(*dst);
1161 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1162 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1163 			*dst = ip6_route_output(net, sk, &fl_gw6);
1164 			err = (*dst)->error;
1165 			if (err)
1166 				goto out_err_release;
1167 		}
1168 	}
1169 #endif
1170 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1171 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1172 		err = -EAFNOSUPPORT;
1173 		goto out_err_release;
1174 	}
1175 
1176 	return 0;
1177 
1178 out_err_release:
1179 	dst_release(*dst);
1180 	*dst = NULL;
1181 
1182 	if (err == -ENETUNREACH)
1183 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1184 	return err;
1185 }
1186 
1187 /**
1188  *	ip6_dst_lookup - perform route lookup on flow
1189  *	@net: Network namespace to perform lookup in
1190  *	@sk: socket which provides route info
1191  *	@dst: pointer to dst_entry * for result
1192  *	@fl6: flow to lookup
1193  *
1194  *	This function performs a route lookup on the given flow.
1195  *
1196  *	It returns zero on success, or a standard errno code on error.
1197  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1198 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1199 		   struct flowi6 *fl6)
1200 {
1201 	*dst = NULL;
1202 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1203 }
1204 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1205 
1206 /**
1207  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1208  *	@net: Network namespace to perform lookup in
1209  *	@sk: socket which provides route info
1210  *	@fl6: flow to lookup
1211  *	@final_dst: final destination address for ipsec lookup
1212  *
1213  *	This function performs a route lookup on the given flow.
1214  *
1215  *	It returns a valid dst pointer on success, or a pointer encoded
1216  *	error code.
1217  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1218 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1219 				      const struct in6_addr *final_dst)
1220 {
1221 	struct dst_entry *dst = NULL;
1222 	int err;
1223 
1224 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1225 	if (err)
1226 		return ERR_PTR(err);
1227 	if (final_dst)
1228 		fl6->daddr = *final_dst;
1229 
1230 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1231 }
1232 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1233 
1234 /**
1235  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1236  *	@sk: socket which provides the dst cache and route info
1237  *	@fl6: flow to lookup
1238  *	@final_dst: final destination address for ipsec lookup
1239  *	@connected: whether @sk is connected or not
1240  *
1241  *	This function performs a route lookup on the given flow with the
1242  *	possibility of using the cached route in the socket if it is valid.
1243  *	It will take the socket dst lock when operating on the dst cache.
1244  *	As a result, this function can only be used in process context.
1245  *
1246  *	In addition, for a connected socket, cache the dst in the socket
1247  *	if the current cache is not valid.
1248  *
1249  *	It returns a valid dst pointer on success, or a pointer encoded
1250  *	error code.
1251  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1252 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1253 					 const struct in6_addr *final_dst,
1254 					 bool connected)
1255 {
1256 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1257 
1258 	dst = ip6_sk_dst_check(sk, dst, fl6);
1259 	if (dst)
1260 		return dst;
1261 
1262 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1263 	if (connected && !IS_ERR(dst))
1264 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1265 
1266 	return dst;
1267 }
1268 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1269 
1270 /**
1271  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1272  *      @skb: Packet for which lookup is done
1273  *      @dev: Tunnel device
1274  *      @net: Network namespace of tunnel device
1275  *      @sock: Socket which provides route info
1276  *      @saddr: Memory to store the src ip address
1277  *      @info: Tunnel information
1278  *      @protocol: IP protocol
1279  *      @use_cache: Flag to enable cache usage
1280  *      This function performs a route lookup on a tunnel
1281  *
1282  *      It returns a valid dst pointer and stores src address to be used in
1283  *      tunnel in param saddr on success, else a pointer encoded error code.
1284  */
1285 
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1286 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1287 					struct net_device *dev,
1288 					struct net *net,
1289 					struct socket *sock,
1290 					struct in6_addr *saddr,
1291 					const struct ip_tunnel_info *info,
1292 					u8 protocol,
1293 					bool use_cache)
1294 {
1295 	struct dst_entry *dst = NULL;
1296 #ifdef CONFIG_DST_CACHE
1297 	struct dst_cache *dst_cache;
1298 #endif
1299 	struct flowi6 fl6;
1300 	__u8 prio;
1301 
1302 #ifdef CONFIG_DST_CACHE
1303 	dst_cache = (struct dst_cache *)&info->dst_cache;
1304 	if (use_cache) {
1305 		dst = dst_cache_get_ip6(dst_cache, saddr);
1306 		if (dst)
1307 			return dst;
1308 	}
1309 #endif
1310 	memset(&fl6, 0, sizeof(fl6));
1311 	fl6.flowi6_mark = skb->mark;
1312 	fl6.flowi6_proto = protocol;
1313 	fl6.daddr = info->key.u.ipv6.dst;
1314 	fl6.saddr = info->key.u.ipv6.src;
1315 	prio = info->key.tos;
1316 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1317 
1318 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1319 					      NULL);
1320 	if (IS_ERR(dst)) {
1321 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1322 		return ERR_PTR(-ENETUNREACH);
1323 	}
1324 	if (dst->dev == dev) { /* is this necessary? */
1325 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1326 		dst_release(dst);
1327 		return ERR_PTR(-ELOOP);
1328 	}
1329 #ifdef CONFIG_DST_CACHE
1330 	if (use_cache)
1331 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1332 #endif
1333 	*saddr = fl6.saddr;
1334 	return dst;
1335 }
1336 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1337 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1338 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1339 					       gfp_t gfp)
1340 {
1341 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1342 }
1343 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1344 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1345 						gfp_t gfp)
1346 {
1347 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1348 }
1349 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1350 static void ip6_append_data_mtu(unsigned int *mtu,
1351 				int *maxfraglen,
1352 				unsigned int fragheaderlen,
1353 				struct sk_buff *skb,
1354 				struct rt6_info *rt,
1355 				unsigned int orig_mtu)
1356 {
1357 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1358 		if (!skb) {
1359 			/* first fragment, reserve header_len */
1360 			*mtu = orig_mtu - rt->dst.header_len;
1361 
1362 		} else {
1363 			/*
1364 			 * this fragment is not first, the headers
1365 			 * space is regarded as data space.
1366 			 */
1367 			*mtu = orig_mtu;
1368 		}
1369 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1370 			      + fragheaderlen - sizeof(struct frag_hdr);
1371 	}
1372 }
1373 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1374 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1375 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1376 			  struct rt6_info *rt, struct flowi6 *fl6)
1377 {
1378 	struct ipv6_pinfo *np = inet6_sk(sk);
1379 	unsigned int mtu;
1380 	struct ipv6_txoptions *opt = ipc6->opt;
1381 
1382 	/*
1383 	 * setup for corking
1384 	 */
1385 	if (opt) {
1386 		if (WARN_ON(v6_cork->opt))
1387 			return -EINVAL;
1388 
1389 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1390 		if (unlikely(!v6_cork->opt))
1391 			return -ENOBUFS;
1392 
1393 		v6_cork->opt->tot_len = sizeof(*opt);
1394 		v6_cork->opt->opt_flen = opt->opt_flen;
1395 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1396 
1397 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1398 						    sk->sk_allocation);
1399 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1400 			return -ENOBUFS;
1401 
1402 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1403 						    sk->sk_allocation);
1404 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1405 			return -ENOBUFS;
1406 
1407 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1408 						   sk->sk_allocation);
1409 		if (opt->hopopt && !v6_cork->opt->hopopt)
1410 			return -ENOBUFS;
1411 
1412 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1413 						    sk->sk_allocation);
1414 		if (opt->srcrt && !v6_cork->opt->srcrt)
1415 			return -ENOBUFS;
1416 
1417 		/* need source address above miyazawa*/
1418 	}
1419 	dst_hold(&rt->dst);
1420 	cork->base.dst = &rt->dst;
1421 	cork->fl.u.ip6 = *fl6;
1422 	v6_cork->hop_limit = ipc6->hlimit;
1423 	v6_cork->tclass = ipc6->tclass;
1424 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1425 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1426 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1427 	else
1428 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1429 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1430 	if (np->frag_size < mtu) {
1431 		if (np->frag_size)
1432 			mtu = np->frag_size;
1433 	}
1434 	cork->base.fragsize = mtu;
1435 	cork->base.gso_size = ipc6->gso_size;
1436 	cork->base.tx_flags = 0;
1437 	cork->base.mark = ipc6->sockc.mark;
1438 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1439 
1440 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1441 		cork->base.flags |= IPCORK_ALLFRAG;
1442 	cork->base.length = 0;
1443 
1444 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1445 
1446 	return 0;
1447 }
1448 
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1449 static int __ip6_append_data(struct sock *sk,
1450 			     struct flowi6 *fl6,
1451 			     struct sk_buff_head *queue,
1452 			     struct inet_cork *cork,
1453 			     struct inet6_cork *v6_cork,
1454 			     struct page_frag *pfrag,
1455 			     int getfrag(void *from, char *to, int offset,
1456 					 int len, int odd, struct sk_buff *skb),
1457 			     void *from, int length, int transhdrlen,
1458 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1459 {
1460 	struct sk_buff *skb, *skb_prev = NULL;
1461 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1462 	struct ubuf_info *uarg = NULL;
1463 	int exthdrlen = 0;
1464 	int dst_exthdrlen = 0;
1465 	int hh_len;
1466 	int copy;
1467 	int err;
1468 	int offset = 0;
1469 	u32 tskey = 0;
1470 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1471 	struct ipv6_txoptions *opt = v6_cork->opt;
1472 	int csummode = CHECKSUM_NONE;
1473 	unsigned int maxnonfragsize, headersize;
1474 	unsigned int wmem_alloc_delta = 0;
1475 	bool paged, extra_uref = false;
1476 
1477 	skb = skb_peek_tail(queue);
1478 	if (!skb) {
1479 		exthdrlen = opt ? opt->opt_flen : 0;
1480 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1481 	}
1482 
1483 	paged = !!cork->gso_size;
1484 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1485 	orig_mtu = mtu;
1486 
1487 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1488 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1489 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1490 
1491 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1492 
1493 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1494 			(opt ? opt->opt_nflen : 0);
1495 
1496 	headersize = sizeof(struct ipv6hdr) +
1497 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1498 		     (dst_allfrag(&rt->dst) ?
1499 		      sizeof(struct frag_hdr) : 0) +
1500 		     rt->rt6i_nfheader_len;
1501 
1502 	if (mtu <= fragheaderlen ||
1503 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1504 		goto emsgsize;
1505 
1506 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1507 		     sizeof(struct frag_hdr);
1508 
1509 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1510 	 * the first fragment
1511 	 */
1512 	if (headersize + transhdrlen > mtu)
1513 		goto emsgsize;
1514 
1515 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1516 	    (sk->sk_protocol == IPPROTO_UDP ||
1517 	     sk->sk_protocol == IPPROTO_RAW)) {
1518 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1519 				sizeof(struct ipv6hdr));
1520 		goto emsgsize;
1521 	}
1522 
1523 	if (ip6_sk_ignore_df(sk))
1524 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1525 	else
1526 		maxnonfragsize = mtu;
1527 
1528 	if (cork->length + length > maxnonfragsize - headersize) {
1529 emsgsize:
1530 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1531 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1532 		return -EMSGSIZE;
1533 	}
1534 
1535 	/* CHECKSUM_PARTIAL only with no extension headers and when
1536 	 * we are not going to fragment
1537 	 */
1538 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1539 	    headersize == sizeof(struct ipv6hdr) &&
1540 	    length <= mtu - headersize &&
1541 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1542 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1543 		csummode = CHECKSUM_PARTIAL;
1544 
1545 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1546 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1547 		if (!uarg)
1548 			return -ENOBUFS;
1549 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1550 		if (rt->dst.dev->features & NETIF_F_SG &&
1551 		    csummode == CHECKSUM_PARTIAL) {
1552 			paged = true;
1553 		} else {
1554 			uarg->zerocopy = 0;
1555 			skb_zcopy_set(skb, uarg, &extra_uref);
1556 		}
1557 	}
1558 
1559 	/*
1560 	 * Let's try using as much space as possible.
1561 	 * Use MTU if total length of the message fits into the MTU.
1562 	 * Otherwise, we need to reserve fragment header and
1563 	 * fragment alignment (= 8-15 octects, in total).
1564 	 *
1565 	 * Note that we may need to "move" the data from the tail
1566 	 * of the buffer to the new fragment when we split
1567 	 * the message.
1568 	 *
1569 	 * FIXME: It may be fragmented into multiple chunks
1570 	 *        at once if non-fragmentable extension headers
1571 	 *        are too large.
1572 	 * --yoshfuji
1573 	 */
1574 
1575 	cork->length += length;
1576 	if (!skb)
1577 		goto alloc_new_skb;
1578 
1579 	while (length > 0) {
1580 		/* Check if the remaining data fits into current packet. */
1581 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1582 		if (copy < length)
1583 			copy = maxfraglen - skb->len;
1584 
1585 		if (copy <= 0) {
1586 			char *data;
1587 			unsigned int datalen;
1588 			unsigned int fraglen;
1589 			unsigned int fraggap;
1590 			unsigned int alloclen, alloc_extra;
1591 			unsigned int pagedlen;
1592 alloc_new_skb:
1593 			/* There's no room in the current skb */
1594 			if (skb)
1595 				fraggap = skb->len - maxfraglen;
1596 			else
1597 				fraggap = 0;
1598 			/* update mtu and maxfraglen if necessary */
1599 			if (!skb || !skb_prev)
1600 				ip6_append_data_mtu(&mtu, &maxfraglen,
1601 						    fragheaderlen, skb, rt,
1602 						    orig_mtu);
1603 
1604 			skb_prev = skb;
1605 
1606 			/*
1607 			 * If remaining data exceeds the mtu,
1608 			 * we know we need more fragment(s).
1609 			 */
1610 			datalen = length + fraggap;
1611 
1612 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1613 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1614 			fraglen = datalen + fragheaderlen;
1615 			pagedlen = 0;
1616 
1617 			alloc_extra = hh_len;
1618 			alloc_extra += dst_exthdrlen;
1619 			alloc_extra += rt->dst.trailer_len;
1620 
1621 			/* We just reserve space for fragment header.
1622 			 * Note: this may be overallocation if the message
1623 			 * (without MSG_MORE) fits into the MTU.
1624 			 */
1625 			alloc_extra += sizeof(struct frag_hdr);
1626 
1627 			if ((flags & MSG_MORE) &&
1628 			    !(rt->dst.dev->features&NETIF_F_SG))
1629 				alloclen = mtu;
1630 			else if (!paged &&
1631 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1632 				  !(rt->dst.dev->features & NETIF_F_SG)))
1633 				alloclen = fraglen;
1634 			else {
1635 				alloclen = min_t(int, fraglen, MAX_HEADER);
1636 				pagedlen = fraglen - alloclen;
1637 			}
1638 			alloclen += alloc_extra;
1639 
1640 			if (datalen != length + fraggap) {
1641 				/*
1642 				 * this is not the last fragment, the trailer
1643 				 * space is regarded as data space.
1644 				 */
1645 				datalen += rt->dst.trailer_len;
1646 			}
1647 
1648 			fraglen = datalen + fragheaderlen;
1649 
1650 			copy = datalen - transhdrlen - fraggap - pagedlen;
1651 			if (copy < 0) {
1652 				err = -EINVAL;
1653 				goto error;
1654 			}
1655 			if (transhdrlen) {
1656 				skb = sock_alloc_send_skb(sk, alloclen,
1657 						(flags & MSG_DONTWAIT), &err);
1658 			} else {
1659 				skb = NULL;
1660 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1661 				    2 * sk->sk_sndbuf)
1662 					skb = alloc_skb(alloclen,
1663 							sk->sk_allocation);
1664 				if (unlikely(!skb))
1665 					err = -ENOBUFS;
1666 			}
1667 			if (!skb)
1668 				goto error;
1669 			/*
1670 			 *	Fill in the control structures
1671 			 */
1672 			skb->protocol = htons(ETH_P_IPV6);
1673 			skb->ip_summed = csummode;
1674 			skb->csum = 0;
1675 			/* reserve for fragmentation and ipsec header */
1676 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1677 				    dst_exthdrlen);
1678 
1679 			/*
1680 			 *	Find where to start putting bytes
1681 			 */
1682 			data = skb_put(skb, fraglen - pagedlen);
1683 			skb_set_network_header(skb, exthdrlen);
1684 			data += fragheaderlen;
1685 			skb->transport_header = (skb->network_header +
1686 						 fragheaderlen);
1687 			if (fraggap) {
1688 				skb->csum = skb_copy_and_csum_bits(
1689 					skb_prev, maxfraglen,
1690 					data + transhdrlen, fraggap);
1691 				skb_prev->csum = csum_sub(skb_prev->csum,
1692 							  skb->csum);
1693 				data += fraggap;
1694 				pskb_trim_unique(skb_prev, maxfraglen);
1695 			}
1696 			if (copy > 0 &&
1697 			    getfrag(from, data + transhdrlen, offset,
1698 				    copy, fraggap, skb) < 0) {
1699 				err = -EFAULT;
1700 				kfree_skb(skb);
1701 				goto error;
1702 			}
1703 
1704 			offset += copy;
1705 			length -= copy + transhdrlen;
1706 			transhdrlen = 0;
1707 			exthdrlen = 0;
1708 			dst_exthdrlen = 0;
1709 
1710 			/* Only the initial fragment is time stamped */
1711 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1712 			cork->tx_flags = 0;
1713 			skb_shinfo(skb)->tskey = tskey;
1714 			tskey = 0;
1715 			skb_zcopy_set(skb, uarg, &extra_uref);
1716 
1717 			if ((flags & MSG_CONFIRM) && !skb_prev)
1718 				skb_set_dst_pending_confirm(skb, 1);
1719 
1720 			/*
1721 			 * Put the packet on the pending queue
1722 			 */
1723 			if (!skb->destructor) {
1724 				skb->destructor = sock_wfree;
1725 				skb->sk = sk;
1726 				wmem_alloc_delta += skb->truesize;
1727 			}
1728 			__skb_queue_tail(queue, skb);
1729 			continue;
1730 		}
1731 
1732 		if (copy > length)
1733 			copy = length;
1734 
1735 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1736 		    skb_tailroom(skb) >= copy) {
1737 			unsigned int off;
1738 
1739 			off = skb->len;
1740 			if (getfrag(from, skb_put(skb, copy),
1741 						offset, copy, off, skb) < 0) {
1742 				__skb_trim(skb, off);
1743 				err = -EFAULT;
1744 				goto error;
1745 			}
1746 		} else if (!uarg || !uarg->zerocopy) {
1747 			int i = skb_shinfo(skb)->nr_frags;
1748 
1749 			err = -ENOMEM;
1750 			if (!sk_page_frag_refill(sk, pfrag))
1751 				goto error;
1752 
1753 			if (!skb_can_coalesce(skb, i, pfrag->page,
1754 					      pfrag->offset)) {
1755 				err = -EMSGSIZE;
1756 				if (i == MAX_SKB_FRAGS)
1757 					goto error;
1758 
1759 				__skb_fill_page_desc(skb, i, pfrag->page,
1760 						     pfrag->offset, 0);
1761 				skb_shinfo(skb)->nr_frags = ++i;
1762 				get_page(pfrag->page);
1763 			}
1764 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1765 			if (getfrag(from,
1766 				    page_address(pfrag->page) + pfrag->offset,
1767 				    offset, copy, skb->len, skb) < 0)
1768 				goto error_efault;
1769 
1770 			pfrag->offset += copy;
1771 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1772 			skb->len += copy;
1773 			skb->data_len += copy;
1774 			skb->truesize += copy;
1775 			wmem_alloc_delta += copy;
1776 		} else {
1777 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1778 			if (err < 0)
1779 				goto error;
1780 		}
1781 		offset += copy;
1782 		length -= copy;
1783 	}
1784 
1785 	if (wmem_alloc_delta)
1786 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1787 	return 0;
1788 
1789 error_efault:
1790 	err = -EFAULT;
1791 error:
1792 	net_zcopy_put_abort(uarg, extra_uref);
1793 	cork->length -= length;
1794 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1795 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1796 	return err;
1797 }
1798 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1799 int ip6_append_data(struct sock *sk,
1800 		    int getfrag(void *from, char *to, int offset, int len,
1801 				int odd, struct sk_buff *skb),
1802 		    void *from, int length, int transhdrlen,
1803 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1804 		    struct rt6_info *rt, unsigned int flags)
1805 {
1806 	struct inet_sock *inet = inet_sk(sk);
1807 	struct ipv6_pinfo *np = inet6_sk(sk);
1808 	int exthdrlen;
1809 	int err;
1810 
1811 	if (flags&MSG_PROBE)
1812 		return 0;
1813 	if (skb_queue_empty(&sk->sk_write_queue)) {
1814 		/*
1815 		 * setup for corking
1816 		 */
1817 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1818 				     ipc6, rt, fl6);
1819 		if (err)
1820 			return err;
1821 
1822 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1823 		length += exthdrlen;
1824 		transhdrlen += exthdrlen;
1825 	} else {
1826 		fl6 = &inet->cork.fl.u.ip6;
1827 		transhdrlen = 0;
1828 	}
1829 
1830 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1831 				 &np->cork, sk_page_frag(sk), getfrag,
1832 				 from, length, transhdrlen, flags, ipc6);
1833 }
1834 EXPORT_SYMBOL_GPL(ip6_append_data);
1835 
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1836 static void ip6_cork_release(struct inet_cork_full *cork,
1837 			     struct inet6_cork *v6_cork)
1838 {
1839 	if (v6_cork->opt) {
1840 		kfree(v6_cork->opt->dst0opt);
1841 		kfree(v6_cork->opt->dst1opt);
1842 		kfree(v6_cork->opt->hopopt);
1843 		kfree(v6_cork->opt->srcrt);
1844 		kfree(v6_cork->opt);
1845 		v6_cork->opt = NULL;
1846 	}
1847 
1848 	if (cork->base.dst) {
1849 		dst_release(cork->base.dst);
1850 		cork->base.dst = NULL;
1851 		cork->base.flags &= ~IPCORK_ALLFRAG;
1852 	}
1853 	memset(&cork->fl, 0, sizeof(cork->fl));
1854 }
1855 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1856 struct sk_buff *__ip6_make_skb(struct sock *sk,
1857 			       struct sk_buff_head *queue,
1858 			       struct inet_cork_full *cork,
1859 			       struct inet6_cork *v6_cork)
1860 {
1861 	struct sk_buff *skb, *tmp_skb;
1862 	struct sk_buff **tail_skb;
1863 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1864 	struct ipv6_pinfo *np = inet6_sk(sk);
1865 	struct net *net = sock_net(sk);
1866 	struct ipv6hdr *hdr;
1867 	struct ipv6_txoptions *opt = v6_cork->opt;
1868 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1869 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1870 	unsigned char proto = fl6->flowi6_proto;
1871 
1872 	skb = __skb_dequeue(queue);
1873 	if (!skb)
1874 		goto out;
1875 	tail_skb = &(skb_shinfo(skb)->frag_list);
1876 
1877 	/* move skb->data to ip header from ext header */
1878 	if (skb->data < skb_network_header(skb))
1879 		__skb_pull(skb, skb_network_offset(skb));
1880 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1881 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1882 		*tail_skb = tmp_skb;
1883 		tail_skb = &(tmp_skb->next);
1884 		skb->len += tmp_skb->len;
1885 		skb->data_len += tmp_skb->len;
1886 		skb->truesize += tmp_skb->truesize;
1887 		tmp_skb->destructor = NULL;
1888 		tmp_skb->sk = NULL;
1889 	}
1890 
1891 	/* Allow local fragmentation. */
1892 	skb->ignore_df = ip6_sk_ignore_df(sk);
1893 
1894 	*final_dst = fl6->daddr;
1895 	__skb_pull(skb, skb_network_header_len(skb));
1896 	if (opt && opt->opt_flen)
1897 		ipv6_push_frag_opts(skb, opt, &proto);
1898 	if (opt && opt->opt_nflen)
1899 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1900 
1901 	skb_push(skb, sizeof(struct ipv6hdr));
1902 	skb_reset_network_header(skb);
1903 	hdr = ipv6_hdr(skb);
1904 
1905 	ip6_flow_hdr(hdr, v6_cork->tclass,
1906 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1907 					ip6_autoflowlabel(net, np), fl6));
1908 	hdr->hop_limit = v6_cork->hop_limit;
1909 	hdr->nexthdr = proto;
1910 	hdr->saddr = fl6->saddr;
1911 	hdr->daddr = *final_dst;
1912 
1913 	skb->priority = sk->sk_priority;
1914 	skb->mark = cork->base.mark;
1915 
1916 	skb->tstamp = cork->base.transmit_time;
1917 
1918 	skb_dst_set(skb, dst_clone(&rt->dst));
1919 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1920 	if (proto == IPPROTO_ICMPV6) {
1921 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1922 		u8 icmp6_type;
1923 
1924 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1925 			icmp6_type = fl6->fl6_icmp_type;
1926 		else
1927 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1928 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1929 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1930 	}
1931 
1932 	ip6_cork_release(cork, v6_cork);
1933 out:
1934 	return skb;
1935 }
1936 
ip6_send_skb(struct sk_buff * skb)1937 int ip6_send_skb(struct sk_buff *skb)
1938 {
1939 	struct net *net = sock_net(skb->sk);
1940 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1941 	int err;
1942 
1943 	err = ip6_local_out(net, skb->sk, skb);
1944 	if (err) {
1945 		if (err > 0)
1946 			err = net_xmit_errno(err);
1947 		if (err)
1948 			IP6_INC_STATS(net, rt->rt6i_idev,
1949 				      IPSTATS_MIB_OUTDISCARDS);
1950 	}
1951 
1952 	return err;
1953 }
1954 
ip6_push_pending_frames(struct sock * sk)1955 int ip6_push_pending_frames(struct sock *sk)
1956 {
1957 	struct sk_buff *skb;
1958 
1959 	skb = ip6_finish_skb(sk);
1960 	if (!skb)
1961 		return 0;
1962 
1963 	return ip6_send_skb(skb);
1964 }
1965 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1966 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1967 static void __ip6_flush_pending_frames(struct sock *sk,
1968 				       struct sk_buff_head *queue,
1969 				       struct inet_cork_full *cork,
1970 				       struct inet6_cork *v6_cork)
1971 {
1972 	struct sk_buff *skb;
1973 
1974 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1975 		if (skb_dst(skb))
1976 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1977 				      IPSTATS_MIB_OUTDISCARDS);
1978 		kfree_skb(skb);
1979 	}
1980 
1981 	ip6_cork_release(cork, v6_cork);
1982 }
1983 
ip6_flush_pending_frames(struct sock * sk)1984 void ip6_flush_pending_frames(struct sock *sk)
1985 {
1986 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1987 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1988 }
1989 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1990 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)1991 struct sk_buff *ip6_make_skb(struct sock *sk,
1992 			     int getfrag(void *from, char *to, int offset,
1993 					 int len, int odd, struct sk_buff *skb),
1994 			     void *from, int length, int transhdrlen,
1995 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1996 			     struct rt6_info *rt, unsigned int flags,
1997 			     struct inet_cork_full *cork)
1998 {
1999 	struct inet6_cork v6_cork;
2000 	struct sk_buff_head queue;
2001 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2002 	int err;
2003 
2004 	if (flags & MSG_PROBE)
2005 		return NULL;
2006 
2007 	__skb_queue_head_init(&queue);
2008 
2009 	cork->base.flags = 0;
2010 	cork->base.addr = 0;
2011 	cork->base.opt = NULL;
2012 	cork->base.dst = NULL;
2013 	v6_cork.opt = NULL;
2014 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2015 	if (err) {
2016 		ip6_cork_release(cork, &v6_cork);
2017 		return ERR_PTR(err);
2018 	}
2019 	if (ipc6->dontfrag < 0)
2020 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2021 
2022 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2023 				&current->task_frag, getfrag, from,
2024 				length + exthdrlen, transhdrlen + exthdrlen,
2025 				flags, ipc6);
2026 	if (err) {
2027 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2028 		return ERR_PTR(err);
2029 	}
2030 
2031 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2032 }
2033