1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 int delta = hh_len - skb_headroom(skb);
65 const struct in6_addr *nexthop;
66 struct neighbour *neigh;
67 int ret;
68
69 /* Be paranoid, rather than too clever. */
70 if (unlikely(delta > 0) && dev->header_ops) {
71 /* pskb_expand_head() might crash, if skb is shared */
72 if (skb_shared(skb)) {
73 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74
75 if (likely(nskb)) {
76 if (skb->sk)
77 skb_set_owner_w(nskb, skb->sk);
78 consume_skb(skb);
79 } else {
80 kfree_skb(skb);
81 }
82 skb = nskb;
83 }
84 /* Make sure idev stays alive */
85 rcu_read_lock();
86 if (skb &&
87 pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
88 kfree_skb(skb);
89 skb = NULL;
90 }
91 if (!skb) {
92 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
93 rcu_read_unlock();
94 return -ENOMEM;
95 }
96 rcu_read_unlock();
97 }
98
99 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
100 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
101
102 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
103 ((mroute6_is_socket(net, skb) &&
104 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
105 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
106 &ipv6_hdr(skb)->saddr))) {
107 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
108
109 /* Do not check for IFF_ALLMULTI; multicast routing
110 is not supported in any case.
111 */
112 if (newskb)
113 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
114 net, sk, newskb, NULL, newskb->dev,
115 dev_loopback_xmit);
116
117 if (ipv6_hdr(skb)->hop_limit == 0) {
118 IP6_INC_STATS(net, idev,
119 IPSTATS_MIB_OUTDISCARDS);
120 kfree_skb(skb);
121 return 0;
122 }
123 }
124
125 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
126
127 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
128 IPV6_ADDR_SCOPE_NODELOCAL &&
129 !(dev->flags & IFF_LOOPBACK)) {
130 kfree_skb(skb);
131 return 0;
132 }
133 }
134
135 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
136 int res = lwtunnel_xmit(skb);
137
138 if (res != LWTUNNEL_XMIT_CONTINUE)
139 return res;
140 }
141
142 rcu_read_lock_bh();
143 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
144 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
145 if (unlikely(!neigh))
146 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
147 if (!IS_ERR(neigh)) {
148 sock_confirm_neigh(skb, neigh);
149 ret = neigh_output(neigh, skb, false);
150 rcu_read_unlock_bh();
151 return ret;
152 }
153 rcu_read_unlock_bh();
154
155 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
156 kfree_skb(skb);
157 return -EINVAL;
158 }
159
160 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)161 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
162 struct sk_buff *skb, unsigned int mtu)
163 {
164 struct sk_buff *segs, *nskb;
165 netdev_features_t features;
166 int ret = 0;
167
168 /* Please see corresponding comment in ip_finish_output_gso
169 * describing the cases where GSO segment length exceeds the
170 * egress MTU.
171 */
172 features = netif_skb_features(skb);
173 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
174 if (IS_ERR_OR_NULL(segs)) {
175 kfree_skb(skb);
176 return -ENOMEM;
177 }
178
179 consume_skb(skb);
180
181 skb_list_walk_safe(segs, segs, nskb) {
182 int err;
183
184 skb_mark_not_on_list(segs);
185 /* Last GSO segment can be smaller than gso_size (and MTU).
186 * Adding a fragment header would produce an "atomic fragment",
187 * which is considered harmful (RFC-8021). Avoid that.
188 */
189 err = segs->len > mtu ?
190 ip6_fragment(net, sk, segs, ip6_finish_output2) :
191 ip6_finish_output2(net, sk, segs);
192 if (err && ret == 0)
193 ret = err;
194 }
195
196 return ret;
197 }
198
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)199 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 unsigned int mtu;
202
203 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
204 /* Policy lookup after SNAT yielded a new policy */
205 if (skb_dst(skb)->xfrm) {
206 IP6CB(skb)->flags |= IP6SKB_REROUTED;
207 return dst_output(net, sk, skb);
208 }
209 #endif
210
211 mtu = ip6_skb_dst_mtu(skb);
212 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
213 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
214
215 if ((skb->len > mtu && !skb_is_gso(skb)) ||
216 dst_allfrag(skb_dst(skb)) ||
217 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
218 return ip6_fragment(net, sk, skb, ip6_finish_output2);
219 else
220 return ip6_finish_output2(net, sk, skb);
221 }
222
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)223 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
224 {
225 int ret;
226
227 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
228 switch (ret) {
229 case NET_XMIT_SUCCESS:
230 return __ip6_finish_output(net, sk, skb);
231 case NET_XMIT_CN:
232 return __ip6_finish_output(net, sk, skb) ? : ret;
233 default:
234 kfree_skb(skb);
235 return ret;
236 }
237 }
238
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)239 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
240 {
241 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
242 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
243
244 skb->protocol = htons(ETH_P_IPV6);
245 skb->dev = dev;
246
247 if (unlikely(!idev || (idev->cnf.disable_ipv6))) {
248 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
249 kfree_skb(skb);
250 return 0;
251 }
252
253 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
254 net, sk, skb, indev, dev,
255 ip6_finish_output,
256 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
257 }
258
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)259 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
260 {
261 if (!np->autoflowlabel_set)
262 return ip6_default_np_autolabel(net);
263 else
264 return np->autoflowlabel;
265 }
266
267 /*
268 * xmit an sk_buff (used by TCP, SCTP and DCCP)
269 * Note : socket lock is not held for SYNACK packets, but might be modified
270 * by calls to skb_set_owner_w() and ipv6_local_error(),
271 * which are using proper atomic operations or spinlocks.
272 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)273 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
274 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
275 {
276 struct net *net = sock_net(sk);
277 const struct ipv6_pinfo *np = inet6_sk(sk);
278 struct in6_addr *first_hop = &fl6->daddr;
279 struct dst_entry *dst = skb_dst(skb);
280 unsigned int head_room;
281 struct ipv6hdr *hdr;
282 u8 proto = fl6->flowi6_proto;
283 int seg_len = skb->len;
284 int hlimit = -1;
285 u32 mtu;
286
287 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
288 if (opt)
289 head_room += opt->opt_nflen + opt->opt_flen;
290
291 if (unlikely(skb_headroom(skb) < head_room)) {
292 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
293 if (!skb2) {
294 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
295 IPSTATS_MIB_OUTDISCARDS);
296 kfree_skb(skb);
297 return -ENOBUFS;
298 }
299 if (skb->sk)
300 skb_set_owner_w(skb2, skb->sk);
301 consume_skb(skb);
302 skb = skb2;
303 }
304
305 if (opt) {
306 seg_len += opt->opt_nflen + opt->opt_flen;
307
308 if (opt->opt_flen)
309 ipv6_push_frag_opts(skb, opt, &proto);
310
311 if (opt->opt_nflen)
312 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
313 &fl6->saddr);
314 }
315
316 skb_push(skb, sizeof(struct ipv6hdr));
317 skb_reset_network_header(skb);
318 hdr = ipv6_hdr(skb);
319
320 /*
321 * Fill in the IPv6 header
322 */
323 if (np)
324 hlimit = np->hop_limit;
325 if (hlimit < 0)
326 hlimit = ip6_dst_hoplimit(dst);
327
328 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
329 ip6_autoflowlabel(net, np), fl6));
330
331 hdr->payload_len = htons(seg_len);
332 hdr->nexthdr = proto;
333 hdr->hop_limit = hlimit;
334
335 hdr->saddr = fl6->saddr;
336 hdr->daddr = *first_hop;
337
338 skb->protocol = htons(ETH_P_IPV6);
339 skb->priority = priority;
340 skb->mark = mark;
341
342 mtu = dst_mtu(dst);
343 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
344 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
345 IPSTATS_MIB_OUT, skb->len);
346
347 /* if egress device is enslaved to an L3 master device pass the
348 * skb to its handler for processing
349 */
350 skb = l3mdev_ip6_out((struct sock *)sk, skb);
351 if (unlikely(!skb))
352 return 0;
353
354 /* hooks should never assume socket lock is held.
355 * we promote our socket to non const
356 */
357 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
358 net, (struct sock *)sk, skb, NULL, dst->dev,
359 dst_output);
360 }
361
362 skb->dev = dst->dev;
363 /* ipv6_local_error() does not require socket lock,
364 * we promote our socket to non const
365 */
366 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
367
368 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
369 kfree_skb(skb);
370 return -EMSGSIZE;
371 }
372 EXPORT_SYMBOL(ip6_xmit);
373
ip6_call_ra_chain(struct sk_buff * skb,int sel)374 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
375 {
376 struct ip6_ra_chain *ra;
377 struct sock *last = NULL;
378
379 read_lock(&ip6_ra_lock);
380 for (ra = ip6_ra_chain; ra; ra = ra->next) {
381 struct sock *sk = ra->sk;
382 if (sk && ra->sel == sel &&
383 (!sk->sk_bound_dev_if ||
384 sk->sk_bound_dev_if == skb->dev->ifindex)) {
385 struct ipv6_pinfo *np = inet6_sk(sk);
386
387 if (np && np->rtalert_isolate &&
388 !net_eq(sock_net(sk), dev_net(skb->dev))) {
389 continue;
390 }
391 if (last) {
392 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
393 if (skb2)
394 rawv6_rcv(last, skb2);
395 }
396 last = sk;
397 }
398 }
399
400 if (last) {
401 rawv6_rcv(last, skb);
402 read_unlock(&ip6_ra_lock);
403 return 1;
404 }
405 read_unlock(&ip6_ra_lock);
406 return 0;
407 }
408
ip6_forward_proxy_check(struct sk_buff * skb)409 static int ip6_forward_proxy_check(struct sk_buff *skb)
410 {
411 struct ipv6hdr *hdr = ipv6_hdr(skb);
412 u8 nexthdr = hdr->nexthdr;
413 __be16 frag_off;
414 int offset;
415
416 if (ipv6_ext_hdr(nexthdr)) {
417 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
418 if (offset < 0)
419 return 0;
420 } else
421 offset = sizeof(struct ipv6hdr);
422
423 if (nexthdr == IPPROTO_ICMPV6) {
424 struct icmp6hdr *icmp6;
425
426 if (!pskb_may_pull(skb, (skb_network_header(skb) +
427 offset + 1 - skb->data)))
428 return 0;
429
430 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
431
432 switch (icmp6->icmp6_type) {
433 case NDISC_ROUTER_SOLICITATION:
434 case NDISC_ROUTER_ADVERTISEMENT:
435 case NDISC_NEIGHBOUR_SOLICITATION:
436 case NDISC_NEIGHBOUR_ADVERTISEMENT:
437 case NDISC_REDIRECT:
438 /* For reaction involving unicast neighbor discovery
439 * message destined to the proxied address, pass it to
440 * input function.
441 */
442 return 1;
443 default:
444 break;
445 }
446 }
447
448 /*
449 * The proxying router can't forward traffic sent to a link-local
450 * address, so signal the sender and discard the packet. This
451 * behavior is clarified by the MIPv6 specification.
452 */
453 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
454 dst_link_failure(skb);
455 return -1;
456 }
457
458 return 0;
459 }
460
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
462 struct sk_buff *skb)
463 {
464 struct dst_entry *dst = skb_dst(skb);
465
466 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
467 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
468
469 #ifdef CONFIG_NET_SWITCHDEV
470 if (skb->offload_l3_fwd_mark) {
471 consume_skb(skb);
472 return 0;
473 }
474 #endif
475
476 skb->tstamp = 0;
477 return dst_output(net, sk, skb);
478 }
479
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)480 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
481 {
482 if (skb->len <= mtu)
483 return false;
484
485 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
486 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
487 return true;
488
489 if (skb->ignore_df)
490 return false;
491
492 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
493 return false;
494
495 return true;
496 }
497
ip6_forward(struct sk_buff * skb)498 int ip6_forward(struct sk_buff *skb)
499 {
500 struct dst_entry *dst = skb_dst(skb);
501 struct ipv6hdr *hdr = ipv6_hdr(skb);
502 struct inet6_skb_parm *opt = IP6CB(skb);
503 struct net *net = dev_net(dst->dev);
504 struct inet6_dev *idev;
505 u32 mtu;
506
507 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
508 if (net->ipv6.devconf_all->forwarding == 0)
509 goto error;
510
511 if (skb->pkt_type != PACKET_HOST)
512 goto drop;
513
514 if (unlikely(skb->sk))
515 goto drop;
516
517 if (skb_warn_if_lro(skb))
518 goto drop;
519
520 if (!net->ipv6.devconf_all->disable_policy &&
521 (!idev || !idev->cnf.disable_policy) &&
522 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
523 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
524 goto drop;
525 }
526
527 skb_forward_csum(skb);
528
529 /*
530 * We DO NOT make any processing on
531 * RA packets, pushing them to user level AS IS
532 * without ane WARRANTY that application will be able
533 * to interpret them. The reason is that we
534 * cannot make anything clever here.
535 *
536 * We are not end-node, so that if packet contains
537 * AH/ESP, we cannot make anything.
538 * Defragmentation also would be mistake, RA packets
539 * cannot be fragmented, because there is no warranty
540 * that different fragments will go along one path. --ANK
541 */
542 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
543 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
544 return 0;
545 }
546
547 /*
548 * check and decrement ttl
549 */
550 if (hdr->hop_limit <= 1) {
551 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
552 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
553
554 kfree_skb(skb);
555 return -ETIMEDOUT;
556 }
557
558 /* XXX: idev->cnf.proxy_ndp? */
559 if (net->ipv6.devconf_all->proxy_ndp &&
560 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
561 int proxied = ip6_forward_proxy_check(skb);
562 if (proxied > 0)
563 return ip6_input(skb);
564 else if (proxied < 0) {
565 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
566 goto drop;
567 }
568 }
569
570 if (!xfrm6_route_forward(skb)) {
571 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
572 goto drop;
573 }
574 dst = skb_dst(skb);
575
576 /* IPv6 specs say nothing about it, but it is clear that we cannot
577 send redirects to source routed frames.
578 We don't send redirects to frames decapsulated from IPsec.
579 */
580 if (IP6CB(skb)->iif == dst->dev->ifindex &&
581 opt->srcrt == 0 && !skb_sec_path(skb)) {
582 struct in6_addr *target = NULL;
583 struct inet_peer *peer;
584 struct rt6_info *rt;
585
586 /*
587 * incoming and outgoing devices are the same
588 * send a redirect.
589 */
590
591 rt = (struct rt6_info *) dst;
592 if (rt->rt6i_flags & RTF_GATEWAY)
593 target = &rt->rt6i_gateway;
594 else
595 target = &hdr->daddr;
596
597 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
598
599 /* Limit redirects both by destination (here)
600 and by source (inside ndisc_send_redirect)
601 */
602 if (inet_peer_xrlim_allow(peer, 1*HZ))
603 ndisc_send_redirect(skb, target);
604 if (peer)
605 inet_putpeer(peer);
606 } else {
607 int addrtype = ipv6_addr_type(&hdr->saddr);
608
609 /* This check is security critical. */
610 if (addrtype == IPV6_ADDR_ANY ||
611 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
612 goto error;
613 if (addrtype & IPV6_ADDR_LINKLOCAL) {
614 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
615 ICMPV6_NOT_NEIGHBOUR, 0);
616 goto error;
617 }
618 }
619
620 mtu = ip6_dst_mtu_forward(dst);
621 if (mtu < IPV6_MIN_MTU)
622 mtu = IPV6_MIN_MTU;
623
624 if (ip6_pkt_too_big(skb, mtu)) {
625 /* Again, force OUTPUT device used as source address */
626 skb->dev = dst->dev;
627 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
629 __IP6_INC_STATS(net, ip6_dst_idev(dst),
630 IPSTATS_MIB_FRAGFAILS);
631 kfree_skb(skb);
632 return -EMSGSIZE;
633 }
634
635 if (skb_cow(skb, dst->dev->hard_header_len)) {
636 __IP6_INC_STATS(net, ip6_dst_idev(dst),
637 IPSTATS_MIB_OUTDISCARDS);
638 goto drop;
639 }
640
641 hdr = ipv6_hdr(skb);
642
643 /* Mangling hops number delayed to point after skb COW */
644
645 hdr->hop_limit--;
646
647 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
648 net, NULL, skb, skb->dev, dst->dev,
649 ip6_forward_finish);
650
651 error:
652 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
653 drop:
654 kfree_skb(skb);
655 return -EINVAL;
656 }
657
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)658 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
659 {
660 to->pkt_type = from->pkt_type;
661 to->priority = from->priority;
662 to->protocol = from->protocol;
663 skb_dst_drop(to);
664 skb_dst_set(to, dst_clone(skb_dst(from)));
665 to->dev = from->dev;
666 to->mark = from->mark;
667
668 skb_copy_hash(to, from);
669
670 #ifdef CONFIG_NET_SCHED
671 to->tc_index = from->tc_index;
672 #endif
673 nf_copy(to, from);
674 skb_ext_copy(to, from);
675 skb_copy_secmark(to, from);
676 }
677
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)678 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
679 u8 nexthdr, __be32 frag_id,
680 struct ip6_fraglist_iter *iter)
681 {
682 unsigned int first_len;
683 struct frag_hdr *fh;
684
685 /* BUILD HEADER */
686 *prevhdr = NEXTHDR_FRAGMENT;
687 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
688 if (!iter->tmp_hdr)
689 return -ENOMEM;
690
691 iter->frag = skb_shinfo(skb)->frag_list;
692 skb_frag_list_init(skb);
693
694 iter->offset = 0;
695 iter->hlen = hlen;
696 iter->frag_id = frag_id;
697 iter->nexthdr = nexthdr;
698
699 __skb_pull(skb, hlen);
700 fh = __skb_push(skb, sizeof(struct frag_hdr));
701 __skb_push(skb, hlen);
702 skb_reset_network_header(skb);
703 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
704
705 fh->nexthdr = nexthdr;
706 fh->reserved = 0;
707 fh->frag_off = htons(IP6_MF);
708 fh->identification = frag_id;
709
710 first_len = skb_pagelen(skb);
711 skb->data_len = first_len - skb_headlen(skb);
712 skb->len = first_len;
713 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
714
715 return 0;
716 }
717 EXPORT_SYMBOL(ip6_fraglist_init);
718
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)719 void ip6_fraglist_prepare(struct sk_buff *skb,
720 struct ip6_fraglist_iter *iter)
721 {
722 struct sk_buff *frag = iter->frag;
723 unsigned int hlen = iter->hlen;
724 struct frag_hdr *fh;
725
726 frag->ip_summed = CHECKSUM_NONE;
727 skb_reset_transport_header(frag);
728 fh = __skb_push(frag, sizeof(struct frag_hdr));
729 __skb_push(frag, hlen);
730 skb_reset_network_header(frag);
731 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
732 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
733 fh->nexthdr = iter->nexthdr;
734 fh->reserved = 0;
735 fh->frag_off = htons(iter->offset);
736 if (frag->next)
737 fh->frag_off |= htons(IP6_MF);
738 fh->identification = iter->frag_id;
739 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
740 ip6_copy_metadata(frag, skb);
741 }
742 EXPORT_SYMBOL(ip6_fraglist_prepare);
743
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)744 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
745 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
746 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
747 {
748 state->prevhdr = prevhdr;
749 state->nexthdr = nexthdr;
750 state->frag_id = frag_id;
751
752 state->hlen = hlen;
753 state->mtu = mtu;
754
755 state->left = skb->len - hlen; /* Space per frame */
756 state->ptr = hlen; /* Where to start from */
757
758 state->hroom = hdr_room;
759 state->troom = needed_tailroom;
760
761 state->offset = 0;
762 }
763 EXPORT_SYMBOL(ip6_frag_init);
764
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)765 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
766 {
767 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
768 struct sk_buff *frag;
769 struct frag_hdr *fh;
770 unsigned int len;
771
772 len = state->left;
773 /* IF: it doesn't fit, use 'mtu' - the data space left */
774 if (len > state->mtu)
775 len = state->mtu;
776 /* IF: we are not sending up to and including the packet end
777 then align the next start on an eight byte boundary */
778 if (len < state->left)
779 len &= ~7;
780
781 /* Allocate buffer */
782 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
783 state->hroom + state->troom, GFP_ATOMIC);
784 if (!frag)
785 return ERR_PTR(-ENOMEM);
786
787 /*
788 * Set up data on packet
789 */
790
791 ip6_copy_metadata(frag, skb);
792 skb_reserve(frag, state->hroom);
793 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
794 skb_reset_network_header(frag);
795 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
796 frag->transport_header = (frag->network_header + state->hlen +
797 sizeof(struct frag_hdr));
798
799 /*
800 * Charge the memory for the fragment to any owner
801 * it might possess
802 */
803 if (skb->sk)
804 skb_set_owner_w(frag, skb->sk);
805
806 /*
807 * Copy the packet header into the new buffer.
808 */
809 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
810
811 fragnexthdr_offset = skb_network_header(frag);
812 fragnexthdr_offset += prevhdr - skb_network_header(skb);
813 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
814
815 /*
816 * Build fragment header.
817 */
818 fh->nexthdr = state->nexthdr;
819 fh->reserved = 0;
820 fh->identification = state->frag_id;
821
822 /*
823 * Copy a block of the IP datagram.
824 */
825 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
826 len));
827 state->left -= len;
828
829 fh->frag_off = htons(state->offset);
830 if (state->left > 0)
831 fh->frag_off |= htons(IP6_MF);
832 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
833
834 state->ptr += len;
835 state->offset += len;
836
837 return frag;
838 }
839 EXPORT_SYMBOL(ip6_frag_next);
840
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))841 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
842 int (*output)(struct net *, struct sock *, struct sk_buff *))
843 {
844 struct sk_buff *frag;
845 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
846 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
847 inet6_sk(skb->sk) : NULL;
848 struct ip6_frag_state state;
849 unsigned int mtu, hlen, nexthdr_offset;
850 ktime_t tstamp = skb->tstamp;
851 int hroom, err = 0;
852 __be32 frag_id;
853 u8 *prevhdr, nexthdr = 0;
854
855 err = ip6_find_1stfragopt(skb, &prevhdr);
856 if (err < 0)
857 goto fail;
858 hlen = err;
859 nexthdr = *prevhdr;
860 nexthdr_offset = prevhdr - skb_network_header(skb);
861
862 mtu = ip6_skb_dst_mtu(skb);
863
864 /* We must not fragment if the socket is set to force MTU discovery
865 * or if the skb it not generated by a local socket.
866 */
867 if (unlikely(!skb->ignore_df && skb->len > mtu))
868 goto fail_toobig;
869
870 if (IP6CB(skb)->frag_max_size) {
871 if (IP6CB(skb)->frag_max_size > mtu)
872 goto fail_toobig;
873
874 /* don't send fragments larger than what we received */
875 mtu = IP6CB(skb)->frag_max_size;
876 if (mtu < IPV6_MIN_MTU)
877 mtu = IPV6_MIN_MTU;
878 }
879
880 if (np && np->frag_size < mtu) {
881 if (np->frag_size)
882 mtu = np->frag_size;
883 }
884 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
885 goto fail_toobig;
886 mtu -= hlen + sizeof(struct frag_hdr);
887
888 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
889 &ipv6_hdr(skb)->saddr);
890
891 if (skb->ip_summed == CHECKSUM_PARTIAL &&
892 (err = skb_checksum_help(skb)))
893 goto fail;
894
895 prevhdr = skb_network_header(skb) + nexthdr_offset;
896 hroom = LL_RESERVED_SPACE(rt->dst.dev);
897 if (skb_has_frag_list(skb)) {
898 unsigned int first_len = skb_pagelen(skb);
899 struct ip6_fraglist_iter iter;
900 struct sk_buff *frag2;
901
902 if (first_len - hlen > mtu ||
903 ((first_len - hlen) & 7) ||
904 skb_cloned(skb) ||
905 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
906 goto slow_path;
907
908 skb_walk_frags(skb, frag) {
909 /* Correct geometry. */
910 if (frag->len > mtu ||
911 ((frag->len & 7) && frag->next) ||
912 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
913 goto slow_path_clean;
914
915 /* Partially cloned skb? */
916 if (skb_shared(frag))
917 goto slow_path_clean;
918
919 BUG_ON(frag->sk);
920 if (skb->sk) {
921 frag->sk = skb->sk;
922 frag->destructor = sock_wfree;
923 }
924 skb->truesize -= frag->truesize;
925 }
926
927 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
928 &iter);
929 if (err < 0)
930 goto fail;
931
932 /* We prevent @rt from being freed. */
933 rcu_read_lock();
934
935 for (;;) {
936 /* Prepare header of the next frame,
937 * before previous one went down. */
938 if (iter.frag)
939 ip6_fraglist_prepare(skb, &iter);
940
941 skb->tstamp = tstamp;
942 err = output(net, sk, skb);
943 if (!err)
944 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
945 IPSTATS_MIB_FRAGCREATES);
946
947 if (err || !iter.frag)
948 break;
949
950 skb = ip6_fraglist_next(&iter);
951 }
952
953 kfree(iter.tmp_hdr);
954
955 if (err == 0) {
956 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
957 IPSTATS_MIB_FRAGOKS);
958 rcu_read_unlock();
959 return 0;
960 }
961
962 kfree_skb_list(iter.frag);
963
964 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
965 IPSTATS_MIB_FRAGFAILS);
966 rcu_read_unlock();
967 return err;
968
969 slow_path_clean:
970 skb_walk_frags(skb, frag2) {
971 if (frag2 == frag)
972 break;
973 frag2->sk = NULL;
974 frag2->destructor = NULL;
975 skb->truesize += frag2->truesize;
976 }
977 }
978
979 slow_path:
980 /*
981 * Fragment the datagram.
982 */
983
984 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
985 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
986 &state);
987
988 /*
989 * Keep copying data until we run out.
990 */
991
992 while (state.left > 0) {
993 frag = ip6_frag_next(skb, &state);
994 if (IS_ERR(frag)) {
995 err = PTR_ERR(frag);
996 goto fail;
997 }
998
999 /*
1000 * Put this fragment into the sending queue.
1001 */
1002 frag->tstamp = tstamp;
1003 err = output(net, sk, frag);
1004 if (err)
1005 goto fail;
1006
1007 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1008 IPSTATS_MIB_FRAGCREATES);
1009 }
1010 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1011 IPSTATS_MIB_FRAGOKS);
1012 consume_skb(skb);
1013 return err;
1014
1015 fail_toobig:
1016 if (skb->sk && dst_allfrag(skb_dst(skb)))
1017 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1018
1019 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1020 err = -EMSGSIZE;
1021
1022 fail:
1023 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1024 IPSTATS_MIB_FRAGFAILS);
1025 kfree_skb(skb);
1026 return err;
1027 }
1028
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1029 static inline int ip6_rt_check(const struct rt6key *rt_key,
1030 const struct in6_addr *fl_addr,
1031 const struct in6_addr *addr_cache)
1032 {
1033 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1034 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1035 }
1036
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1037 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1038 struct dst_entry *dst,
1039 const struct flowi6 *fl6)
1040 {
1041 struct ipv6_pinfo *np = inet6_sk(sk);
1042 struct rt6_info *rt;
1043
1044 if (!dst)
1045 goto out;
1046
1047 if (dst->ops->family != AF_INET6) {
1048 dst_release(dst);
1049 return NULL;
1050 }
1051
1052 rt = (struct rt6_info *)dst;
1053 /* Yes, checking route validity in not connected
1054 * case is not very simple. Take into account,
1055 * that we do not support routing by source, TOS,
1056 * and MSG_DONTROUTE --ANK (980726)
1057 *
1058 * 1. ip6_rt_check(): If route was host route,
1059 * check that cached destination is current.
1060 * If it is network route, we still may
1061 * check its validity using saved pointer
1062 * to the last used address: daddr_cache.
1063 * We do not want to save whole address now,
1064 * (because main consumer of this service
1065 * is tcp, which has not this problem),
1066 * so that the last trick works only on connected
1067 * sockets.
1068 * 2. oif also should be the same.
1069 */
1070 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1071 #ifdef CONFIG_IPV6_SUBTREES
1072 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1073 #endif
1074 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1075 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1076 dst_release(dst);
1077 dst = NULL;
1078 }
1079
1080 out:
1081 return dst;
1082 }
1083
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1084 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1085 struct dst_entry **dst, struct flowi6 *fl6)
1086 {
1087 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1088 struct neighbour *n;
1089 struct rt6_info *rt;
1090 #endif
1091 int err;
1092 int flags = 0;
1093
1094 /* The correct way to handle this would be to do
1095 * ip6_route_get_saddr, and then ip6_route_output; however,
1096 * the route-specific preferred source forces the
1097 * ip6_route_output call _before_ ip6_route_get_saddr.
1098 *
1099 * In source specific routing (no src=any default route),
1100 * ip6_route_output will fail given src=any saddr, though, so
1101 * that's why we try it again later.
1102 */
1103 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1104 struct fib6_info *from;
1105 struct rt6_info *rt;
1106 bool had_dst = *dst != NULL;
1107
1108 if (!had_dst)
1109 *dst = ip6_route_output(net, sk, fl6);
1110 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1111
1112 rcu_read_lock();
1113 from = rt ? rcu_dereference(rt->from) : NULL;
1114 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1115 sk ? inet6_sk(sk)->srcprefs : 0,
1116 &fl6->saddr);
1117 rcu_read_unlock();
1118
1119 if (err)
1120 goto out_err_release;
1121
1122 /* If we had an erroneous initial result, pretend it
1123 * never existed and let the SA-enabled version take
1124 * over.
1125 */
1126 if (!had_dst && (*dst)->error) {
1127 dst_release(*dst);
1128 *dst = NULL;
1129 }
1130
1131 if (fl6->flowi6_oif)
1132 flags |= RT6_LOOKUP_F_IFACE;
1133 }
1134
1135 if (!*dst)
1136 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1137
1138 err = (*dst)->error;
1139 if (err)
1140 goto out_err_release;
1141
1142 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1143 /*
1144 * Here if the dst entry we've looked up
1145 * has a neighbour entry that is in the INCOMPLETE
1146 * state and the src address from the flow is
1147 * marked as OPTIMISTIC, we release the found
1148 * dst entry and replace it instead with the
1149 * dst entry of the nexthop router
1150 */
1151 rt = (struct rt6_info *) *dst;
1152 rcu_read_lock_bh();
1153 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1154 rt6_nexthop(rt, &fl6->daddr));
1155 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1156 rcu_read_unlock_bh();
1157
1158 if (err) {
1159 struct inet6_ifaddr *ifp;
1160 struct flowi6 fl_gw6;
1161 int redirect;
1162
1163 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1164 (*dst)->dev, 1);
1165
1166 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1167 if (ifp)
1168 in6_ifa_put(ifp);
1169
1170 if (redirect) {
1171 /*
1172 * We need to get the dst entry for the
1173 * default router instead
1174 */
1175 dst_release(*dst);
1176 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1177 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1178 *dst = ip6_route_output(net, sk, &fl_gw6);
1179 err = (*dst)->error;
1180 if (err)
1181 goto out_err_release;
1182 }
1183 }
1184 #endif
1185 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1186 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1187 err = -EAFNOSUPPORT;
1188 goto out_err_release;
1189 }
1190
1191 return 0;
1192
1193 out_err_release:
1194 dst_release(*dst);
1195 *dst = NULL;
1196
1197 if (err == -ENETUNREACH)
1198 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1199 return err;
1200 }
1201
1202 /**
1203 * ip6_dst_lookup - perform route lookup on flow
1204 * @net: Network namespace to perform lookup in
1205 * @sk: socket which provides route info
1206 * @dst: pointer to dst_entry * for result
1207 * @fl6: flow to lookup
1208 *
1209 * This function performs a route lookup on the given flow.
1210 *
1211 * It returns zero on success, or a standard errno code on error.
1212 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1213 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1214 struct flowi6 *fl6)
1215 {
1216 *dst = NULL;
1217 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1218 }
1219 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1220
1221 /**
1222 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1223 * @net: Network namespace to perform lookup in
1224 * @sk: socket which provides route info
1225 * @fl6: flow to lookup
1226 * @final_dst: final destination address for ipsec lookup
1227 *
1228 * This function performs a route lookup on the given flow.
1229 *
1230 * It returns a valid dst pointer on success, or a pointer encoded
1231 * error code.
1232 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1233 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1234 const struct in6_addr *final_dst)
1235 {
1236 struct dst_entry *dst = NULL;
1237 int err;
1238
1239 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1240 if (err)
1241 return ERR_PTR(err);
1242 if (final_dst)
1243 fl6->daddr = *final_dst;
1244
1245 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1246 }
1247 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1248
1249 /**
1250 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1251 * @sk: socket which provides the dst cache and route info
1252 * @fl6: flow to lookup
1253 * @final_dst: final destination address for ipsec lookup
1254 * @connected: whether @sk is connected or not
1255 *
1256 * This function performs a route lookup on the given flow with the
1257 * possibility of using the cached route in the socket if it is valid.
1258 * It will take the socket dst lock when operating on the dst cache.
1259 * As a result, this function can only be used in process context.
1260 *
1261 * In addition, for a connected socket, cache the dst in the socket
1262 * if the current cache is not valid.
1263 *
1264 * It returns a valid dst pointer on success, or a pointer encoded
1265 * error code.
1266 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1267 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1268 const struct in6_addr *final_dst,
1269 bool connected)
1270 {
1271 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1272
1273 dst = ip6_sk_dst_check(sk, dst, fl6);
1274 if (dst)
1275 return dst;
1276
1277 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1278 if (connected && !IS_ERR(dst))
1279 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1280
1281 return dst;
1282 }
1283 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1284
1285 /**
1286 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1287 * @skb: Packet for which lookup is done
1288 * @dev: Tunnel device
1289 * @net: Network namespace of tunnel device
1290 * @sock: Socket which provides route info
1291 * @saddr: Memory to store the src ip address
1292 * @info: Tunnel information
1293 * @protocol: IP protocol
1294 * @use_cache: Flag to enable cache usage
1295 * This function performs a route lookup on a tunnel
1296 *
1297 * It returns a valid dst pointer and stores src address to be used in
1298 * tunnel in param saddr on success, else a pointer encoded error code.
1299 */
1300
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1301 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1302 struct net_device *dev,
1303 struct net *net,
1304 struct socket *sock,
1305 struct in6_addr *saddr,
1306 const struct ip_tunnel_info *info,
1307 u8 protocol,
1308 bool use_cache)
1309 {
1310 struct dst_entry *dst = NULL;
1311 #ifdef CONFIG_DST_CACHE
1312 struct dst_cache *dst_cache;
1313 #endif
1314 struct flowi6 fl6;
1315 __u8 prio;
1316
1317 #ifdef CONFIG_DST_CACHE
1318 dst_cache = (struct dst_cache *)&info->dst_cache;
1319 if (use_cache) {
1320 dst = dst_cache_get_ip6(dst_cache, saddr);
1321 if (dst)
1322 return dst;
1323 }
1324 #endif
1325 memset(&fl6, 0, sizeof(fl6));
1326 fl6.flowi6_mark = skb->mark;
1327 fl6.flowi6_proto = protocol;
1328 fl6.daddr = info->key.u.ipv6.dst;
1329 fl6.saddr = info->key.u.ipv6.src;
1330 prio = info->key.tos;
1331 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1332
1333 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1334 NULL);
1335 if (IS_ERR(dst)) {
1336 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1337 return ERR_PTR(-ENETUNREACH);
1338 }
1339 if (dst->dev == dev) { /* is this necessary? */
1340 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1341 dst_release(dst);
1342 return ERR_PTR(-ELOOP);
1343 }
1344 #ifdef CONFIG_DST_CACHE
1345 if (use_cache)
1346 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1347 #endif
1348 *saddr = fl6.saddr;
1349 return dst;
1350 }
1351 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1352
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1353 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1354 gfp_t gfp)
1355 {
1356 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1357 }
1358
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1359 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1360 gfp_t gfp)
1361 {
1362 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1363 }
1364
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1365 static void ip6_append_data_mtu(unsigned int *mtu,
1366 int *maxfraglen,
1367 unsigned int fragheaderlen,
1368 struct sk_buff *skb,
1369 struct rt6_info *rt,
1370 unsigned int orig_mtu)
1371 {
1372 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1373 if (!skb) {
1374 /* first fragment, reserve header_len */
1375 *mtu = orig_mtu - rt->dst.header_len;
1376
1377 } else {
1378 /*
1379 * this fragment is not first, the headers
1380 * space is regarded as data space.
1381 */
1382 *mtu = orig_mtu;
1383 }
1384 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1385 + fragheaderlen - sizeof(struct frag_hdr);
1386 }
1387 }
1388
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1389 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1390 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1391 struct rt6_info *rt, struct flowi6 *fl6)
1392 {
1393 struct ipv6_pinfo *np = inet6_sk(sk);
1394 unsigned int mtu;
1395 struct ipv6_txoptions *opt = ipc6->opt;
1396
1397 /*
1398 * setup for corking
1399 */
1400 if (opt) {
1401 if (WARN_ON(v6_cork->opt))
1402 return -EINVAL;
1403
1404 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1405 if (unlikely(!v6_cork->opt))
1406 return -ENOBUFS;
1407
1408 v6_cork->opt->tot_len = sizeof(*opt);
1409 v6_cork->opt->opt_flen = opt->opt_flen;
1410 v6_cork->opt->opt_nflen = opt->opt_nflen;
1411
1412 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1413 sk->sk_allocation);
1414 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1415 return -ENOBUFS;
1416
1417 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1418 sk->sk_allocation);
1419 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1420 return -ENOBUFS;
1421
1422 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1423 sk->sk_allocation);
1424 if (opt->hopopt && !v6_cork->opt->hopopt)
1425 return -ENOBUFS;
1426
1427 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1428 sk->sk_allocation);
1429 if (opt->srcrt && !v6_cork->opt->srcrt)
1430 return -ENOBUFS;
1431
1432 /* need source address above miyazawa*/
1433 }
1434 dst_hold(&rt->dst);
1435 cork->base.dst = &rt->dst;
1436 cork->fl.u.ip6 = *fl6;
1437 v6_cork->hop_limit = ipc6->hlimit;
1438 v6_cork->tclass = ipc6->tclass;
1439 if (rt->dst.flags & DST_XFRM_TUNNEL)
1440 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1441 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1442 else
1443 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1444 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1445 if (np->frag_size < mtu) {
1446 if (np->frag_size)
1447 mtu = np->frag_size;
1448 }
1449 cork->base.fragsize = mtu;
1450 cork->base.gso_size = ipc6->gso_size;
1451 cork->base.tx_flags = 0;
1452 cork->base.mark = ipc6->sockc.mark;
1453 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454
1455 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456 cork->base.flags |= IPCORK_ALLFRAG;
1457 cork->base.length = 0;
1458
1459 cork->base.transmit_time = ipc6->sockc.transmit_time;
1460
1461 return 0;
1462 }
1463
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1464 static int __ip6_append_data(struct sock *sk,
1465 struct flowi6 *fl6,
1466 struct sk_buff_head *queue,
1467 struct inet_cork *cork,
1468 struct inet6_cork *v6_cork,
1469 struct page_frag *pfrag,
1470 int getfrag(void *from, char *to, int offset,
1471 int len, int odd, struct sk_buff *skb),
1472 void *from, int length, int transhdrlen,
1473 unsigned int flags, struct ipcm6_cookie *ipc6)
1474 {
1475 struct sk_buff *skb, *skb_prev = NULL;
1476 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477 struct ubuf_info *uarg = NULL;
1478 int exthdrlen = 0;
1479 int dst_exthdrlen = 0;
1480 int hh_len;
1481 int copy;
1482 int err;
1483 int offset = 0;
1484 u32 tskey = 0;
1485 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1486 struct ipv6_txoptions *opt = v6_cork->opt;
1487 int csummode = CHECKSUM_NONE;
1488 unsigned int maxnonfragsize, headersize;
1489 unsigned int wmem_alloc_delta = 0;
1490 bool paged, extra_uref = false;
1491
1492 skb = skb_peek_tail(queue);
1493 if (!skb) {
1494 exthdrlen = opt ? opt->opt_flen : 0;
1495 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1496 }
1497
1498 paged = !!cork->gso_size;
1499 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1500 orig_mtu = mtu;
1501
1502 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1503 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1504 tskey = sk->sk_tskey++;
1505
1506 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1507
1508 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1509 (opt ? opt->opt_nflen : 0);
1510
1511 headersize = sizeof(struct ipv6hdr) +
1512 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1513 (dst_allfrag(&rt->dst) ?
1514 sizeof(struct frag_hdr) : 0) +
1515 rt->rt6i_nfheader_len;
1516
1517 if (mtu <= fragheaderlen ||
1518 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1519 goto emsgsize;
1520
1521 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1522 sizeof(struct frag_hdr);
1523
1524 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1525 * the first fragment
1526 */
1527 if (headersize + transhdrlen > mtu)
1528 goto emsgsize;
1529
1530 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1531 (sk->sk_protocol == IPPROTO_UDP ||
1532 sk->sk_protocol == IPPROTO_RAW)) {
1533 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1534 sizeof(struct ipv6hdr));
1535 goto emsgsize;
1536 }
1537
1538 if (ip6_sk_ignore_df(sk))
1539 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1540 else
1541 maxnonfragsize = mtu;
1542
1543 if (cork->length + length > maxnonfragsize - headersize) {
1544 emsgsize:
1545 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1546 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1547 return -EMSGSIZE;
1548 }
1549
1550 /* CHECKSUM_PARTIAL only with no extension headers and when
1551 * we are not going to fragment
1552 */
1553 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1554 headersize == sizeof(struct ipv6hdr) &&
1555 length <= mtu - headersize &&
1556 (!(flags & MSG_MORE) || cork->gso_size) &&
1557 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1558 csummode = CHECKSUM_PARTIAL;
1559
1560 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1561 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1562 if (!uarg)
1563 return -ENOBUFS;
1564 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1565 if (rt->dst.dev->features & NETIF_F_SG &&
1566 csummode == CHECKSUM_PARTIAL) {
1567 paged = true;
1568 } else {
1569 uarg->zerocopy = 0;
1570 skb_zcopy_set(skb, uarg, &extra_uref);
1571 }
1572 }
1573
1574 /*
1575 * Let's try using as much space as possible.
1576 * Use MTU if total length of the message fits into the MTU.
1577 * Otherwise, we need to reserve fragment header and
1578 * fragment alignment (= 8-15 octects, in total).
1579 *
1580 * Note that we may need to "move" the data from the tail
1581 * of the buffer to the new fragment when we split
1582 * the message.
1583 *
1584 * FIXME: It may be fragmented into multiple chunks
1585 * at once if non-fragmentable extension headers
1586 * are too large.
1587 * --yoshfuji
1588 */
1589
1590 cork->length += length;
1591 if (!skb)
1592 goto alloc_new_skb;
1593
1594 while (length > 0) {
1595 /* Check if the remaining data fits into current packet. */
1596 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1597 if (copy < length)
1598 copy = maxfraglen - skb->len;
1599
1600 if (copy <= 0) {
1601 char *data;
1602 unsigned int datalen;
1603 unsigned int fraglen;
1604 unsigned int fraggap;
1605 unsigned int alloclen, alloc_extra;
1606 unsigned int pagedlen;
1607 alloc_new_skb:
1608 /* There's no room in the current skb */
1609 if (skb)
1610 fraggap = skb->len - maxfraglen;
1611 else
1612 fraggap = 0;
1613 /* update mtu and maxfraglen if necessary */
1614 if (!skb || !skb_prev)
1615 ip6_append_data_mtu(&mtu, &maxfraglen,
1616 fragheaderlen, skb, rt,
1617 orig_mtu);
1618
1619 skb_prev = skb;
1620
1621 /*
1622 * If remaining data exceeds the mtu,
1623 * we know we need more fragment(s).
1624 */
1625 datalen = length + fraggap;
1626
1627 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1628 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1629 fraglen = datalen + fragheaderlen;
1630 pagedlen = 0;
1631
1632 alloc_extra = hh_len;
1633 alloc_extra += dst_exthdrlen;
1634 alloc_extra += rt->dst.trailer_len;
1635
1636 /* We just reserve space for fragment header.
1637 * Note: this may be overallocation if the message
1638 * (without MSG_MORE) fits into the MTU.
1639 */
1640 alloc_extra += sizeof(struct frag_hdr);
1641
1642 if ((flags & MSG_MORE) &&
1643 !(rt->dst.dev->features&NETIF_F_SG))
1644 alloclen = mtu;
1645 else if (!paged &&
1646 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1647 !(rt->dst.dev->features & NETIF_F_SG)))
1648 alloclen = fraglen;
1649 else {
1650 alloclen = min_t(int, fraglen, MAX_HEADER);
1651 pagedlen = fraglen - alloclen;
1652 }
1653 alloclen += alloc_extra;
1654
1655 if (datalen != length + fraggap) {
1656 /*
1657 * this is not the last fragment, the trailer
1658 * space is regarded as data space.
1659 */
1660 datalen += rt->dst.trailer_len;
1661 }
1662
1663 fraglen = datalen + fragheaderlen;
1664
1665 copy = datalen - transhdrlen - fraggap - pagedlen;
1666 if (copy < 0) {
1667 err = -EINVAL;
1668 goto error;
1669 }
1670 if (transhdrlen) {
1671 skb = sock_alloc_send_skb(sk, alloclen,
1672 (flags & MSG_DONTWAIT), &err);
1673 } else {
1674 skb = NULL;
1675 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1676 2 * sk->sk_sndbuf)
1677 skb = alloc_skb(alloclen,
1678 sk->sk_allocation);
1679 if (unlikely(!skb))
1680 err = -ENOBUFS;
1681 }
1682 if (!skb)
1683 goto error;
1684 /*
1685 * Fill in the control structures
1686 */
1687 skb->protocol = htons(ETH_P_IPV6);
1688 skb->ip_summed = csummode;
1689 skb->csum = 0;
1690 /* reserve for fragmentation and ipsec header */
1691 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1692 dst_exthdrlen);
1693
1694 /*
1695 * Find where to start putting bytes
1696 */
1697 data = skb_put(skb, fraglen - pagedlen);
1698 skb_set_network_header(skb, exthdrlen);
1699 data += fragheaderlen;
1700 skb->transport_header = (skb->network_header +
1701 fragheaderlen);
1702 if (fraggap) {
1703 skb->csum = skb_copy_and_csum_bits(
1704 skb_prev, maxfraglen,
1705 data + transhdrlen, fraggap);
1706 skb_prev->csum = csum_sub(skb_prev->csum,
1707 skb->csum);
1708 data += fraggap;
1709 pskb_trim_unique(skb_prev, maxfraglen);
1710 }
1711 if (copy > 0 &&
1712 getfrag(from, data + transhdrlen, offset,
1713 copy, fraggap, skb) < 0) {
1714 err = -EFAULT;
1715 kfree_skb(skb);
1716 goto error;
1717 }
1718
1719 offset += copy;
1720 length -= copy + transhdrlen;
1721 transhdrlen = 0;
1722 exthdrlen = 0;
1723 dst_exthdrlen = 0;
1724
1725 /* Only the initial fragment is time stamped */
1726 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1727 cork->tx_flags = 0;
1728 skb_shinfo(skb)->tskey = tskey;
1729 tskey = 0;
1730 skb_zcopy_set(skb, uarg, &extra_uref);
1731
1732 if ((flags & MSG_CONFIRM) && !skb_prev)
1733 skb_set_dst_pending_confirm(skb, 1);
1734
1735 /*
1736 * Put the packet on the pending queue
1737 */
1738 if (!skb->destructor) {
1739 skb->destructor = sock_wfree;
1740 skb->sk = sk;
1741 wmem_alloc_delta += skb->truesize;
1742 }
1743 __skb_queue_tail(queue, skb);
1744 continue;
1745 }
1746
1747 if (copy > length)
1748 copy = length;
1749
1750 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1751 skb_tailroom(skb) >= copy) {
1752 unsigned int off;
1753
1754 off = skb->len;
1755 if (getfrag(from, skb_put(skb, copy),
1756 offset, copy, off, skb) < 0) {
1757 __skb_trim(skb, off);
1758 err = -EFAULT;
1759 goto error;
1760 }
1761 } else if (!uarg || !uarg->zerocopy) {
1762 int i = skb_shinfo(skb)->nr_frags;
1763
1764 err = -ENOMEM;
1765 if (!sk_page_frag_refill(sk, pfrag))
1766 goto error;
1767
1768 if (!skb_can_coalesce(skb, i, pfrag->page,
1769 pfrag->offset)) {
1770 err = -EMSGSIZE;
1771 if (i == MAX_SKB_FRAGS)
1772 goto error;
1773
1774 __skb_fill_page_desc(skb, i, pfrag->page,
1775 pfrag->offset, 0);
1776 skb_shinfo(skb)->nr_frags = ++i;
1777 get_page(pfrag->page);
1778 }
1779 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1780 if (getfrag(from,
1781 page_address(pfrag->page) + pfrag->offset,
1782 offset, copy, skb->len, skb) < 0)
1783 goto error_efault;
1784
1785 pfrag->offset += copy;
1786 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1787 skb->len += copy;
1788 skb->data_len += copy;
1789 skb->truesize += copy;
1790 wmem_alloc_delta += copy;
1791 } else {
1792 err = skb_zerocopy_iter_dgram(skb, from, copy);
1793 if (err < 0)
1794 goto error;
1795 }
1796 offset += copy;
1797 length -= copy;
1798 }
1799
1800 if (wmem_alloc_delta)
1801 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1802 return 0;
1803
1804 error_efault:
1805 err = -EFAULT;
1806 error:
1807 if (uarg)
1808 sock_zerocopy_put_abort(uarg, extra_uref);
1809 cork->length -= length;
1810 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1811 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1812 return err;
1813 }
1814
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1815 int ip6_append_data(struct sock *sk,
1816 int getfrag(void *from, char *to, int offset, int len,
1817 int odd, struct sk_buff *skb),
1818 void *from, int length, int transhdrlen,
1819 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1820 struct rt6_info *rt, unsigned int flags)
1821 {
1822 struct inet_sock *inet = inet_sk(sk);
1823 struct ipv6_pinfo *np = inet6_sk(sk);
1824 int exthdrlen;
1825 int err;
1826
1827 if (flags&MSG_PROBE)
1828 return 0;
1829 if (skb_queue_empty(&sk->sk_write_queue)) {
1830 /*
1831 * setup for corking
1832 */
1833 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1834 ipc6, rt, fl6);
1835 if (err)
1836 return err;
1837
1838 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1839 length += exthdrlen;
1840 transhdrlen += exthdrlen;
1841 } else {
1842 fl6 = &inet->cork.fl.u.ip6;
1843 transhdrlen = 0;
1844 }
1845
1846 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1847 &np->cork, sk_page_frag(sk), getfrag,
1848 from, length, transhdrlen, flags, ipc6);
1849 }
1850 EXPORT_SYMBOL_GPL(ip6_append_data);
1851
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1852 static void ip6_cork_release(struct inet_cork_full *cork,
1853 struct inet6_cork *v6_cork)
1854 {
1855 if (v6_cork->opt) {
1856 kfree(v6_cork->opt->dst0opt);
1857 kfree(v6_cork->opt->dst1opt);
1858 kfree(v6_cork->opt->hopopt);
1859 kfree(v6_cork->opt->srcrt);
1860 kfree(v6_cork->opt);
1861 v6_cork->opt = NULL;
1862 }
1863
1864 if (cork->base.dst) {
1865 dst_release(cork->base.dst);
1866 cork->base.dst = NULL;
1867 cork->base.flags &= ~IPCORK_ALLFRAG;
1868 }
1869 memset(&cork->fl, 0, sizeof(cork->fl));
1870 }
1871
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1872 struct sk_buff *__ip6_make_skb(struct sock *sk,
1873 struct sk_buff_head *queue,
1874 struct inet_cork_full *cork,
1875 struct inet6_cork *v6_cork)
1876 {
1877 struct sk_buff *skb, *tmp_skb;
1878 struct sk_buff **tail_skb;
1879 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1880 struct ipv6_pinfo *np = inet6_sk(sk);
1881 struct net *net = sock_net(sk);
1882 struct ipv6hdr *hdr;
1883 struct ipv6_txoptions *opt = v6_cork->opt;
1884 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1885 struct flowi6 *fl6 = &cork->fl.u.ip6;
1886 unsigned char proto = fl6->flowi6_proto;
1887
1888 skb = __skb_dequeue(queue);
1889 if (!skb)
1890 goto out;
1891 tail_skb = &(skb_shinfo(skb)->frag_list);
1892
1893 /* move skb->data to ip header from ext header */
1894 if (skb->data < skb_network_header(skb))
1895 __skb_pull(skb, skb_network_offset(skb));
1896 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1897 __skb_pull(tmp_skb, skb_network_header_len(skb));
1898 *tail_skb = tmp_skb;
1899 tail_skb = &(tmp_skb->next);
1900 skb->len += tmp_skb->len;
1901 skb->data_len += tmp_skb->len;
1902 skb->truesize += tmp_skb->truesize;
1903 tmp_skb->destructor = NULL;
1904 tmp_skb->sk = NULL;
1905 }
1906
1907 /* Allow local fragmentation. */
1908 skb->ignore_df = ip6_sk_ignore_df(sk);
1909
1910 *final_dst = fl6->daddr;
1911 __skb_pull(skb, skb_network_header_len(skb));
1912 if (opt && opt->opt_flen)
1913 ipv6_push_frag_opts(skb, opt, &proto);
1914 if (opt && opt->opt_nflen)
1915 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1916
1917 skb_push(skb, sizeof(struct ipv6hdr));
1918 skb_reset_network_header(skb);
1919 hdr = ipv6_hdr(skb);
1920
1921 ip6_flow_hdr(hdr, v6_cork->tclass,
1922 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1923 ip6_autoflowlabel(net, np), fl6));
1924 hdr->hop_limit = v6_cork->hop_limit;
1925 hdr->nexthdr = proto;
1926 hdr->saddr = fl6->saddr;
1927 hdr->daddr = *final_dst;
1928
1929 skb->priority = sk->sk_priority;
1930 skb->mark = cork->base.mark;
1931
1932 skb->tstamp = cork->base.transmit_time;
1933
1934 skb_dst_set(skb, dst_clone(&rt->dst));
1935 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1936 if (proto == IPPROTO_ICMPV6) {
1937 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1938 u8 icmp6_type;
1939
1940 if (sk->sk_socket->type == SOCK_RAW &&
1941 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1942 icmp6_type = fl6->fl6_icmp_type;
1943 else
1944 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1945 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1946 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1947 }
1948
1949 ip6_cork_release(cork, v6_cork);
1950 out:
1951 return skb;
1952 }
1953
ip6_send_skb(struct sk_buff * skb)1954 int ip6_send_skb(struct sk_buff *skb)
1955 {
1956 struct net *net = sock_net(skb->sk);
1957 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1958 int err;
1959
1960 rcu_read_lock();
1961 err = ip6_local_out(net, skb->sk, skb);
1962 if (err) {
1963 if (err > 0)
1964 err = net_xmit_errno(err);
1965 if (err)
1966 IP6_INC_STATS(net, rt->rt6i_idev,
1967 IPSTATS_MIB_OUTDISCARDS);
1968 }
1969
1970 rcu_read_unlock();
1971 return err;
1972 }
1973
ip6_push_pending_frames(struct sock * sk)1974 int ip6_push_pending_frames(struct sock *sk)
1975 {
1976 struct sk_buff *skb;
1977
1978 skb = ip6_finish_skb(sk);
1979 if (!skb)
1980 return 0;
1981
1982 return ip6_send_skb(skb);
1983 }
1984 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1985
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1986 static void __ip6_flush_pending_frames(struct sock *sk,
1987 struct sk_buff_head *queue,
1988 struct inet_cork_full *cork,
1989 struct inet6_cork *v6_cork)
1990 {
1991 struct sk_buff *skb;
1992
1993 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1994 if (skb_dst(skb))
1995 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1996 IPSTATS_MIB_OUTDISCARDS);
1997 kfree_skb(skb);
1998 }
1999
2000 ip6_cork_release(cork, v6_cork);
2001 }
2002
ip6_flush_pending_frames(struct sock * sk)2003 void ip6_flush_pending_frames(struct sock *sk)
2004 {
2005 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2006 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2007 }
2008 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2009
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2010 struct sk_buff *ip6_make_skb(struct sock *sk,
2011 int getfrag(void *from, char *to, int offset,
2012 int len, int odd, struct sk_buff *skb),
2013 void *from, int length, int transhdrlen,
2014 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
2015 struct rt6_info *rt, unsigned int flags,
2016 struct inet_cork_full *cork)
2017 {
2018 struct inet6_cork v6_cork;
2019 struct sk_buff_head queue;
2020 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2021 int err;
2022
2023 if (flags & MSG_PROBE)
2024 return NULL;
2025
2026 __skb_queue_head_init(&queue);
2027
2028 cork->base.flags = 0;
2029 cork->base.addr = 0;
2030 cork->base.opt = NULL;
2031 cork->base.dst = NULL;
2032 v6_cork.opt = NULL;
2033 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2034 if (err) {
2035 ip6_cork_release(cork, &v6_cork);
2036 return ERR_PTR(err);
2037 }
2038 if (ipc6->dontfrag < 0)
2039 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2040
2041 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2042 ¤t->task_frag, getfrag, from,
2043 length + exthdrlen, transhdrlen + exthdrlen,
2044 flags, ipc6);
2045 if (err) {
2046 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2047 return ERR_PTR(err);
2048 }
2049
2050 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2051 }
2052