1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
ip6_finish_output2(struct sk_buff * skb)59 static int ip6_finish_output2(struct sk_buff *skb)
60 {
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 struct neighbour *neigh;
64 struct in6_addr *nexthop;
65 int ret;
66
67 skb->protocol = htons(ETH_P_IPV6);
68 skb->dev = dev;
69
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74 ((mroute6_socket(dev_net(dev), skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
87
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(dev_net(dev), idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
96 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 skb->len);
98
99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 IPV6_ADDR_SCOPE_NODELOCAL &&
101 !(dev->flags & IFF_LOOPBACK)) {
102 kfree_skb(skb);
103 return 0;
104 }
105 }
106
107 rcu_read_lock_bh();
108 nexthop = rt6_nexthop((struct rt6_info *)dst);
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 if (unlikely(!neigh))
111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 if (!IS_ERR(neigh)) {
113 ret = dst_neigh_output(dst, neigh, skb);
114 rcu_read_unlock_bh();
115 return ret;
116 }
117 rcu_read_unlock_bh();
118
119 IP6_INC_STATS(dev_net(dst->dev),
120 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 kfree_skb(skb);
122 return -EINVAL;
123 }
124
ip6_finish_output(struct sk_buff * skb)125 static int ip6_finish_output(struct sk_buff *skb)
126 {
127 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 dst_allfrag(skb_dst(skb)) ||
129 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 return ip6_fragment(skb, ip6_finish_output2);
131 else
132 return ip6_finish_output2(skb);
133 }
134
ip6_output(struct sock * sk,struct sk_buff * skb)135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 struct net_device *dev = skb_dst(skb)->dev;
138 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 if (unlikely(idev->cnf.disable_ipv6)) {
140 IP6_INC_STATS(dev_net(dev), idev,
141 IPSTATS_MIB_OUTDISCARDS);
142 kfree_skb(skb);
143 return 0;
144 }
145
146 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147 ip6_finish_output,
148 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150
151 /*
152 * xmit an sk_buff (used by TCP, SCTP and DCCP)
153 */
154
ip6_xmit(struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,struct ipv6_txoptions * opt,int tclass)155 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
156 struct ipv6_txoptions *opt, int tclass)
157 {
158 struct net *net = sock_net(sk);
159 struct ipv6_pinfo *np = inet6_sk(sk);
160 struct in6_addr *first_hop = &fl6->daddr;
161 struct dst_entry *dst = skb_dst(skb);
162 struct ipv6hdr *hdr;
163 u8 proto = fl6->flowi6_proto;
164 int seg_len = skb->len;
165 int hlimit = -1;
166 u32 mtu;
167
168 if (opt) {
169 unsigned int head_room;
170
171 /* First: exthdrs may take lots of space (~8K for now)
172 MAX_HEADER is not enough.
173 */
174 head_room = opt->opt_nflen + opt->opt_flen;
175 seg_len += head_room;
176 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177
178 if (skb_headroom(skb) < head_room) {
179 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
180 if (skb2 == NULL) {
181 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
182 IPSTATS_MIB_OUTDISCARDS);
183 kfree_skb(skb);
184 return -ENOBUFS;
185 }
186 consume_skb(skb);
187 skb = skb2;
188 skb_set_owner_w(skb, sk);
189 }
190 if (opt->opt_flen)
191 ipv6_push_frag_opts(skb, opt, &proto);
192 if (opt->opt_nflen)
193 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194 }
195
196 skb_push(skb, sizeof(struct ipv6hdr));
197 skb_reset_network_header(skb);
198 hdr = ipv6_hdr(skb);
199
200 /*
201 * Fill in the IPv6 header
202 */
203 if (np)
204 hlimit = np->hop_limit;
205 if (hlimit < 0)
206 hlimit = ip6_dst_hoplimit(dst);
207
208 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
209 np->autoflowlabel));
210
211 hdr->payload_len = htons(seg_len);
212 hdr->nexthdr = proto;
213 hdr->hop_limit = hlimit;
214
215 hdr->saddr = fl6->saddr;
216 hdr->daddr = *first_hop;
217
218 skb->protocol = htons(ETH_P_IPV6);
219 skb->priority = sk->sk_priority;
220 skb->mark = sk->sk_mark;
221
222 mtu = dst_mtu(dst);
223 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
224 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
225 IPSTATS_MIB_OUT, skb->len);
226 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
227 dst->dev, dst_output);
228 }
229
230 skb->dev = dst->dev;
231 ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
232 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
233 kfree_skb(skb);
234 return -EMSGSIZE;
235 }
236 EXPORT_SYMBOL(ip6_xmit);
237
ip6_call_ra_chain(struct sk_buff * skb,int sel)238 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
239 {
240 struct ip6_ra_chain *ra;
241 struct sock *last = NULL;
242
243 read_lock(&ip6_ra_lock);
244 for (ra = ip6_ra_chain; ra; ra = ra->next) {
245 struct sock *sk = ra->sk;
246 if (sk && ra->sel == sel &&
247 (!sk->sk_bound_dev_if ||
248 sk->sk_bound_dev_if == skb->dev->ifindex)) {
249 if (last) {
250 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
251 if (skb2)
252 rawv6_rcv(last, skb2);
253 }
254 last = sk;
255 }
256 }
257
258 if (last) {
259 rawv6_rcv(last, skb);
260 read_unlock(&ip6_ra_lock);
261 return 1;
262 }
263 read_unlock(&ip6_ra_lock);
264 return 0;
265 }
266
ip6_forward_proxy_check(struct sk_buff * skb)267 static int ip6_forward_proxy_check(struct sk_buff *skb)
268 {
269 struct ipv6hdr *hdr = ipv6_hdr(skb);
270 u8 nexthdr = hdr->nexthdr;
271 __be16 frag_off;
272 int offset;
273
274 if (ipv6_ext_hdr(nexthdr)) {
275 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
276 if (offset < 0)
277 return 0;
278 } else
279 offset = sizeof(struct ipv6hdr);
280
281 if (nexthdr == IPPROTO_ICMPV6) {
282 struct icmp6hdr *icmp6;
283
284 if (!pskb_may_pull(skb, (skb_network_header(skb) +
285 offset + 1 - skb->data)))
286 return 0;
287
288 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
289
290 switch (icmp6->icmp6_type) {
291 case NDISC_ROUTER_SOLICITATION:
292 case NDISC_ROUTER_ADVERTISEMENT:
293 case NDISC_NEIGHBOUR_SOLICITATION:
294 case NDISC_NEIGHBOUR_ADVERTISEMENT:
295 case NDISC_REDIRECT:
296 /* For reaction involving unicast neighbor discovery
297 * message destined to the proxied address, pass it to
298 * input function.
299 */
300 return 1;
301 default:
302 break;
303 }
304 }
305
306 /*
307 * The proxying router can't forward traffic sent to a link-local
308 * address, so signal the sender and discard the packet. This
309 * behavior is clarified by the MIPv6 specification.
310 */
311 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
312 dst_link_failure(skb);
313 return -1;
314 }
315
316 return 0;
317 }
318
ip6_forward_finish(struct sk_buff * skb)319 static inline int ip6_forward_finish(struct sk_buff *skb)
320 {
321 return dst_output(skb);
322 }
323
ip6_dst_mtu_forward(const struct dst_entry * dst)324 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
325 {
326 unsigned int mtu;
327 struct inet6_dev *idev;
328
329 if (dst_metric_locked(dst, RTAX_MTU)) {
330 mtu = dst_metric_raw(dst, RTAX_MTU);
331 if (mtu)
332 return mtu;
333 }
334
335 mtu = IPV6_MIN_MTU;
336 rcu_read_lock();
337 idev = __in6_dev_get(dst->dev);
338 if (idev)
339 mtu = idev->cnf.mtu6;
340 rcu_read_unlock();
341
342 return mtu;
343 }
344
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)345 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
346 {
347 if (skb->len <= mtu)
348 return false;
349
350 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
351 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
352 return true;
353
354 if (skb->ignore_df)
355 return false;
356
357 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
358 return false;
359
360 return true;
361 }
362
ip6_forward(struct sk_buff * skb)363 int ip6_forward(struct sk_buff *skb)
364 {
365 struct dst_entry *dst = skb_dst(skb);
366 struct ipv6hdr *hdr = ipv6_hdr(skb);
367 struct inet6_skb_parm *opt = IP6CB(skb);
368 struct net *net = dev_net(dst->dev);
369 u32 mtu;
370
371 if (net->ipv6.devconf_all->forwarding == 0)
372 goto error;
373
374 if (skb->pkt_type != PACKET_HOST)
375 goto drop;
376
377 if (unlikely(skb->sk))
378 goto drop;
379
380 if (skb_warn_if_lro(skb))
381 goto drop;
382
383 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
384 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
385 IPSTATS_MIB_INDISCARDS);
386 goto drop;
387 }
388
389 skb_forward_csum(skb);
390
391 /*
392 * We DO NOT make any processing on
393 * RA packets, pushing them to user level AS IS
394 * without ane WARRANTY that application will be able
395 * to interpret them. The reason is that we
396 * cannot make anything clever here.
397 *
398 * We are not end-node, so that if packet contains
399 * AH/ESP, we cannot make anything.
400 * Defragmentation also would be mistake, RA packets
401 * cannot be fragmented, because there is no warranty
402 * that different fragments will go along one path. --ANK
403 */
404 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
405 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
406 return 0;
407 }
408
409 /*
410 * check and decrement ttl
411 */
412 if (hdr->hop_limit <= 1) {
413 /* Force OUTPUT device used as source address */
414 skb->dev = dst->dev;
415 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
416 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
417 IPSTATS_MIB_INHDRERRORS);
418
419 kfree_skb(skb);
420 return -ETIMEDOUT;
421 }
422
423 /* XXX: idev->cnf.proxy_ndp? */
424 if (net->ipv6.devconf_all->proxy_ndp &&
425 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
426 int proxied = ip6_forward_proxy_check(skb);
427 if (proxied > 0)
428 return ip6_input(skb);
429 else if (proxied < 0) {
430 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
431 IPSTATS_MIB_INDISCARDS);
432 goto drop;
433 }
434 }
435
436 if (!xfrm6_route_forward(skb)) {
437 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
438 IPSTATS_MIB_INDISCARDS);
439 goto drop;
440 }
441 dst = skb_dst(skb);
442
443 /* IPv6 specs say nothing about it, but it is clear that we cannot
444 send redirects to source routed frames.
445 We don't send redirects to frames decapsulated from IPsec.
446 */
447 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
448 struct in6_addr *target = NULL;
449 struct inet_peer *peer;
450 struct rt6_info *rt;
451
452 /*
453 * incoming and outgoing devices are the same
454 * send a redirect.
455 */
456
457 rt = (struct rt6_info *) dst;
458 if (rt->rt6i_flags & RTF_GATEWAY)
459 target = &rt->rt6i_gateway;
460 else
461 target = &hdr->daddr;
462
463 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
464
465 /* Limit redirects both by destination (here)
466 and by source (inside ndisc_send_redirect)
467 */
468 if (inet_peer_xrlim_allow(peer, 1*HZ))
469 ndisc_send_redirect(skb, target);
470 if (peer)
471 inet_putpeer(peer);
472 } else {
473 int addrtype = ipv6_addr_type(&hdr->saddr);
474
475 /* This check is security critical. */
476 if (addrtype == IPV6_ADDR_ANY ||
477 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
478 goto error;
479 if (addrtype & IPV6_ADDR_LINKLOCAL) {
480 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
481 ICMPV6_NOT_NEIGHBOUR, 0);
482 goto error;
483 }
484 }
485
486 mtu = ip6_dst_mtu_forward(dst);
487 if (mtu < IPV6_MIN_MTU)
488 mtu = IPV6_MIN_MTU;
489
490 if (ip6_pkt_too_big(skb, mtu)) {
491 /* Again, force OUTPUT device used as source address */
492 skb->dev = dst->dev;
493 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
494 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
495 IPSTATS_MIB_INTOOBIGERRORS);
496 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497 IPSTATS_MIB_FRAGFAILS);
498 kfree_skb(skb);
499 return -EMSGSIZE;
500 }
501
502 if (skb_cow(skb, dst->dev->hard_header_len)) {
503 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
504 IPSTATS_MIB_OUTDISCARDS);
505 goto drop;
506 }
507
508 hdr = ipv6_hdr(skb);
509
510 /* Mangling hops number delayed to point after skb COW */
511
512 hdr->hop_limit--;
513
514 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
515 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
516 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
517 ip6_forward_finish);
518
519 error:
520 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
521 drop:
522 kfree_skb(skb);
523 return -EINVAL;
524 }
525
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
527 {
528 to->pkt_type = from->pkt_type;
529 to->priority = from->priority;
530 to->protocol = from->protocol;
531 skb_dst_drop(to);
532 skb_dst_set(to, dst_clone(skb_dst(from)));
533 to->dev = from->dev;
534 to->mark = from->mark;
535
536 #ifdef CONFIG_NET_SCHED
537 to->tc_index = from->tc_index;
538 #endif
539 nf_copy(to, from);
540 skb_copy_secmark(to, from);
541 }
542
ipv6_select_ident(struct frag_hdr * fhdr,struct rt6_info * rt)543 static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
544 {
545 static u32 ip6_idents_hashrnd __read_mostly;
546 u32 hash, id;
547
548 net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
549
550 hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
551 hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
552
553 id = ip_idents_reserve(hash, 1);
554 fhdr->identification = htonl(id);
555 }
556
ip6_fragment(struct sk_buff * skb,int (* output)(struct sk_buff *))557 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
558 {
559 struct sk_buff *frag;
560 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
561 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
562 inet6_sk(skb->sk) : NULL;
563 struct ipv6hdr *tmp_hdr;
564 struct frag_hdr *fh;
565 unsigned int mtu, hlen, left, len;
566 int hroom, troom;
567 __be32 frag_id = 0;
568 int ptr, offset = 0, err = 0;
569 u8 *prevhdr, nexthdr = 0;
570 struct net *net = dev_net(skb_dst(skb)->dev);
571
572 err = ip6_find_1stfragopt(skb, &prevhdr);
573 if (err < 0)
574 goto fail;
575 hlen = err;
576 nexthdr = *prevhdr;
577
578 mtu = ip6_skb_dst_mtu(skb);
579
580 /* We must not fragment if the socket is set to force MTU discovery
581 * or if the skb it not generated by a local socket.
582 */
583 if (unlikely(!skb->ignore_df && skb->len > mtu) ||
584 (IP6CB(skb)->frag_max_size &&
585 IP6CB(skb)->frag_max_size > mtu)) {
586 if (skb->sk && dst_allfrag(skb_dst(skb)))
587 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
588
589 skb->dev = skb_dst(skb)->dev;
590 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
591 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
592 IPSTATS_MIB_FRAGFAILS);
593 kfree_skb(skb);
594 return -EMSGSIZE;
595 }
596
597 if (np && np->frag_size < mtu) {
598 if (np->frag_size)
599 mtu = np->frag_size;
600 }
601 mtu -= hlen + sizeof(struct frag_hdr);
602
603 if (skb_has_frag_list(skb)) {
604 int first_len = skb_pagelen(skb);
605 struct sk_buff *frag2;
606
607 if (first_len - hlen > mtu ||
608 ((first_len - hlen) & 7) ||
609 skb_cloned(skb))
610 goto slow_path;
611
612 skb_walk_frags(skb, frag) {
613 /* Correct geometry. */
614 if (frag->len > mtu ||
615 ((frag->len & 7) && frag->next) ||
616 skb_headroom(frag) < hlen)
617 goto slow_path_clean;
618
619 /* Partially cloned skb? */
620 if (skb_shared(frag))
621 goto slow_path_clean;
622
623 BUG_ON(frag->sk);
624 if (skb->sk) {
625 frag->sk = skb->sk;
626 frag->destructor = sock_wfree;
627 }
628 skb->truesize -= frag->truesize;
629 }
630
631 err = 0;
632 offset = 0;
633 frag = skb_shinfo(skb)->frag_list;
634 skb_frag_list_init(skb);
635 /* BUILD HEADER */
636
637 *prevhdr = NEXTHDR_FRAGMENT;
638 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
639 if (!tmp_hdr) {
640 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
641 IPSTATS_MIB_FRAGFAILS);
642 return -ENOMEM;
643 }
644
645 __skb_pull(skb, hlen);
646 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
647 __skb_push(skb, hlen);
648 skb_reset_network_header(skb);
649 memcpy(skb_network_header(skb), tmp_hdr, hlen);
650
651 ipv6_select_ident(fh, rt);
652 fh->nexthdr = nexthdr;
653 fh->reserved = 0;
654 fh->frag_off = htons(IP6_MF);
655 frag_id = fh->identification;
656
657 first_len = skb_pagelen(skb);
658 skb->data_len = first_len - skb_headlen(skb);
659 skb->len = first_len;
660 ipv6_hdr(skb)->payload_len = htons(first_len -
661 sizeof(struct ipv6hdr));
662
663 dst_hold(&rt->dst);
664
665 for (;;) {
666 /* Prepare header of the next frame,
667 * before previous one went down. */
668 if (frag) {
669 frag->ip_summed = CHECKSUM_NONE;
670 skb_reset_transport_header(frag);
671 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
672 __skb_push(frag, hlen);
673 skb_reset_network_header(frag);
674 memcpy(skb_network_header(frag), tmp_hdr,
675 hlen);
676 offset += skb->len - hlen - sizeof(struct frag_hdr);
677 fh->nexthdr = nexthdr;
678 fh->reserved = 0;
679 fh->frag_off = htons(offset);
680 if (frag->next != NULL)
681 fh->frag_off |= htons(IP6_MF);
682 fh->identification = frag_id;
683 ipv6_hdr(frag)->payload_len =
684 htons(frag->len -
685 sizeof(struct ipv6hdr));
686 ip6_copy_metadata(frag, skb);
687 }
688
689 err = output(skb);
690 if (!err)
691 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
692 IPSTATS_MIB_FRAGCREATES);
693
694 if (err || !frag)
695 break;
696
697 skb = frag;
698 frag = skb->next;
699 skb->next = NULL;
700 }
701
702 kfree(tmp_hdr);
703
704 if (err == 0) {
705 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
706 IPSTATS_MIB_FRAGOKS);
707 ip6_rt_put(rt);
708 return 0;
709 }
710
711 kfree_skb_list(frag);
712
713 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
714 IPSTATS_MIB_FRAGFAILS);
715 ip6_rt_put(rt);
716 return err;
717
718 slow_path_clean:
719 skb_walk_frags(skb, frag2) {
720 if (frag2 == frag)
721 break;
722 frag2->sk = NULL;
723 frag2->destructor = NULL;
724 skb->truesize += frag2->truesize;
725 }
726 }
727
728 slow_path:
729 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
730 skb_checksum_help(skb))
731 goto fail;
732
733 left = skb->len - hlen; /* Space per frame */
734 ptr = hlen; /* Where to start from */
735
736 /*
737 * Fragment the datagram.
738 */
739
740 *prevhdr = NEXTHDR_FRAGMENT;
741 hroom = LL_RESERVED_SPACE(rt->dst.dev);
742 troom = rt->dst.dev->needed_tailroom;
743
744 /*
745 * Keep copying data until we run out.
746 */
747 while (left > 0) {
748 len = left;
749 /* IF: it doesn't fit, use 'mtu' - the data space left */
750 if (len > mtu)
751 len = mtu;
752 /* IF: we are not sending up to and including the packet end
753 then align the next start on an eight byte boundary */
754 if (len < left) {
755 len &= ~7;
756 }
757 /*
758 * Allocate buffer.
759 */
760
761 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
762 hroom + troom, GFP_ATOMIC)) == NULL) {
763 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
764 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
765 IPSTATS_MIB_FRAGFAILS);
766 err = -ENOMEM;
767 goto fail;
768 }
769
770 /*
771 * Set up data on packet
772 */
773
774 ip6_copy_metadata(frag, skb);
775 skb_reserve(frag, hroom);
776 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
777 skb_reset_network_header(frag);
778 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
779 frag->transport_header = (frag->network_header + hlen +
780 sizeof(struct frag_hdr));
781
782 /*
783 * Charge the memory for the fragment to any owner
784 * it might possess
785 */
786 if (skb->sk)
787 skb_set_owner_w(frag, skb->sk);
788
789 /*
790 * Copy the packet header into the new buffer.
791 */
792 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
793
794 /*
795 * Build fragment header.
796 */
797 fh->nexthdr = nexthdr;
798 fh->reserved = 0;
799 if (!frag_id) {
800 ipv6_select_ident(fh, rt);
801 frag_id = fh->identification;
802 } else
803 fh->identification = frag_id;
804
805 /*
806 * Copy a block of the IP datagram.
807 */
808 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
809 len));
810 left -= len;
811
812 fh->frag_off = htons(offset);
813 if (left > 0)
814 fh->frag_off |= htons(IP6_MF);
815 ipv6_hdr(frag)->payload_len = htons(frag->len -
816 sizeof(struct ipv6hdr));
817
818 ptr += len;
819 offset += len;
820
821 /*
822 * Put this fragment into the sending queue.
823 */
824 err = output(frag);
825 if (err)
826 goto fail;
827
828 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
829 IPSTATS_MIB_FRAGCREATES);
830 }
831 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
832 IPSTATS_MIB_FRAGOKS);
833 consume_skb(skb);
834 return err;
835
836 fail:
837 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
838 IPSTATS_MIB_FRAGFAILS);
839 kfree_skb(skb);
840 return err;
841 }
842
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)843 static inline int ip6_rt_check(const struct rt6key *rt_key,
844 const struct in6_addr *fl_addr,
845 const struct in6_addr *addr_cache)
846 {
847 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
848 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
849 }
850
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)851 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
852 struct dst_entry *dst,
853 const struct flowi6 *fl6)
854 {
855 struct ipv6_pinfo *np = inet6_sk(sk);
856 struct rt6_info *rt;
857
858 if (!dst)
859 goto out;
860
861 if (dst->ops->family != AF_INET6) {
862 dst_release(dst);
863 return NULL;
864 }
865
866 rt = (struct rt6_info *)dst;
867 /* Yes, checking route validity in not connected
868 * case is not very simple. Take into account,
869 * that we do not support routing by source, TOS,
870 * and MSG_DONTROUTE --ANK (980726)
871 *
872 * 1. ip6_rt_check(): If route was host route,
873 * check that cached destination is current.
874 * If it is network route, we still may
875 * check its validity using saved pointer
876 * to the last used address: daddr_cache.
877 * We do not want to save whole address now,
878 * (because main consumer of this service
879 * is tcp, which has not this problem),
880 * so that the last trick works only on connected
881 * sockets.
882 * 2. oif also should be the same.
883 */
884 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
885 #ifdef CONFIG_IPV6_SUBTREES
886 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
887 #endif
888 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
889 dst_release(dst);
890 dst = NULL;
891 }
892
893 out:
894 return dst;
895 }
896
ip6_dst_lookup_tail(struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)897 static int ip6_dst_lookup_tail(struct sock *sk,
898 struct dst_entry **dst, struct flowi6 *fl6)
899 {
900 struct net *net = sock_net(sk);
901 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
902 struct neighbour *n;
903 struct rt6_info *rt;
904 #endif
905 int err;
906
907 if (*dst == NULL)
908 *dst = ip6_route_output(net, sk, fl6);
909
910 if ((err = (*dst)->error))
911 goto out_err_release;
912
913 if (ipv6_addr_any(&fl6->saddr)) {
914 struct rt6_info *rt = (struct rt6_info *) *dst;
915 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
916 sk ? inet6_sk(sk)->srcprefs : 0,
917 &fl6->saddr);
918 if (err)
919 goto out_err_release;
920 }
921
922 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
923 /*
924 * Here if the dst entry we've looked up
925 * has a neighbour entry that is in the INCOMPLETE
926 * state and the src address from the flow is
927 * marked as OPTIMISTIC, we release the found
928 * dst entry and replace it instead with the
929 * dst entry of the nexthop router
930 */
931 rt = (struct rt6_info *) *dst;
932 rcu_read_lock_bh();
933 n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
934 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
935 rcu_read_unlock_bh();
936
937 if (err) {
938 struct inet6_ifaddr *ifp;
939 struct flowi6 fl_gw6;
940 int redirect;
941
942 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
943 (*dst)->dev, 1);
944
945 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
946 if (ifp)
947 in6_ifa_put(ifp);
948
949 if (redirect) {
950 /*
951 * We need to get the dst entry for the
952 * default router instead
953 */
954 dst_release(*dst);
955 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
956 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
957 *dst = ip6_route_output(net, sk, &fl_gw6);
958 if ((err = (*dst)->error))
959 goto out_err_release;
960 }
961 }
962 #endif
963 if (ipv6_addr_v4mapped(&fl6->saddr) &&
964 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
965 err = -EAFNOSUPPORT;
966 goto out_err_release;
967 }
968
969 return 0;
970
971 out_err_release:
972 if (err == -ENETUNREACH)
973 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
974 dst_release(*dst);
975 *dst = NULL;
976 return err;
977 }
978
979 /**
980 * ip6_dst_lookup - perform route lookup on flow
981 * @sk: socket which provides route info
982 * @dst: pointer to dst_entry * for result
983 * @fl6: flow to lookup
984 *
985 * This function performs a route lookup on the given flow.
986 *
987 * It returns zero on success, or a standard errno code on error.
988 */
ip6_dst_lookup(struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)989 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
990 {
991 *dst = NULL;
992 return ip6_dst_lookup_tail(sk, dst, fl6);
993 }
994 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
995
996 /**
997 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
998 * @sk: socket which provides route info
999 * @fl6: flow to lookup
1000 * @final_dst: final destination address for ipsec lookup
1001 *
1002 * This function performs a route lookup on the given flow.
1003 *
1004 * It returns a valid dst pointer on success, or a pointer encoded
1005 * error code.
1006 */
ip6_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1007 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1008 const struct in6_addr *final_dst)
1009 {
1010 struct dst_entry *dst = NULL;
1011 int err;
1012
1013 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1014 if (err)
1015 return ERR_PTR(err);
1016 if (final_dst)
1017 fl6->daddr = *final_dst;
1018
1019 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1020 }
1021 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1022
1023 /**
1024 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1025 * @sk: socket which provides the dst cache and route info
1026 * @fl6: flow to lookup
1027 * @final_dst: final destination address for ipsec lookup
1028 *
1029 * This function performs a route lookup on the given flow with the
1030 * possibility of using the cached route in the socket if it is valid.
1031 * It will take the socket dst lock when operating on the dst cache.
1032 * As a result, this function can only be used in process context.
1033 *
1034 * It returns a valid dst pointer on success, or a pointer encoded
1035 * error code.
1036 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1037 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1038 const struct in6_addr *final_dst)
1039 {
1040 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1041
1042 dst = ip6_sk_dst_check(sk, dst, fl6);
1043 if (!dst)
1044 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1045
1046 return dst;
1047 }
1048 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1049
ip6_ufo_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int hh_len,int fragheaderlen,int transhdrlen,int mtu,unsigned int flags,struct rt6_info * rt)1050 static inline int ip6_ufo_append_data(struct sock *sk,
1051 int getfrag(void *from, char *to, int offset, int len,
1052 int odd, struct sk_buff *skb),
1053 void *from, int length, int hh_len, int fragheaderlen,
1054 int transhdrlen, int mtu, unsigned int flags,
1055 struct rt6_info *rt)
1056
1057 {
1058 struct sk_buff *skb;
1059 struct frag_hdr fhdr;
1060 int err;
1061
1062 /* There is support for UDP large send offload by network
1063 * device, so create one single skb packet containing complete
1064 * udp datagram
1065 */
1066 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1067 skb = sock_alloc_send_skb(sk,
1068 hh_len + fragheaderlen + transhdrlen + 20,
1069 (flags & MSG_DONTWAIT), &err);
1070 if (skb == NULL)
1071 return err;
1072
1073 /* reserve space for Hardware header */
1074 skb_reserve(skb, hh_len);
1075
1076 /* create space for UDP/IP header */
1077 skb_put(skb, fragheaderlen + transhdrlen);
1078
1079 /* initialize network header pointer */
1080 skb_reset_network_header(skb);
1081
1082 /* initialize protocol header pointer */
1083 skb->transport_header = skb->network_header + fragheaderlen;
1084
1085 skb->protocol = htons(ETH_P_IPV6);
1086 skb->csum = 0;
1087
1088 __skb_queue_tail(&sk->sk_write_queue, skb);
1089 } else if (skb_is_gso(skb)) {
1090 goto append;
1091 }
1092
1093 skb->ip_summed = CHECKSUM_PARTIAL;
1094 /* Specify the length of each IPv6 datagram fragment.
1095 * It has to be a multiple of 8.
1096 */
1097 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1098 sizeof(struct frag_hdr)) & ~7;
1099 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1100 ipv6_select_ident(&fhdr, rt);
1101 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1102
1103 append:
1104 return skb_append_datato_frags(sk, skb, getfrag, from,
1105 (length - transhdrlen));
1106 }
1107
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1108 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1109 gfp_t gfp)
1110 {
1111 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1112 }
1113
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1114 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1115 gfp_t gfp)
1116 {
1117 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1118 }
1119
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1120 static void ip6_append_data_mtu(unsigned int *mtu,
1121 int *maxfraglen,
1122 unsigned int fragheaderlen,
1123 struct sk_buff *skb,
1124 struct rt6_info *rt,
1125 unsigned int orig_mtu)
1126 {
1127 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1128 if (skb == NULL) {
1129 /* first fragment, reserve header_len */
1130 *mtu = orig_mtu - rt->dst.header_len;
1131
1132 } else {
1133 /*
1134 * this fragment is not first, the headers
1135 * space is regarded as data space.
1136 */
1137 *mtu = orig_mtu;
1138 }
1139 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1140 + fragheaderlen - sizeof(struct frag_hdr);
1141 }
1142 }
1143
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,int hlimit,int tclass,struct ipv6_txoptions * opt,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,int dontfrag)1144 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1145 int offset, int len, int odd, struct sk_buff *skb),
1146 void *from, int length, int transhdrlen,
1147 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1148 struct rt6_info *rt, unsigned int flags, int dontfrag)
1149 {
1150 struct inet_sock *inet = inet_sk(sk);
1151 struct ipv6_pinfo *np = inet6_sk(sk);
1152 struct inet_cork *cork;
1153 struct sk_buff *skb, *skb_prev = NULL;
1154 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1155 int exthdrlen;
1156 int dst_exthdrlen;
1157 int hh_len;
1158 int copy;
1159 int err;
1160 int offset = 0;
1161 __u8 tx_flags = 0;
1162 u32 tskey = 0;
1163
1164 if (flags&MSG_PROBE)
1165 return 0;
1166 cork = &inet->cork.base;
1167 if (skb_queue_empty(&sk->sk_write_queue)) {
1168 /*
1169 * setup for corking
1170 */
1171 if (opt) {
1172 if (WARN_ON(np->cork.opt))
1173 return -EINVAL;
1174
1175 np->cork.opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1176 if (unlikely(np->cork.opt == NULL))
1177 return -ENOBUFS;
1178
1179 np->cork.opt->tot_len = sizeof(*opt);
1180 np->cork.opt->opt_flen = opt->opt_flen;
1181 np->cork.opt->opt_nflen = opt->opt_nflen;
1182
1183 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1184 sk->sk_allocation);
1185 if (opt->dst0opt && !np->cork.opt->dst0opt)
1186 return -ENOBUFS;
1187
1188 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1189 sk->sk_allocation);
1190 if (opt->dst1opt && !np->cork.opt->dst1opt)
1191 return -ENOBUFS;
1192
1193 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1194 sk->sk_allocation);
1195 if (opt->hopopt && !np->cork.opt->hopopt)
1196 return -ENOBUFS;
1197
1198 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1199 sk->sk_allocation);
1200 if (opt->srcrt && !np->cork.opt->srcrt)
1201 return -ENOBUFS;
1202
1203 /* need source address above miyazawa*/
1204 }
1205 dst_hold(&rt->dst);
1206 cork->dst = &rt->dst;
1207 inet->cork.fl.u.ip6 = *fl6;
1208 np->cork.hop_limit = hlimit;
1209 np->cork.tclass = tclass;
1210 if (rt->dst.flags & DST_XFRM_TUNNEL)
1211 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1212 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1213 else
1214 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1215 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1216 if (np->frag_size < mtu) {
1217 if (np->frag_size)
1218 mtu = np->frag_size;
1219 }
1220 if (mtu < IPV6_MIN_MTU)
1221 return -EINVAL;
1222 cork->fragsize = mtu;
1223 if (dst_allfrag(rt->dst.path))
1224 cork->flags |= IPCORK_ALLFRAG;
1225 cork->length = 0;
1226 exthdrlen = (opt ? opt->opt_flen : 0);
1227 length += exthdrlen;
1228 transhdrlen += exthdrlen;
1229 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1230 } else {
1231 rt = (struct rt6_info *)cork->dst;
1232 fl6 = &inet->cork.fl.u.ip6;
1233 opt = np->cork.opt;
1234 transhdrlen = 0;
1235 exthdrlen = 0;
1236 dst_exthdrlen = 0;
1237 mtu = cork->fragsize;
1238 }
1239 orig_mtu = mtu;
1240
1241 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1242
1243 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1244 (opt ? opt->opt_nflen : 0);
1245 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1246 sizeof(struct frag_hdr);
1247
1248 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1249 unsigned int maxnonfragsize, headersize;
1250
1251 headersize = sizeof(struct ipv6hdr) +
1252 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1253 (dst_allfrag(&rt->dst) ?
1254 sizeof(struct frag_hdr) : 0) +
1255 rt->rt6i_nfheader_len;
1256
1257 if (ip6_sk_ignore_df(sk))
1258 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1259 else
1260 maxnonfragsize = mtu;
1261
1262 /* dontfrag active */
1263 if ((cork->length + length > mtu - headersize) && dontfrag &&
1264 (sk->sk_protocol == IPPROTO_UDP ||
1265 sk->sk_protocol == IPPROTO_RAW)) {
1266 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1267 sizeof(struct ipv6hdr));
1268 goto emsgsize;
1269 }
1270
1271 if (cork->length + length > maxnonfragsize - headersize) {
1272 emsgsize:
1273 ipv6_local_error(sk, EMSGSIZE, fl6,
1274 mtu - headersize +
1275 sizeof(struct ipv6hdr));
1276 return -EMSGSIZE;
1277 }
1278 }
1279
1280 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1281 sock_tx_timestamp(sk, &tx_flags);
1282 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1283 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1284 tskey = sk->sk_tskey++;
1285 }
1286
1287 /*
1288 * Let's try using as much space as possible.
1289 * Use MTU if total length of the message fits into the MTU.
1290 * Otherwise, we need to reserve fragment header and
1291 * fragment alignment (= 8-15 octects, in total).
1292 *
1293 * Note that we may need to "move" the data from the tail of
1294 * of the buffer to the new fragment when we split
1295 * the message.
1296 *
1297 * FIXME: It may be fragmented into multiple chunks
1298 * at once if non-fragmentable extension headers
1299 * are too large.
1300 * --yoshfuji
1301 */
1302
1303 skb = skb_peek_tail(&sk->sk_write_queue);
1304 cork->length += length;
1305 if ((skb && skb_is_gso(skb)) ||
1306 (((length + fragheaderlen) > mtu) &&
1307 (skb_queue_len(&sk->sk_write_queue) <= 1) &&
1308 (sk->sk_protocol == IPPROTO_UDP) &&
1309 (rt->dst.dev->features & NETIF_F_UFO) &&
1310 (sk->sk_type == SOCK_DGRAM))) {
1311 err = ip6_ufo_append_data(sk, getfrag, from, length,
1312 hh_len, fragheaderlen,
1313 transhdrlen, mtu, flags, rt);
1314 if (err)
1315 goto error;
1316 return 0;
1317 }
1318
1319 if (!skb)
1320 goto alloc_new_skb;
1321
1322 while (length > 0) {
1323 /* Check if the remaining data fits into current packet. */
1324 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1325 if (copy < length)
1326 copy = maxfraglen - skb->len;
1327
1328 if (copy <= 0) {
1329 char *data;
1330 unsigned int datalen;
1331 unsigned int fraglen;
1332 unsigned int fraggap;
1333 unsigned int alloclen;
1334 alloc_new_skb:
1335 /* There's no room in the current skb */
1336 if (skb)
1337 fraggap = skb->len - maxfraglen;
1338 else
1339 fraggap = 0;
1340 /* update mtu and maxfraglen if necessary */
1341 if (skb == NULL || skb_prev == NULL)
1342 ip6_append_data_mtu(&mtu, &maxfraglen,
1343 fragheaderlen, skb, rt,
1344 orig_mtu);
1345
1346 skb_prev = skb;
1347
1348 /*
1349 * If remaining data exceeds the mtu,
1350 * we know we need more fragment(s).
1351 */
1352 datalen = length + fraggap;
1353
1354 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1355 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1356 if ((flags & MSG_MORE) &&
1357 !(rt->dst.dev->features&NETIF_F_SG))
1358 alloclen = mtu;
1359 else
1360 alloclen = datalen + fragheaderlen;
1361
1362 alloclen += dst_exthdrlen;
1363
1364 if (datalen != length + fraggap) {
1365 /*
1366 * this is not the last fragment, the trailer
1367 * space is regarded as data space.
1368 */
1369 datalen += rt->dst.trailer_len;
1370 }
1371
1372 alloclen += rt->dst.trailer_len;
1373 fraglen = datalen + fragheaderlen;
1374
1375 /*
1376 * We just reserve space for fragment header.
1377 * Note: this may be overallocation if the message
1378 * (without MSG_MORE) fits into the MTU.
1379 */
1380 alloclen += sizeof(struct frag_hdr);
1381
1382 copy = datalen - transhdrlen - fraggap;
1383 if (copy < 0) {
1384 err = -EINVAL;
1385 goto error;
1386 }
1387 if (transhdrlen) {
1388 skb = sock_alloc_send_skb(sk,
1389 alloclen + hh_len,
1390 (flags & MSG_DONTWAIT), &err);
1391 } else {
1392 skb = NULL;
1393 if (atomic_read(&sk->sk_wmem_alloc) <=
1394 2 * sk->sk_sndbuf)
1395 skb = sock_wmalloc(sk,
1396 alloclen + hh_len, 1,
1397 sk->sk_allocation);
1398 if (unlikely(skb == NULL))
1399 err = -ENOBUFS;
1400 }
1401 if (skb == NULL)
1402 goto error;
1403 /*
1404 * Fill in the control structures
1405 */
1406 skb->protocol = htons(ETH_P_IPV6);
1407 skb->ip_summed = CHECKSUM_NONE;
1408 skb->csum = 0;
1409 /* reserve for fragmentation and ipsec header */
1410 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1411 dst_exthdrlen);
1412
1413 /* Only the initial fragment is time stamped */
1414 skb_shinfo(skb)->tx_flags = tx_flags;
1415 tx_flags = 0;
1416 skb_shinfo(skb)->tskey = tskey;
1417 tskey = 0;
1418
1419 /*
1420 * Find where to start putting bytes
1421 */
1422 data = skb_put(skb, fraglen);
1423 skb_set_network_header(skb, exthdrlen);
1424 data += fragheaderlen;
1425 skb->transport_header = (skb->network_header +
1426 fragheaderlen);
1427 if (fraggap) {
1428 skb->csum = skb_copy_and_csum_bits(
1429 skb_prev, maxfraglen,
1430 data + transhdrlen, fraggap, 0);
1431 skb_prev->csum = csum_sub(skb_prev->csum,
1432 skb->csum);
1433 data += fraggap;
1434 pskb_trim_unique(skb_prev, maxfraglen);
1435 }
1436 if (copy > 0 &&
1437 getfrag(from, data + transhdrlen, offset,
1438 copy, fraggap, skb) < 0) {
1439 err = -EFAULT;
1440 kfree_skb(skb);
1441 goto error;
1442 }
1443
1444 offset += copy;
1445 length -= datalen - fraggap;
1446 transhdrlen = 0;
1447 exthdrlen = 0;
1448 dst_exthdrlen = 0;
1449
1450 /*
1451 * Put the packet on the pending queue
1452 */
1453 __skb_queue_tail(&sk->sk_write_queue, skb);
1454 continue;
1455 }
1456
1457 if (copy > length)
1458 copy = length;
1459
1460 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1461 unsigned int off;
1462
1463 off = skb->len;
1464 if (getfrag(from, skb_put(skb, copy),
1465 offset, copy, off, skb) < 0) {
1466 __skb_trim(skb, off);
1467 err = -EFAULT;
1468 goto error;
1469 }
1470 } else {
1471 int i = skb_shinfo(skb)->nr_frags;
1472 struct page_frag *pfrag = sk_page_frag(sk);
1473
1474 err = -ENOMEM;
1475 if (!sk_page_frag_refill(sk, pfrag))
1476 goto error;
1477
1478 if (!skb_can_coalesce(skb, i, pfrag->page,
1479 pfrag->offset)) {
1480 err = -EMSGSIZE;
1481 if (i == MAX_SKB_FRAGS)
1482 goto error;
1483
1484 __skb_fill_page_desc(skb, i, pfrag->page,
1485 pfrag->offset, 0);
1486 skb_shinfo(skb)->nr_frags = ++i;
1487 get_page(pfrag->page);
1488 }
1489 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1490 if (getfrag(from,
1491 page_address(pfrag->page) + pfrag->offset,
1492 offset, copy, skb->len, skb) < 0)
1493 goto error_efault;
1494
1495 pfrag->offset += copy;
1496 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1497 skb->len += copy;
1498 skb->data_len += copy;
1499 skb->truesize += copy;
1500 atomic_add(copy, &sk->sk_wmem_alloc);
1501 }
1502 offset += copy;
1503 length -= copy;
1504 }
1505
1506 return 0;
1507
1508 error_efault:
1509 err = -EFAULT;
1510 error:
1511 cork->length -= length;
1512 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1513 return err;
1514 }
1515 EXPORT_SYMBOL_GPL(ip6_append_data);
1516
ip6_cork_release(struct inet_sock * inet,struct ipv6_pinfo * np)1517 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1518 {
1519 if (np->cork.opt) {
1520 kfree(np->cork.opt->dst0opt);
1521 kfree(np->cork.opt->dst1opt);
1522 kfree(np->cork.opt->hopopt);
1523 kfree(np->cork.opt->srcrt);
1524 kfree(np->cork.opt);
1525 np->cork.opt = NULL;
1526 }
1527
1528 if (inet->cork.base.dst) {
1529 dst_release(inet->cork.base.dst);
1530 inet->cork.base.dst = NULL;
1531 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1532 }
1533 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1534 }
1535
ip6_push_pending_frames(struct sock * sk)1536 int ip6_push_pending_frames(struct sock *sk)
1537 {
1538 struct sk_buff *skb, *tmp_skb;
1539 struct sk_buff **tail_skb;
1540 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1541 struct inet_sock *inet = inet_sk(sk);
1542 struct ipv6_pinfo *np = inet6_sk(sk);
1543 struct net *net = sock_net(sk);
1544 struct ipv6hdr *hdr;
1545 struct ipv6_txoptions *opt = np->cork.opt;
1546 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1547 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1548 unsigned char proto = fl6->flowi6_proto;
1549 int err = 0;
1550
1551 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1552 goto out;
1553 tail_skb = &(skb_shinfo(skb)->frag_list);
1554
1555 /* move skb->data to ip header from ext header */
1556 if (skb->data < skb_network_header(skb))
1557 __skb_pull(skb, skb_network_offset(skb));
1558 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1559 __skb_pull(tmp_skb, skb_network_header_len(skb));
1560 *tail_skb = tmp_skb;
1561 tail_skb = &(tmp_skb->next);
1562 skb->len += tmp_skb->len;
1563 skb->data_len += tmp_skb->len;
1564 skb->truesize += tmp_skb->truesize;
1565 tmp_skb->destructor = NULL;
1566 tmp_skb->sk = NULL;
1567 }
1568
1569 /* Allow local fragmentation. */
1570 skb->ignore_df = ip6_sk_ignore_df(sk);
1571
1572 *final_dst = fl6->daddr;
1573 __skb_pull(skb, skb_network_header_len(skb));
1574 if (opt && opt->opt_flen)
1575 ipv6_push_frag_opts(skb, opt, &proto);
1576 if (opt && opt->opt_nflen)
1577 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1578
1579 skb_push(skb, sizeof(struct ipv6hdr));
1580 skb_reset_network_header(skb);
1581 hdr = ipv6_hdr(skb);
1582
1583 ip6_flow_hdr(hdr, np->cork.tclass,
1584 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1585 np->autoflowlabel));
1586 hdr->hop_limit = np->cork.hop_limit;
1587 hdr->nexthdr = proto;
1588 hdr->saddr = fl6->saddr;
1589 hdr->daddr = *final_dst;
1590
1591 skb->priority = sk->sk_priority;
1592 skb->mark = sk->sk_mark;
1593
1594 skb_dst_set(skb, dst_clone(&rt->dst));
1595 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1596 if (proto == IPPROTO_ICMPV6) {
1597 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1598
1599 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1600 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1601 }
1602
1603 err = ip6_local_out(skb);
1604 if (err) {
1605 if (err > 0)
1606 err = net_xmit_errno(err);
1607 if (err)
1608 goto error;
1609 }
1610
1611 out:
1612 ip6_cork_release(inet, np);
1613 return err;
1614 error:
1615 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1616 goto out;
1617 }
1618 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1619
ip6_flush_pending_frames(struct sock * sk)1620 void ip6_flush_pending_frames(struct sock *sk)
1621 {
1622 struct sk_buff *skb;
1623
1624 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1625 if (skb_dst(skb))
1626 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1627 IPSTATS_MIB_OUTDISCARDS);
1628 kfree_skb(skb);
1629 }
1630
1631 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1632 }
1633 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1634