1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 int delta = hh_len - skb_headroom(skb);
65 const struct in6_addr *nexthop;
66 struct neighbour *neigh;
67 int ret;
68
69 /* Be paranoid, rather than too clever. */
70 if (unlikely(delta > 0) && dev->header_ops) {
71 /* pskb_expand_head() might crash, if skb is shared */
72 if (skb_shared(skb)) {
73 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74
75 if (likely(nskb)) {
76 if (skb->sk)
77 skb_set_owner_w(nskb, skb->sk);
78 consume_skb(skb);
79 } else {
80 kfree_skb(skb);
81 }
82 skb = nskb;
83 }
84 if (skb &&
85 pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 kfree_skb(skb);
87 skb = NULL;
88 }
89 if (!skb) {
90 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 return -ENOMEM;
92 }
93 }
94
95 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97
98 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99 ((mroute6_is_socket(net, skb) &&
100 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 &ipv6_hdr(skb)->saddr))) {
103 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104
105 /* Do not check for IFF_ALLMULTI; multicast routing
106 is not supported in any case.
107 */
108 if (newskb)
109 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 net, sk, newskb, NULL, newskb->dev,
111 dev_loopback_xmit);
112
113 if (ipv6_hdr(skb)->hop_limit == 0) {
114 IP6_INC_STATS(net, idev,
115 IPSTATS_MIB_OUTDISCARDS);
116 kfree_skb(skb);
117 return 0;
118 }
119 }
120
121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122
123 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 IPV6_ADDR_SCOPE_NODELOCAL &&
125 !(dev->flags & IFF_LOOPBACK)) {
126 kfree_skb(skb);
127 return 0;
128 }
129 }
130
131 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 int res = lwtunnel_xmit(skb);
133
134 if (res != LWTUNNEL_XMIT_CONTINUE)
135 return res;
136 }
137
138 rcu_read_lock_bh();
139 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 if (unlikely(!neigh))
142 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 if (!IS_ERR(neigh)) {
144 sock_confirm_neigh(skb, neigh);
145 ret = neigh_output(neigh, skb, false);
146 rcu_read_unlock_bh();
147 return ret;
148 }
149 rcu_read_unlock_bh();
150
151 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152 kfree_skb(skb);
153 return -EINVAL;
154 }
155
156 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 struct sk_buff *skb, unsigned int mtu)
159 {
160 struct sk_buff *segs, *nskb;
161 netdev_features_t features;
162 int ret = 0;
163
164 /* Please see corresponding comment in ip_finish_output_gso
165 * describing the cases where GSO segment length exceeds the
166 * egress MTU.
167 */
168 features = netif_skb_features(skb);
169 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 if (IS_ERR_OR_NULL(segs)) {
171 kfree_skb(skb);
172 return -ENOMEM;
173 }
174
175 consume_skb(skb);
176
177 skb_list_walk_safe(segs, segs, nskb) {
178 int err;
179
180 skb_mark_not_on_list(segs);
181 /* Last GSO segment can be smaller than gso_size (and MTU).
182 * Adding a fragment header would produce an "atomic fragment",
183 * which is considered harmful (RFC-8021). Avoid that.
184 */
185 err = segs->len > mtu ?
186 ip6_fragment(net, sk, segs, ip6_finish_output2) :
187 ip6_finish_output2(net, sk, segs);
188 if (err && ret == 0)
189 ret = err;
190 }
191
192 return ret;
193 }
194
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197 unsigned int mtu;
198
199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200 /* Policy lookup after SNAT yielded a new policy */
201 if (skb_dst(skb)->xfrm) {
202 IP6CB(skb)->flags |= IP6SKB_REROUTED;
203 return dst_output(net, sk, skb);
204 }
205 #endif
206
207 mtu = ip6_skb_dst_mtu(skb);
208 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
209 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
210
211 if ((skb->len > mtu && !skb_is_gso(skb)) ||
212 dst_allfrag(skb_dst(skb)) ||
213 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
214 return ip6_fragment(net, sk, skb, ip6_finish_output2);
215 else
216 return ip6_finish_output2(net, sk, skb);
217 }
218
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)219 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
220 {
221 int ret;
222
223 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
224 switch (ret) {
225 case NET_XMIT_SUCCESS:
226 return __ip6_finish_output(net, sk, skb);
227 case NET_XMIT_CN:
228 return __ip6_finish_output(net, sk, skb) ? : ret;
229 default:
230 kfree_skb(skb);
231 return ret;
232 }
233 }
234
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)235 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
236 {
237 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
238 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
239
240 skb->protocol = htons(ETH_P_IPV6);
241 skb->dev = dev;
242
243 if (unlikely(idev->cnf.disable_ipv6)) {
244 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
245 kfree_skb(skb);
246 return 0;
247 }
248
249 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
250 net, sk, skb, indev, dev,
251 ip6_finish_output,
252 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
253 }
254
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)255 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
256 {
257 if (!np->autoflowlabel_set)
258 return ip6_default_np_autolabel(net);
259 else
260 return np->autoflowlabel;
261 }
262
263 /*
264 * xmit an sk_buff (used by TCP, SCTP and DCCP)
265 * Note : socket lock is not held for SYNACK packets, but might be modified
266 * by calls to skb_set_owner_w() and ipv6_local_error(),
267 * which are using proper atomic operations or spinlocks.
268 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
270 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
271 {
272 struct net *net = sock_net(sk);
273 const struct ipv6_pinfo *np = inet6_sk(sk);
274 struct in6_addr *first_hop = &fl6->daddr;
275 struct dst_entry *dst = skb_dst(skb);
276 unsigned int head_room;
277 struct ipv6hdr *hdr;
278 u8 proto = fl6->flowi6_proto;
279 int seg_len = skb->len;
280 int hlimit = -1;
281 u32 mtu;
282
283 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
284 if (opt)
285 head_room += opt->opt_nflen + opt->opt_flen;
286
287 if (unlikely(skb_headroom(skb) < head_room)) {
288 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
289 if (!skb2) {
290 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
291 IPSTATS_MIB_OUTDISCARDS);
292 kfree_skb(skb);
293 return -ENOBUFS;
294 }
295 if (skb->sk)
296 skb_set_owner_w(skb2, skb->sk);
297 consume_skb(skb);
298 skb = skb2;
299 }
300
301 if (opt) {
302 seg_len += opt->opt_nflen + opt->opt_flen;
303
304 if (opt->opt_flen)
305 ipv6_push_frag_opts(skb, opt, &proto);
306
307 if (opt->opt_nflen)
308 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
309 &fl6->saddr);
310 }
311
312 skb_push(skb, sizeof(struct ipv6hdr));
313 skb_reset_network_header(skb);
314 hdr = ipv6_hdr(skb);
315
316 /*
317 * Fill in the IPv6 header
318 */
319 if (np)
320 hlimit = np->hop_limit;
321 if (hlimit < 0)
322 hlimit = ip6_dst_hoplimit(dst);
323
324 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
325 ip6_autoflowlabel(net, np), fl6));
326
327 hdr->payload_len = htons(seg_len);
328 hdr->nexthdr = proto;
329 hdr->hop_limit = hlimit;
330
331 hdr->saddr = fl6->saddr;
332 hdr->daddr = *first_hop;
333
334 skb->protocol = htons(ETH_P_IPV6);
335 skb->priority = priority;
336 skb->mark = mark;
337
338 mtu = dst_mtu(dst);
339 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
340 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
341 IPSTATS_MIB_OUT, skb->len);
342
343 /* if egress device is enslaved to an L3 master device pass the
344 * skb to its handler for processing
345 */
346 skb = l3mdev_ip6_out((struct sock *)sk, skb);
347 if (unlikely(!skb))
348 return 0;
349
350 /* hooks should never assume socket lock is held.
351 * we promote our socket to non const
352 */
353 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
354 net, (struct sock *)sk, skb, NULL, dst->dev,
355 dst_output);
356 }
357
358 skb->dev = dst->dev;
359 /* ipv6_local_error() does not require socket lock,
360 * we promote our socket to non const
361 */
362 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
363
364 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
365 kfree_skb(skb);
366 return -EMSGSIZE;
367 }
368 EXPORT_SYMBOL(ip6_xmit);
369
ip6_call_ra_chain(struct sk_buff * skb,int sel)370 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
371 {
372 struct ip6_ra_chain *ra;
373 struct sock *last = NULL;
374
375 read_lock(&ip6_ra_lock);
376 for (ra = ip6_ra_chain; ra; ra = ra->next) {
377 struct sock *sk = ra->sk;
378 if (sk && ra->sel == sel &&
379 (!sk->sk_bound_dev_if ||
380 sk->sk_bound_dev_if == skb->dev->ifindex)) {
381 struct ipv6_pinfo *np = inet6_sk(sk);
382
383 if (np && np->rtalert_isolate &&
384 !net_eq(sock_net(sk), dev_net(skb->dev))) {
385 continue;
386 }
387 if (last) {
388 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
389 if (skb2)
390 rawv6_rcv(last, skb2);
391 }
392 last = sk;
393 }
394 }
395
396 if (last) {
397 rawv6_rcv(last, skb);
398 read_unlock(&ip6_ra_lock);
399 return 1;
400 }
401 read_unlock(&ip6_ra_lock);
402 return 0;
403 }
404
ip6_forward_proxy_check(struct sk_buff * skb)405 static int ip6_forward_proxy_check(struct sk_buff *skb)
406 {
407 struct ipv6hdr *hdr = ipv6_hdr(skb);
408 u8 nexthdr = hdr->nexthdr;
409 __be16 frag_off;
410 int offset;
411
412 if (ipv6_ext_hdr(nexthdr)) {
413 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
414 if (offset < 0)
415 return 0;
416 } else
417 offset = sizeof(struct ipv6hdr);
418
419 if (nexthdr == IPPROTO_ICMPV6) {
420 struct icmp6hdr *icmp6;
421
422 if (!pskb_may_pull(skb, (skb_network_header(skb) +
423 offset + 1 - skb->data)))
424 return 0;
425
426 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
427
428 switch (icmp6->icmp6_type) {
429 case NDISC_ROUTER_SOLICITATION:
430 case NDISC_ROUTER_ADVERTISEMENT:
431 case NDISC_NEIGHBOUR_SOLICITATION:
432 case NDISC_NEIGHBOUR_ADVERTISEMENT:
433 case NDISC_REDIRECT:
434 /* For reaction involving unicast neighbor discovery
435 * message destined to the proxied address, pass it to
436 * input function.
437 */
438 return 1;
439 default:
440 break;
441 }
442 }
443
444 /*
445 * The proxying router can't forward traffic sent to a link-local
446 * address, so signal the sender and discard the packet. This
447 * behavior is clarified by the MIPv6 specification.
448 */
449 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
450 dst_link_failure(skb);
451 return -1;
452 }
453
454 return 0;
455 }
456
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)457 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
458 struct sk_buff *skb)
459 {
460 struct dst_entry *dst = skb_dst(skb);
461
462 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
463 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
464
465 #ifdef CONFIG_NET_SWITCHDEV
466 if (skb->offload_l3_fwd_mark) {
467 consume_skb(skb);
468 return 0;
469 }
470 #endif
471
472 skb->tstamp = 0;
473 return dst_output(net, sk, skb);
474 }
475
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)476 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
477 {
478 if (skb->len <= mtu)
479 return false;
480
481 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
482 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
483 return true;
484
485 if (skb->ignore_df)
486 return false;
487
488 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
489 return false;
490
491 return true;
492 }
493
ip6_forward(struct sk_buff * skb)494 int ip6_forward(struct sk_buff *skb)
495 {
496 struct dst_entry *dst = skb_dst(skb);
497 struct ipv6hdr *hdr = ipv6_hdr(skb);
498 struct inet6_skb_parm *opt = IP6CB(skb);
499 struct net *net = dev_net(dst->dev);
500 struct inet6_dev *idev;
501 u32 mtu;
502
503 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504 if (net->ipv6.devconf_all->forwarding == 0)
505 goto error;
506
507 if (skb->pkt_type != PACKET_HOST)
508 goto drop;
509
510 if (unlikely(skb->sk))
511 goto drop;
512
513 if (skb_warn_if_lro(skb))
514 goto drop;
515
516 if (!net->ipv6.devconf_all->disable_policy &&
517 (!idev || !idev->cnf.disable_policy) &&
518 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520 goto drop;
521 }
522
523 skb_forward_csum(skb);
524
525 /*
526 * We DO NOT make any processing on
527 * RA packets, pushing them to user level AS IS
528 * without ane WARRANTY that application will be able
529 * to interpret them. The reason is that we
530 * cannot make anything clever here.
531 *
532 * We are not end-node, so that if packet contains
533 * AH/ESP, we cannot make anything.
534 * Defragmentation also would be mistake, RA packets
535 * cannot be fragmented, because there is no warranty
536 * that different fragments will go along one path. --ANK
537 */
538 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540 return 0;
541 }
542
543 /*
544 * check and decrement ttl
545 */
546 if (hdr->hop_limit <= 1) {
547 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549
550 kfree_skb(skb);
551 return -ETIMEDOUT;
552 }
553
554 /* XXX: idev->cnf.proxy_ndp? */
555 if (net->ipv6.devconf_all->proxy_ndp &&
556 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557 int proxied = ip6_forward_proxy_check(skb);
558 if (proxied > 0)
559 return ip6_input(skb);
560 else if (proxied < 0) {
561 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
562 goto drop;
563 }
564 }
565
566 if (!xfrm6_route_forward(skb)) {
567 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
568 goto drop;
569 }
570 dst = skb_dst(skb);
571
572 /* IPv6 specs say nothing about it, but it is clear that we cannot
573 send redirects to source routed frames.
574 We don't send redirects to frames decapsulated from IPsec.
575 */
576 if (IP6CB(skb)->iif == dst->dev->ifindex &&
577 opt->srcrt == 0 && !skb_sec_path(skb)) {
578 struct in6_addr *target = NULL;
579 struct inet_peer *peer;
580 struct rt6_info *rt;
581
582 /*
583 * incoming and outgoing devices are the same
584 * send a redirect.
585 */
586
587 rt = (struct rt6_info *) dst;
588 if (rt->rt6i_flags & RTF_GATEWAY)
589 target = &rt->rt6i_gateway;
590 else
591 target = &hdr->daddr;
592
593 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
594
595 /* Limit redirects both by destination (here)
596 and by source (inside ndisc_send_redirect)
597 */
598 if (inet_peer_xrlim_allow(peer, 1*HZ))
599 ndisc_send_redirect(skb, target);
600 if (peer)
601 inet_putpeer(peer);
602 } else {
603 int addrtype = ipv6_addr_type(&hdr->saddr);
604
605 /* This check is security critical. */
606 if (addrtype == IPV6_ADDR_ANY ||
607 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
608 goto error;
609 if (addrtype & IPV6_ADDR_LINKLOCAL) {
610 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
611 ICMPV6_NOT_NEIGHBOUR, 0);
612 goto error;
613 }
614 }
615
616 mtu = ip6_dst_mtu_forward(dst);
617 if (mtu < IPV6_MIN_MTU)
618 mtu = IPV6_MIN_MTU;
619
620 if (ip6_pkt_too_big(skb, mtu)) {
621 /* Again, force OUTPUT device used as source address */
622 skb->dev = dst->dev;
623 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
624 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
625 __IP6_INC_STATS(net, ip6_dst_idev(dst),
626 IPSTATS_MIB_FRAGFAILS);
627 kfree_skb(skb);
628 return -EMSGSIZE;
629 }
630
631 if (skb_cow(skb, dst->dev->hard_header_len)) {
632 __IP6_INC_STATS(net, ip6_dst_idev(dst),
633 IPSTATS_MIB_OUTDISCARDS);
634 goto drop;
635 }
636
637 hdr = ipv6_hdr(skb);
638
639 /* Mangling hops number delayed to point after skb COW */
640
641 hdr->hop_limit--;
642
643 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
644 net, NULL, skb, skb->dev, dst->dev,
645 ip6_forward_finish);
646
647 error:
648 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
649 drop:
650 kfree_skb(skb);
651 return -EINVAL;
652 }
653
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)654 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
655 {
656 to->pkt_type = from->pkt_type;
657 to->priority = from->priority;
658 to->protocol = from->protocol;
659 skb_dst_drop(to);
660 skb_dst_set(to, dst_clone(skb_dst(from)));
661 to->dev = from->dev;
662 to->mark = from->mark;
663
664 skb_copy_hash(to, from);
665
666 #ifdef CONFIG_NET_SCHED
667 to->tc_index = from->tc_index;
668 #endif
669 nf_copy(to, from);
670 skb_ext_copy(to, from);
671 skb_copy_secmark(to, from);
672 }
673
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)674 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
675 u8 nexthdr, __be32 frag_id,
676 struct ip6_fraglist_iter *iter)
677 {
678 unsigned int first_len;
679 struct frag_hdr *fh;
680
681 /* BUILD HEADER */
682 *prevhdr = NEXTHDR_FRAGMENT;
683 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
684 if (!iter->tmp_hdr)
685 return -ENOMEM;
686
687 iter->frag = skb_shinfo(skb)->frag_list;
688 skb_frag_list_init(skb);
689
690 iter->offset = 0;
691 iter->hlen = hlen;
692 iter->frag_id = frag_id;
693 iter->nexthdr = nexthdr;
694
695 __skb_pull(skb, hlen);
696 fh = __skb_push(skb, sizeof(struct frag_hdr));
697 __skb_push(skb, hlen);
698 skb_reset_network_header(skb);
699 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
700
701 fh->nexthdr = nexthdr;
702 fh->reserved = 0;
703 fh->frag_off = htons(IP6_MF);
704 fh->identification = frag_id;
705
706 first_len = skb_pagelen(skb);
707 skb->data_len = first_len - skb_headlen(skb);
708 skb->len = first_len;
709 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
710
711 return 0;
712 }
713 EXPORT_SYMBOL(ip6_fraglist_init);
714
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)715 void ip6_fraglist_prepare(struct sk_buff *skb,
716 struct ip6_fraglist_iter *iter)
717 {
718 struct sk_buff *frag = iter->frag;
719 unsigned int hlen = iter->hlen;
720 struct frag_hdr *fh;
721
722 frag->ip_summed = CHECKSUM_NONE;
723 skb_reset_transport_header(frag);
724 fh = __skb_push(frag, sizeof(struct frag_hdr));
725 __skb_push(frag, hlen);
726 skb_reset_network_header(frag);
727 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
728 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
729 fh->nexthdr = iter->nexthdr;
730 fh->reserved = 0;
731 fh->frag_off = htons(iter->offset);
732 if (frag->next)
733 fh->frag_off |= htons(IP6_MF);
734 fh->identification = iter->frag_id;
735 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
736 ip6_copy_metadata(frag, skb);
737 }
738 EXPORT_SYMBOL(ip6_fraglist_prepare);
739
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)740 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
741 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
742 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
743 {
744 state->prevhdr = prevhdr;
745 state->nexthdr = nexthdr;
746 state->frag_id = frag_id;
747
748 state->hlen = hlen;
749 state->mtu = mtu;
750
751 state->left = skb->len - hlen; /* Space per frame */
752 state->ptr = hlen; /* Where to start from */
753
754 state->hroom = hdr_room;
755 state->troom = needed_tailroom;
756
757 state->offset = 0;
758 }
759 EXPORT_SYMBOL(ip6_frag_init);
760
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)761 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
762 {
763 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
764 struct sk_buff *frag;
765 struct frag_hdr *fh;
766 unsigned int len;
767
768 len = state->left;
769 /* IF: it doesn't fit, use 'mtu' - the data space left */
770 if (len > state->mtu)
771 len = state->mtu;
772 /* IF: we are not sending up to and including the packet end
773 then align the next start on an eight byte boundary */
774 if (len < state->left)
775 len &= ~7;
776
777 /* Allocate buffer */
778 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
779 state->hroom + state->troom, GFP_ATOMIC);
780 if (!frag)
781 return ERR_PTR(-ENOMEM);
782
783 /*
784 * Set up data on packet
785 */
786
787 ip6_copy_metadata(frag, skb);
788 skb_reserve(frag, state->hroom);
789 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
790 skb_reset_network_header(frag);
791 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
792 frag->transport_header = (frag->network_header + state->hlen +
793 sizeof(struct frag_hdr));
794
795 /*
796 * Charge the memory for the fragment to any owner
797 * it might possess
798 */
799 if (skb->sk)
800 skb_set_owner_w(frag, skb->sk);
801
802 /*
803 * Copy the packet header into the new buffer.
804 */
805 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
806
807 fragnexthdr_offset = skb_network_header(frag);
808 fragnexthdr_offset += prevhdr - skb_network_header(skb);
809 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
810
811 /*
812 * Build fragment header.
813 */
814 fh->nexthdr = state->nexthdr;
815 fh->reserved = 0;
816 fh->identification = state->frag_id;
817
818 /*
819 * Copy a block of the IP datagram.
820 */
821 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
822 len));
823 state->left -= len;
824
825 fh->frag_off = htons(state->offset);
826 if (state->left > 0)
827 fh->frag_off |= htons(IP6_MF);
828 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
829
830 state->ptr += len;
831 state->offset += len;
832
833 return frag;
834 }
835 EXPORT_SYMBOL(ip6_frag_next);
836
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))837 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
838 int (*output)(struct net *, struct sock *, struct sk_buff *))
839 {
840 struct sk_buff *frag;
841 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
842 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
843 inet6_sk(skb->sk) : NULL;
844 struct ip6_frag_state state;
845 unsigned int mtu, hlen, nexthdr_offset;
846 ktime_t tstamp = skb->tstamp;
847 int hroom, err = 0;
848 __be32 frag_id;
849 u8 *prevhdr, nexthdr = 0;
850
851 err = ip6_find_1stfragopt(skb, &prevhdr);
852 if (err < 0)
853 goto fail;
854 hlen = err;
855 nexthdr = *prevhdr;
856 nexthdr_offset = prevhdr - skb_network_header(skb);
857
858 mtu = ip6_skb_dst_mtu(skb);
859
860 /* We must not fragment if the socket is set to force MTU discovery
861 * or if the skb it not generated by a local socket.
862 */
863 if (unlikely(!skb->ignore_df && skb->len > mtu))
864 goto fail_toobig;
865
866 if (IP6CB(skb)->frag_max_size) {
867 if (IP6CB(skb)->frag_max_size > mtu)
868 goto fail_toobig;
869
870 /* don't send fragments larger than what we received */
871 mtu = IP6CB(skb)->frag_max_size;
872 if (mtu < IPV6_MIN_MTU)
873 mtu = IPV6_MIN_MTU;
874 }
875
876 if (np && np->frag_size < mtu) {
877 if (np->frag_size)
878 mtu = np->frag_size;
879 }
880 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
881 goto fail_toobig;
882 mtu -= hlen + sizeof(struct frag_hdr);
883
884 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
885 &ipv6_hdr(skb)->saddr);
886
887 if (skb->ip_summed == CHECKSUM_PARTIAL &&
888 (err = skb_checksum_help(skb)))
889 goto fail;
890
891 prevhdr = skb_network_header(skb) + nexthdr_offset;
892 hroom = LL_RESERVED_SPACE(rt->dst.dev);
893 if (skb_has_frag_list(skb)) {
894 unsigned int first_len = skb_pagelen(skb);
895 struct ip6_fraglist_iter iter;
896 struct sk_buff *frag2;
897
898 if (first_len - hlen > mtu ||
899 ((first_len - hlen) & 7) ||
900 skb_cloned(skb) ||
901 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
902 goto slow_path;
903
904 skb_walk_frags(skb, frag) {
905 /* Correct geometry. */
906 if (frag->len > mtu ||
907 ((frag->len & 7) && frag->next) ||
908 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
909 goto slow_path_clean;
910
911 /* Partially cloned skb? */
912 if (skb_shared(frag))
913 goto slow_path_clean;
914
915 BUG_ON(frag->sk);
916 if (skb->sk) {
917 frag->sk = skb->sk;
918 frag->destructor = sock_wfree;
919 }
920 skb->truesize -= frag->truesize;
921 }
922
923 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
924 &iter);
925 if (err < 0)
926 goto fail;
927
928 /* We prevent @rt from being freed. */
929 rcu_read_lock();
930
931 for (;;) {
932 /* Prepare header of the next frame,
933 * before previous one went down. */
934 if (iter.frag)
935 ip6_fraglist_prepare(skb, &iter);
936
937 skb->tstamp = tstamp;
938 err = output(net, sk, skb);
939 if (!err)
940 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
941 IPSTATS_MIB_FRAGCREATES);
942
943 if (err || !iter.frag)
944 break;
945
946 skb = ip6_fraglist_next(&iter);
947 }
948
949 kfree(iter.tmp_hdr);
950
951 if (err == 0) {
952 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
953 IPSTATS_MIB_FRAGOKS);
954 rcu_read_unlock();
955 return 0;
956 }
957
958 kfree_skb_list(iter.frag);
959
960 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 IPSTATS_MIB_FRAGFAILS);
962 rcu_read_unlock();
963 return err;
964
965 slow_path_clean:
966 skb_walk_frags(skb, frag2) {
967 if (frag2 == frag)
968 break;
969 frag2->sk = NULL;
970 frag2->destructor = NULL;
971 skb->truesize += frag2->truesize;
972 }
973 }
974
975 slow_path:
976 /*
977 * Fragment the datagram.
978 */
979
980 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
981 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
982 &state);
983
984 /*
985 * Keep copying data until we run out.
986 */
987
988 while (state.left > 0) {
989 frag = ip6_frag_next(skb, &state);
990 if (IS_ERR(frag)) {
991 err = PTR_ERR(frag);
992 goto fail;
993 }
994
995 /*
996 * Put this fragment into the sending queue.
997 */
998 frag->tstamp = tstamp;
999 err = output(net, sk, frag);
1000 if (err)
1001 goto fail;
1002
1003 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1004 IPSTATS_MIB_FRAGCREATES);
1005 }
1006 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1007 IPSTATS_MIB_FRAGOKS);
1008 consume_skb(skb);
1009 return err;
1010
1011 fail_toobig:
1012 if (skb->sk && dst_allfrag(skb_dst(skb)))
1013 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1014
1015 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1016 err = -EMSGSIZE;
1017
1018 fail:
1019 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1020 IPSTATS_MIB_FRAGFAILS);
1021 kfree_skb(skb);
1022 return err;
1023 }
1024
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1025 static inline int ip6_rt_check(const struct rt6key *rt_key,
1026 const struct in6_addr *fl_addr,
1027 const struct in6_addr *addr_cache)
1028 {
1029 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1030 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1031 }
1032
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1033 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1034 struct dst_entry *dst,
1035 const struct flowi6 *fl6)
1036 {
1037 struct ipv6_pinfo *np = inet6_sk(sk);
1038 struct rt6_info *rt;
1039
1040 if (!dst)
1041 goto out;
1042
1043 if (dst->ops->family != AF_INET6) {
1044 dst_release(dst);
1045 return NULL;
1046 }
1047
1048 rt = (struct rt6_info *)dst;
1049 /* Yes, checking route validity in not connected
1050 * case is not very simple. Take into account,
1051 * that we do not support routing by source, TOS,
1052 * and MSG_DONTROUTE --ANK (980726)
1053 *
1054 * 1. ip6_rt_check(): If route was host route,
1055 * check that cached destination is current.
1056 * If it is network route, we still may
1057 * check its validity using saved pointer
1058 * to the last used address: daddr_cache.
1059 * We do not want to save whole address now,
1060 * (because main consumer of this service
1061 * is tcp, which has not this problem),
1062 * so that the last trick works only on connected
1063 * sockets.
1064 * 2. oif also should be the same.
1065 */
1066 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1067 #ifdef CONFIG_IPV6_SUBTREES
1068 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1069 #endif
1070 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1071 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1072 dst_release(dst);
1073 dst = NULL;
1074 }
1075
1076 out:
1077 return dst;
1078 }
1079
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1080 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1081 struct dst_entry **dst, struct flowi6 *fl6)
1082 {
1083 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1084 struct neighbour *n;
1085 struct rt6_info *rt;
1086 #endif
1087 int err;
1088 int flags = 0;
1089
1090 /* The correct way to handle this would be to do
1091 * ip6_route_get_saddr, and then ip6_route_output; however,
1092 * the route-specific preferred source forces the
1093 * ip6_route_output call _before_ ip6_route_get_saddr.
1094 *
1095 * In source specific routing (no src=any default route),
1096 * ip6_route_output will fail given src=any saddr, though, so
1097 * that's why we try it again later.
1098 */
1099 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1100 struct fib6_info *from;
1101 struct rt6_info *rt;
1102 bool had_dst = *dst != NULL;
1103
1104 if (!had_dst)
1105 *dst = ip6_route_output(net, sk, fl6);
1106 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1107
1108 rcu_read_lock();
1109 from = rt ? rcu_dereference(rt->from) : NULL;
1110 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1111 sk ? inet6_sk(sk)->srcprefs : 0,
1112 &fl6->saddr);
1113 rcu_read_unlock();
1114
1115 if (err)
1116 goto out_err_release;
1117
1118 /* If we had an erroneous initial result, pretend it
1119 * never existed and let the SA-enabled version take
1120 * over.
1121 */
1122 if (!had_dst && (*dst)->error) {
1123 dst_release(*dst);
1124 *dst = NULL;
1125 }
1126
1127 if (fl6->flowi6_oif)
1128 flags |= RT6_LOOKUP_F_IFACE;
1129 }
1130
1131 if (!*dst)
1132 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1133
1134 err = (*dst)->error;
1135 if (err)
1136 goto out_err_release;
1137
1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1139 /*
1140 * Here if the dst entry we've looked up
1141 * has a neighbour entry that is in the INCOMPLETE
1142 * state and the src address from the flow is
1143 * marked as OPTIMISTIC, we release the found
1144 * dst entry and replace it instead with the
1145 * dst entry of the nexthop router
1146 */
1147 rt = (struct rt6_info *) *dst;
1148 rcu_read_lock_bh();
1149 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1150 rt6_nexthop(rt, &fl6->daddr));
1151 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1152 rcu_read_unlock_bh();
1153
1154 if (err) {
1155 struct inet6_ifaddr *ifp;
1156 struct flowi6 fl_gw6;
1157 int redirect;
1158
1159 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1160 (*dst)->dev, 1);
1161
1162 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1163 if (ifp)
1164 in6_ifa_put(ifp);
1165
1166 if (redirect) {
1167 /*
1168 * We need to get the dst entry for the
1169 * default router instead
1170 */
1171 dst_release(*dst);
1172 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1173 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1174 *dst = ip6_route_output(net, sk, &fl_gw6);
1175 err = (*dst)->error;
1176 if (err)
1177 goto out_err_release;
1178 }
1179 }
1180 #endif
1181 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1182 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1183 err = -EAFNOSUPPORT;
1184 goto out_err_release;
1185 }
1186
1187 return 0;
1188
1189 out_err_release:
1190 dst_release(*dst);
1191 *dst = NULL;
1192
1193 if (err == -ENETUNREACH)
1194 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1195 return err;
1196 }
1197
1198 /**
1199 * ip6_dst_lookup - perform route lookup on flow
1200 * @net: Network namespace to perform lookup in
1201 * @sk: socket which provides route info
1202 * @dst: pointer to dst_entry * for result
1203 * @fl6: flow to lookup
1204 *
1205 * This function performs a route lookup on the given flow.
1206 *
1207 * It returns zero on success, or a standard errno code on error.
1208 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1209 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1210 struct flowi6 *fl6)
1211 {
1212 *dst = NULL;
1213 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1214 }
1215 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1216
1217 /**
1218 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1219 * @net: Network namespace to perform lookup in
1220 * @sk: socket which provides route info
1221 * @fl6: flow to lookup
1222 * @final_dst: final destination address for ipsec lookup
1223 *
1224 * This function performs a route lookup on the given flow.
1225 *
1226 * It returns a valid dst pointer on success, or a pointer encoded
1227 * error code.
1228 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1229 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1230 const struct in6_addr *final_dst)
1231 {
1232 struct dst_entry *dst = NULL;
1233 int err;
1234
1235 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1236 if (err)
1237 return ERR_PTR(err);
1238 if (final_dst)
1239 fl6->daddr = *final_dst;
1240
1241 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1244
1245 /**
1246 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1247 * @sk: socket which provides the dst cache and route info
1248 * @fl6: flow to lookup
1249 * @final_dst: final destination address for ipsec lookup
1250 * @connected: whether @sk is connected or not
1251 *
1252 * This function performs a route lookup on the given flow with the
1253 * possibility of using the cached route in the socket if it is valid.
1254 * It will take the socket dst lock when operating on the dst cache.
1255 * As a result, this function can only be used in process context.
1256 *
1257 * In addition, for a connected socket, cache the dst in the socket
1258 * if the current cache is not valid.
1259 *
1260 * It returns a valid dst pointer on success, or a pointer encoded
1261 * error code.
1262 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1263 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1264 const struct in6_addr *final_dst,
1265 bool connected)
1266 {
1267 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1268
1269 dst = ip6_sk_dst_check(sk, dst, fl6);
1270 if (dst)
1271 return dst;
1272
1273 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1274 if (connected && !IS_ERR(dst))
1275 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1276
1277 return dst;
1278 }
1279 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1280
1281 /**
1282 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1283 * @skb: Packet for which lookup is done
1284 * @dev: Tunnel device
1285 * @net: Network namespace of tunnel device
1286 * @sock: Socket which provides route info
1287 * @saddr: Memory to store the src ip address
1288 * @info: Tunnel information
1289 * @protocol: IP protocol
1290 * @use_cache: Flag to enable cache usage
1291 * This function performs a route lookup on a tunnel
1292 *
1293 * It returns a valid dst pointer and stores src address to be used in
1294 * tunnel in param saddr on success, else a pointer encoded error code.
1295 */
1296
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1297 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1298 struct net_device *dev,
1299 struct net *net,
1300 struct socket *sock,
1301 struct in6_addr *saddr,
1302 const struct ip_tunnel_info *info,
1303 u8 protocol,
1304 bool use_cache)
1305 {
1306 struct dst_entry *dst = NULL;
1307 #ifdef CONFIG_DST_CACHE
1308 struct dst_cache *dst_cache;
1309 #endif
1310 struct flowi6 fl6;
1311 __u8 prio;
1312
1313 #ifdef CONFIG_DST_CACHE
1314 dst_cache = (struct dst_cache *)&info->dst_cache;
1315 if (use_cache) {
1316 dst = dst_cache_get_ip6(dst_cache, saddr);
1317 if (dst)
1318 return dst;
1319 }
1320 #endif
1321 memset(&fl6, 0, sizeof(fl6));
1322 fl6.flowi6_mark = skb->mark;
1323 fl6.flowi6_proto = protocol;
1324 fl6.daddr = info->key.u.ipv6.dst;
1325 fl6.saddr = info->key.u.ipv6.src;
1326 prio = info->key.tos;
1327 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1328
1329 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1330 NULL);
1331 if (IS_ERR(dst)) {
1332 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1333 return ERR_PTR(-ENETUNREACH);
1334 }
1335 if (dst->dev == dev) { /* is this necessary? */
1336 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1337 dst_release(dst);
1338 return ERR_PTR(-ELOOP);
1339 }
1340 #ifdef CONFIG_DST_CACHE
1341 if (use_cache)
1342 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1343 #endif
1344 *saddr = fl6.saddr;
1345 return dst;
1346 }
1347 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1348
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1349 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1350 gfp_t gfp)
1351 {
1352 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1353 }
1354
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1355 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1356 gfp_t gfp)
1357 {
1358 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1361 static void ip6_append_data_mtu(unsigned int *mtu,
1362 int *maxfraglen,
1363 unsigned int fragheaderlen,
1364 struct sk_buff *skb,
1365 struct rt6_info *rt,
1366 unsigned int orig_mtu)
1367 {
1368 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1369 if (!skb) {
1370 /* first fragment, reserve header_len */
1371 *mtu = orig_mtu - rt->dst.header_len;
1372
1373 } else {
1374 /*
1375 * this fragment is not first, the headers
1376 * space is regarded as data space.
1377 */
1378 *mtu = orig_mtu;
1379 }
1380 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1381 + fragheaderlen - sizeof(struct frag_hdr);
1382 }
1383 }
1384
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt,struct flowi6 * fl6)1385 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1386 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1387 struct rt6_info *rt, struct flowi6 *fl6)
1388 {
1389 struct ipv6_pinfo *np = inet6_sk(sk);
1390 unsigned int mtu;
1391 struct ipv6_txoptions *opt = ipc6->opt;
1392
1393 /*
1394 * setup for corking
1395 */
1396 if (opt) {
1397 if (WARN_ON(v6_cork->opt))
1398 return -EINVAL;
1399
1400 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1401 if (unlikely(!v6_cork->opt))
1402 return -ENOBUFS;
1403
1404 v6_cork->opt->tot_len = sizeof(*opt);
1405 v6_cork->opt->opt_flen = opt->opt_flen;
1406 v6_cork->opt->opt_nflen = opt->opt_nflen;
1407
1408 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1409 sk->sk_allocation);
1410 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1411 return -ENOBUFS;
1412
1413 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1414 sk->sk_allocation);
1415 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1416 return -ENOBUFS;
1417
1418 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1419 sk->sk_allocation);
1420 if (opt->hopopt && !v6_cork->opt->hopopt)
1421 return -ENOBUFS;
1422
1423 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1424 sk->sk_allocation);
1425 if (opt->srcrt && !v6_cork->opt->srcrt)
1426 return -ENOBUFS;
1427
1428 /* need source address above miyazawa*/
1429 }
1430 dst_hold(&rt->dst);
1431 cork->base.dst = &rt->dst;
1432 cork->fl.u.ip6 = *fl6;
1433 v6_cork->hop_limit = ipc6->hlimit;
1434 v6_cork->tclass = ipc6->tclass;
1435 if (rt->dst.flags & DST_XFRM_TUNNEL)
1436 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1437 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1438 else
1439 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1441 if (np->frag_size < mtu) {
1442 if (np->frag_size)
1443 mtu = np->frag_size;
1444 }
1445 cork->base.fragsize = mtu;
1446 cork->base.gso_size = ipc6->gso_size;
1447 cork->base.tx_flags = 0;
1448 cork->base.mark = ipc6->sockc.mark;
1449 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1450
1451 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1452 cork->base.flags |= IPCORK_ALLFRAG;
1453 cork->base.length = 0;
1454
1455 cork->base.transmit_time = ipc6->sockc.transmit_time;
1456
1457 return 0;
1458 }
1459
__ip6_append_data(struct sock * sk,struct flowi6 * fl6,struct sk_buff_head * queue,struct inet_cork * cork,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1460 static int __ip6_append_data(struct sock *sk,
1461 struct flowi6 *fl6,
1462 struct sk_buff_head *queue,
1463 struct inet_cork *cork,
1464 struct inet6_cork *v6_cork,
1465 struct page_frag *pfrag,
1466 int getfrag(void *from, char *to, int offset,
1467 int len, int odd, struct sk_buff *skb),
1468 void *from, int length, int transhdrlen,
1469 unsigned int flags, struct ipcm6_cookie *ipc6)
1470 {
1471 struct sk_buff *skb, *skb_prev = NULL;
1472 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1473 struct ubuf_info *uarg = NULL;
1474 int exthdrlen = 0;
1475 int dst_exthdrlen = 0;
1476 int hh_len;
1477 int copy;
1478 int err;
1479 int offset = 0;
1480 u32 tskey = 0;
1481 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1482 struct ipv6_txoptions *opt = v6_cork->opt;
1483 int csummode = CHECKSUM_NONE;
1484 unsigned int maxnonfragsize, headersize;
1485 unsigned int wmem_alloc_delta = 0;
1486 bool paged, extra_uref = false;
1487
1488 skb = skb_peek_tail(queue);
1489 if (!skb) {
1490 exthdrlen = opt ? opt->opt_flen : 0;
1491 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1492 }
1493
1494 paged = !!cork->gso_size;
1495 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1496 orig_mtu = mtu;
1497
1498 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1499 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1500 tskey = sk->sk_tskey++;
1501
1502 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1503
1504 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1505 (opt ? opt->opt_nflen : 0);
1506
1507 headersize = sizeof(struct ipv6hdr) +
1508 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1509 (dst_allfrag(&rt->dst) ?
1510 sizeof(struct frag_hdr) : 0) +
1511 rt->rt6i_nfheader_len;
1512
1513 if (mtu <= fragheaderlen ||
1514 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1515 goto emsgsize;
1516
1517 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1518 sizeof(struct frag_hdr);
1519
1520 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1521 * the first fragment
1522 */
1523 if (headersize + transhdrlen > mtu)
1524 goto emsgsize;
1525
1526 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1527 (sk->sk_protocol == IPPROTO_UDP ||
1528 sk->sk_protocol == IPPROTO_RAW)) {
1529 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1530 sizeof(struct ipv6hdr));
1531 goto emsgsize;
1532 }
1533
1534 if (ip6_sk_ignore_df(sk))
1535 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1536 else
1537 maxnonfragsize = mtu;
1538
1539 if (cork->length + length > maxnonfragsize - headersize) {
1540 emsgsize:
1541 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1542 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1543 return -EMSGSIZE;
1544 }
1545
1546 /* CHECKSUM_PARTIAL only with no extension headers and when
1547 * we are not going to fragment
1548 */
1549 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1550 headersize == sizeof(struct ipv6hdr) &&
1551 length <= mtu - headersize &&
1552 (!(flags & MSG_MORE) || cork->gso_size) &&
1553 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1554 csummode = CHECKSUM_PARTIAL;
1555
1556 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1557 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1558 if (!uarg)
1559 return -ENOBUFS;
1560 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1561 if (rt->dst.dev->features & NETIF_F_SG &&
1562 csummode == CHECKSUM_PARTIAL) {
1563 paged = true;
1564 } else {
1565 uarg->zerocopy = 0;
1566 skb_zcopy_set(skb, uarg, &extra_uref);
1567 }
1568 }
1569
1570 /*
1571 * Let's try using as much space as possible.
1572 * Use MTU if total length of the message fits into the MTU.
1573 * Otherwise, we need to reserve fragment header and
1574 * fragment alignment (= 8-15 octects, in total).
1575 *
1576 * Note that we may need to "move" the data from the tail
1577 * of the buffer to the new fragment when we split
1578 * the message.
1579 *
1580 * FIXME: It may be fragmented into multiple chunks
1581 * at once if non-fragmentable extension headers
1582 * are too large.
1583 * --yoshfuji
1584 */
1585
1586 cork->length += length;
1587 if (!skb)
1588 goto alloc_new_skb;
1589
1590 while (length > 0) {
1591 /* Check if the remaining data fits into current packet. */
1592 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1593 if (copy < length)
1594 copy = maxfraglen - skb->len;
1595
1596 if (copy <= 0) {
1597 char *data;
1598 unsigned int datalen;
1599 unsigned int fraglen;
1600 unsigned int fraggap;
1601 unsigned int alloclen, alloc_extra;
1602 unsigned int pagedlen;
1603 alloc_new_skb:
1604 /* There's no room in the current skb */
1605 if (skb)
1606 fraggap = skb->len - maxfraglen;
1607 else
1608 fraggap = 0;
1609 /* update mtu and maxfraglen if necessary */
1610 if (!skb || !skb_prev)
1611 ip6_append_data_mtu(&mtu, &maxfraglen,
1612 fragheaderlen, skb, rt,
1613 orig_mtu);
1614
1615 skb_prev = skb;
1616
1617 /*
1618 * If remaining data exceeds the mtu,
1619 * we know we need more fragment(s).
1620 */
1621 datalen = length + fraggap;
1622
1623 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1624 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1625 fraglen = datalen + fragheaderlen;
1626 pagedlen = 0;
1627
1628 alloc_extra = hh_len;
1629 alloc_extra += dst_exthdrlen;
1630 alloc_extra += rt->dst.trailer_len;
1631
1632 /* We just reserve space for fragment header.
1633 * Note: this may be overallocation if the message
1634 * (without MSG_MORE) fits into the MTU.
1635 */
1636 alloc_extra += sizeof(struct frag_hdr);
1637
1638 if ((flags & MSG_MORE) &&
1639 !(rt->dst.dev->features&NETIF_F_SG))
1640 alloclen = mtu;
1641 else if (!paged &&
1642 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1643 !(rt->dst.dev->features & NETIF_F_SG)))
1644 alloclen = fraglen;
1645 else {
1646 alloclen = min_t(int, fraglen, MAX_HEADER);
1647 pagedlen = fraglen - alloclen;
1648 }
1649 alloclen += alloc_extra;
1650
1651 if (datalen != length + fraggap) {
1652 /*
1653 * this is not the last fragment, the trailer
1654 * space is regarded as data space.
1655 */
1656 datalen += rt->dst.trailer_len;
1657 }
1658
1659 fraglen = datalen + fragheaderlen;
1660
1661 copy = datalen - transhdrlen - fraggap - pagedlen;
1662 if (copy < 0) {
1663 err = -EINVAL;
1664 goto error;
1665 }
1666 if (transhdrlen) {
1667 skb = sock_alloc_send_skb(sk, alloclen,
1668 (flags & MSG_DONTWAIT), &err);
1669 } else {
1670 skb = NULL;
1671 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1672 2 * sk->sk_sndbuf)
1673 skb = alloc_skb(alloclen,
1674 sk->sk_allocation);
1675 if (unlikely(!skb))
1676 err = -ENOBUFS;
1677 }
1678 if (!skb)
1679 goto error;
1680 /*
1681 * Fill in the control structures
1682 */
1683 skb->protocol = htons(ETH_P_IPV6);
1684 skb->ip_summed = csummode;
1685 skb->csum = 0;
1686 /* reserve for fragmentation and ipsec header */
1687 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1688 dst_exthdrlen);
1689
1690 /*
1691 * Find where to start putting bytes
1692 */
1693 data = skb_put(skb, fraglen - pagedlen);
1694 skb_set_network_header(skb, exthdrlen);
1695 data += fragheaderlen;
1696 skb->transport_header = (skb->network_header +
1697 fragheaderlen);
1698 if (fraggap) {
1699 skb->csum = skb_copy_and_csum_bits(
1700 skb_prev, maxfraglen,
1701 data + transhdrlen, fraggap);
1702 skb_prev->csum = csum_sub(skb_prev->csum,
1703 skb->csum);
1704 data += fraggap;
1705 pskb_trim_unique(skb_prev, maxfraglen);
1706 }
1707 if (copy > 0 &&
1708 getfrag(from, data + transhdrlen, offset,
1709 copy, fraggap, skb) < 0) {
1710 err = -EFAULT;
1711 kfree_skb(skb);
1712 goto error;
1713 }
1714
1715 offset += copy;
1716 length -= copy + transhdrlen;
1717 transhdrlen = 0;
1718 exthdrlen = 0;
1719 dst_exthdrlen = 0;
1720
1721 /* Only the initial fragment is time stamped */
1722 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1723 cork->tx_flags = 0;
1724 skb_shinfo(skb)->tskey = tskey;
1725 tskey = 0;
1726 skb_zcopy_set(skb, uarg, &extra_uref);
1727
1728 if ((flags & MSG_CONFIRM) && !skb_prev)
1729 skb_set_dst_pending_confirm(skb, 1);
1730
1731 /*
1732 * Put the packet on the pending queue
1733 */
1734 if (!skb->destructor) {
1735 skb->destructor = sock_wfree;
1736 skb->sk = sk;
1737 wmem_alloc_delta += skb->truesize;
1738 }
1739 __skb_queue_tail(queue, skb);
1740 continue;
1741 }
1742
1743 if (copy > length)
1744 copy = length;
1745
1746 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1747 skb_tailroom(skb) >= copy) {
1748 unsigned int off;
1749
1750 off = skb->len;
1751 if (getfrag(from, skb_put(skb, copy),
1752 offset, copy, off, skb) < 0) {
1753 __skb_trim(skb, off);
1754 err = -EFAULT;
1755 goto error;
1756 }
1757 } else if (!uarg || !uarg->zerocopy) {
1758 int i = skb_shinfo(skb)->nr_frags;
1759
1760 err = -ENOMEM;
1761 if (!sk_page_frag_refill(sk, pfrag))
1762 goto error;
1763
1764 if (!skb_can_coalesce(skb, i, pfrag->page,
1765 pfrag->offset)) {
1766 err = -EMSGSIZE;
1767 if (i == MAX_SKB_FRAGS)
1768 goto error;
1769
1770 __skb_fill_page_desc(skb, i, pfrag->page,
1771 pfrag->offset, 0);
1772 skb_shinfo(skb)->nr_frags = ++i;
1773 get_page(pfrag->page);
1774 }
1775 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1776 if (getfrag(from,
1777 page_address(pfrag->page) + pfrag->offset,
1778 offset, copy, skb->len, skb) < 0)
1779 goto error_efault;
1780
1781 pfrag->offset += copy;
1782 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1783 skb->len += copy;
1784 skb->data_len += copy;
1785 skb->truesize += copy;
1786 wmem_alloc_delta += copy;
1787 } else {
1788 err = skb_zerocopy_iter_dgram(skb, from, copy);
1789 if (err < 0)
1790 goto error;
1791 }
1792 offset += copy;
1793 length -= copy;
1794 }
1795
1796 if (wmem_alloc_delta)
1797 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1798 return 0;
1799
1800 error_efault:
1801 err = -EFAULT;
1802 error:
1803 if (uarg)
1804 sock_zerocopy_put_abort(uarg, extra_uref);
1805 cork->length -= length;
1806 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1807 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1808 return err;
1809 }
1810
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1811 int ip6_append_data(struct sock *sk,
1812 int getfrag(void *from, char *to, int offset, int len,
1813 int odd, struct sk_buff *skb),
1814 void *from, int length, int transhdrlen,
1815 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1816 struct rt6_info *rt, unsigned int flags)
1817 {
1818 struct inet_sock *inet = inet_sk(sk);
1819 struct ipv6_pinfo *np = inet6_sk(sk);
1820 int exthdrlen;
1821 int err;
1822
1823 if (flags&MSG_PROBE)
1824 return 0;
1825 if (skb_queue_empty(&sk->sk_write_queue)) {
1826 /*
1827 * setup for corking
1828 */
1829 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1830 ipc6, rt, fl6);
1831 if (err)
1832 return err;
1833
1834 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1835 length += exthdrlen;
1836 transhdrlen += exthdrlen;
1837 } else {
1838 fl6 = &inet->cork.fl.u.ip6;
1839 transhdrlen = 0;
1840 }
1841
1842 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1843 &np->cork, sk_page_frag(sk), getfrag,
1844 from, length, transhdrlen, flags, ipc6);
1845 }
1846 EXPORT_SYMBOL_GPL(ip6_append_data);
1847
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1848 static void ip6_cork_release(struct inet_cork_full *cork,
1849 struct inet6_cork *v6_cork)
1850 {
1851 if (v6_cork->opt) {
1852 kfree(v6_cork->opt->dst0opt);
1853 kfree(v6_cork->opt->dst1opt);
1854 kfree(v6_cork->opt->hopopt);
1855 kfree(v6_cork->opt->srcrt);
1856 kfree(v6_cork->opt);
1857 v6_cork->opt = NULL;
1858 }
1859
1860 if (cork->base.dst) {
1861 dst_release(cork->base.dst);
1862 cork->base.dst = NULL;
1863 cork->base.flags &= ~IPCORK_ALLFRAG;
1864 }
1865 memset(&cork->fl, 0, sizeof(cork->fl));
1866 }
1867
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1868 struct sk_buff *__ip6_make_skb(struct sock *sk,
1869 struct sk_buff_head *queue,
1870 struct inet_cork_full *cork,
1871 struct inet6_cork *v6_cork)
1872 {
1873 struct sk_buff *skb, *tmp_skb;
1874 struct sk_buff **tail_skb;
1875 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1876 struct ipv6_pinfo *np = inet6_sk(sk);
1877 struct net *net = sock_net(sk);
1878 struct ipv6hdr *hdr;
1879 struct ipv6_txoptions *opt = v6_cork->opt;
1880 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1881 struct flowi6 *fl6 = &cork->fl.u.ip6;
1882 unsigned char proto = fl6->flowi6_proto;
1883
1884 skb = __skb_dequeue(queue);
1885 if (!skb)
1886 goto out;
1887 tail_skb = &(skb_shinfo(skb)->frag_list);
1888
1889 /* move skb->data to ip header from ext header */
1890 if (skb->data < skb_network_header(skb))
1891 __skb_pull(skb, skb_network_offset(skb));
1892 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1893 __skb_pull(tmp_skb, skb_network_header_len(skb));
1894 *tail_skb = tmp_skb;
1895 tail_skb = &(tmp_skb->next);
1896 skb->len += tmp_skb->len;
1897 skb->data_len += tmp_skb->len;
1898 skb->truesize += tmp_skb->truesize;
1899 tmp_skb->destructor = NULL;
1900 tmp_skb->sk = NULL;
1901 }
1902
1903 /* Allow local fragmentation. */
1904 skb->ignore_df = ip6_sk_ignore_df(sk);
1905
1906 *final_dst = fl6->daddr;
1907 __skb_pull(skb, skb_network_header_len(skb));
1908 if (opt && opt->opt_flen)
1909 ipv6_push_frag_opts(skb, opt, &proto);
1910 if (opt && opt->opt_nflen)
1911 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1912
1913 skb_push(skb, sizeof(struct ipv6hdr));
1914 skb_reset_network_header(skb);
1915 hdr = ipv6_hdr(skb);
1916
1917 ip6_flow_hdr(hdr, v6_cork->tclass,
1918 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1919 ip6_autoflowlabel(net, np), fl6));
1920 hdr->hop_limit = v6_cork->hop_limit;
1921 hdr->nexthdr = proto;
1922 hdr->saddr = fl6->saddr;
1923 hdr->daddr = *final_dst;
1924
1925 skb->priority = sk->sk_priority;
1926 skb->mark = cork->base.mark;
1927
1928 skb->tstamp = cork->base.transmit_time;
1929
1930 skb_dst_set(skb, dst_clone(&rt->dst));
1931 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1932 if (proto == IPPROTO_ICMPV6) {
1933 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1934 u8 icmp6_type;
1935
1936 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1937 icmp6_type = fl6->fl6_icmp_type;
1938 else
1939 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1940 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1941 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1942 }
1943
1944 ip6_cork_release(cork, v6_cork);
1945 out:
1946 return skb;
1947 }
1948
ip6_send_skb(struct sk_buff * skb)1949 int ip6_send_skb(struct sk_buff *skb)
1950 {
1951 struct net *net = sock_net(skb->sk);
1952 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1953 int err;
1954
1955 err = ip6_local_out(net, skb->sk, skb);
1956 if (err) {
1957 if (err > 0)
1958 err = net_xmit_errno(err);
1959 if (err)
1960 IP6_INC_STATS(net, rt->rt6i_idev,
1961 IPSTATS_MIB_OUTDISCARDS);
1962 }
1963
1964 return err;
1965 }
1966
ip6_push_pending_frames(struct sock * sk)1967 int ip6_push_pending_frames(struct sock *sk)
1968 {
1969 struct sk_buff *skb;
1970
1971 skb = ip6_finish_skb(sk);
1972 if (!skb)
1973 return 0;
1974
1975 return ip6_send_skb(skb);
1976 }
1977 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1978
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1979 static void __ip6_flush_pending_frames(struct sock *sk,
1980 struct sk_buff_head *queue,
1981 struct inet_cork_full *cork,
1982 struct inet6_cork *v6_cork)
1983 {
1984 struct sk_buff *skb;
1985
1986 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1987 if (skb_dst(skb))
1988 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1989 IPSTATS_MIB_OUTDISCARDS);
1990 kfree_skb(skb);
1991 }
1992
1993 ip6_cork_release(cork, v6_cork);
1994 }
1995
ip6_flush_pending_frames(struct sock * sk)1996 void ip6_flush_pending_frames(struct sock *sk)
1997 {
1998 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1999 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2000 }
2001 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2002
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2003 struct sk_buff *ip6_make_skb(struct sock *sk,
2004 int getfrag(void *from, char *to, int offset,
2005 int len, int odd, struct sk_buff *skb),
2006 void *from, int length, int transhdrlen,
2007 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
2008 struct rt6_info *rt, unsigned int flags,
2009 struct inet_cork_full *cork)
2010 {
2011 struct inet6_cork v6_cork;
2012 struct sk_buff_head queue;
2013 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2014 int err;
2015
2016 if (flags & MSG_PROBE)
2017 return NULL;
2018
2019 __skb_queue_head_init(&queue);
2020
2021 cork->base.flags = 0;
2022 cork->base.addr = 0;
2023 cork->base.opt = NULL;
2024 cork->base.dst = NULL;
2025 v6_cork.opt = NULL;
2026 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2027 if (err) {
2028 ip6_cork_release(cork, &v6_cork);
2029 return ERR_PTR(err);
2030 }
2031 if (ipc6->dontfrag < 0)
2032 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2033
2034 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2035 ¤t->task_frag, getfrag, from,
2036 length + exthdrlen, transhdrlen + exthdrlen,
2037 flags, ipc6);
2038 if (err) {
2039 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2040 return ERR_PTR(err);
2041 }
2042
2043 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2044 }
2045