• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
3  */
4 
5 #include <linux/kernel.h>
6 #include <linux/module.h>
7 #include <linux/skbuff.h>
8 #include <linux/types.h>
9 #include <linux/bpf.h>
10 #include <net/lwtunnel.h>
11 #include <net/gre.h>
12 #include <net/ip6_route.h>
13 #include <net/ipv6_stubs.h>
14 
15 struct bpf_lwt_prog {
16 	struct bpf_prog *prog;
17 	char *name;
18 };
19 
20 struct bpf_lwt {
21 	struct bpf_lwt_prog in;
22 	struct bpf_lwt_prog out;
23 	struct bpf_lwt_prog xmit;
24 	int family;
25 };
26 
27 #define MAX_PROG_NAME 256
28 
bpf_lwt_lwtunnel(struct lwtunnel_state * lwt)29 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
30 {
31 	return (struct bpf_lwt *)lwt->data;
32 }
33 
34 #define NO_REDIRECT false
35 #define CAN_REDIRECT true
36 
run_lwt_bpf(struct sk_buff * skb,struct bpf_lwt_prog * lwt,struct dst_entry * dst,bool can_redirect)37 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
38 		       struct dst_entry *dst, bool can_redirect)
39 {
40 	int ret;
41 
42 	/* Preempt disable and BH disable are needed to protect per-cpu
43 	 * redirect_info between BPF prog and skb_do_redirect().
44 	 */
45 	preempt_disable();
46 	local_bh_disable();
47 	bpf_compute_data_pointers(skb);
48 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
49 
50 	switch (ret) {
51 	case BPF_OK:
52 	case BPF_LWT_REROUTE:
53 		break;
54 
55 	case BPF_REDIRECT:
56 		if (unlikely(!can_redirect)) {
57 			pr_warn_once("Illegal redirect return code in prog %s\n",
58 				     lwt->name ? : "<unknown>");
59 			ret = BPF_OK;
60 		} else {
61 			skb_reset_mac_header(skb);
62 			skb_do_redirect(skb);
63 			ret = BPF_REDIRECT;
64 		}
65 		break;
66 
67 	case BPF_DROP:
68 		kfree_skb(skb);
69 		ret = -EPERM;
70 		break;
71 
72 	default:
73 		pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
74 		kfree_skb(skb);
75 		ret = -EINVAL;
76 		break;
77 	}
78 
79 	local_bh_enable();
80 	preempt_enable();
81 
82 	return ret;
83 }
84 
bpf_lwt_input_reroute(struct sk_buff * skb)85 static int bpf_lwt_input_reroute(struct sk_buff *skb)
86 {
87 	int err = -EINVAL;
88 
89 	if (skb->protocol == htons(ETH_P_IP)) {
90 		struct net_device *dev = skb_dst(skb)->dev;
91 		struct iphdr *iph = ip_hdr(skb);
92 
93 		dev_hold(dev);
94 		skb_dst_drop(skb);
95 		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
96 					   iph->tos, dev);
97 		dev_put(dev);
98 	} else if (skb->protocol == htons(ETH_P_IPV6)) {
99 		skb_dst_drop(skb);
100 		err = ipv6_stub->ipv6_route_input(skb);
101 	} else {
102 		err = -EAFNOSUPPORT;
103 	}
104 
105 	if (err)
106 		goto err;
107 	return dst_input(skb);
108 
109 err:
110 	kfree_skb(skb);
111 	return err;
112 }
113 
bpf_input(struct sk_buff * skb)114 static int bpf_input(struct sk_buff *skb)
115 {
116 	struct dst_entry *dst = skb_dst(skb);
117 	struct bpf_lwt *bpf;
118 	int ret;
119 
120 	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
121 	if (bpf->in.prog) {
122 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
123 		if (ret < 0)
124 			return ret;
125 		if (ret == BPF_LWT_REROUTE)
126 			return bpf_lwt_input_reroute(skb);
127 	}
128 
129 	if (unlikely(!dst->lwtstate->orig_input)) {
130 		kfree_skb(skb);
131 		return -EINVAL;
132 	}
133 
134 	return dst->lwtstate->orig_input(skb);
135 }
136 
bpf_output(struct net * net,struct sock * sk,struct sk_buff * skb)137 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
138 {
139 	struct dst_entry *dst = skb_dst(skb);
140 	struct bpf_lwt *bpf;
141 	int ret;
142 
143 	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
144 	if (bpf->out.prog) {
145 		ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
146 		if (ret < 0)
147 			return ret;
148 	}
149 
150 	if (unlikely(!dst->lwtstate->orig_output)) {
151 		pr_warn_once("orig_output not set on dst for prog %s\n",
152 			     bpf->out.name);
153 		kfree_skb(skb);
154 		return -EINVAL;
155 	}
156 
157 	return dst->lwtstate->orig_output(net, sk, skb);
158 }
159 
xmit_check_hhlen(struct sk_buff * skb,int hh_len)160 static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
161 {
162 	if (skb_headroom(skb) < hh_len) {
163 		int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
164 
165 		if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
166 			return -ENOMEM;
167 	}
168 
169 	return 0;
170 }
171 
bpf_lwt_xmit_reroute(struct sk_buff * skb)172 static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
173 {
174 	struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
175 	int oif = l3mdev ? l3mdev->ifindex : 0;
176 	struct dst_entry *dst = NULL;
177 	int err = -EAFNOSUPPORT;
178 	struct sock *sk;
179 	struct net *net;
180 	bool ipv4;
181 
182 	if (skb->protocol == htons(ETH_P_IP))
183 		ipv4 = true;
184 	else if (skb->protocol == htons(ETH_P_IPV6))
185 		ipv4 = false;
186 	else
187 		goto err;
188 
189 	sk = sk_to_full_sk(skb->sk);
190 	if (sk) {
191 		if (sk->sk_bound_dev_if)
192 			oif = sk->sk_bound_dev_if;
193 		net = sock_net(sk);
194 	} else {
195 		net = dev_net(skb_dst(skb)->dev);
196 	}
197 
198 	if (ipv4) {
199 		struct iphdr *iph = ip_hdr(skb);
200 		struct flowi4 fl4 = {};
201 		struct rtable *rt;
202 
203 		fl4.flowi4_oif = oif;
204 		fl4.flowi4_mark = skb->mark;
205 		fl4.flowi4_uid = sock_net_uid(net, sk);
206 		fl4.flowi4_tos = RT_TOS(iph->tos);
207 		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
208 		fl4.flowi4_proto = iph->protocol;
209 		fl4.daddr = iph->daddr;
210 		fl4.saddr = iph->saddr;
211 
212 		rt = ip_route_output_key(net, &fl4);
213 		if (IS_ERR(rt)) {
214 			err = PTR_ERR(rt);
215 			goto err;
216 		}
217 		dst = &rt->dst;
218 	} else {
219 		struct ipv6hdr *iph6 = ipv6_hdr(skb);
220 		struct flowi6 fl6 = {};
221 
222 		fl6.flowi6_oif = oif;
223 		fl6.flowi6_mark = skb->mark;
224 		fl6.flowi6_uid = sock_net_uid(net, sk);
225 		fl6.flowlabel = ip6_flowinfo(iph6);
226 		fl6.flowi6_proto = iph6->nexthdr;
227 		fl6.daddr = iph6->daddr;
228 		fl6.saddr = iph6->saddr;
229 
230 		dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
231 		if (IS_ERR(dst)) {
232 			err = PTR_ERR(dst);
233 			goto err;
234 		}
235 	}
236 	if (unlikely(dst->error)) {
237 		err = dst->error;
238 		dst_release(dst);
239 		goto err;
240 	}
241 
242 	/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
243 	 * was done for the previous dst, so we are doing it here again, in
244 	 * case the new dst needs much more space. The call below is a noop
245 	 * if there is enough header space in skb.
246 	 */
247 	err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
248 	if (unlikely(err))
249 		goto err;
250 
251 	skb_dst_drop(skb);
252 	skb_dst_set(skb, dst);
253 
254 	err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
255 	if (unlikely(err))
256 		return net_xmit_errno(err);
257 
258 	/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
259 	return LWTUNNEL_XMIT_DONE;
260 
261 err:
262 	kfree_skb(skb);
263 	return err;
264 }
265 
bpf_xmit(struct sk_buff * skb)266 static int bpf_xmit(struct sk_buff *skb)
267 {
268 	struct dst_entry *dst = skb_dst(skb);
269 	struct bpf_lwt *bpf;
270 
271 	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
272 	if (bpf->xmit.prog) {
273 		int hh_len = dst->dev->hard_header_len;
274 		__be16 proto = skb->protocol;
275 		int ret;
276 
277 		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
278 		switch (ret) {
279 		case BPF_OK:
280 			/* If the header changed, e.g. via bpf_lwt_push_encap,
281 			 * BPF_LWT_REROUTE below should have been used if the
282 			 * protocol was also changed.
283 			 */
284 			if (skb->protocol != proto) {
285 				kfree_skb(skb);
286 				return -EINVAL;
287 			}
288 			/* If the header was expanded, headroom might be too
289 			 * small for L2 header to come, expand as needed.
290 			 */
291 			ret = xmit_check_hhlen(skb, hh_len);
292 			if (unlikely(ret))
293 				return ret;
294 
295 			return LWTUNNEL_XMIT_CONTINUE;
296 		case BPF_REDIRECT:
297 			return LWTUNNEL_XMIT_DONE;
298 		case BPF_LWT_REROUTE:
299 			return bpf_lwt_xmit_reroute(skb);
300 		default:
301 			return ret;
302 		}
303 	}
304 
305 	return LWTUNNEL_XMIT_CONTINUE;
306 }
307 
bpf_lwt_prog_destroy(struct bpf_lwt_prog * prog)308 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
309 {
310 	if (prog->prog)
311 		bpf_prog_put(prog->prog);
312 
313 	kfree(prog->name);
314 }
315 
bpf_destroy_state(struct lwtunnel_state * lwt)316 static void bpf_destroy_state(struct lwtunnel_state *lwt)
317 {
318 	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
319 
320 	bpf_lwt_prog_destroy(&bpf->in);
321 	bpf_lwt_prog_destroy(&bpf->out);
322 	bpf_lwt_prog_destroy(&bpf->xmit);
323 }
324 
325 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
326 	[LWT_BPF_PROG_FD]   = { .type = NLA_U32, },
327 	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
328 				.len = MAX_PROG_NAME },
329 };
330 
bpf_parse_prog(struct nlattr * attr,struct bpf_lwt_prog * prog,enum bpf_prog_type type)331 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
332 			  enum bpf_prog_type type)
333 {
334 	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
335 	struct bpf_prog *p;
336 	int ret;
337 	u32 fd;
338 
339 	ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr,
340 					  bpf_prog_policy, NULL);
341 	if (ret < 0)
342 		return ret;
343 
344 	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
345 		return -EINVAL;
346 
347 	prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC);
348 	if (!prog->name)
349 		return -ENOMEM;
350 
351 	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
352 	p = bpf_prog_get_type(fd, type);
353 	if (IS_ERR(p))
354 		return PTR_ERR(p);
355 
356 	prog->prog = p;
357 
358 	return 0;
359 }
360 
361 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
362 	[LWT_BPF_IN]		= { .type = NLA_NESTED, },
363 	[LWT_BPF_OUT]		= { .type = NLA_NESTED, },
364 	[LWT_BPF_XMIT]		= { .type = NLA_NESTED, },
365 	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
366 };
367 
bpf_build_state(struct nlattr * nla,unsigned int family,const void * cfg,struct lwtunnel_state ** ts,struct netlink_ext_ack * extack)368 static int bpf_build_state(struct nlattr *nla,
369 			   unsigned int family, const void *cfg,
370 			   struct lwtunnel_state **ts,
371 			   struct netlink_ext_ack *extack)
372 {
373 	struct nlattr *tb[LWT_BPF_MAX + 1];
374 	struct lwtunnel_state *newts;
375 	struct bpf_lwt *bpf;
376 	int ret;
377 
378 	if (family != AF_INET && family != AF_INET6)
379 		return -EAFNOSUPPORT;
380 
381 	ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy,
382 					  extack);
383 	if (ret < 0)
384 		return ret;
385 
386 	if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
387 		return -EINVAL;
388 
389 	newts = lwtunnel_state_alloc(sizeof(*bpf));
390 	if (!newts)
391 		return -ENOMEM;
392 
393 	newts->type = LWTUNNEL_ENCAP_BPF;
394 	bpf = bpf_lwt_lwtunnel(newts);
395 
396 	if (tb[LWT_BPF_IN]) {
397 		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
398 		ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
399 				     BPF_PROG_TYPE_LWT_IN);
400 		if (ret  < 0)
401 			goto errout;
402 	}
403 
404 	if (tb[LWT_BPF_OUT]) {
405 		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
406 		ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
407 				     BPF_PROG_TYPE_LWT_OUT);
408 		if (ret < 0)
409 			goto errout;
410 	}
411 
412 	if (tb[LWT_BPF_XMIT]) {
413 		newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
414 		ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
415 				     BPF_PROG_TYPE_LWT_XMIT);
416 		if (ret < 0)
417 			goto errout;
418 	}
419 
420 	if (tb[LWT_BPF_XMIT_HEADROOM]) {
421 		u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
422 
423 		if (headroom > LWT_BPF_MAX_HEADROOM) {
424 			ret = -ERANGE;
425 			goto errout;
426 		}
427 
428 		newts->headroom = headroom;
429 	}
430 
431 	bpf->family = family;
432 	*ts = newts;
433 
434 	return 0;
435 
436 errout:
437 	bpf_destroy_state(newts);
438 	kfree(newts);
439 	return ret;
440 }
441 
bpf_fill_lwt_prog(struct sk_buff * skb,int attr,struct bpf_lwt_prog * prog)442 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
443 			     struct bpf_lwt_prog *prog)
444 {
445 	struct nlattr *nest;
446 
447 	if (!prog->prog)
448 		return 0;
449 
450 	nest = nla_nest_start_noflag(skb, attr);
451 	if (!nest)
452 		return -EMSGSIZE;
453 
454 	if (prog->name &&
455 	    nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
456 		return -EMSGSIZE;
457 
458 	return nla_nest_end(skb, nest);
459 }
460 
bpf_fill_encap_info(struct sk_buff * skb,struct lwtunnel_state * lwt)461 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
462 {
463 	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
464 
465 	if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
466 	    bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
467 	    bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
468 		return -EMSGSIZE;
469 
470 	return 0;
471 }
472 
bpf_encap_nlsize(struct lwtunnel_state * lwtstate)473 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
474 {
475 	int nest_len = nla_total_size(sizeof(struct nlattr)) +
476 		       nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
477 		       0;
478 
479 	return nest_len + /* LWT_BPF_IN */
480 	       nest_len + /* LWT_BPF_OUT */
481 	       nest_len + /* LWT_BPF_XMIT */
482 	       0;
483 }
484 
bpf_lwt_prog_cmp(struct bpf_lwt_prog * a,struct bpf_lwt_prog * b)485 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
486 {
487 	/* FIXME:
488 	 * The LWT state is currently rebuilt for delete requests which
489 	 * results in a new bpf_prog instance. Comparing names for now.
490 	 */
491 	if (!a->name && !b->name)
492 		return 0;
493 
494 	if (!a->name || !b->name)
495 		return 1;
496 
497 	return strcmp(a->name, b->name);
498 }
499 
bpf_encap_cmp(struct lwtunnel_state * a,struct lwtunnel_state * b)500 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
501 {
502 	struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
503 	struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
504 
505 	return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
506 	       bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
507 	       bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
508 }
509 
510 static const struct lwtunnel_encap_ops bpf_encap_ops = {
511 	.build_state	= bpf_build_state,
512 	.destroy_state	= bpf_destroy_state,
513 	.input		= bpf_input,
514 	.output		= bpf_output,
515 	.xmit		= bpf_xmit,
516 	.fill_encap	= bpf_fill_encap_info,
517 	.get_encap_size = bpf_encap_nlsize,
518 	.cmp_encap	= bpf_encap_cmp,
519 	.owner		= THIS_MODULE,
520 };
521 
handle_gso_type(struct sk_buff * skb,unsigned int gso_type,int encap_len)522 static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
523 			   int encap_len)
524 {
525 	struct skb_shared_info *shinfo = skb_shinfo(skb);
526 
527 	gso_type |= SKB_GSO_DODGY;
528 	shinfo->gso_type |= gso_type;
529 	skb_decrease_gso_size(shinfo, encap_len);
530 	shinfo->gso_segs = 0;
531 	return 0;
532 }
533 
handle_gso_encap(struct sk_buff * skb,bool ipv4,int encap_len)534 static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
535 {
536 	int next_hdr_offset;
537 	void *next_hdr;
538 	__u8 protocol;
539 
540 	/* SCTP and UDP_L4 gso need more nuanced handling than what
541 	 * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
542 	 * So at the moment only TCP GSO packets are let through.
543 	 */
544 	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
545 		return -ENOTSUPP;
546 
547 	if (ipv4) {
548 		protocol = ip_hdr(skb)->protocol;
549 		next_hdr_offset = sizeof(struct iphdr);
550 		next_hdr = skb_network_header(skb) + next_hdr_offset;
551 	} else {
552 		protocol = ipv6_hdr(skb)->nexthdr;
553 		next_hdr_offset = sizeof(struct ipv6hdr);
554 		next_hdr = skb_network_header(skb) + next_hdr_offset;
555 	}
556 
557 	switch (protocol) {
558 	case IPPROTO_GRE:
559 		next_hdr_offset += sizeof(struct gre_base_hdr);
560 		if (next_hdr_offset > encap_len)
561 			return -EINVAL;
562 
563 		if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
564 			return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
565 					       encap_len);
566 		return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
567 
568 	case IPPROTO_UDP:
569 		next_hdr_offset += sizeof(struct udphdr);
570 		if (next_hdr_offset > encap_len)
571 			return -EINVAL;
572 
573 		if (((struct udphdr *)next_hdr)->check)
574 			return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
575 					       encap_len);
576 		return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
577 
578 	case IPPROTO_IP:
579 	case IPPROTO_IPV6:
580 		if (ipv4)
581 			return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
582 		else
583 			return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
584 
585 	default:
586 		return -EPROTONOSUPPORT;
587 	}
588 }
589 
bpf_lwt_push_ip_encap(struct sk_buff * skb,void * hdr,u32 len,bool ingress)590 int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
591 {
592 	struct iphdr *iph;
593 	bool ipv4;
594 	int err;
595 
596 	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
597 		return -EINVAL;
598 
599 	/* validate protocol and length */
600 	iph = (struct iphdr *)hdr;
601 	if (iph->version == 4) {
602 		ipv4 = true;
603 		if (unlikely(len < iph->ihl * 4))
604 			return -EINVAL;
605 	} else if (iph->version == 6) {
606 		ipv4 = false;
607 		if (unlikely(len < sizeof(struct ipv6hdr)))
608 			return -EINVAL;
609 	} else {
610 		return -EINVAL;
611 	}
612 
613 	if (ingress)
614 		err = skb_cow_head(skb, len + skb->mac_len);
615 	else
616 		err = skb_cow_head(skb,
617 				   len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
618 	if (unlikely(err))
619 		return err;
620 
621 	/* push the encap headers and fix pointers */
622 	skb_reset_inner_headers(skb);
623 	skb_reset_inner_mac_header(skb);  /* mac header is not yet set */
624 	skb_set_inner_protocol(skb, skb->protocol);
625 	skb->encapsulation = 1;
626 	skb_push(skb, len);
627 	if (ingress)
628 		skb_postpush_rcsum(skb, iph, len);
629 	skb_reset_network_header(skb);
630 	memcpy(skb_network_header(skb), hdr, len);
631 	bpf_compute_data_pointers(skb);
632 	skb_clear_hash(skb);
633 
634 	if (ipv4) {
635 		skb->protocol = htons(ETH_P_IP);
636 		iph = ip_hdr(skb);
637 
638 		if (!iph->check)
639 			iph->check = ip_fast_csum((unsigned char *)iph,
640 						  iph->ihl);
641 	} else {
642 		skb->protocol = htons(ETH_P_IPV6);
643 	}
644 
645 	if (skb_is_gso(skb))
646 		return handle_gso_encap(skb, ipv4, len);
647 
648 	return 0;
649 }
650 
bpf_lwt_init(void)651 static int __init bpf_lwt_init(void)
652 {
653 	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
654 }
655 
656 subsys_initcall(bpf_lwt_init)
657