• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Linux Socket Filter - Kernel level socket filtering
3   *
4   * Based on the design of the Berkeley Packet Filter. The new
5   * internal format has been designed by PLUMgrid:
6   *
7   *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8   *
9   * Authors:
10   *
11   *	Jay Schulist <jschlst@samba.org>
12   *	Alexei Starovoitov <ast@plumgrid.com>
13   *	Daniel Borkmann <dborkman@redhat.com>
14   *
15   * This program is free software; you can redistribute it and/or
16   * modify it under the terms of the GNU General Public License
17   * as published by the Free Software Foundation; either version
18   * 2 of the License, or (at your option) any later version.
19   *
20   * Andi Kleen - Fix a few bad bugs and races.
21   * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22   */
23  
24  #include <linux/module.h>
25  #include <linux/types.h>
26  #include <linux/mm.h>
27  #include <linux/fcntl.h>
28  #include <linux/socket.h>
29  #include <linux/sock_diag.h>
30  #include <linux/in.h>
31  #include <linux/inet.h>
32  #include <linux/netdevice.h>
33  #include <linux/if_packet.h>
34  #include <linux/gfp.h>
35  #include <net/ip.h>
36  #include <net/protocol.h>
37  #include <net/netlink.h>
38  #include <linux/skbuff.h>
39  #include <net/sock.h>
40  #include <net/flow_dissector.h>
41  #include <linux/errno.h>
42  #include <linux/timer.h>
43  #include <asm/uaccess.h>
44  #include <asm/unaligned.h>
45  #include <linux/filter.h>
46  #include <linux/ratelimit.h>
47  #include <linux/seccomp.h>
48  #include <linux/if_vlan.h>
49  #include <linux/bpf.h>
50  #include <net/sch_generic.h>
51  #include <net/cls_cgroup.h>
52  #include <net/dst_metadata.h>
53  #include <net/dst.h>
54  #include <net/sock_reuseport.h>
55  
56  /**
57   *	sk_filter_trim_cap - run a packet through a socket filter
58   *	@sk: sock associated with &sk_buff
59   *	@skb: buffer to filter
60   *	@cap: limit on how short the eBPF program may trim the packet
61   *
62   * Run the eBPF program and then cut skb->data to correct size returned by
63   * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
64   * than pkt_len we keep whole skb->data. This is the socket level
65   * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
66   * be accepted or -EPERM if the packet should be tossed.
67   *
68   */
sk_filter_trim_cap(struct sock * sk,struct sk_buff * skb,unsigned int cap)69  int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
70  {
71  	int err;
72  	struct sk_filter *filter;
73  
74  	/*
75  	 * If the skb was allocated from pfmemalloc reserves, only
76  	 * allow SOCK_MEMALLOC sockets to use it as this socket is
77  	 * helping free memory
78  	 */
79  	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
80  		return -ENOMEM;
81  
82  	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
83  	if (err)
84  		return err;
85  
86  	err = security_sock_rcv_skb(sk, skb);
87  	if (err)
88  		return err;
89  
90  	rcu_read_lock();
91  	filter = rcu_dereference(sk->sk_filter);
92  	if (filter) {
93  		struct sock *save_sk = skb->sk;
94  		unsigned int pkt_len;
95  
96  		skb->sk = sk;
97  		pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
98  		skb->sk = save_sk;
99  		err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
100  	}
101  	rcu_read_unlock();
102  
103  	return err;
104  }
105  EXPORT_SYMBOL(sk_filter_trim_cap);
106  
BPF_CALL_1(__skb_get_pay_offset,struct sk_buff *,skb)107  BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
108  {
109  	return skb_get_poff(skb);
110  }
111  
BPF_CALL_3(__skb_get_nlattr,struct sk_buff *,skb,u32,a,u32,x)112  BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
113  {
114  	struct nlattr *nla;
115  
116  	if (skb_is_nonlinear(skb))
117  		return 0;
118  
119  	if (skb->len < sizeof(struct nlattr))
120  		return 0;
121  
122  	if (a > skb->len - sizeof(struct nlattr))
123  		return 0;
124  
125  	nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
126  	if (nla)
127  		return (void *) nla - (void *) skb->data;
128  
129  	return 0;
130  }
131  
BPF_CALL_3(__skb_get_nlattr_nest,struct sk_buff *,skb,u32,a,u32,x)132  BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
133  {
134  	struct nlattr *nla;
135  
136  	if (skb_is_nonlinear(skb))
137  		return 0;
138  
139  	if (skb->len < sizeof(struct nlattr))
140  		return 0;
141  
142  	if (a > skb->len - sizeof(struct nlattr))
143  		return 0;
144  
145  	nla = (struct nlattr *) &skb->data[a];
146  	if (nla->nla_len > skb->len - a)
147  		return 0;
148  
149  	nla = nla_find_nested(nla, x);
150  	if (nla)
151  		return (void *) nla - (void *) skb->data;
152  
153  	return 0;
154  }
155  
BPF_CALL_0(__get_raw_cpu_id)156  BPF_CALL_0(__get_raw_cpu_id)
157  {
158  	return raw_smp_processor_id();
159  }
160  
161  static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
162  	.func		= __get_raw_cpu_id,
163  	.gpl_only	= false,
164  	.ret_type	= RET_INTEGER,
165  };
166  
convert_skb_access(int skb_field,int dst_reg,int src_reg,struct bpf_insn * insn_buf)167  static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
168  			      struct bpf_insn *insn_buf)
169  {
170  	struct bpf_insn *insn = insn_buf;
171  
172  	switch (skb_field) {
173  	case SKF_AD_MARK:
174  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
175  
176  		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
177  				      offsetof(struct sk_buff, mark));
178  		break;
179  
180  	case SKF_AD_PKTTYPE:
181  		*insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
182  		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
183  #ifdef __BIG_ENDIAN_BITFIELD
184  		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
185  #endif
186  		break;
187  
188  	case SKF_AD_QUEUE:
189  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
190  
191  		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
192  				      offsetof(struct sk_buff, queue_mapping));
193  		break;
194  
195  	case SKF_AD_VLAN_TAG:
196  	case SKF_AD_VLAN_TAG_PRESENT:
197  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
198  		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
199  
200  		/* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
201  		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
202  				      offsetof(struct sk_buff, vlan_tci));
203  		if (skb_field == SKF_AD_VLAN_TAG) {
204  			*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
205  						~VLAN_TAG_PRESENT);
206  		} else {
207  			/* dst_reg >>= 12 */
208  			*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
209  			/* dst_reg &= 1 */
210  			*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
211  		}
212  		break;
213  	}
214  
215  	return insn - insn_buf;
216  }
217  
convert_bpf_extensions(struct sock_filter * fp,struct bpf_insn ** insnp)218  static bool convert_bpf_extensions(struct sock_filter *fp,
219  				   struct bpf_insn **insnp)
220  {
221  	struct bpf_insn *insn = *insnp;
222  	u32 cnt;
223  
224  	switch (fp->k) {
225  	case SKF_AD_OFF + SKF_AD_PROTOCOL:
226  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
227  
228  		/* A = *(u16 *) (CTX + offsetof(protocol)) */
229  		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
230  				      offsetof(struct sk_buff, protocol));
231  		/* A = ntohs(A) [emitting a nop or swap16] */
232  		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
233  		break;
234  
235  	case SKF_AD_OFF + SKF_AD_PKTTYPE:
236  		cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
237  		insn += cnt - 1;
238  		break;
239  
240  	case SKF_AD_OFF + SKF_AD_IFINDEX:
241  	case SKF_AD_OFF + SKF_AD_HATYPE:
242  		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
243  		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
244  
245  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
246  				      BPF_REG_TMP, BPF_REG_CTX,
247  				      offsetof(struct sk_buff, dev));
248  		/* if (tmp != 0) goto pc + 1 */
249  		*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
250  		*insn++ = BPF_EXIT_INSN();
251  		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
252  			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
253  					    offsetof(struct net_device, ifindex));
254  		else
255  			*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
256  					    offsetof(struct net_device, type));
257  		break;
258  
259  	case SKF_AD_OFF + SKF_AD_MARK:
260  		cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
261  		insn += cnt - 1;
262  		break;
263  
264  	case SKF_AD_OFF + SKF_AD_RXHASH:
265  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
266  
267  		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
268  				    offsetof(struct sk_buff, hash));
269  		break;
270  
271  	case SKF_AD_OFF + SKF_AD_QUEUE:
272  		cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
273  		insn += cnt - 1;
274  		break;
275  
276  	case SKF_AD_OFF + SKF_AD_VLAN_TAG:
277  		cnt = convert_skb_access(SKF_AD_VLAN_TAG,
278  					 BPF_REG_A, BPF_REG_CTX, insn);
279  		insn += cnt - 1;
280  		break;
281  
282  	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
283  		cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
284  					 BPF_REG_A, BPF_REG_CTX, insn);
285  		insn += cnt - 1;
286  		break;
287  
288  	case SKF_AD_OFF + SKF_AD_VLAN_TPID:
289  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
290  
291  		/* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
292  		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
293  				      offsetof(struct sk_buff, vlan_proto));
294  		/* A = ntohs(A) [emitting a nop or swap16] */
295  		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
296  		break;
297  
298  	case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
299  	case SKF_AD_OFF + SKF_AD_NLATTR:
300  	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
301  	case SKF_AD_OFF + SKF_AD_CPU:
302  	case SKF_AD_OFF + SKF_AD_RANDOM:
303  		/* arg1 = CTX */
304  		*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
305  		/* arg2 = A */
306  		*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
307  		/* arg3 = X */
308  		*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
309  		/* Emit call(arg1=CTX, arg2=A, arg3=X) */
310  		switch (fp->k) {
311  		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
312  			*insn = BPF_EMIT_CALL(__skb_get_pay_offset);
313  			break;
314  		case SKF_AD_OFF + SKF_AD_NLATTR:
315  			*insn = BPF_EMIT_CALL(__skb_get_nlattr);
316  			break;
317  		case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
318  			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
319  			break;
320  		case SKF_AD_OFF + SKF_AD_CPU:
321  			*insn = BPF_EMIT_CALL(__get_raw_cpu_id);
322  			break;
323  		case SKF_AD_OFF + SKF_AD_RANDOM:
324  			*insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
325  			bpf_user_rnd_init_once();
326  			break;
327  		}
328  		break;
329  
330  	case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
331  		/* A ^= X */
332  		*insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
333  		break;
334  
335  	default:
336  		/* This is just a dummy call to avoid letting the compiler
337  		 * evict __bpf_call_base() as an optimization. Placed here
338  		 * where no-one bothers.
339  		 */
340  		BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
341  		return false;
342  	}
343  
344  	*insnp = insn;
345  	return true;
346  }
347  
348  /**
349   *	bpf_convert_filter - convert filter program
350   *	@prog: the user passed filter program
351   *	@len: the length of the user passed filter program
352   *	@new_prog: buffer where converted program will be stored
353   *	@new_len: pointer to store length of converted program
354   *
355   * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
356   * Conversion workflow:
357   *
358   * 1) First pass for calculating the new program length:
359   *   bpf_convert_filter(old_prog, old_len, NULL, &new_len)
360   *
361   * 2) 2nd pass to remap in two passes: 1st pass finds new
362   *    jump offsets, 2nd pass remapping:
363   *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
364   *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
365   */
bpf_convert_filter(struct sock_filter * prog,int len,struct bpf_insn * new_prog,int * new_len)366  static int bpf_convert_filter(struct sock_filter *prog, int len,
367  			      struct bpf_insn *new_prog, int *new_len)
368  {
369  	int new_flen = 0, pass = 0, target, i;
370  	struct bpf_insn *new_insn;
371  	struct sock_filter *fp;
372  	int *addrs = NULL;
373  	u8 bpf_src;
374  
375  	BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
376  	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
377  
378  	if (len <= 0 || len > BPF_MAXINSNS)
379  		return -EINVAL;
380  
381  	if (new_prog) {
382  		addrs = kcalloc(len, sizeof(*addrs),
383  				GFP_KERNEL | __GFP_NOWARN);
384  		if (!addrs)
385  			return -ENOMEM;
386  	}
387  
388  do_pass:
389  	new_insn = new_prog;
390  	fp = prog;
391  
392  	/* Classic BPF related prologue emission. */
393  	if (new_insn) {
394  		/* Classic BPF expects A and X to be reset first. These need
395  		 * to be guaranteed to be the first two instructions.
396  		 */
397  		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
398  		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
399  
400  		/* All programs must keep CTX in callee saved BPF_REG_CTX.
401  		 * In eBPF case it's done by the compiler, here we need to
402  		 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
403  		 */
404  		*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
405  	} else {
406  		new_insn += 3;
407  	}
408  
409  	for (i = 0; i < len; fp++, i++) {
410  		struct bpf_insn tmp_insns[6] = { };
411  		struct bpf_insn *insn = tmp_insns;
412  
413  		if (addrs)
414  			addrs[i] = new_insn - new_prog;
415  
416  		switch (fp->code) {
417  		/* All arithmetic insns and skb loads map as-is. */
418  		case BPF_ALU | BPF_ADD | BPF_X:
419  		case BPF_ALU | BPF_ADD | BPF_K:
420  		case BPF_ALU | BPF_SUB | BPF_X:
421  		case BPF_ALU | BPF_SUB | BPF_K:
422  		case BPF_ALU | BPF_AND | BPF_X:
423  		case BPF_ALU | BPF_AND | BPF_K:
424  		case BPF_ALU | BPF_OR | BPF_X:
425  		case BPF_ALU | BPF_OR | BPF_K:
426  		case BPF_ALU | BPF_LSH | BPF_X:
427  		case BPF_ALU | BPF_LSH | BPF_K:
428  		case BPF_ALU | BPF_RSH | BPF_X:
429  		case BPF_ALU | BPF_RSH | BPF_K:
430  		case BPF_ALU | BPF_XOR | BPF_X:
431  		case BPF_ALU | BPF_XOR | BPF_K:
432  		case BPF_ALU | BPF_MUL | BPF_X:
433  		case BPF_ALU | BPF_MUL | BPF_K:
434  		case BPF_ALU | BPF_DIV | BPF_X:
435  		case BPF_ALU | BPF_DIV | BPF_K:
436  		case BPF_ALU | BPF_MOD | BPF_X:
437  		case BPF_ALU | BPF_MOD | BPF_K:
438  		case BPF_ALU | BPF_NEG:
439  		case BPF_LD | BPF_ABS | BPF_W:
440  		case BPF_LD | BPF_ABS | BPF_H:
441  		case BPF_LD | BPF_ABS | BPF_B:
442  		case BPF_LD | BPF_IND | BPF_W:
443  		case BPF_LD | BPF_IND | BPF_H:
444  		case BPF_LD | BPF_IND | BPF_B:
445  			/* Check for overloaded BPF extension and
446  			 * directly convert it if found, otherwise
447  			 * just move on with mapping.
448  			 */
449  			if (BPF_CLASS(fp->code) == BPF_LD &&
450  			    BPF_MODE(fp->code) == BPF_ABS &&
451  			    convert_bpf_extensions(fp, &insn))
452  				break;
453  
454  			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
455  			    fp->code == (BPF_ALU | BPF_MOD | BPF_X))
456  				*insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
457  
458  			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
459  			break;
460  
461  		/* Jump transformation cannot use BPF block macros
462  		 * everywhere as offset calculation and target updates
463  		 * require a bit more work than the rest, i.e. jump
464  		 * opcodes map as-is, but offsets need adjustment.
465  		 */
466  
467  #define BPF_EMIT_JMP							\
468  	do {								\
469  		if (target >= len || target < 0)			\
470  			goto err;					\
471  		insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;	\
472  		/* Adjust pc relative offset for 2nd or 3rd insn. */	\
473  		insn->off -= insn - tmp_insns;				\
474  	} while (0)
475  
476  		case BPF_JMP | BPF_JA:
477  			target = i + fp->k + 1;
478  			insn->code = fp->code;
479  			BPF_EMIT_JMP;
480  			break;
481  
482  		case BPF_JMP | BPF_JEQ | BPF_K:
483  		case BPF_JMP | BPF_JEQ | BPF_X:
484  		case BPF_JMP | BPF_JSET | BPF_K:
485  		case BPF_JMP | BPF_JSET | BPF_X:
486  		case BPF_JMP | BPF_JGT | BPF_K:
487  		case BPF_JMP | BPF_JGT | BPF_X:
488  		case BPF_JMP | BPF_JGE | BPF_K:
489  		case BPF_JMP | BPF_JGE | BPF_X:
490  			if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
491  				/* BPF immediates are signed, zero extend
492  				 * immediate into tmp register and use it
493  				 * in compare insn.
494  				 */
495  				*insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
496  
497  				insn->dst_reg = BPF_REG_A;
498  				insn->src_reg = BPF_REG_TMP;
499  				bpf_src = BPF_X;
500  			} else {
501  				insn->dst_reg = BPF_REG_A;
502  				insn->imm = fp->k;
503  				bpf_src = BPF_SRC(fp->code);
504  				insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
505  			}
506  
507  			/* Common case where 'jump_false' is next insn. */
508  			if (fp->jf == 0) {
509  				insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
510  				target = i + fp->jt + 1;
511  				BPF_EMIT_JMP;
512  				break;
513  			}
514  
515  			/* Convert JEQ into JNE when 'jump_true' is next insn. */
516  			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
517  				insn->code = BPF_JMP | BPF_JNE | bpf_src;
518  				target = i + fp->jf + 1;
519  				BPF_EMIT_JMP;
520  				break;
521  			}
522  
523  			/* Other jumps are mapped into two insns: Jxx and JA. */
524  			target = i + fp->jt + 1;
525  			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
526  			BPF_EMIT_JMP;
527  			insn++;
528  
529  			insn->code = BPF_JMP | BPF_JA;
530  			target = i + fp->jf + 1;
531  			BPF_EMIT_JMP;
532  			break;
533  
534  		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
535  		case BPF_LDX | BPF_MSH | BPF_B:
536  			/* tmp = A */
537  			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
538  			/* A = BPF_R0 = *(u8 *) (skb->data + K) */
539  			*insn++ = BPF_LD_ABS(BPF_B, fp->k);
540  			/* A &= 0xf */
541  			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
542  			/* A <<= 2 */
543  			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
544  			/* X = A */
545  			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
546  			/* A = tmp */
547  			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
548  			break;
549  
550  		/* RET_K is remaped into 2 insns. RET_A case doesn't need an
551  		 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
552  		 */
553  		case BPF_RET | BPF_A:
554  		case BPF_RET | BPF_K:
555  			if (BPF_RVAL(fp->code) == BPF_K)
556  				*insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
557  							0, fp->k);
558  			*insn = BPF_EXIT_INSN();
559  			break;
560  
561  		/* Store to stack. */
562  		case BPF_ST:
563  		case BPF_STX:
564  			*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
565  					    BPF_ST ? BPF_REG_A : BPF_REG_X,
566  					    -(BPF_MEMWORDS - fp->k) * 4);
567  			break;
568  
569  		/* Load from stack. */
570  		case BPF_LD | BPF_MEM:
571  		case BPF_LDX | BPF_MEM:
572  			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
573  					    BPF_REG_A : BPF_REG_X, BPF_REG_FP,
574  					    -(BPF_MEMWORDS - fp->k) * 4);
575  			break;
576  
577  		/* A = K or X = K */
578  		case BPF_LD | BPF_IMM:
579  		case BPF_LDX | BPF_IMM:
580  			*insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
581  					      BPF_REG_A : BPF_REG_X, fp->k);
582  			break;
583  
584  		/* X = A */
585  		case BPF_MISC | BPF_TAX:
586  			*insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
587  			break;
588  
589  		/* A = X */
590  		case BPF_MISC | BPF_TXA:
591  			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
592  			break;
593  
594  		/* A = skb->len or X = skb->len */
595  		case BPF_LD | BPF_W | BPF_LEN:
596  		case BPF_LDX | BPF_W | BPF_LEN:
597  			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
598  					    BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
599  					    offsetof(struct sk_buff, len));
600  			break;
601  
602  		/* Access seccomp_data fields. */
603  		case BPF_LDX | BPF_ABS | BPF_W:
604  			/* A = *(u32 *) (ctx + K) */
605  			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
606  			break;
607  
608  		/* Unknown instruction. */
609  		default:
610  			goto err;
611  		}
612  
613  		insn++;
614  		if (new_prog)
615  			memcpy(new_insn, tmp_insns,
616  			       sizeof(*insn) * (insn - tmp_insns));
617  		new_insn += insn - tmp_insns;
618  	}
619  
620  	if (!new_prog) {
621  		/* Only calculating new length. */
622  		*new_len = new_insn - new_prog;
623  		return 0;
624  	}
625  
626  	pass++;
627  	if (new_flen != new_insn - new_prog) {
628  		new_flen = new_insn - new_prog;
629  		if (pass > 2)
630  			goto err;
631  		goto do_pass;
632  	}
633  
634  	kfree(addrs);
635  	BUG_ON(*new_len != new_flen);
636  	return 0;
637  err:
638  	kfree(addrs);
639  	return -EINVAL;
640  }
641  
642  /* Security:
643   *
644   * As we dont want to clear mem[] array for each packet going through
645   * __bpf_prog_run(), we check that filter loaded by user never try to read
646   * a cell if not previously written, and we check all branches to be sure
647   * a malicious user doesn't try to abuse us.
648   */
check_load_and_stores(const struct sock_filter * filter,int flen)649  static int check_load_and_stores(const struct sock_filter *filter, int flen)
650  {
651  	u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
652  	int pc, ret = 0;
653  
654  	BUILD_BUG_ON(BPF_MEMWORDS > 16);
655  
656  	masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
657  	if (!masks)
658  		return -ENOMEM;
659  
660  	memset(masks, 0xff, flen * sizeof(*masks));
661  
662  	for (pc = 0; pc < flen; pc++) {
663  		memvalid &= masks[pc];
664  
665  		switch (filter[pc].code) {
666  		case BPF_ST:
667  		case BPF_STX:
668  			memvalid |= (1 << filter[pc].k);
669  			break;
670  		case BPF_LD | BPF_MEM:
671  		case BPF_LDX | BPF_MEM:
672  			if (!(memvalid & (1 << filter[pc].k))) {
673  				ret = -EINVAL;
674  				goto error;
675  			}
676  			break;
677  		case BPF_JMP | BPF_JA:
678  			/* A jump must set masks on target */
679  			masks[pc + 1 + filter[pc].k] &= memvalid;
680  			memvalid = ~0;
681  			break;
682  		case BPF_JMP | BPF_JEQ | BPF_K:
683  		case BPF_JMP | BPF_JEQ | BPF_X:
684  		case BPF_JMP | BPF_JGE | BPF_K:
685  		case BPF_JMP | BPF_JGE | BPF_X:
686  		case BPF_JMP | BPF_JGT | BPF_K:
687  		case BPF_JMP | BPF_JGT | BPF_X:
688  		case BPF_JMP | BPF_JSET | BPF_K:
689  		case BPF_JMP | BPF_JSET | BPF_X:
690  			/* A jump must set masks on targets */
691  			masks[pc + 1 + filter[pc].jt] &= memvalid;
692  			masks[pc + 1 + filter[pc].jf] &= memvalid;
693  			memvalid = ~0;
694  			break;
695  		}
696  	}
697  error:
698  	kfree(masks);
699  	return ret;
700  }
701  
chk_code_allowed(u16 code_to_probe)702  static bool chk_code_allowed(u16 code_to_probe)
703  {
704  	static const bool codes[] = {
705  		/* 32 bit ALU operations */
706  		[BPF_ALU | BPF_ADD | BPF_K] = true,
707  		[BPF_ALU | BPF_ADD | BPF_X] = true,
708  		[BPF_ALU | BPF_SUB | BPF_K] = true,
709  		[BPF_ALU | BPF_SUB | BPF_X] = true,
710  		[BPF_ALU | BPF_MUL | BPF_K] = true,
711  		[BPF_ALU | BPF_MUL | BPF_X] = true,
712  		[BPF_ALU | BPF_DIV | BPF_K] = true,
713  		[BPF_ALU | BPF_DIV | BPF_X] = true,
714  		[BPF_ALU | BPF_MOD | BPF_K] = true,
715  		[BPF_ALU | BPF_MOD | BPF_X] = true,
716  		[BPF_ALU | BPF_AND | BPF_K] = true,
717  		[BPF_ALU | BPF_AND | BPF_X] = true,
718  		[BPF_ALU | BPF_OR | BPF_K] = true,
719  		[BPF_ALU | BPF_OR | BPF_X] = true,
720  		[BPF_ALU | BPF_XOR | BPF_K] = true,
721  		[BPF_ALU | BPF_XOR | BPF_X] = true,
722  		[BPF_ALU | BPF_LSH | BPF_K] = true,
723  		[BPF_ALU | BPF_LSH | BPF_X] = true,
724  		[BPF_ALU | BPF_RSH | BPF_K] = true,
725  		[BPF_ALU | BPF_RSH | BPF_X] = true,
726  		[BPF_ALU | BPF_NEG] = true,
727  		/* Load instructions */
728  		[BPF_LD | BPF_W | BPF_ABS] = true,
729  		[BPF_LD | BPF_H | BPF_ABS] = true,
730  		[BPF_LD | BPF_B | BPF_ABS] = true,
731  		[BPF_LD | BPF_W | BPF_LEN] = true,
732  		[BPF_LD | BPF_W | BPF_IND] = true,
733  		[BPF_LD | BPF_H | BPF_IND] = true,
734  		[BPF_LD | BPF_B | BPF_IND] = true,
735  		[BPF_LD | BPF_IMM] = true,
736  		[BPF_LD | BPF_MEM] = true,
737  		[BPF_LDX | BPF_W | BPF_LEN] = true,
738  		[BPF_LDX | BPF_B | BPF_MSH] = true,
739  		[BPF_LDX | BPF_IMM] = true,
740  		[BPF_LDX | BPF_MEM] = true,
741  		/* Store instructions */
742  		[BPF_ST] = true,
743  		[BPF_STX] = true,
744  		/* Misc instructions */
745  		[BPF_MISC | BPF_TAX] = true,
746  		[BPF_MISC | BPF_TXA] = true,
747  		/* Return instructions */
748  		[BPF_RET | BPF_K] = true,
749  		[BPF_RET | BPF_A] = true,
750  		/* Jump instructions */
751  		[BPF_JMP | BPF_JA] = true,
752  		[BPF_JMP | BPF_JEQ | BPF_K] = true,
753  		[BPF_JMP | BPF_JEQ | BPF_X] = true,
754  		[BPF_JMP | BPF_JGE | BPF_K] = true,
755  		[BPF_JMP | BPF_JGE | BPF_X] = true,
756  		[BPF_JMP | BPF_JGT | BPF_K] = true,
757  		[BPF_JMP | BPF_JGT | BPF_X] = true,
758  		[BPF_JMP | BPF_JSET | BPF_K] = true,
759  		[BPF_JMP | BPF_JSET | BPF_X] = true,
760  	};
761  
762  	if (code_to_probe >= ARRAY_SIZE(codes))
763  		return false;
764  
765  	return codes[code_to_probe];
766  }
767  
bpf_check_basics_ok(const struct sock_filter * filter,unsigned int flen)768  static bool bpf_check_basics_ok(const struct sock_filter *filter,
769  				unsigned int flen)
770  {
771  	if (filter == NULL)
772  		return false;
773  	if (flen == 0 || flen > BPF_MAXINSNS)
774  		return false;
775  
776  	return true;
777  }
778  
779  /**
780   *	bpf_check_classic - verify socket filter code
781   *	@filter: filter to verify
782   *	@flen: length of filter
783   *
784   * Check the user's filter code. If we let some ugly
785   * filter code slip through kaboom! The filter must contain
786   * no references or jumps that are out of range, no illegal
787   * instructions, and must end with a RET instruction.
788   *
789   * All jumps are forward as they are not signed.
790   *
791   * Returns 0 if the rule set is legal or -EINVAL if not.
792   */
bpf_check_classic(const struct sock_filter * filter,unsigned int flen)793  static int bpf_check_classic(const struct sock_filter *filter,
794  			     unsigned int flen)
795  {
796  	bool anc_found;
797  	int pc;
798  
799  	/* Check the filter code now */
800  	for (pc = 0; pc < flen; pc++) {
801  		const struct sock_filter *ftest = &filter[pc];
802  
803  		/* May we actually operate on this code? */
804  		if (!chk_code_allowed(ftest->code))
805  			return -EINVAL;
806  
807  		/* Some instructions need special checks */
808  		switch (ftest->code) {
809  		case BPF_ALU | BPF_DIV | BPF_K:
810  		case BPF_ALU | BPF_MOD | BPF_K:
811  			/* Check for division by zero */
812  			if (ftest->k == 0)
813  				return -EINVAL;
814  			break;
815  		case BPF_ALU | BPF_LSH | BPF_K:
816  		case BPF_ALU | BPF_RSH | BPF_K:
817  			if (ftest->k >= 32)
818  				return -EINVAL;
819  			break;
820  		case BPF_LD | BPF_MEM:
821  		case BPF_LDX | BPF_MEM:
822  		case BPF_ST:
823  		case BPF_STX:
824  			/* Check for invalid memory addresses */
825  			if (ftest->k >= BPF_MEMWORDS)
826  				return -EINVAL;
827  			break;
828  		case BPF_JMP | BPF_JA:
829  			/* Note, the large ftest->k might cause loops.
830  			 * Compare this with conditional jumps below,
831  			 * where offsets are limited. --ANK (981016)
832  			 */
833  			if (ftest->k >= (unsigned int)(flen - pc - 1))
834  				return -EINVAL;
835  			break;
836  		case BPF_JMP | BPF_JEQ | BPF_K:
837  		case BPF_JMP | BPF_JEQ | BPF_X:
838  		case BPF_JMP | BPF_JGE | BPF_K:
839  		case BPF_JMP | BPF_JGE | BPF_X:
840  		case BPF_JMP | BPF_JGT | BPF_K:
841  		case BPF_JMP | BPF_JGT | BPF_X:
842  		case BPF_JMP | BPF_JSET | BPF_K:
843  		case BPF_JMP | BPF_JSET | BPF_X:
844  			/* Both conditionals must be safe */
845  			if (pc + ftest->jt + 1 >= flen ||
846  			    pc + ftest->jf + 1 >= flen)
847  				return -EINVAL;
848  			break;
849  		case BPF_LD | BPF_W | BPF_ABS:
850  		case BPF_LD | BPF_H | BPF_ABS:
851  		case BPF_LD | BPF_B | BPF_ABS:
852  			anc_found = false;
853  			if (bpf_anc_helper(ftest) & BPF_ANC)
854  				anc_found = true;
855  			/* Ancillary operation unknown or unsupported */
856  			if (anc_found == false && ftest->k >= SKF_AD_OFF)
857  				return -EINVAL;
858  		}
859  	}
860  
861  	/* Last instruction must be a RET code */
862  	switch (filter[flen - 1].code) {
863  	case BPF_RET | BPF_K:
864  	case BPF_RET | BPF_A:
865  		return check_load_and_stores(filter, flen);
866  	}
867  
868  	return -EINVAL;
869  }
870  
bpf_prog_store_orig_filter(struct bpf_prog * fp,const struct sock_fprog * fprog)871  static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
872  				      const struct sock_fprog *fprog)
873  {
874  	unsigned int fsize = bpf_classic_proglen(fprog);
875  	struct sock_fprog_kern *fkprog;
876  
877  	fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
878  	if (!fp->orig_prog)
879  		return -ENOMEM;
880  
881  	fkprog = fp->orig_prog;
882  	fkprog->len = fprog->len;
883  
884  	fkprog->filter = kmemdup(fp->insns, fsize,
885  				 GFP_KERNEL | __GFP_NOWARN);
886  	if (!fkprog->filter) {
887  		kfree(fp->orig_prog);
888  		return -ENOMEM;
889  	}
890  
891  	return 0;
892  }
893  
bpf_release_orig_filter(struct bpf_prog * fp)894  static void bpf_release_orig_filter(struct bpf_prog *fp)
895  {
896  	struct sock_fprog_kern *fprog = fp->orig_prog;
897  
898  	if (fprog) {
899  		kfree(fprog->filter);
900  		kfree(fprog);
901  	}
902  }
903  
__bpf_prog_release(struct bpf_prog * prog)904  static void __bpf_prog_release(struct bpf_prog *prog)
905  {
906  	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
907  		bpf_prog_put(prog);
908  	} else {
909  		bpf_release_orig_filter(prog);
910  		bpf_prog_free(prog);
911  	}
912  }
913  
__sk_filter_release(struct sk_filter * fp)914  static void __sk_filter_release(struct sk_filter *fp)
915  {
916  	__bpf_prog_release(fp->prog);
917  	kfree(fp);
918  }
919  
920  /**
921   * 	sk_filter_release_rcu - Release a socket filter by rcu_head
922   *	@rcu: rcu_head that contains the sk_filter to free
923   */
sk_filter_release_rcu(struct rcu_head * rcu)924  static void sk_filter_release_rcu(struct rcu_head *rcu)
925  {
926  	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
927  
928  	__sk_filter_release(fp);
929  }
930  
931  /**
932   *	sk_filter_release - release a socket filter
933   *	@fp: filter to remove
934   *
935   *	Remove a filter from a socket and release its resources.
936   */
sk_filter_release(struct sk_filter * fp)937  static void sk_filter_release(struct sk_filter *fp)
938  {
939  	if (atomic_dec_and_test(&fp->refcnt))
940  		call_rcu(&fp->rcu, sk_filter_release_rcu);
941  }
942  
sk_filter_uncharge(struct sock * sk,struct sk_filter * fp)943  void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
944  {
945  	u32 filter_size = bpf_prog_size(fp->prog->len);
946  
947  	atomic_sub(filter_size, &sk->sk_omem_alloc);
948  	sk_filter_release(fp);
949  }
950  
951  /* try to charge the socket memory if there is space available
952   * return true on success
953   */
sk_filter_charge(struct sock * sk,struct sk_filter * fp)954  bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
955  {
956  	u32 filter_size = bpf_prog_size(fp->prog->len);
957  
958  	/* same check as in sock_kmalloc() */
959  	if (filter_size <= sysctl_optmem_max &&
960  	    atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
961  		atomic_inc(&fp->refcnt);
962  		atomic_add(filter_size, &sk->sk_omem_alloc);
963  		return true;
964  	}
965  	return false;
966  }
967  
bpf_migrate_filter(struct bpf_prog * fp)968  static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
969  {
970  	struct sock_filter *old_prog;
971  	struct bpf_prog *old_fp;
972  	int err, new_len, old_len = fp->len;
973  
974  	/* We are free to overwrite insns et al right here as it
975  	 * won't be used at this point in time anymore internally
976  	 * after the migration to the internal BPF instruction
977  	 * representation.
978  	 */
979  	BUILD_BUG_ON(sizeof(struct sock_filter) !=
980  		     sizeof(struct bpf_insn));
981  
982  	/* Conversion cannot happen on overlapping memory areas,
983  	 * so we need to keep the user BPF around until the 2nd
984  	 * pass. At this time, the user BPF is stored in fp->insns.
985  	 */
986  	old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
987  			   GFP_KERNEL | __GFP_NOWARN);
988  	if (!old_prog) {
989  		err = -ENOMEM;
990  		goto out_err;
991  	}
992  
993  	/* 1st pass: calculate the new program length. */
994  	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
995  	if (err)
996  		goto out_err_free;
997  
998  	/* Expand fp for appending the new filter representation. */
999  	old_fp = fp;
1000  	fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
1001  	if (!fp) {
1002  		/* The old_fp is still around in case we couldn't
1003  		 * allocate new memory, so uncharge on that one.
1004  		 */
1005  		fp = old_fp;
1006  		err = -ENOMEM;
1007  		goto out_err_free;
1008  	}
1009  
1010  	fp->len = new_len;
1011  
1012  	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1013  	err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
1014  	if (err)
1015  		/* 2nd bpf_convert_filter() can fail only if it fails
1016  		 * to allocate memory, remapping must succeed. Note,
1017  		 * that at this time old_fp has already been released
1018  		 * by krealloc().
1019  		 */
1020  		goto out_err_free;
1021  
1022  	fp = bpf_prog_select_runtime(fp, &err);
1023  	if (err)
1024  		goto out_err_free;
1025  
1026  	kfree(old_prog);
1027  	return fp;
1028  
1029  out_err_free:
1030  	kfree(old_prog);
1031  out_err:
1032  	__bpf_prog_release(fp);
1033  	return ERR_PTR(err);
1034  }
1035  
bpf_prepare_filter(struct bpf_prog * fp,bpf_aux_classic_check_t trans)1036  static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1037  					   bpf_aux_classic_check_t trans)
1038  {
1039  	int err;
1040  
1041  	fp->bpf_func = NULL;
1042  	fp->jited = 0;
1043  
1044  	err = bpf_check_classic(fp->insns, fp->len);
1045  	if (err) {
1046  		__bpf_prog_release(fp);
1047  		return ERR_PTR(err);
1048  	}
1049  
1050  	/* There might be additional checks and transformations
1051  	 * needed on classic filters, f.e. in case of seccomp.
1052  	 */
1053  	if (trans) {
1054  		err = trans(fp->insns, fp->len);
1055  		if (err) {
1056  			__bpf_prog_release(fp);
1057  			return ERR_PTR(err);
1058  		}
1059  	}
1060  
1061  	/* Probe if we can JIT compile the filter and if so, do
1062  	 * the compilation of the filter.
1063  	 */
1064  	bpf_jit_compile(fp);
1065  
1066  	/* JIT compiler couldn't process this filter, so do the
1067  	 * internal BPF translation for the optimized interpreter.
1068  	 */
1069  	if (!fp->jited)
1070  		fp = bpf_migrate_filter(fp);
1071  
1072  	return fp;
1073  }
1074  
1075  /**
1076   *	bpf_prog_create - create an unattached filter
1077   *	@pfp: the unattached filter that is created
1078   *	@fprog: the filter program
1079   *
1080   * Create a filter independent of any socket. We first run some
1081   * sanity checks on it to make sure it does not explode on us later.
1082   * If an error occurs or there is insufficient memory for the filter
1083   * a negative errno code is returned. On success the return is zero.
1084   */
bpf_prog_create(struct bpf_prog ** pfp,struct sock_fprog_kern * fprog)1085  int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1086  {
1087  	unsigned int fsize = bpf_classic_proglen(fprog);
1088  	struct bpf_prog *fp;
1089  
1090  	/* Make sure new filter is there and in the right amounts. */
1091  	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1092  		return -EINVAL;
1093  
1094  	fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1095  	if (!fp)
1096  		return -ENOMEM;
1097  
1098  	memcpy(fp->insns, fprog->filter, fsize);
1099  
1100  	fp->len = fprog->len;
1101  	/* Since unattached filters are not copied back to user
1102  	 * space through sk_get_filter(), we do not need to hold
1103  	 * a copy here, and can spare us the work.
1104  	 */
1105  	fp->orig_prog = NULL;
1106  
1107  	/* bpf_prepare_filter() already takes care of freeing
1108  	 * memory in case something goes wrong.
1109  	 */
1110  	fp = bpf_prepare_filter(fp, NULL);
1111  	if (IS_ERR(fp))
1112  		return PTR_ERR(fp);
1113  
1114  	*pfp = fp;
1115  	return 0;
1116  }
1117  EXPORT_SYMBOL_GPL(bpf_prog_create);
1118  
1119  /**
1120   *	bpf_prog_create_from_user - create an unattached filter from user buffer
1121   *	@pfp: the unattached filter that is created
1122   *	@fprog: the filter program
1123   *	@trans: post-classic verifier transformation handler
1124   *	@save_orig: save classic BPF program
1125   *
1126   * This function effectively does the same as bpf_prog_create(), only
1127   * that it builds up its insns buffer from user space provided buffer.
1128   * It also allows for passing a bpf_aux_classic_check_t handler.
1129   */
bpf_prog_create_from_user(struct bpf_prog ** pfp,struct sock_fprog * fprog,bpf_aux_classic_check_t trans,bool save_orig)1130  int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1131  			      bpf_aux_classic_check_t trans, bool save_orig)
1132  {
1133  	unsigned int fsize = bpf_classic_proglen(fprog);
1134  	struct bpf_prog *fp;
1135  	int err;
1136  
1137  	/* Make sure new filter is there and in the right amounts. */
1138  	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1139  		return -EINVAL;
1140  
1141  	fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1142  	if (!fp)
1143  		return -ENOMEM;
1144  
1145  	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1146  		__bpf_prog_free(fp);
1147  		return -EFAULT;
1148  	}
1149  
1150  	fp->len = fprog->len;
1151  	fp->orig_prog = NULL;
1152  
1153  	if (save_orig) {
1154  		err = bpf_prog_store_orig_filter(fp, fprog);
1155  		if (err) {
1156  			__bpf_prog_free(fp);
1157  			return -ENOMEM;
1158  		}
1159  	}
1160  
1161  	/* bpf_prepare_filter() already takes care of freeing
1162  	 * memory in case something goes wrong.
1163  	 */
1164  	fp = bpf_prepare_filter(fp, trans);
1165  	if (IS_ERR(fp))
1166  		return PTR_ERR(fp);
1167  
1168  	*pfp = fp;
1169  	return 0;
1170  }
1171  EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1172  
bpf_prog_destroy(struct bpf_prog * fp)1173  void bpf_prog_destroy(struct bpf_prog *fp)
1174  {
1175  	__bpf_prog_release(fp);
1176  }
1177  EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1178  
__sk_attach_prog(struct bpf_prog * prog,struct sock * sk)1179  static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1180  {
1181  	struct sk_filter *fp, *old_fp;
1182  
1183  	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1184  	if (!fp)
1185  		return -ENOMEM;
1186  
1187  	fp->prog = prog;
1188  	atomic_set(&fp->refcnt, 0);
1189  
1190  	if (!sk_filter_charge(sk, fp)) {
1191  		kfree(fp);
1192  		return -ENOMEM;
1193  	}
1194  
1195  	old_fp = rcu_dereference_protected(sk->sk_filter,
1196  					   lockdep_sock_is_held(sk));
1197  	rcu_assign_pointer(sk->sk_filter, fp);
1198  
1199  	if (old_fp)
1200  		sk_filter_uncharge(sk, old_fp);
1201  
1202  	return 0;
1203  }
1204  
__reuseport_attach_prog(struct bpf_prog * prog,struct sock * sk)1205  static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
1206  {
1207  	struct bpf_prog *old_prog;
1208  	int err;
1209  
1210  	if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1211  		return -ENOMEM;
1212  
1213  	if (sk_unhashed(sk) && sk->sk_reuseport) {
1214  		err = reuseport_alloc(sk);
1215  		if (err)
1216  			return err;
1217  	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
1218  		/* The socket wasn't bound with SO_REUSEPORT */
1219  		return -EINVAL;
1220  	}
1221  
1222  	old_prog = reuseport_attach_prog(sk, prog);
1223  	if (old_prog)
1224  		bpf_prog_destroy(old_prog);
1225  
1226  	return 0;
1227  }
1228  
1229  static
__get_filter(struct sock_fprog * fprog,struct sock * sk)1230  struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1231  {
1232  	unsigned int fsize = bpf_classic_proglen(fprog);
1233  	struct bpf_prog *prog;
1234  	int err;
1235  
1236  	if (sock_flag(sk, SOCK_FILTER_LOCKED))
1237  		return ERR_PTR(-EPERM);
1238  
1239  	/* Make sure new filter is there and in the right amounts. */
1240  	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1241  		return ERR_PTR(-EINVAL);
1242  
1243  	prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1244  	if (!prog)
1245  		return ERR_PTR(-ENOMEM);
1246  
1247  	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1248  		__bpf_prog_free(prog);
1249  		return ERR_PTR(-EFAULT);
1250  	}
1251  
1252  	prog->len = fprog->len;
1253  
1254  	err = bpf_prog_store_orig_filter(prog, fprog);
1255  	if (err) {
1256  		__bpf_prog_free(prog);
1257  		return ERR_PTR(-ENOMEM);
1258  	}
1259  
1260  	/* bpf_prepare_filter() already takes care of freeing
1261  	 * memory in case something goes wrong.
1262  	 */
1263  	return bpf_prepare_filter(prog, NULL);
1264  }
1265  
1266  /**
1267   *	sk_attach_filter - attach a socket filter
1268   *	@fprog: the filter program
1269   *	@sk: the socket to use
1270   *
1271   * Attach the user's filter code. We first run some sanity checks on
1272   * it to make sure it does not explode on us later. If an error
1273   * occurs or there is insufficient memory for the filter a negative
1274   * errno code is returned. On success the return is zero.
1275   */
sk_attach_filter(struct sock_fprog * fprog,struct sock * sk)1276  int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1277  {
1278  	struct bpf_prog *prog = __get_filter(fprog, sk);
1279  	int err;
1280  
1281  	if (IS_ERR(prog))
1282  		return PTR_ERR(prog);
1283  
1284  	err = __sk_attach_prog(prog, sk);
1285  	if (err < 0) {
1286  		__bpf_prog_release(prog);
1287  		return err;
1288  	}
1289  
1290  	return 0;
1291  }
1292  EXPORT_SYMBOL_GPL(sk_attach_filter);
1293  
sk_reuseport_attach_filter(struct sock_fprog * fprog,struct sock * sk)1294  int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1295  {
1296  	struct bpf_prog *prog = __get_filter(fprog, sk);
1297  	int err;
1298  
1299  	if (IS_ERR(prog))
1300  		return PTR_ERR(prog);
1301  
1302  	err = __reuseport_attach_prog(prog, sk);
1303  	if (err < 0) {
1304  		__bpf_prog_release(prog);
1305  		return err;
1306  	}
1307  
1308  	return 0;
1309  }
1310  
__get_bpf(u32 ufd,struct sock * sk)1311  static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1312  {
1313  	if (sock_flag(sk, SOCK_FILTER_LOCKED))
1314  		return ERR_PTR(-EPERM);
1315  
1316  	return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1317  }
1318  
sk_attach_bpf(u32 ufd,struct sock * sk)1319  int sk_attach_bpf(u32 ufd, struct sock *sk)
1320  {
1321  	struct bpf_prog *prog = __get_bpf(ufd, sk);
1322  	int err;
1323  
1324  	if (IS_ERR(prog))
1325  		return PTR_ERR(prog);
1326  
1327  	err = __sk_attach_prog(prog, sk);
1328  	if (err < 0) {
1329  		bpf_prog_put(prog);
1330  		return err;
1331  	}
1332  
1333  	return 0;
1334  }
1335  
sk_reuseport_attach_bpf(u32 ufd,struct sock * sk)1336  int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1337  {
1338  	struct bpf_prog *prog = __get_bpf(ufd, sk);
1339  	int err;
1340  
1341  	if (IS_ERR(prog))
1342  		return PTR_ERR(prog);
1343  
1344  	err = __reuseport_attach_prog(prog, sk);
1345  	if (err < 0) {
1346  		bpf_prog_put(prog);
1347  		return err;
1348  	}
1349  
1350  	return 0;
1351  }
1352  
1353  struct bpf_scratchpad {
1354  	union {
1355  		__be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1356  		u8     buff[MAX_BPF_STACK];
1357  	};
1358  };
1359  
1360  static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1361  
__bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1362  static inline int __bpf_try_make_writable(struct sk_buff *skb,
1363  					  unsigned int write_len)
1364  {
1365  	return skb_ensure_writable(skb, write_len);
1366  }
1367  
bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1368  static inline int bpf_try_make_writable(struct sk_buff *skb,
1369  					unsigned int write_len)
1370  {
1371  	int err = __bpf_try_make_writable(skb, write_len);
1372  
1373  	bpf_compute_data_end(skb);
1374  	return err;
1375  }
1376  
bpf_try_make_head_writable(struct sk_buff * skb)1377  static int bpf_try_make_head_writable(struct sk_buff *skb)
1378  {
1379  	return bpf_try_make_writable(skb, skb_headlen(skb));
1380  }
1381  
bpf_push_mac_rcsum(struct sk_buff * skb)1382  static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1383  {
1384  	if (skb_at_tc_ingress(skb))
1385  		skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1386  }
1387  
bpf_pull_mac_rcsum(struct sk_buff * skb)1388  static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1389  {
1390  	if (skb_at_tc_ingress(skb))
1391  		skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1392  }
1393  
BPF_CALL_5(bpf_skb_store_bytes,struct sk_buff *,skb,u32,offset,const void *,from,u32,len,u64,flags)1394  BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1395  	   const void *, from, u32, len, u64, flags)
1396  {
1397  	void *ptr;
1398  
1399  	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1400  		return -EINVAL;
1401  	if (unlikely(offset > 0xffff))
1402  		return -EFAULT;
1403  	if (unlikely(bpf_try_make_writable(skb, offset + len)))
1404  		return -EFAULT;
1405  
1406  	ptr = skb->data + offset;
1407  	if (flags & BPF_F_RECOMPUTE_CSUM)
1408  		__skb_postpull_rcsum(skb, ptr, len, offset);
1409  
1410  	memcpy(ptr, from, len);
1411  
1412  	if (flags & BPF_F_RECOMPUTE_CSUM)
1413  		__skb_postpush_rcsum(skb, ptr, len, offset);
1414  	if (flags & BPF_F_INVALIDATE_HASH)
1415  		skb_clear_hash(skb);
1416  
1417  	return 0;
1418  }
1419  
1420  static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1421  	.func		= bpf_skb_store_bytes,
1422  	.gpl_only	= false,
1423  	.ret_type	= RET_INTEGER,
1424  	.arg1_type	= ARG_PTR_TO_CTX,
1425  	.arg2_type	= ARG_ANYTHING,
1426  	.arg3_type	= ARG_PTR_TO_STACK,
1427  	.arg4_type	= ARG_CONST_STACK_SIZE,
1428  	.arg5_type	= ARG_ANYTHING,
1429  };
1430  
BPF_CALL_4(bpf_skb_load_bytes,const struct sk_buff *,skb,u32,offset,void *,to,u32,len)1431  BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1432  	   void *, to, u32, len)
1433  {
1434  	void *ptr;
1435  
1436  	if (unlikely(offset > 0xffff))
1437  		goto err_clear;
1438  
1439  	ptr = skb_header_pointer(skb, offset, len, to);
1440  	if (unlikely(!ptr))
1441  		goto err_clear;
1442  	if (ptr != to)
1443  		memcpy(to, ptr, len);
1444  
1445  	return 0;
1446  err_clear:
1447  	memset(to, 0, len);
1448  	return -EFAULT;
1449  }
1450  
1451  static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1452  	.func		= bpf_skb_load_bytes,
1453  	.gpl_only	= false,
1454  	.ret_type	= RET_INTEGER,
1455  	.arg1_type	= ARG_PTR_TO_CTX,
1456  	.arg2_type	= ARG_ANYTHING,
1457  	.arg3_type	= ARG_PTR_TO_RAW_STACK,
1458  	.arg4_type	= ARG_CONST_STACK_SIZE,
1459  };
1460  
BPF_CALL_2(bpf_skb_pull_data,struct sk_buff *,skb,u32,len)1461  BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1462  {
1463  	/* Idea is the following: should the needed direct read/write
1464  	 * test fail during runtime, we can pull in more data and redo
1465  	 * again, since implicitly, we invalidate previous checks here.
1466  	 *
1467  	 * Or, since we know how much we need to make read/writeable,
1468  	 * this can be done once at the program beginning for direct
1469  	 * access case. By this we overcome limitations of only current
1470  	 * headroom being accessible.
1471  	 */
1472  	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1473  }
1474  
1475  static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1476  	.func		= bpf_skb_pull_data,
1477  	.gpl_only	= false,
1478  	.ret_type	= RET_INTEGER,
1479  	.arg1_type	= ARG_PTR_TO_CTX,
1480  	.arg2_type	= ARG_ANYTHING,
1481  };
1482  
BPF_CALL_5(bpf_l3_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1483  BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1484  	   u64, from, u64, to, u64, flags)
1485  {
1486  	__sum16 *ptr;
1487  
1488  	if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1489  		return -EINVAL;
1490  	if (unlikely(offset > 0xffff || offset & 1))
1491  		return -EFAULT;
1492  	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1493  		return -EFAULT;
1494  
1495  	ptr = (__sum16 *)(skb->data + offset);
1496  	switch (flags & BPF_F_HDR_FIELD_MASK) {
1497  	case 0:
1498  		if (unlikely(from != 0))
1499  			return -EINVAL;
1500  
1501  		csum_replace_by_diff(ptr, to);
1502  		break;
1503  	case 2:
1504  		csum_replace2(ptr, from, to);
1505  		break;
1506  	case 4:
1507  		csum_replace4(ptr, from, to);
1508  		break;
1509  	default:
1510  		return -EINVAL;
1511  	}
1512  
1513  	return 0;
1514  }
1515  
1516  static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1517  	.func		= bpf_l3_csum_replace,
1518  	.gpl_only	= false,
1519  	.ret_type	= RET_INTEGER,
1520  	.arg1_type	= ARG_PTR_TO_CTX,
1521  	.arg2_type	= ARG_ANYTHING,
1522  	.arg3_type	= ARG_ANYTHING,
1523  	.arg4_type	= ARG_ANYTHING,
1524  	.arg5_type	= ARG_ANYTHING,
1525  };
1526  
BPF_CALL_5(bpf_l4_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1527  BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1528  	   u64, from, u64, to, u64, flags)
1529  {
1530  	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1531  	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1532  	__sum16 *ptr;
1533  
1534  	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
1535  			       BPF_F_HDR_FIELD_MASK)))
1536  		return -EINVAL;
1537  	if (unlikely(offset > 0xffff || offset & 1))
1538  		return -EFAULT;
1539  	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1540  		return -EFAULT;
1541  
1542  	ptr = (__sum16 *)(skb->data + offset);
1543  	if (is_mmzero && !*ptr)
1544  		return 0;
1545  
1546  	switch (flags & BPF_F_HDR_FIELD_MASK) {
1547  	case 0:
1548  		if (unlikely(from != 0))
1549  			return -EINVAL;
1550  
1551  		inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1552  		break;
1553  	case 2:
1554  		inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1555  		break;
1556  	case 4:
1557  		inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1558  		break;
1559  	default:
1560  		return -EINVAL;
1561  	}
1562  
1563  	if (is_mmzero && !*ptr)
1564  		*ptr = CSUM_MANGLED_0;
1565  	return 0;
1566  }
1567  
1568  static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
1569  	.func		= bpf_l4_csum_replace,
1570  	.gpl_only	= false,
1571  	.ret_type	= RET_INTEGER,
1572  	.arg1_type	= ARG_PTR_TO_CTX,
1573  	.arg2_type	= ARG_ANYTHING,
1574  	.arg3_type	= ARG_ANYTHING,
1575  	.arg4_type	= ARG_ANYTHING,
1576  	.arg5_type	= ARG_ANYTHING,
1577  };
1578  
BPF_CALL_5(bpf_csum_diff,__be32 *,from,u32,from_size,__be32 *,to,u32,to_size,__wsum,seed)1579  BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
1580  	   __be32 *, to, u32, to_size, __wsum, seed)
1581  {
1582  	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
1583  	u32 diff_size = from_size + to_size;
1584  	int i, j = 0;
1585  
1586  	/* This is quite flexible, some examples:
1587  	 *
1588  	 * from_size == 0, to_size > 0,  seed := csum --> pushing data
1589  	 * from_size > 0,  to_size == 0, seed := csum --> pulling data
1590  	 * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
1591  	 *
1592  	 * Even for diffing, from_size and to_size don't need to be equal.
1593  	 */
1594  	if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
1595  		     diff_size > sizeof(sp->diff)))
1596  		return -EINVAL;
1597  
1598  	for (i = 0; i < from_size / sizeof(__be32); i++, j++)
1599  		sp->diff[j] = ~from[i];
1600  	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
1601  		sp->diff[j] = to[i];
1602  
1603  	return csum_partial(sp->diff, diff_size, seed);
1604  }
1605  
1606  static const struct bpf_func_proto bpf_csum_diff_proto = {
1607  	.func		= bpf_csum_diff,
1608  	.gpl_only	= false,
1609  	.pkt_access	= true,
1610  	.ret_type	= RET_INTEGER,
1611  	.arg1_type	= ARG_PTR_TO_STACK,
1612  	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
1613  	.arg3_type	= ARG_PTR_TO_STACK,
1614  	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
1615  	.arg5_type	= ARG_ANYTHING,
1616  };
1617  
BPF_CALL_2(bpf_csum_update,struct sk_buff *,skb,__wsum,csum)1618  BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
1619  {
1620  	/* The interface is to be used in combination with bpf_csum_diff()
1621  	 * for direct packet writes. csum rotation for alignment as well
1622  	 * as emulating csum_sub() can be done from the eBPF program.
1623  	 */
1624  	if (skb->ip_summed == CHECKSUM_COMPLETE)
1625  		return (skb->csum = csum_add(skb->csum, csum));
1626  
1627  	return -ENOTSUPP;
1628  }
1629  
1630  static const struct bpf_func_proto bpf_csum_update_proto = {
1631  	.func		= bpf_csum_update,
1632  	.gpl_only	= false,
1633  	.ret_type	= RET_INTEGER,
1634  	.arg1_type	= ARG_PTR_TO_CTX,
1635  	.arg2_type	= ARG_ANYTHING,
1636  };
1637  
__bpf_rx_skb(struct net_device * dev,struct sk_buff * skb)1638  static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
1639  {
1640  	return dev_forward_skb(dev, skb);
1641  }
1642  
__bpf_rx_skb_no_mac(struct net_device * dev,struct sk_buff * skb)1643  static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
1644  				      struct sk_buff *skb)
1645  {
1646  	int ret = ____dev_forward_skb(dev, skb);
1647  
1648  	if (likely(!ret)) {
1649  		skb->dev = dev;
1650  		ret = netif_rx(skb);
1651  	}
1652  
1653  	return ret;
1654  }
1655  
__bpf_tx_skb(struct net_device * dev,struct sk_buff * skb)1656  static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1657  {
1658  	int ret;
1659  
1660  	if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
1661  		net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
1662  		kfree_skb(skb);
1663  		return -ENETDOWN;
1664  	}
1665  
1666  	skb->dev = dev;
1667  
1668  	__this_cpu_inc(xmit_recursion);
1669  	ret = dev_queue_xmit(skb);
1670  	__this_cpu_dec(xmit_recursion);
1671  
1672  	return ret;
1673  }
1674  
__bpf_redirect_no_mac(struct sk_buff * skb,struct net_device * dev,u32 flags)1675  static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
1676  				 u32 flags)
1677  {
1678  	/* skb->mac_len is not set on normal egress */
1679  	unsigned int mlen = skb->network_header - skb->mac_header;
1680  
1681  	__skb_pull(skb, mlen);
1682  
1683  	/* At ingress, the mac header has already been pulled once.
1684  	 * At egress, skb_pospull_rcsum has to be done in case that
1685  	 * the skb is originated from ingress (i.e. a forwarded skb)
1686  	 * to ensure that rcsum starts at net header.
1687  	 */
1688  	if (!skb_at_tc_ingress(skb))
1689  		skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
1690  	skb_pop_mac_header(skb);
1691  	skb_reset_mac_len(skb);
1692  	return flags & BPF_F_INGRESS ?
1693  	       __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
1694  }
1695  
__bpf_redirect_common(struct sk_buff * skb,struct net_device * dev,u32 flags)1696  static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
1697  				 u32 flags)
1698  {
1699  	bpf_push_mac_rcsum(skb);
1700  	return flags & BPF_F_INGRESS ?
1701  	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
1702  }
1703  
__bpf_redirect(struct sk_buff * skb,struct net_device * dev,u32 flags)1704  static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
1705  			  u32 flags)
1706  {
1707  	switch (dev->type) {
1708  	case ARPHRD_TUNNEL:
1709  	case ARPHRD_TUNNEL6:
1710  	case ARPHRD_SIT:
1711  	case ARPHRD_IPGRE:
1712  	case ARPHRD_VOID:
1713  	case ARPHRD_NONE:
1714  		return __bpf_redirect_no_mac(skb, dev, flags);
1715  	default:
1716  		return __bpf_redirect_common(skb, dev, flags);
1717  	}
1718  }
1719  
BPF_CALL_3(bpf_clone_redirect,struct sk_buff *,skb,u32,ifindex,u64,flags)1720  BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
1721  {
1722  	struct net_device *dev;
1723  	struct sk_buff *clone;
1724  	int ret;
1725  
1726  	if (unlikely(flags & ~(BPF_F_INGRESS)))
1727  		return -EINVAL;
1728  
1729  	dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
1730  	if (unlikely(!dev))
1731  		return -EINVAL;
1732  
1733  	clone = skb_clone(skb, GFP_ATOMIC);
1734  	if (unlikely(!clone))
1735  		return -ENOMEM;
1736  
1737  	/* For direct write, we need to keep the invariant that the skbs
1738  	 * we're dealing with need to be uncloned. Should uncloning fail
1739  	 * here, we need to free the just generated clone to unclone once
1740  	 * again.
1741  	 */
1742  	ret = bpf_try_make_head_writable(skb);
1743  	if (unlikely(ret)) {
1744  		kfree_skb(clone);
1745  		return -ENOMEM;
1746  	}
1747  
1748  	return __bpf_redirect(clone, dev, flags);
1749  }
1750  
1751  static const struct bpf_func_proto bpf_clone_redirect_proto = {
1752  	.func           = bpf_clone_redirect,
1753  	.gpl_only       = false,
1754  	.ret_type       = RET_INTEGER,
1755  	.arg1_type      = ARG_PTR_TO_CTX,
1756  	.arg2_type      = ARG_ANYTHING,
1757  	.arg3_type      = ARG_ANYTHING,
1758  };
1759  
1760  struct redirect_info {
1761  	u32 ifindex;
1762  	u32 flags;
1763  };
1764  
1765  static DEFINE_PER_CPU(struct redirect_info, redirect_info);
1766  
BPF_CALL_2(bpf_redirect,u32,ifindex,u64,flags)1767  BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
1768  {
1769  	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1770  
1771  	if (unlikely(flags & ~(BPF_F_INGRESS)))
1772  		return TC_ACT_SHOT;
1773  
1774  	ri->ifindex = ifindex;
1775  	ri->flags = flags;
1776  
1777  	return TC_ACT_REDIRECT;
1778  }
1779  
skb_do_redirect(struct sk_buff * skb)1780  int skb_do_redirect(struct sk_buff *skb)
1781  {
1782  	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1783  	struct net_device *dev;
1784  
1785  	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
1786  	ri->ifindex = 0;
1787  	if (unlikely(!dev)) {
1788  		kfree_skb(skb);
1789  		return -EINVAL;
1790  	}
1791  
1792  	return __bpf_redirect(skb, dev, ri->flags);
1793  }
1794  
1795  static const struct bpf_func_proto bpf_redirect_proto = {
1796  	.func           = bpf_redirect,
1797  	.gpl_only       = false,
1798  	.ret_type       = RET_INTEGER,
1799  	.arg1_type      = ARG_ANYTHING,
1800  	.arg2_type      = ARG_ANYTHING,
1801  };
1802  
BPF_CALL_1(bpf_get_cgroup_classid,const struct sk_buff *,skb)1803  BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
1804  {
1805  	return task_get_classid(skb);
1806  }
1807  
1808  static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
1809  	.func           = bpf_get_cgroup_classid,
1810  	.gpl_only       = false,
1811  	.ret_type       = RET_INTEGER,
1812  	.arg1_type      = ARG_PTR_TO_CTX,
1813  };
1814  
BPF_CALL_1(bpf_get_route_realm,const struct sk_buff *,skb)1815  BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
1816  {
1817  	return dst_tclassid(skb);
1818  }
1819  
1820  static const struct bpf_func_proto bpf_get_route_realm_proto = {
1821  	.func           = bpf_get_route_realm,
1822  	.gpl_only       = false,
1823  	.ret_type       = RET_INTEGER,
1824  	.arg1_type      = ARG_PTR_TO_CTX,
1825  };
1826  
BPF_CALL_1(bpf_get_hash_recalc,struct sk_buff *,skb)1827  BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
1828  {
1829  	/* If skb_clear_hash() was called due to mangling, we can
1830  	 * trigger SW recalculation here. Later access to hash
1831  	 * can then use the inline skb->hash via context directly
1832  	 * instead of calling this helper again.
1833  	 */
1834  	return skb_get_hash(skb);
1835  }
1836  
1837  static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
1838  	.func		= bpf_get_hash_recalc,
1839  	.gpl_only	= false,
1840  	.ret_type	= RET_INTEGER,
1841  	.arg1_type	= ARG_PTR_TO_CTX,
1842  };
1843  
BPF_CALL_1(bpf_set_hash_invalid,struct sk_buff *,skb)1844  BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
1845  {
1846  	/* After all direct packet write, this can be used once for
1847  	 * triggering a lazy recalc on next skb_get_hash() invocation.
1848  	 */
1849  	skb_clear_hash(skb);
1850  	return 0;
1851  }
1852  
1853  static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
1854  	.func		= bpf_set_hash_invalid,
1855  	.gpl_only	= false,
1856  	.ret_type	= RET_INTEGER,
1857  	.arg1_type	= ARG_PTR_TO_CTX,
1858  };
1859  
BPF_CALL_3(bpf_skb_vlan_push,struct sk_buff *,skb,__be16,vlan_proto,u16,vlan_tci)1860  BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
1861  	   u16, vlan_tci)
1862  {
1863  	int ret;
1864  
1865  	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
1866  		     vlan_proto != htons(ETH_P_8021AD)))
1867  		vlan_proto = htons(ETH_P_8021Q);
1868  
1869  	bpf_push_mac_rcsum(skb);
1870  	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
1871  	bpf_pull_mac_rcsum(skb);
1872  
1873  	bpf_compute_data_end(skb);
1874  	return ret;
1875  }
1876  
1877  const struct bpf_func_proto bpf_skb_vlan_push_proto = {
1878  	.func           = bpf_skb_vlan_push,
1879  	.gpl_only       = false,
1880  	.ret_type       = RET_INTEGER,
1881  	.arg1_type      = ARG_PTR_TO_CTX,
1882  	.arg2_type      = ARG_ANYTHING,
1883  	.arg3_type      = ARG_ANYTHING,
1884  };
1885  EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
1886  
BPF_CALL_1(bpf_skb_vlan_pop,struct sk_buff *,skb)1887  BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
1888  {
1889  	int ret;
1890  
1891  	bpf_push_mac_rcsum(skb);
1892  	ret = skb_vlan_pop(skb);
1893  	bpf_pull_mac_rcsum(skb);
1894  
1895  	bpf_compute_data_end(skb);
1896  	return ret;
1897  }
1898  
1899  const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
1900  	.func           = bpf_skb_vlan_pop,
1901  	.gpl_only       = false,
1902  	.ret_type       = RET_INTEGER,
1903  	.arg1_type      = ARG_PTR_TO_CTX,
1904  };
1905  EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
1906  
bpf_skb_generic_push(struct sk_buff * skb,u32 off,u32 len)1907  static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
1908  {
1909  	/* Caller already did skb_cow() with len as headroom,
1910  	 * so no need to do it here.
1911  	 */
1912  	skb_push(skb, len);
1913  	memmove(skb->data, skb->data + len, off);
1914  	memset(skb->data + off, 0, len);
1915  
1916  	/* No skb_postpush_rcsum(skb, skb->data + off, len)
1917  	 * needed here as it does not change the skb->csum
1918  	 * result for checksum complete when summing over
1919  	 * zeroed blocks.
1920  	 */
1921  	return 0;
1922  }
1923  
bpf_skb_generic_pop(struct sk_buff * skb,u32 off,u32 len)1924  static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
1925  {
1926  	/* skb_ensure_writable() is not needed here, as we're
1927  	 * already working on an uncloned skb.
1928  	 */
1929  	if (unlikely(!pskb_may_pull(skb, off + len)))
1930  		return -ENOMEM;
1931  
1932  	skb_postpull_rcsum(skb, skb->data + off, len);
1933  	memmove(skb->data + len, skb->data, off);
1934  	__skb_pull(skb, len);
1935  
1936  	return 0;
1937  }
1938  
bpf_skb_net_hdr_push(struct sk_buff * skb,u32 off,u32 len)1939  static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
1940  {
1941  	bool trans_same = skb->transport_header == skb->network_header;
1942  	int ret;
1943  
1944  	/* There's no need for __skb_push()/__skb_pull() pair to
1945  	 * get to the start of the mac header as we're guaranteed
1946  	 * to always start from here under eBPF.
1947  	 */
1948  	ret = bpf_skb_generic_push(skb, off, len);
1949  	if (likely(!ret)) {
1950  		skb->mac_header -= len;
1951  		skb->network_header -= len;
1952  		if (trans_same)
1953  			skb->transport_header = skb->network_header;
1954  	}
1955  
1956  	return ret;
1957  }
1958  
bpf_skb_net_hdr_pop(struct sk_buff * skb,u32 off,u32 len)1959  static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
1960  {
1961  	bool trans_same = skb->transport_header == skb->network_header;
1962  	int ret;
1963  
1964  	/* Same here, __skb_push()/__skb_pull() pair not needed. */
1965  	ret = bpf_skb_generic_pop(skb, off, len);
1966  	if (likely(!ret)) {
1967  		skb->mac_header += len;
1968  		skb->network_header += len;
1969  		if (trans_same)
1970  			skb->transport_header = skb->network_header;
1971  	}
1972  
1973  	return ret;
1974  }
1975  
bpf_skb_proto_4_to_6(struct sk_buff * skb)1976  static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
1977  {
1978  	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
1979  	u32 off = skb->network_header - skb->mac_header;
1980  	int ret;
1981  
1982  	ret = skb_cow(skb, len_diff);
1983  	if (unlikely(ret < 0))
1984  		return ret;
1985  
1986  	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
1987  	if (unlikely(ret < 0))
1988  		return ret;
1989  
1990  	if (skb_is_gso(skb)) {
1991  		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
1992  		 * be changed into SKB_GSO_TCPV6.
1993  		 */
1994  		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
1995  			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
1996  			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
1997  		}
1998  
1999  		/* Due to IPv6 header, MSS needs to be downgraded. */
2000  		skb_shinfo(skb)->gso_size -= len_diff;
2001  		/* Header must be checked, and gso_segs recomputed. */
2002  		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2003  		skb_shinfo(skb)->gso_segs = 0;
2004  	}
2005  
2006  	skb->protocol = htons(ETH_P_IPV6);
2007  	skb_clear_hash(skb);
2008  
2009  	return 0;
2010  }
2011  
bpf_skb_proto_6_to_4(struct sk_buff * skb)2012  static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
2013  {
2014  	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
2015  	u32 off = skb->network_header - skb->mac_header;
2016  	int ret;
2017  
2018  	ret = skb_unclone(skb, GFP_ATOMIC);
2019  	if (unlikely(ret < 0))
2020  		return ret;
2021  
2022  	ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
2023  	if (unlikely(ret < 0))
2024  		return ret;
2025  
2026  	if (skb_is_gso(skb)) {
2027  		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
2028  		 * be changed into SKB_GSO_TCPV4.
2029  		 */
2030  		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
2031  			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
2032  			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV4;
2033  		}
2034  
2035  		/* Due to IPv4 header, MSS can be upgraded. */
2036  		skb_shinfo(skb)->gso_size += len_diff;
2037  		/* Header must be checked, and gso_segs recomputed. */
2038  		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2039  		skb_shinfo(skb)->gso_segs = 0;
2040  	}
2041  
2042  	skb->protocol = htons(ETH_P_IP);
2043  	skb_clear_hash(skb);
2044  
2045  	return 0;
2046  }
2047  
bpf_skb_proto_xlat(struct sk_buff * skb,__be16 to_proto)2048  static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
2049  {
2050  	__be16 from_proto = skb->protocol;
2051  
2052  	if (from_proto == htons(ETH_P_IP) &&
2053  	      to_proto == htons(ETH_P_IPV6))
2054  		return bpf_skb_proto_4_to_6(skb);
2055  
2056  	if (from_proto == htons(ETH_P_IPV6) &&
2057  	      to_proto == htons(ETH_P_IP))
2058  		return bpf_skb_proto_6_to_4(skb);
2059  
2060  	return -ENOTSUPP;
2061  }
2062  
BPF_CALL_3(bpf_skb_change_proto,struct sk_buff *,skb,__be16,proto,u64,flags)2063  BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
2064  	   u64, flags)
2065  {
2066  	int ret;
2067  
2068  	if (unlikely(flags))
2069  		return -EINVAL;
2070  
2071  	/* General idea is that this helper does the basic groundwork
2072  	 * needed for changing the protocol, and eBPF program fills the
2073  	 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
2074  	 * and other helpers, rather than passing a raw buffer here.
2075  	 *
2076  	 * The rationale is to keep this minimal and without a need to
2077  	 * deal with raw packet data. F.e. even if we would pass buffers
2078  	 * here, the program still needs to call the bpf_lX_csum_replace()
2079  	 * helpers anyway. Plus, this way we keep also separation of
2080  	 * concerns, since f.e. bpf_skb_store_bytes() should only take
2081  	 * care of stores.
2082  	 *
2083  	 * Currently, additional options and extension header space are
2084  	 * not supported, but flags register is reserved so we can adapt
2085  	 * that. For offloads, we mark packet as dodgy, so that headers
2086  	 * need to be verified first.
2087  	 */
2088  	ret = bpf_skb_proto_xlat(skb, proto);
2089  	bpf_compute_data_end(skb);
2090  	return ret;
2091  }
2092  
2093  static const struct bpf_func_proto bpf_skb_change_proto_proto = {
2094  	.func		= bpf_skb_change_proto,
2095  	.gpl_only	= false,
2096  	.ret_type	= RET_INTEGER,
2097  	.arg1_type	= ARG_PTR_TO_CTX,
2098  	.arg2_type	= ARG_ANYTHING,
2099  	.arg3_type	= ARG_ANYTHING,
2100  };
2101  
BPF_CALL_2(bpf_skb_change_type,struct sk_buff *,skb,u32,pkt_type)2102  BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
2103  {
2104  	/* We only allow a restricted subset to be changed for now. */
2105  	if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
2106  		     !skb_pkt_type_ok(pkt_type)))
2107  		return -EINVAL;
2108  
2109  	skb->pkt_type = pkt_type;
2110  	return 0;
2111  }
2112  
2113  static const struct bpf_func_proto bpf_skb_change_type_proto = {
2114  	.func		= bpf_skb_change_type,
2115  	.gpl_only	= false,
2116  	.ret_type	= RET_INTEGER,
2117  	.arg1_type	= ARG_PTR_TO_CTX,
2118  	.arg2_type	= ARG_ANYTHING,
2119  };
2120  
__bpf_skb_min_len(const struct sk_buff * skb)2121  static u32 __bpf_skb_min_len(const struct sk_buff *skb)
2122  {
2123  	u32 min_len = skb_network_offset(skb);
2124  
2125  	if (skb_transport_header_was_set(skb))
2126  		min_len = skb_transport_offset(skb);
2127  	if (skb->ip_summed == CHECKSUM_PARTIAL)
2128  		min_len = skb_checksum_start_offset(skb) +
2129  			  skb->csum_offset + sizeof(__sum16);
2130  	return min_len;
2131  }
2132  
__bpf_skb_max_len(const struct sk_buff * skb)2133  static u32 __bpf_skb_max_len(const struct sk_buff *skb)
2134  {
2135  	return skb->dev->mtu + skb->dev->hard_header_len;
2136  }
2137  
bpf_skb_grow_rcsum(struct sk_buff * skb,unsigned int new_len)2138  static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
2139  {
2140  	unsigned int old_len = skb->len;
2141  	int ret;
2142  
2143  	ret = __skb_grow_rcsum(skb, new_len);
2144  	if (!ret)
2145  		memset(skb->data + old_len, 0, new_len - old_len);
2146  	return ret;
2147  }
2148  
bpf_skb_trim_rcsum(struct sk_buff * skb,unsigned int new_len)2149  static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
2150  {
2151  	return __skb_trim_rcsum(skb, new_len);
2152  }
2153  
BPF_CALL_3(bpf_skb_change_tail,struct sk_buff *,skb,u32,new_len,u64,flags)2154  BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
2155  	   u64, flags)
2156  {
2157  	u32 max_len = __bpf_skb_max_len(skb);
2158  	u32 min_len = __bpf_skb_min_len(skb);
2159  	int ret;
2160  
2161  	if (unlikely(flags || new_len > max_len || new_len < min_len))
2162  		return -EINVAL;
2163  	if (skb->encapsulation)
2164  		return -ENOTSUPP;
2165  
2166  	/* The basic idea of this helper is that it's performing the
2167  	 * needed work to either grow or trim an skb, and eBPF program
2168  	 * rewrites the rest via helpers like bpf_skb_store_bytes(),
2169  	 * bpf_lX_csum_replace() and others rather than passing a raw
2170  	 * buffer here. This one is a slow path helper and intended
2171  	 * for replies with control messages.
2172  	 *
2173  	 * Like in bpf_skb_change_proto(), we want to keep this rather
2174  	 * minimal and without protocol specifics so that we are able
2175  	 * to separate concerns as in bpf_skb_store_bytes() should only
2176  	 * be the one responsible for writing buffers.
2177  	 *
2178  	 * It's really expected to be a slow path operation here for
2179  	 * control message replies, so we're implicitly linearizing,
2180  	 * uncloning and drop offloads from the skb by this.
2181  	 */
2182  	ret = __bpf_try_make_writable(skb, skb->len);
2183  	if (!ret) {
2184  		if (new_len > skb->len)
2185  			ret = bpf_skb_grow_rcsum(skb, new_len);
2186  		else if (new_len < skb->len)
2187  			ret = bpf_skb_trim_rcsum(skb, new_len);
2188  		if (!ret && skb_is_gso(skb))
2189  			skb_gso_reset(skb);
2190  	}
2191  
2192  	bpf_compute_data_end(skb);
2193  	return ret;
2194  }
2195  
2196  static const struct bpf_func_proto bpf_skb_change_tail_proto = {
2197  	.func		= bpf_skb_change_tail,
2198  	.gpl_only	= false,
2199  	.ret_type	= RET_INTEGER,
2200  	.arg1_type	= ARG_PTR_TO_CTX,
2201  	.arg2_type	= ARG_ANYTHING,
2202  	.arg3_type	= ARG_ANYTHING,
2203  };
2204  
bpf_helper_changes_skb_data(void * func)2205  bool bpf_helper_changes_skb_data(void *func)
2206  {
2207  	if (func == bpf_skb_vlan_push ||
2208  	    func == bpf_skb_vlan_pop ||
2209  	    func == bpf_skb_store_bytes ||
2210  	    func == bpf_skb_change_proto ||
2211  	    func == bpf_skb_change_tail ||
2212  	    func == bpf_skb_pull_data ||
2213  	    func == bpf_clone_redirect ||
2214  	    func == bpf_l3_csum_replace ||
2215  	    func == bpf_l4_csum_replace)
2216  		return true;
2217  
2218  	return false;
2219  }
2220  
bpf_skb_copy(void * dst_buff,const void * skb,unsigned long off,unsigned long len)2221  static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
2222  				  unsigned long off, unsigned long len)
2223  {
2224  	void *ptr = skb_header_pointer(skb, off, len, dst_buff);
2225  
2226  	if (unlikely(!ptr))
2227  		return len;
2228  	if (ptr != dst_buff)
2229  		memcpy(dst_buff, ptr, len);
2230  
2231  	return 0;
2232  }
2233  
BPF_CALL_5(bpf_skb_event_output,struct sk_buff *,skb,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)2234  BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
2235  	   u64, flags, void *, meta, u64, meta_size)
2236  {
2237  	u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2238  
2239  	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2240  		return -EINVAL;
2241  	if (unlikely(skb_size > skb->len))
2242  		return -EFAULT;
2243  
2244  	return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
2245  				bpf_skb_copy);
2246  }
2247  
2248  static const struct bpf_func_proto bpf_skb_event_output_proto = {
2249  	.func		= bpf_skb_event_output,
2250  	.gpl_only	= true,
2251  	.ret_type	= RET_INTEGER,
2252  	.arg1_type	= ARG_PTR_TO_CTX,
2253  	.arg2_type	= ARG_CONST_MAP_PTR,
2254  	.arg3_type	= ARG_ANYTHING,
2255  	.arg4_type	= ARG_PTR_TO_STACK,
2256  	.arg5_type	= ARG_CONST_STACK_SIZE,
2257  };
2258  
bpf_tunnel_key_af(u64 flags)2259  static unsigned short bpf_tunnel_key_af(u64 flags)
2260  {
2261  	return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
2262  }
2263  
BPF_CALL_4(bpf_skb_get_tunnel_key,struct sk_buff *,skb,struct bpf_tunnel_key *,to,u32,size,u64,flags)2264  BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
2265  	   u32, size, u64, flags)
2266  {
2267  	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2268  	u8 compat[sizeof(struct bpf_tunnel_key)];
2269  	void *to_orig = to;
2270  	int err;
2271  
2272  	if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
2273  		err = -EINVAL;
2274  		goto err_clear;
2275  	}
2276  	if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
2277  		err = -EPROTO;
2278  		goto err_clear;
2279  	}
2280  	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2281  		err = -EINVAL;
2282  		switch (size) {
2283  		case offsetof(struct bpf_tunnel_key, tunnel_label):
2284  		case offsetof(struct bpf_tunnel_key, tunnel_ext):
2285  			goto set_compat;
2286  		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2287  			/* Fixup deprecated structure layouts here, so we have
2288  			 * a common path later on.
2289  			 */
2290  			if (ip_tunnel_info_af(info) != AF_INET)
2291  				goto err_clear;
2292  set_compat:
2293  			to = (struct bpf_tunnel_key *)compat;
2294  			break;
2295  		default:
2296  			goto err_clear;
2297  		}
2298  	}
2299  
2300  	to->tunnel_id = be64_to_cpu(info->key.tun_id);
2301  	to->tunnel_tos = info->key.tos;
2302  	to->tunnel_ttl = info->key.ttl;
2303  
2304  	if (flags & BPF_F_TUNINFO_IPV6) {
2305  		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
2306  		       sizeof(to->remote_ipv6));
2307  		to->tunnel_label = be32_to_cpu(info->key.label);
2308  	} else {
2309  		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
2310  	}
2311  
2312  	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
2313  		memcpy(to_orig, to, size);
2314  
2315  	return 0;
2316  err_clear:
2317  	memset(to_orig, 0, size);
2318  	return err;
2319  }
2320  
2321  static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
2322  	.func		= bpf_skb_get_tunnel_key,
2323  	.gpl_only	= false,
2324  	.ret_type	= RET_INTEGER,
2325  	.arg1_type	= ARG_PTR_TO_CTX,
2326  	.arg2_type	= ARG_PTR_TO_RAW_STACK,
2327  	.arg3_type	= ARG_CONST_STACK_SIZE,
2328  	.arg4_type	= ARG_ANYTHING,
2329  };
2330  
BPF_CALL_3(bpf_skb_get_tunnel_opt,struct sk_buff *,skb,u8 *,to,u32,size)2331  BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
2332  {
2333  	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2334  	int err;
2335  
2336  	if (unlikely(!info ||
2337  		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
2338  		err = -ENOENT;
2339  		goto err_clear;
2340  	}
2341  	if (unlikely(size < info->options_len)) {
2342  		err = -ENOMEM;
2343  		goto err_clear;
2344  	}
2345  
2346  	ip_tunnel_info_opts_get(to, info);
2347  	if (size > info->options_len)
2348  		memset(to + info->options_len, 0, size - info->options_len);
2349  
2350  	return info->options_len;
2351  err_clear:
2352  	memset(to, 0, size);
2353  	return err;
2354  }
2355  
2356  static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2357  	.func		= bpf_skb_get_tunnel_opt,
2358  	.gpl_only	= false,
2359  	.ret_type	= RET_INTEGER,
2360  	.arg1_type	= ARG_PTR_TO_CTX,
2361  	.arg2_type	= ARG_PTR_TO_RAW_STACK,
2362  	.arg3_type	= ARG_CONST_STACK_SIZE,
2363  };
2364  
2365  static struct metadata_dst __percpu *md_dst;
2366  
BPF_CALL_4(bpf_skb_set_tunnel_key,struct sk_buff *,skb,const struct bpf_tunnel_key *,from,u32,size,u64,flags)2367  BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
2368  	   const struct bpf_tunnel_key *, from, u32, size, u64, flags)
2369  {
2370  	struct metadata_dst *md = this_cpu_ptr(md_dst);
2371  	u8 compat[sizeof(struct bpf_tunnel_key)];
2372  	struct ip_tunnel_info *info;
2373  
2374  	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
2375  			       BPF_F_DONT_FRAGMENT)))
2376  		return -EINVAL;
2377  	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2378  		switch (size) {
2379  		case offsetof(struct bpf_tunnel_key, tunnel_label):
2380  		case offsetof(struct bpf_tunnel_key, tunnel_ext):
2381  		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2382  			/* Fixup deprecated structure layouts here, so we have
2383  			 * a common path later on.
2384  			 */
2385  			memcpy(compat, from, size);
2386  			memset(compat + size, 0, sizeof(compat) - size);
2387  			from = (const struct bpf_tunnel_key *) compat;
2388  			break;
2389  		default:
2390  			return -EINVAL;
2391  		}
2392  	}
2393  	if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
2394  		     from->tunnel_ext))
2395  		return -EINVAL;
2396  
2397  	skb_dst_drop(skb);
2398  	dst_hold((struct dst_entry *) md);
2399  	skb_dst_set(skb, (struct dst_entry *) md);
2400  
2401  	info = &md->u.tun_info;
2402  	info->mode = IP_TUNNEL_INFO_TX;
2403  
2404  	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
2405  	if (flags & BPF_F_DONT_FRAGMENT)
2406  		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
2407  
2408  	info->key.tun_id = cpu_to_be64(from->tunnel_id);
2409  	info->key.tos = from->tunnel_tos;
2410  	info->key.ttl = from->tunnel_ttl;
2411  
2412  	if (flags & BPF_F_TUNINFO_IPV6) {
2413  		info->mode |= IP_TUNNEL_INFO_IPV6;
2414  		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
2415  		       sizeof(from->remote_ipv6));
2416  		info->key.label = cpu_to_be32(from->tunnel_label) &
2417  				  IPV6_FLOWLABEL_MASK;
2418  	} else {
2419  		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
2420  		if (flags & BPF_F_ZERO_CSUM_TX)
2421  			info->key.tun_flags &= ~TUNNEL_CSUM;
2422  	}
2423  
2424  	return 0;
2425  }
2426  
2427  static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
2428  	.func		= bpf_skb_set_tunnel_key,
2429  	.gpl_only	= false,
2430  	.ret_type	= RET_INTEGER,
2431  	.arg1_type	= ARG_PTR_TO_CTX,
2432  	.arg2_type	= ARG_PTR_TO_STACK,
2433  	.arg3_type	= ARG_CONST_STACK_SIZE,
2434  	.arg4_type	= ARG_ANYTHING,
2435  };
2436  
BPF_CALL_3(bpf_skb_set_tunnel_opt,struct sk_buff *,skb,const u8 *,from,u32,size)2437  BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
2438  	   const u8 *, from, u32, size)
2439  {
2440  	struct ip_tunnel_info *info = skb_tunnel_info(skb);
2441  	const struct metadata_dst *md = this_cpu_ptr(md_dst);
2442  
2443  	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
2444  		return -EINVAL;
2445  	if (unlikely(size > IP_TUNNEL_OPTS_MAX))
2446  		return -ENOMEM;
2447  
2448  	ip_tunnel_info_opts_set(info, from, size);
2449  
2450  	return 0;
2451  }
2452  
2453  static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
2454  	.func		= bpf_skb_set_tunnel_opt,
2455  	.gpl_only	= false,
2456  	.ret_type	= RET_INTEGER,
2457  	.arg1_type	= ARG_PTR_TO_CTX,
2458  	.arg2_type	= ARG_PTR_TO_STACK,
2459  	.arg3_type	= ARG_CONST_STACK_SIZE,
2460  };
2461  
2462  static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)2463  bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
2464  {
2465  	if (!md_dst) {
2466  		/* Race is not possible, since it's called from verifier
2467  		 * that is holding verifier mutex.
2468  		 */
2469  		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
2470  						   GFP_KERNEL);
2471  		if (!md_dst)
2472  			return NULL;
2473  	}
2474  
2475  	switch (which) {
2476  	case BPF_FUNC_skb_set_tunnel_key:
2477  		return &bpf_skb_set_tunnel_key_proto;
2478  	case BPF_FUNC_skb_set_tunnel_opt:
2479  		return &bpf_skb_set_tunnel_opt_proto;
2480  	default:
2481  		return NULL;
2482  	}
2483  }
2484  
BPF_CALL_3(bpf_skb_under_cgroup,struct sk_buff *,skb,struct bpf_map *,map,u32,idx)2485  BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
2486  	   u32, idx)
2487  {
2488  	struct bpf_array *array = container_of(map, struct bpf_array, map);
2489  	struct cgroup *cgrp;
2490  	struct sock *sk;
2491  
2492  	sk = skb_to_full_sk(skb);
2493  	if (!sk || !sk_fullsock(sk))
2494  		return -ENOENT;
2495  	if (unlikely(idx >= array->map.max_entries))
2496  		return -E2BIG;
2497  
2498  	cgrp = READ_ONCE(array->ptrs[idx]);
2499  	if (unlikely(!cgrp))
2500  		return -EAGAIN;
2501  
2502  	return sk_under_cgroup_hierarchy(sk, cgrp);
2503  }
2504  
2505  static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
2506  	.func		= bpf_skb_under_cgroup,
2507  	.gpl_only	= false,
2508  	.ret_type	= RET_INTEGER,
2509  	.arg1_type	= ARG_PTR_TO_CTX,
2510  	.arg2_type	= ARG_CONST_MAP_PTR,
2511  	.arg3_type	= ARG_ANYTHING,
2512  };
2513  
bpf_xdp_copy(void * dst_buff,const void * src_buff,unsigned long off,unsigned long len)2514  static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
2515  				  unsigned long off, unsigned long len)
2516  {
2517  	memcpy(dst_buff, src_buff + off, len);
2518  	return 0;
2519  }
2520  
BPF_CALL_5(bpf_xdp_event_output,struct xdp_buff *,xdp,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)2521  BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
2522  	   u64, flags, void *, meta, u64, meta_size)
2523  {
2524  	u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2525  
2526  	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2527  		return -EINVAL;
2528  	if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
2529  		return -EFAULT;
2530  
2531  	return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
2532  				bpf_xdp_copy);
2533  }
2534  
2535  static const struct bpf_func_proto bpf_xdp_event_output_proto = {
2536  	.func		= bpf_xdp_event_output,
2537  	.gpl_only	= true,
2538  	.ret_type	= RET_INTEGER,
2539  	.arg1_type	= ARG_PTR_TO_CTX,
2540  	.arg2_type	= ARG_CONST_MAP_PTR,
2541  	.arg3_type	= ARG_ANYTHING,
2542  	.arg4_type	= ARG_PTR_TO_STACK,
2543  	.arg5_type	= ARG_CONST_STACK_SIZE,
2544  };
2545  
BPF_CALL_1(bpf_get_socket_cookie,struct sk_buff *,skb)2546  BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
2547  {
2548  	return skb->sk ? sock_gen_cookie(skb->sk) : 0;
2549  }
2550  
2551  static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
2552  	.func           = bpf_get_socket_cookie,
2553  	.gpl_only       = false,
2554  	.ret_type       = RET_INTEGER,
2555  	.arg1_type      = ARG_PTR_TO_CTX,
2556  };
2557  
BPF_CALL_1(bpf_get_socket_uid,struct sk_buff *,skb)2558  BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
2559  {
2560  	struct sock *sk = sk_to_full_sk(skb->sk);
2561  	kuid_t kuid;
2562  
2563  	if (!sk || !sk_fullsock(sk))
2564  		return overflowuid;
2565  	kuid = sock_net_uid(sock_net(sk), sk);
2566  	return from_kuid_munged(sock_net(sk)->user_ns, kuid);
2567  }
2568  
2569  static const struct bpf_func_proto bpf_get_socket_uid_proto = {
2570  	.func           = bpf_get_socket_uid,
2571  	.gpl_only       = false,
2572  	.ret_type       = RET_INTEGER,
2573  	.arg1_type      = ARG_PTR_TO_CTX,
2574  };
2575  
2576  static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)2577  sk_filter_func_proto(enum bpf_func_id func_id)
2578  {
2579  	switch (func_id) {
2580  	case BPF_FUNC_map_lookup_elem:
2581  		return &bpf_map_lookup_elem_proto;
2582  	case BPF_FUNC_map_update_elem:
2583  		return &bpf_map_update_elem_proto;
2584  	case BPF_FUNC_map_delete_elem:
2585  		return &bpf_map_delete_elem_proto;
2586  	case BPF_FUNC_get_prandom_u32:
2587  		return &bpf_get_prandom_u32_proto;
2588  	case BPF_FUNC_get_smp_processor_id:
2589  		return &bpf_get_raw_smp_processor_id_proto;
2590  	case BPF_FUNC_tail_call:
2591  		return &bpf_tail_call_proto;
2592  	case BPF_FUNC_ktime_get_ns:
2593  		return &bpf_ktime_get_ns_proto;
2594  	case BPF_FUNC_trace_printk:
2595  		if (capable(CAP_SYS_ADMIN))
2596  			return bpf_get_trace_printk_proto();
2597  	case BPF_FUNC_get_socket_cookie:
2598  		return &bpf_get_socket_cookie_proto;
2599  	case BPF_FUNC_get_socket_uid:
2600  		return &bpf_get_socket_uid_proto;
2601  	default:
2602  		return NULL;
2603  	}
2604  }
2605  
2606  static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id)2607  tc_cls_act_func_proto(enum bpf_func_id func_id)
2608  {
2609  	switch (func_id) {
2610  	case BPF_FUNC_skb_store_bytes:
2611  		return &bpf_skb_store_bytes_proto;
2612  	case BPF_FUNC_skb_load_bytes:
2613  		return &bpf_skb_load_bytes_proto;
2614  	case BPF_FUNC_skb_pull_data:
2615  		return &bpf_skb_pull_data_proto;
2616  	case BPF_FUNC_csum_diff:
2617  		return &bpf_csum_diff_proto;
2618  	case BPF_FUNC_csum_update:
2619  		return &bpf_csum_update_proto;
2620  	case BPF_FUNC_l3_csum_replace:
2621  		return &bpf_l3_csum_replace_proto;
2622  	case BPF_FUNC_l4_csum_replace:
2623  		return &bpf_l4_csum_replace_proto;
2624  	case BPF_FUNC_clone_redirect:
2625  		return &bpf_clone_redirect_proto;
2626  	case BPF_FUNC_get_cgroup_classid:
2627  		return &bpf_get_cgroup_classid_proto;
2628  	case BPF_FUNC_skb_vlan_push:
2629  		return &bpf_skb_vlan_push_proto;
2630  	case BPF_FUNC_skb_vlan_pop:
2631  		return &bpf_skb_vlan_pop_proto;
2632  	case BPF_FUNC_skb_change_proto:
2633  		return &bpf_skb_change_proto_proto;
2634  	case BPF_FUNC_skb_change_type:
2635  		return &bpf_skb_change_type_proto;
2636  	case BPF_FUNC_skb_change_tail:
2637  		return &bpf_skb_change_tail_proto;
2638  	case BPF_FUNC_skb_get_tunnel_key:
2639  		return &bpf_skb_get_tunnel_key_proto;
2640  	case BPF_FUNC_skb_set_tunnel_key:
2641  		return bpf_get_skb_set_tunnel_proto(func_id);
2642  	case BPF_FUNC_skb_get_tunnel_opt:
2643  		return &bpf_skb_get_tunnel_opt_proto;
2644  	case BPF_FUNC_skb_set_tunnel_opt:
2645  		return bpf_get_skb_set_tunnel_proto(func_id);
2646  	case BPF_FUNC_redirect:
2647  		return &bpf_redirect_proto;
2648  	case BPF_FUNC_get_route_realm:
2649  		return &bpf_get_route_realm_proto;
2650  	case BPF_FUNC_get_hash_recalc:
2651  		return &bpf_get_hash_recalc_proto;
2652  	case BPF_FUNC_set_hash_invalid:
2653  		return &bpf_set_hash_invalid_proto;
2654  	case BPF_FUNC_perf_event_output:
2655  		return &bpf_skb_event_output_proto;
2656  	case BPF_FUNC_get_smp_processor_id:
2657  		return &bpf_get_smp_processor_id_proto;
2658  	case BPF_FUNC_skb_under_cgroup:
2659  		return &bpf_skb_under_cgroup_proto;
2660  	default:
2661  		return sk_filter_func_proto(func_id);
2662  	}
2663  }
2664  
2665  static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id)2666  xdp_func_proto(enum bpf_func_id func_id)
2667  {
2668  	switch (func_id) {
2669  	case BPF_FUNC_perf_event_output:
2670  		return &bpf_xdp_event_output_proto;
2671  	case BPF_FUNC_get_smp_processor_id:
2672  		return &bpf_get_smp_processor_id_proto;
2673  	default:
2674  		return sk_filter_func_proto(func_id);
2675  	}
2676  }
2677  
2678  static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id)2679  cg_skb_func_proto(enum bpf_func_id func_id)
2680  {
2681  	switch (func_id) {
2682  	case BPF_FUNC_skb_load_bytes:
2683  		return &bpf_skb_load_bytes_proto;
2684  	default:
2685  		return sk_filter_func_proto(func_id);
2686  	}
2687  }
2688  
__is_valid_access(int off,int size,enum bpf_access_type type)2689  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
2690  {
2691  	if (off < 0 || off >= sizeof(struct __sk_buff))
2692  		return false;
2693  	/* The verifier guarantees that size > 0. */
2694  	if (off % size != 0)
2695  		return false;
2696  	if (size != sizeof(__u32))
2697  		return false;
2698  
2699  	return true;
2700  }
2701  
sk_filter_is_valid_access(int off,int size,enum bpf_access_type type,enum bpf_reg_type * reg_type)2702  static bool sk_filter_is_valid_access(int off, int size,
2703  				      enum bpf_access_type type,
2704  				      enum bpf_reg_type *reg_type)
2705  {
2706  	switch (off) {
2707  	case offsetof(struct __sk_buff, tc_classid):
2708  	case offsetof(struct __sk_buff, data):
2709  	case offsetof(struct __sk_buff, data_end):
2710  		return false;
2711  	}
2712  
2713  	if (type == BPF_WRITE) {
2714  		switch (off) {
2715  		case offsetof(struct __sk_buff, cb[0]) ...
2716  		     offsetof(struct __sk_buff, cb[4]):
2717  			break;
2718  		default:
2719  			return false;
2720  		}
2721  	}
2722  
2723  	return __is_valid_access(off, size, type);
2724  }
2725  
tc_cls_act_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)2726  static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
2727  			       const struct bpf_prog *prog)
2728  {
2729  	struct bpf_insn *insn = insn_buf;
2730  
2731  	if (!direct_write)
2732  		return 0;
2733  
2734  	/* if (!skb->cloned)
2735  	 *       goto start;
2736  	 *
2737  	 * (Fast-path, otherwise approximation that we might be
2738  	 *  a clone, do the rest in helper.)
2739  	 */
2740  	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
2741  	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
2742  	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
2743  
2744  	/* ret = bpf_skb_pull_data(skb, 0); */
2745  	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
2746  	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
2747  	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
2748  			       BPF_FUNC_skb_pull_data);
2749  	/* if (!ret)
2750  	 *      goto restore;
2751  	 * return TC_ACT_SHOT;
2752  	 */
2753  	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
2754  	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
2755  	*insn++ = BPF_EXIT_INSN();
2756  
2757  	/* restore: */
2758  	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
2759  	/* start: */
2760  	*insn++ = prog->insnsi[0];
2761  
2762  	return insn - insn_buf;
2763  }
2764  
tc_cls_act_is_valid_access(int off,int size,enum bpf_access_type type,enum bpf_reg_type * reg_type)2765  static bool tc_cls_act_is_valid_access(int off, int size,
2766  				       enum bpf_access_type type,
2767  				       enum bpf_reg_type *reg_type)
2768  {
2769  	if (type == BPF_WRITE) {
2770  		switch (off) {
2771  		case offsetof(struct __sk_buff, mark):
2772  		case offsetof(struct __sk_buff, tc_index):
2773  		case offsetof(struct __sk_buff, priority):
2774  		case offsetof(struct __sk_buff, cb[0]) ...
2775  		     offsetof(struct __sk_buff, cb[4]):
2776  		case offsetof(struct __sk_buff, tc_classid):
2777  			break;
2778  		default:
2779  			return false;
2780  		}
2781  	}
2782  
2783  	switch (off) {
2784  	case offsetof(struct __sk_buff, data):
2785  		*reg_type = PTR_TO_PACKET;
2786  		break;
2787  	case offsetof(struct __sk_buff, data_end):
2788  		*reg_type = PTR_TO_PACKET_END;
2789  		break;
2790  	}
2791  
2792  	return __is_valid_access(off, size, type);
2793  }
2794  
__is_valid_xdp_access(int off,int size,enum bpf_access_type type)2795  static bool __is_valid_xdp_access(int off, int size,
2796  				  enum bpf_access_type type)
2797  {
2798  	if (off < 0 || off >= sizeof(struct xdp_md))
2799  		return false;
2800  	if (off % size != 0)
2801  		return false;
2802  	if (size != sizeof(__u32))
2803  		return false;
2804  
2805  	return true;
2806  }
2807  
xdp_is_valid_access(int off,int size,enum bpf_access_type type,enum bpf_reg_type * reg_type)2808  static bool xdp_is_valid_access(int off, int size,
2809  				enum bpf_access_type type,
2810  				enum bpf_reg_type *reg_type)
2811  {
2812  	if (type == BPF_WRITE)
2813  		return false;
2814  
2815  	switch (off) {
2816  	case offsetof(struct xdp_md, data):
2817  		*reg_type = PTR_TO_PACKET;
2818  		break;
2819  	case offsetof(struct xdp_md, data_end):
2820  		*reg_type = PTR_TO_PACKET_END;
2821  		break;
2822  	}
2823  
2824  	return __is_valid_xdp_access(off, size, type);
2825  }
2826  
bpf_warn_invalid_xdp_action(u32 act)2827  void bpf_warn_invalid_xdp_action(u32 act)
2828  {
2829  	WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
2830  }
2831  EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2832  
sk_filter_convert_ctx_access(enum bpf_access_type type,int dst_reg,int src_reg,int ctx_off,struct bpf_insn * insn_buf,struct bpf_prog * prog)2833  static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2834  					int src_reg, int ctx_off,
2835  					struct bpf_insn *insn_buf,
2836  					struct bpf_prog *prog)
2837  {
2838  	struct bpf_insn *insn = insn_buf;
2839  
2840  	switch (ctx_off) {
2841  	case offsetof(struct __sk_buff, len):
2842  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
2843  
2844  		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2845  				      offsetof(struct sk_buff, len));
2846  		break;
2847  
2848  	case offsetof(struct __sk_buff, protocol):
2849  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
2850  
2851  		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2852  				      offsetof(struct sk_buff, protocol));
2853  		break;
2854  
2855  	case offsetof(struct __sk_buff, vlan_proto):
2856  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
2857  
2858  		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2859  				      offsetof(struct sk_buff, vlan_proto));
2860  		break;
2861  
2862  	case offsetof(struct __sk_buff, priority):
2863  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
2864  
2865  		if (type == BPF_WRITE)
2866  			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
2867  					      offsetof(struct sk_buff, priority));
2868  		else
2869  			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2870  					      offsetof(struct sk_buff, priority));
2871  		break;
2872  
2873  	case offsetof(struct __sk_buff, ingress_ifindex):
2874  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
2875  
2876  		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2877  				      offsetof(struct sk_buff, skb_iif));
2878  		break;
2879  
2880  	case offsetof(struct __sk_buff, ifindex):
2881  		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2882  
2883  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
2884  				      dst_reg, src_reg,
2885  				      offsetof(struct sk_buff, dev));
2886  		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
2887  		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
2888  				      offsetof(struct net_device, ifindex));
2889  		break;
2890  
2891  	case offsetof(struct __sk_buff, hash):
2892  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
2893  
2894  		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2895  				      offsetof(struct sk_buff, hash));
2896  		break;
2897  
2898  	case offsetof(struct __sk_buff, mark):
2899  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
2900  
2901  		if (type == BPF_WRITE)
2902  			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
2903  					      offsetof(struct sk_buff, mark));
2904  		else
2905  			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2906  					      offsetof(struct sk_buff, mark));
2907  		break;
2908  
2909  	case offsetof(struct __sk_buff, pkt_type):
2910  		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
2911  
2912  	case offsetof(struct __sk_buff, queue_mapping):
2913  		return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
2914  
2915  	case offsetof(struct __sk_buff, vlan_present):
2916  		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
2917  					  dst_reg, src_reg, insn);
2918  
2919  	case offsetof(struct __sk_buff, vlan_tci):
2920  		return convert_skb_access(SKF_AD_VLAN_TAG,
2921  					  dst_reg, src_reg, insn);
2922  
2923  	case offsetof(struct __sk_buff, cb[0]) ...
2924  	     offsetof(struct __sk_buff, cb[4]):
2925  		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
2926  
2927  		prog->cb_access = 1;
2928  		ctx_off -= offsetof(struct __sk_buff, cb[0]);
2929  		ctx_off += offsetof(struct sk_buff, cb);
2930  		ctx_off += offsetof(struct qdisc_skb_cb, data);
2931  		if (type == BPF_WRITE)
2932  			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
2933  		else
2934  			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
2935  		break;
2936  
2937  	case offsetof(struct __sk_buff, tc_classid):
2938  		ctx_off -= offsetof(struct __sk_buff, tc_classid);
2939  		ctx_off += offsetof(struct sk_buff, cb);
2940  		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
2941  		if (type == BPF_WRITE)
2942  			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
2943  		else
2944  			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
2945  		break;
2946  
2947  	case offsetof(struct __sk_buff, data):
2948  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
2949  				      dst_reg, src_reg,
2950  				      offsetof(struct sk_buff, data));
2951  		break;
2952  
2953  	case offsetof(struct __sk_buff, data_end):
2954  		ctx_off -= offsetof(struct __sk_buff, data_end);
2955  		ctx_off += offsetof(struct sk_buff, cb);
2956  		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
2957  		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
2958  				      ctx_off);
2959  		break;
2960  
2961  	case offsetof(struct __sk_buff, tc_index):
2962  #ifdef CONFIG_NET_SCHED
2963  		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
2964  
2965  		if (type == BPF_WRITE)
2966  			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
2967  					      offsetof(struct sk_buff, tc_index));
2968  		else
2969  			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2970  					      offsetof(struct sk_buff, tc_index));
2971  		break;
2972  #else
2973  		if (type == BPF_WRITE)
2974  			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
2975  		else
2976  			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
2977  		break;
2978  #endif
2979  	}
2980  
2981  	return insn - insn_buf;
2982  }
2983  
tc_cls_act_convert_ctx_access(enum bpf_access_type type,int dst_reg,int src_reg,int ctx_off,struct bpf_insn * insn_buf,struct bpf_prog * prog)2984  static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2985  					 int src_reg, int ctx_off,
2986  					 struct bpf_insn *insn_buf,
2987  					 struct bpf_prog *prog)
2988  {
2989  	struct bpf_insn *insn = insn_buf;
2990  
2991  	switch (ctx_off) {
2992  	case offsetof(struct __sk_buff, ifindex):
2993  		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2994  
2995  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
2996  				      dst_reg, src_reg,
2997  				      offsetof(struct sk_buff, dev));
2998  		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
2999  				      offsetof(struct net_device, ifindex));
3000  		break;
3001  	default:
3002  		return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
3003  						    ctx_off, insn_buf, prog);
3004  	}
3005  
3006  	return insn - insn_buf;
3007  }
3008  
xdp_convert_ctx_access(enum bpf_access_type type,int dst_reg,int src_reg,int ctx_off,struct bpf_insn * insn_buf,struct bpf_prog * prog)3009  static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
3010  				  int src_reg, int ctx_off,
3011  				  struct bpf_insn *insn_buf,
3012  				  struct bpf_prog *prog)
3013  {
3014  	struct bpf_insn *insn = insn_buf;
3015  
3016  	switch (ctx_off) {
3017  	case offsetof(struct xdp_md, data):
3018  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
3019  				      dst_reg, src_reg,
3020  				      offsetof(struct xdp_buff, data));
3021  		break;
3022  	case offsetof(struct xdp_md, data_end):
3023  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
3024  				      dst_reg, src_reg,
3025  				      offsetof(struct xdp_buff, data_end));
3026  		break;
3027  	}
3028  
3029  	return insn - insn_buf;
3030  }
3031  
3032  static const struct bpf_verifier_ops sk_filter_ops = {
3033  	.get_func_proto		= sk_filter_func_proto,
3034  	.is_valid_access	= sk_filter_is_valid_access,
3035  	.convert_ctx_access	= sk_filter_convert_ctx_access,
3036  };
3037  
3038  static const struct bpf_verifier_ops tc_cls_act_ops = {
3039  	.get_func_proto		= tc_cls_act_func_proto,
3040  	.is_valid_access	= tc_cls_act_is_valid_access,
3041  	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
3042  	.gen_prologue		= tc_cls_act_prologue,
3043  };
3044  
3045  static const struct bpf_verifier_ops xdp_ops = {
3046  	.get_func_proto		= xdp_func_proto,
3047  	.is_valid_access	= xdp_is_valid_access,
3048  	.convert_ctx_access	= xdp_convert_ctx_access,
3049  };
3050  
3051  static const struct bpf_verifier_ops cg_skb_ops = {
3052  	.get_func_proto		= cg_skb_func_proto,
3053  	.is_valid_access	= sk_filter_is_valid_access,
3054  	.convert_ctx_access	= sk_filter_convert_ctx_access,
3055  };
3056  
3057  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
3058  	.ops	= &sk_filter_ops,
3059  	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
3060  };
3061  
3062  static struct bpf_prog_type_list sched_cls_type __read_mostly = {
3063  	.ops	= &tc_cls_act_ops,
3064  	.type	= BPF_PROG_TYPE_SCHED_CLS,
3065  };
3066  
3067  static struct bpf_prog_type_list sched_act_type __read_mostly = {
3068  	.ops	= &tc_cls_act_ops,
3069  	.type	= BPF_PROG_TYPE_SCHED_ACT,
3070  };
3071  
3072  static struct bpf_prog_type_list xdp_type __read_mostly = {
3073  	.ops	= &xdp_ops,
3074  	.type	= BPF_PROG_TYPE_XDP,
3075  };
3076  
3077  static struct bpf_prog_type_list cg_skb_type __read_mostly = {
3078  	.ops	= &cg_skb_ops,
3079  	.type	= BPF_PROG_TYPE_CGROUP_SKB,
3080  };
3081  
register_sk_filter_ops(void)3082  static int __init register_sk_filter_ops(void)
3083  {
3084  	bpf_register_prog_type(&sk_filter_type);
3085  	bpf_register_prog_type(&sched_cls_type);
3086  	bpf_register_prog_type(&sched_act_type);
3087  	bpf_register_prog_type(&xdp_type);
3088  	bpf_register_prog_type(&cg_skb_type);
3089  
3090  	return 0;
3091  }
3092  late_initcall(register_sk_filter_ops);
3093  
sk_detach_filter(struct sock * sk)3094  int sk_detach_filter(struct sock *sk)
3095  {
3096  	int ret = -ENOENT;
3097  	struct sk_filter *filter;
3098  
3099  	if (sock_flag(sk, SOCK_FILTER_LOCKED))
3100  		return -EPERM;
3101  
3102  	filter = rcu_dereference_protected(sk->sk_filter,
3103  					   lockdep_sock_is_held(sk));
3104  	if (filter) {
3105  		RCU_INIT_POINTER(sk->sk_filter, NULL);
3106  		sk_filter_uncharge(sk, filter);
3107  		ret = 0;
3108  	}
3109  
3110  	return ret;
3111  }
3112  EXPORT_SYMBOL_GPL(sk_detach_filter);
3113  
sk_get_filter(struct sock * sk,struct sock_filter __user * ubuf,unsigned int len)3114  int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
3115  		  unsigned int len)
3116  {
3117  	struct sock_fprog_kern *fprog;
3118  	struct sk_filter *filter;
3119  	int ret = 0;
3120  
3121  	lock_sock(sk);
3122  	filter = rcu_dereference_protected(sk->sk_filter,
3123  					   lockdep_sock_is_held(sk));
3124  	if (!filter)
3125  		goto out;
3126  
3127  	/* We're copying the filter that has been originally attached,
3128  	 * so no conversion/decode needed anymore. eBPF programs that
3129  	 * have no original program cannot be dumped through this.
3130  	 */
3131  	ret = -EACCES;
3132  	fprog = filter->prog->orig_prog;
3133  	if (!fprog)
3134  		goto out;
3135  
3136  	ret = fprog->len;
3137  	if (!len)
3138  		/* User space only enquires number of filter blocks. */
3139  		goto out;
3140  
3141  	ret = -EINVAL;
3142  	if (len < fprog->len)
3143  		goto out;
3144  
3145  	ret = -EFAULT;
3146  	if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
3147  		goto out;
3148  
3149  	/* Instead of bytes, the API requests to return the number
3150  	 * of filter blocks.
3151  	 */
3152  	ret = fprog->len;
3153  out:
3154  	release_sock(sk);
3155  	return ret;
3156  }
3157