1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/sock_diag.h> 30 #include <linux/in.h> 31 #include <linux/inet.h> 32 #include <linux/netdevice.h> 33 #include <linux/if_packet.h> 34 #include <linux/gfp.h> 35 #include <net/ip.h> 36 #include <net/protocol.h> 37 #include <net/netlink.h> 38 #include <linux/skbuff.h> 39 #include <net/sock.h> 40 #include <net/flow_dissector.h> 41 #include <linux/errno.h> 42 #include <linux/timer.h> 43 #include <asm/uaccess.h> 44 #include <asm/unaligned.h> 45 #include <linux/filter.h> 46 #include <linux/ratelimit.h> 47 #include <linux/seccomp.h> 48 #include <linux/if_vlan.h> 49 #include <linux/bpf.h> 50 #include <net/sch_generic.h> 51 #include <net/cls_cgroup.h> 52 #include <net/dst_metadata.h> 53 #include <net/dst.h> 54 #include <net/sock_reuseport.h> 55 56 /** 57 * sk_filter_trim_cap - run a packet through a socket filter 58 * @sk: sock associated with &sk_buff 59 * @skb: buffer to filter 60 * @cap: limit on how short the eBPF program may trim the packet 61 * 62 * Run the eBPF program and then cut skb->data to correct size returned by 63 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 64 * than pkt_len we keep whole skb->data. This is the socket level 65 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 66 * be accepted or -EPERM if the packet should be tossed. 67 * 68 */ sk_filter_trim_cap(struct sock * sk,struct sk_buff * skb,unsigned int cap)69 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) 70 { 71 int err; 72 struct sk_filter *filter; 73 74 /* 75 * If the skb was allocated from pfmemalloc reserves, only 76 * allow SOCK_MEMALLOC sockets to use it as this socket is 77 * helping free memory 78 */ 79 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 80 return -ENOMEM; 81 82 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); 83 if (err) 84 return err; 85 86 err = security_sock_rcv_skb(sk, skb); 87 if (err) 88 return err; 89 90 rcu_read_lock(); 91 filter = rcu_dereference(sk->sk_filter); 92 if (filter) { 93 struct sock *save_sk = skb->sk; 94 unsigned int pkt_len; 95 96 skb->sk = sk; 97 pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 98 skb->sk = save_sk; 99 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 100 } 101 rcu_read_unlock(); 102 103 return err; 104 } 105 EXPORT_SYMBOL(sk_filter_trim_cap); 106 BPF_CALL_1(__skb_get_pay_offset,struct sk_buff *,skb)107 BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) 108 { 109 return skb_get_poff(skb); 110 } 111 BPF_CALL_3(__skb_get_nlattr,struct sk_buff *,skb,u32,a,u32,x)112 BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 113 { 114 struct nlattr *nla; 115 116 if (skb_is_nonlinear(skb)) 117 return 0; 118 119 if (skb->len < sizeof(struct nlattr)) 120 return 0; 121 122 if (a > skb->len - sizeof(struct nlattr)) 123 return 0; 124 125 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 126 if (nla) 127 return (void *) nla - (void *) skb->data; 128 129 return 0; 130 } 131 BPF_CALL_3(__skb_get_nlattr_nest,struct sk_buff *,skb,u32,a,u32,x)132 BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 133 { 134 struct nlattr *nla; 135 136 if (skb_is_nonlinear(skb)) 137 return 0; 138 139 if (skb->len < sizeof(struct nlattr)) 140 return 0; 141 142 if (a > skb->len - sizeof(struct nlattr)) 143 return 0; 144 145 nla = (struct nlattr *) &skb->data[a]; 146 if (nla->nla_len > skb->len - a) 147 return 0; 148 149 nla = nla_find_nested(nla, x); 150 if (nla) 151 return (void *) nla - (void *) skb->data; 152 153 return 0; 154 } 155 BPF_CALL_0(__get_raw_cpu_id)156 BPF_CALL_0(__get_raw_cpu_id) 157 { 158 return raw_smp_processor_id(); 159 } 160 161 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 162 .func = __get_raw_cpu_id, 163 .gpl_only = false, 164 .ret_type = RET_INTEGER, 165 }; 166 convert_skb_access(int skb_field,int dst_reg,int src_reg,struct bpf_insn * insn_buf)167 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 168 struct bpf_insn *insn_buf) 169 { 170 struct bpf_insn *insn = insn_buf; 171 172 switch (skb_field) { 173 case SKF_AD_MARK: 174 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 175 176 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 177 offsetof(struct sk_buff, mark)); 178 break; 179 180 case SKF_AD_PKTTYPE: 181 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 182 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 183 #ifdef __BIG_ENDIAN_BITFIELD 184 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 185 #endif 186 break; 187 188 case SKF_AD_QUEUE: 189 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 190 191 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 192 offsetof(struct sk_buff, queue_mapping)); 193 break; 194 195 case SKF_AD_VLAN_TAG: 196 case SKF_AD_VLAN_TAG_PRESENT: 197 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 198 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 199 200 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 201 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 202 offsetof(struct sk_buff, vlan_tci)); 203 if (skb_field == SKF_AD_VLAN_TAG) { 204 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 205 ~VLAN_TAG_PRESENT); 206 } else { 207 /* dst_reg >>= 12 */ 208 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 209 /* dst_reg &= 1 */ 210 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 211 } 212 break; 213 } 214 215 return insn - insn_buf; 216 } 217 convert_bpf_extensions(struct sock_filter * fp,struct bpf_insn ** insnp)218 static bool convert_bpf_extensions(struct sock_filter *fp, 219 struct bpf_insn **insnp) 220 { 221 struct bpf_insn *insn = *insnp; 222 u32 cnt; 223 224 switch (fp->k) { 225 case SKF_AD_OFF + SKF_AD_PROTOCOL: 226 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 227 228 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 229 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 230 offsetof(struct sk_buff, protocol)); 231 /* A = ntohs(A) [emitting a nop or swap16] */ 232 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 233 break; 234 235 case SKF_AD_OFF + SKF_AD_PKTTYPE: 236 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 237 insn += cnt - 1; 238 break; 239 240 case SKF_AD_OFF + SKF_AD_IFINDEX: 241 case SKF_AD_OFF + SKF_AD_HATYPE: 242 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 243 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 244 245 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 246 BPF_REG_TMP, BPF_REG_CTX, 247 offsetof(struct sk_buff, dev)); 248 /* if (tmp != 0) goto pc + 1 */ 249 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 250 *insn++ = BPF_EXIT_INSN(); 251 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 252 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 253 offsetof(struct net_device, ifindex)); 254 else 255 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 256 offsetof(struct net_device, type)); 257 break; 258 259 case SKF_AD_OFF + SKF_AD_MARK: 260 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 261 insn += cnt - 1; 262 break; 263 264 case SKF_AD_OFF + SKF_AD_RXHASH: 265 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 266 267 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 268 offsetof(struct sk_buff, hash)); 269 break; 270 271 case SKF_AD_OFF + SKF_AD_QUEUE: 272 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 273 insn += cnt - 1; 274 break; 275 276 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 277 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 278 BPF_REG_A, BPF_REG_CTX, insn); 279 insn += cnt - 1; 280 break; 281 282 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 283 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 284 BPF_REG_A, BPF_REG_CTX, insn); 285 insn += cnt - 1; 286 break; 287 288 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 289 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 290 291 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 292 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 293 offsetof(struct sk_buff, vlan_proto)); 294 /* A = ntohs(A) [emitting a nop or swap16] */ 295 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 296 break; 297 298 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 299 case SKF_AD_OFF + SKF_AD_NLATTR: 300 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 301 case SKF_AD_OFF + SKF_AD_CPU: 302 case SKF_AD_OFF + SKF_AD_RANDOM: 303 /* arg1 = CTX */ 304 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 305 /* arg2 = A */ 306 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 307 /* arg3 = X */ 308 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 309 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 310 switch (fp->k) { 311 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 312 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 313 break; 314 case SKF_AD_OFF + SKF_AD_NLATTR: 315 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 316 break; 317 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 318 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 319 break; 320 case SKF_AD_OFF + SKF_AD_CPU: 321 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 322 break; 323 case SKF_AD_OFF + SKF_AD_RANDOM: 324 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 325 bpf_user_rnd_init_once(); 326 break; 327 } 328 break; 329 330 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 331 /* A ^= X */ 332 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 333 break; 334 335 default: 336 /* This is just a dummy call to avoid letting the compiler 337 * evict __bpf_call_base() as an optimization. Placed here 338 * where no-one bothers. 339 */ 340 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 341 return false; 342 } 343 344 *insnp = insn; 345 return true; 346 } 347 348 /** 349 * bpf_convert_filter - convert filter program 350 * @prog: the user passed filter program 351 * @len: the length of the user passed filter program 352 * @new_prog: buffer where converted program will be stored 353 * @new_len: pointer to store length of converted program 354 * 355 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. 356 * Conversion workflow: 357 * 358 * 1) First pass for calculating the new program length: 359 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 360 * 361 * 2) 2nd pass to remap in two passes: 1st pass finds new 362 * jump offsets, 2nd pass remapping: 363 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 364 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 365 */ bpf_convert_filter(struct sock_filter * prog,int len,struct bpf_insn * new_prog,int * new_len)366 static int bpf_convert_filter(struct sock_filter *prog, int len, 367 struct bpf_insn *new_prog, int *new_len) 368 { 369 int new_flen = 0, pass = 0, target, i; 370 struct bpf_insn *new_insn; 371 struct sock_filter *fp; 372 int *addrs = NULL; 373 u8 bpf_src; 374 375 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 376 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 377 378 if (len <= 0 || len > BPF_MAXINSNS) 379 return -EINVAL; 380 381 if (new_prog) { 382 addrs = kcalloc(len, sizeof(*addrs), 383 GFP_KERNEL | __GFP_NOWARN); 384 if (!addrs) 385 return -ENOMEM; 386 } 387 388 do_pass: 389 new_insn = new_prog; 390 fp = prog; 391 392 /* Classic BPF related prologue emission. */ 393 if (new_insn) { 394 /* Classic BPF expects A and X to be reset first. These need 395 * to be guaranteed to be the first two instructions. 396 */ 397 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 398 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 399 400 /* All programs must keep CTX in callee saved BPF_REG_CTX. 401 * In eBPF case it's done by the compiler, here we need to 402 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 403 */ 404 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 405 } else { 406 new_insn += 3; 407 } 408 409 for (i = 0; i < len; fp++, i++) { 410 struct bpf_insn tmp_insns[6] = { }; 411 struct bpf_insn *insn = tmp_insns; 412 413 if (addrs) 414 addrs[i] = new_insn - new_prog; 415 416 switch (fp->code) { 417 /* All arithmetic insns and skb loads map as-is. */ 418 case BPF_ALU | BPF_ADD | BPF_X: 419 case BPF_ALU | BPF_ADD | BPF_K: 420 case BPF_ALU | BPF_SUB | BPF_X: 421 case BPF_ALU | BPF_SUB | BPF_K: 422 case BPF_ALU | BPF_AND | BPF_X: 423 case BPF_ALU | BPF_AND | BPF_K: 424 case BPF_ALU | BPF_OR | BPF_X: 425 case BPF_ALU | BPF_OR | BPF_K: 426 case BPF_ALU | BPF_LSH | BPF_X: 427 case BPF_ALU | BPF_LSH | BPF_K: 428 case BPF_ALU | BPF_RSH | BPF_X: 429 case BPF_ALU | BPF_RSH | BPF_K: 430 case BPF_ALU | BPF_XOR | BPF_X: 431 case BPF_ALU | BPF_XOR | BPF_K: 432 case BPF_ALU | BPF_MUL | BPF_X: 433 case BPF_ALU | BPF_MUL | BPF_K: 434 case BPF_ALU | BPF_DIV | BPF_X: 435 case BPF_ALU | BPF_DIV | BPF_K: 436 case BPF_ALU | BPF_MOD | BPF_X: 437 case BPF_ALU | BPF_MOD | BPF_K: 438 case BPF_ALU | BPF_NEG: 439 case BPF_LD | BPF_ABS | BPF_W: 440 case BPF_LD | BPF_ABS | BPF_H: 441 case BPF_LD | BPF_ABS | BPF_B: 442 case BPF_LD | BPF_IND | BPF_W: 443 case BPF_LD | BPF_IND | BPF_H: 444 case BPF_LD | BPF_IND | BPF_B: 445 /* Check for overloaded BPF extension and 446 * directly convert it if found, otherwise 447 * just move on with mapping. 448 */ 449 if (BPF_CLASS(fp->code) == BPF_LD && 450 BPF_MODE(fp->code) == BPF_ABS && 451 convert_bpf_extensions(fp, &insn)) 452 break; 453 454 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 455 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) 456 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); 457 458 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 459 break; 460 461 /* Jump transformation cannot use BPF block macros 462 * everywhere as offset calculation and target updates 463 * require a bit more work than the rest, i.e. jump 464 * opcodes map as-is, but offsets need adjustment. 465 */ 466 467 #define BPF_EMIT_JMP \ 468 do { \ 469 if (target >= len || target < 0) \ 470 goto err; \ 471 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 472 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 473 insn->off -= insn - tmp_insns; \ 474 } while (0) 475 476 case BPF_JMP | BPF_JA: 477 target = i + fp->k + 1; 478 insn->code = fp->code; 479 BPF_EMIT_JMP; 480 break; 481 482 case BPF_JMP | BPF_JEQ | BPF_K: 483 case BPF_JMP | BPF_JEQ | BPF_X: 484 case BPF_JMP | BPF_JSET | BPF_K: 485 case BPF_JMP | BPF_JSET | BPF_X: 486 case BPF_JMP | BPF_JGT | BPF_K: 487 case BPF_JMP | BPF_JGT | BPF_X: 488 case BPF_JMP | BPF_JGE | BPF_K: 489 case BPF_JMP | BPF_JGE | BPF_X: 490 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 491 /* BPF immediates are signed, zero extend 492 * immediate into tmp register and use it 493 * in compare insn. 494 */ 495 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 496 497 insn->dst_reg = BPF_REG_A; 498 insn->src_reg = BPF_REG_TMP; 499 bpf_src = BPF_X; 500 } else { 501 insn->dst_reg = BPF_REG_A; 502 insn->imm = fp->k; 503 bpf_src = BPF_SRC(fp->code); 504 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 505 } 506 507 /* Common case where 'jump_false' is next insn. */ 508 if (fp->jf == 0) { 509 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 510 target = i + fp->jt + 1; 511 BPF_EMIT_JMP; 512 break; 513 } 514 515 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 516 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 517 insn->code = BPF_JMP | BPF_JNE | bpf_src; 518 target = i + fp->jf + 1; 519 BPF_EMIT_JMP; 520 break; 521 } 522 523 /* Other jumps are mapped into two insns: Jxx and JA. */ 524 target = i + fp->jt + 1; 525 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 526 BPF_EMIT_JMP; 527 insn++; 528 529 insn->code = BPF_JMP | BPF_JA; 530 target = i + fp->jf + 1; 531 BPF_EMIT_JMP; 532 break; 533 534 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 535 case BPF_LDX | BPF_MSH | BPF_B: 536 /* tmp = A */ 537 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 538 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 539 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 540 /* A &= 0xf */ 541 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 542 /* A <<= 2 */ 543 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 544 /* X = A */ 545 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 546 /* A = tmp */ 547 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 548 break; 549 550 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 551 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 552 */ 553 case BPF_RET | BPF_A: 554 case BPF_RET | BPF_K: 555 if (BPF_RVAL(fp->code) == BPF_K) 556 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 557 0, fp->k); 558 *insn = BPF_EXIT_INSN(); 559 break; 560 561 /* Store to stack. */ 562 case BPF_ST: 563 case BPF_STX: 564 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 565 BPF_ST ? BPF_REG_A : BPF_REG_X, 566 -(BPF_MEMWORDS - fp->k) * 4); 567 break; 568 569 /* Load from stack. */ 570 case BPF_LD | BPF_MEM: 571 case BPF_LDX | BPF_MEM: 572 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 573 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 574 -(BPF_MEMWORDS - fp->k) * 4); 575 break; 576 577 /* A = K or X = K */ 578 case BPF_LD | BPF_IMM: 579 case BPF_LDX | BPF_IMM: 580 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 581 BPF_REG_A : BPF_REG_X, fp->k); 582 break; 583 584 /* X = A */ 585 case BPF_MISC | BPF_TAX: 586 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 587 break; 588 589 /* A = X */ 590 case BPF_MISC | BPF_TXA: 591 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 592 break; 593 594 /* A = skb->len or X = skb->len */ 595 case BPF_LD | BPF_W | BPF_LEN: 596 case BPF_LDX | BPF_W | BPF_LEN: 597 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 598 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 599 offsetof(struct sk_buff, len)); 600 break; 601 602 /* Access seccomp_data fields. */ 603 case BPF_LDX | BPF_ABS | BPF_W: 604 /* A = *(u32 *) (ctx + K) */ 605 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 606 break; 607 608 /* Unknown instruction. */ 609 default: 610 goto err; 611 } 612 613 insn++; 614 if (new_prog) 615 memcpy(new_insn, tmp_insns, 616 sizeof(*insn) * (insn - tmp_insns)); 617 new_insn += insn - tmp_insns; 618 } 619 620 if (!new_prog) { 621 /* Only calculating new length. */ 622 *new_len = new_insn - new_prog; 623 return 0; 624 } 625 626 pass++; 627 if (new_flen != new_insn - new_prog) { 628 new_flen = new_insn - new_prog; 629 if (pass > 2) 630 goto err; 631 goto do_pass; 632 } 633 634 kfree(addrs); 635 BUG_ON(*new_len != new_flen); 636 return 0; 637 err: 638 kfree(addrs); 639 return -EINVAL; 640 } 641 642 /* Security: 643 * 644 * As we dont want to clear mem[] array for each packet going through 645 * __bpf_prog_run(), we check that filter loaded by user never try to read 646 * a cell if not previously written, and we check all branches to be sure 647 * a malicious user doesn't try to abuse us. 648 */ check_load_and_stores(const struct sock_filter * filter,int flen)649 static int check_load_and_stores(const struct sock_filter *filter, int flen) 650 { 651 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 652 int pc, ret = 0; 653 654 BUILD_BUG_ON(BPF_MEMWORDS > 16); 655 656 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 657 if (!masks) 658 return -ENOMEM; 659 660 memset(masks, 0xff, flen * sizeof(*masks)); 661 662 for (pc = 0; pc < flen; pc++) { 663 memvalid &= masks[pc]; 664 665 switch (filter[pc].code) { 666 case BPF_ST: 667 case BPF_STX: 668 memvalid |= (1 << filter[pc].k); 669 break; 670 case BPF_LD | BPF_MEM: 671 case BPF_LDX | BPF_MEM: 672 if (!(memvalid & (1 << filter[pc].k))) { 673 ret = -EINVAL; 674 goto error; 675 } 676 break; 677 case BPF_JMP | BPF_JA: 678 /* A jump must set masks on target */ 679 masks[pc + 1 + filter[pc].k] &= memvalid; 680 memvalid = ~0; 681 break; 682 case BPF_JMP | BPF_JEQ | BPF_K: 683 case BPF_JMP | BPF_JEQ | BPF_X: 684 case BPF_JMP | BPF_JGE | BPF_K: 685 case BPF_JMP | BPF_JGE | BPF_X: 686 case BPF_JMP | BPF_JGT | BPF_K: 687 case BPF_JMP | BPF_JGT | BPF_X: 688 case BPF_JMP | BPF_JSET | BPF_K: 689 case BPF_JMP | BPF_JSET | BPF_X: 690 /* A jump must set masks on targets */ 691 masks[pc + 1 + filter[pc].jt] &= memvalid; 692 masks[pc + 1 + filter[pc].jf] &= memvalid; 693 memvalid = ~0; 694 break; 695 } 696 } 697 error: 698 kfree(masks); 699 return ret; 700 } 701 chk_code_allowed(u16 code_to_probe)702 static bool chk_code_allowed(u16 code_to_probe) 703 { 704 static const bool codes[] = { 705 /* 32 bit ALU operations */ 706 [BPF_ALU | BPF_ADD | BPF_K] = true, 707 [BPF_ALU | BPF_ADD | BPF_X] = true, 708 [BPF_ALU | BPF_SUB | BPF_K] = true, 709 [BPF_ALU | BPF_SUB | BPF_X] = true, 710 [BPF_ALU | BPF_MUL | BPF_K] = true, 711 [BPF_ALU | BPF_MUL | BPF_X] = true, 712 [BPF_ALU | BPF_DIV | BPF_K] = true, 713 [BPF_ALU | BPF_DIV | BPF_X] = true, 714 [BPF_ALU | BPF_MOD | BPF_K] = true, 715 [BPF_ALU | BPF_MOD | BPF_X] = true, 716 [BPF_ALU | BPF_AND | BPF_K] = true, 717 [BPF_ALU | BPF_AND | BPF_X] = true, 718 [BPF_ALU | BPF_OR | BPF_K] = true, 719 [BPF_ALU | BPF_OR | BPF_X] = true, 720 [BPF_ALU | BPF_XOR | BPF_K] = true, 721 [BPF_ALU | BPF_XOR | BPF_X] = true, 722 [BPF_ALU | BPF_LSH | BPF_K] = true, 723 [BPF_ALU | BPF_LSH | BPF_X] = true, 724 [BPF_ALU | BPF_RSH | BPF_K] = true, 725 [BPF_ALU | BPF_RSH | BPF_X] = true, 726 [BPF_ALU | BPF_NEG] = true, 727 /* Load instructions */ 728 [BPF_LD | BPF_W | BPF_ABS] = true, 729 [BPF_LD | BPF_H | BPF_ABS] = true, 730 [BPF_LD | BPF_B | BPF_ABS] = true, 731 [BPF_LD | BPF_W | BPF_LEN] = true, 732 [BPF_LD | BPF_W | BPF_IND] = true, 733 [BPF_LD | BPF_H | BPF_IND] = true, 734 [BPF_LD | BPF_B | BPF_IND] = true, 735 [BPF_LD | BPF_IMM] = true, 736 [BPF_LD | BPF_MEM] = true, 737 [BPF_LDX | BPF_W | BPF_LEN] = true, 738 [BPF_LDX | BPF_B | BPF_MSH] = true, 739 [BPF_LDX | BPF_IMM] = true, 740 [BPF_LDX | BPF_MEM] = true, 741 /* Store instructions */ 742 [BPF_ST] = true, 743 [BPF_STX] = true, 744 /* Misc instructions */ 745 [BPF_MISC | BPF_TAX] = true, 746 [BPF_MISC | BPF_TXA] = true, 747 /* Return instructions */ 748 [BPF_RET | BPF_K] = true, 749 [BPF_RET | BPF_A] = true, 750 /* Jump instructions */ 751 [BPF_JMP | BPF_JA] = true, 752 [BPF_JMP | BPF_JEQ | BPF_K] = true, 753 [BPF_JMP | BPF_JEQ | BPF_X] = true, 754 [BPF_JMP | BPF_JGE | BPF_K] = true, 755 [BPF_JMP | BPF_JGE | BPF_X] = true, 756 [BPF_JMP | BPF_JGT | BPF_K] = true, 757 [BPF_JMP | BPF_JGT | BPF_X] = true, 758 [BPF_JMP | BPF_JSET | BPF_K] = true, 759 [BPF_JMP | BPF_JSET | BPF_X] = true, 760 }; 761 762 if (code_to_probe >= ARRAY_SIZE(codes)) 763 return false; 764 765 return codes[code_to_probe]; 766 } 767 bpf_check_basics_ok(const struct sock_filter * filter,unsigned int flen)768 static bool bpf_check_basics_ok(const struct sock_filter *filter, 769 unsigned int flen) 770 { 771 if (filter == NULL) 772 return false; 773 if (flen == 0 || flen > BPF_MAXINSNS) 774 return false; 775 776 return true; 777 } 778 779 /** 780 * bpf_check_classic - verify socket filter code 781 * @filter: filter to verify 782 * @flen: length of filter 783 * 784 * Check the user's filter code. If we let some ugly 785 * filter code slip through kaboom! The filter must contain 786 * no references or jumps that are out of range, no illegal 787 * instructions, and must end with a RET instruction. 788 * 789 * All jumps are forward as they are not signed. 790 * 791 * Returns 0 if the rule set is legal or -EINVAL if not. 792 */ bpf_check_classic(const struct sock_filter * filter,unsigned int flen)793 static int bpf_check_classic(const struct sock_filter *filter, 794 unsigned int flen) 795 { 796 bool anc_found; 797 int pc; 798 799 /* Check the filter code now */ 800 for (pc = 0; pc < flen; pc++) { 801 const struct sock_filter *ftest = &filter[pc]; 802 803 /* May we actually operate on this code? */ 804 if (!chk_code_allowed(ftest->code)) 805 return -EINVAL; 806 807 /* Some instructions need special checks */ 808 switch (ftest->code) { 809 case BPF_ALU | BPF_DIV | BPF_K: 810 case BPF_ALU | BPF_MOD | BPF_K: 811 /* Check for division by zero */ 812 if (ftest->k == 0) 813 return -EINVAL; 814 break; 815 case BPF_ALU | BPF_LSH | BPF_K: 816 case BPF_ALU | BPF_RSH | BPF_K: 817 if (ftest->k >= 32) 818 return -EINVAL; 819 break; 820 case BPF_LD | BPF_MEM: 821 case BPF_LDX | BPF_MEM: 822 case BPF_ST: 823 case BPF_STX: 824 /* Check for invalid memory addresses */ 825 if (ftest->k >= BPF_MEMWORDS) 826 return -EINVAL; 827 break; 828 case BPF_JMP | BPF_JA: 829 /* Note, the large ftest->k might cause loops. 830 * Compare this with conditional jumps below, 831 * where offsets are limited. --ANK (981016) 832 */ 833 if (ftest->k >= (unsigned int)(flen - pc - 1)) 834 return -EINVAL; 835 break; 836 case BPF_JMP | BPF_JEQ | BPF_K: 837 case BPF_JMP | BPF_JEQ | BPF_X: 838 case BPF_JMP | BPF_JGE | BPF_K: 839 case BPF_JMP | BPF_JGE | BPF_X: 840 case BPF_JMP | BPF_JGT | BPF_K: 841 case BPF_JMP | BPF_JGT | BPF_X: 842 case BPF_JMP | BPF_JSET | BPF_K: 843 case BPF_JMP | BPF_JSET | BPF_X: 844 /* Both conditionals must be safe */ 845 if (pc + ftest->jt + 1 >= flen || 846 pc + ftest->jf + 1 >= flen) 847 return -EINVAL; 848 break; 849 case BPF_LD | BPF_W | BPF_ABS: 850 case BPF_LD | BPF_H | BPF_ABS: 851 case BPF_LD | BPF_B | BPF_ABS: 852 anc_found = false; 853 if (bpf_anc_helper(ftest) & BPF_ANC) 854 anc_found = true; 855 /* Ancillary operation unknown or unsupported */ 856 if (anc_found == false && ftest->k >= SKF_AD_OFF) 857 return -EINVAL; 858 } 859 } 860 861 /* Last instruction must be a RET code */ 862 switch (filter[flen - 1].code) { 863 case BPF_RET | BPF_K: 864 case BPF_RET | BPF_A: 865 return check_load_and_stores(filter, flen); 866 } 867 868 return -EINVAL; 869 } 870 bpf_prog_store_orig_filter(struct bpf_prog * fp,const struct sock_fprog * fprog)871 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 872 const struct sock_fprog *fprog) 873 { 874 unsigned int fsize = bpf_classic_proglen(fprog); 875 struct sock_fprog_kern *fkprog; 876 877 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 878 if (!fp->orig_prog) 879 return -ENOMEM; 880 881 fkprog = fp->orig_prog; 882 fkprog->len = fprog->len; 883 884 fkprog->filter = kmemdup(fp->insns, fsize, 885 GFP_KERNEL | __GFP_NOWARN); 886 if (!fkprog->filter) { 887 kfree(fp->orig_prog); 888 return -ENOMEM; 889 } 890 891 return 0; 892 } 893 bpf_release_orig_filter(struct bpf_prog * fp)894 static void bpf_release_orig_filter(struct bpf_prog *fp) 895 { 896 struct sock_fprog_kern *fprog = fp->orig_prog; 897 898 if (fprog) { 899 kfree(fprog->filter); 900 kfree(fprog); 901 } 902 } 903 __bpf_prog_release(struct bpf_prog * prog)904 static void __bpf_prog_release(struct bpf_prog *prog) 905 { 906 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 907 bpf_prog_put(prog); 908 } else { 909 bpf_release_orig_filter(prog); 910 bpf_prog_free(prog); 911 } 912 } 913 __sk_filter_release(struct sk_filter * fp)914 static void __sk_filter_release(struct sk_filter *fp) 915 { 916 __bpf_prog_release(fp->prog); 917 kfree(fp); 918 } 919 920 /** 921 * sk_filter_release_rcu - Release a socket filter by rcu_head 922 * @rcu: rcu_head that contains the sk_filter to free 923 */ sk_filter_release_rcu(struct rcu_head * rcu)924 static void sk_filter_release_rcu(struct rcu_head *rcu) 925 { 926 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 927 928 __sk_filter_release(fp); 929 } 930 931 /** 932 * sk_filter_release - release a socket filter 933 * @fp: filter to remove 934 * 935 * Remove a filter from a socket and release its resources. 936 */ sk_filter_release(struct sk_filter * fp)937 static void sk_filter_release(struct sk_filter *fp) 938 { 939 if (atomic_dec_and_test(&fp->refcnt)) 940 call_rcu(&fp->rcu, sk_filter_release_rcu); 941 } 942 sk_filter_uncharge(struct sock * sk,struct sk_filter * fp)943 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 944 { 945 u32 filter_size = bpf_prog_size(fp->prog->len); 946 947 atomic_sub(filter_size, &sk->sk_omem_alloc); 948 sk_filter_release(fp); 949 } 950 951 /* try to charge the socket memory if there is space available 952 * return true on success 953 */ sk_filter_charge(struct sock * sk,struct sk_filter * fp)954 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 955 { 956 u32 filter_size = bpf_prog_size(fp->prog->len); 957 958 /* same check as in sock_kmalloc() */ 959 if (filter_size <= sysctl_optmem_max && 960 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 961 atomic_inc(&fp->refcnt); 962 atomic_add(filter_size, &sk->sk_omem_alloc); 963 return true; 964 } 965 return false; 966 } 967 bpf_migrate_filter(struct bpf_prog * fp)968 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 969 { 970 struct sock_filter *old_prog; 971 struct bpf_prog *old_fp; 972 int err, new_len, old_len = fp->len; 973 974 /* We are free to overwrite insns et al right here as it 975 * won't be used at this point in time anymore internally 976 * after the migration to the internal BPF instruction 977 * representation. 978 */ 979 BUILD_BUG_ON(sizeof(struct sock_filter) != 980 sizeof(struct bpf_insn)); 981 982 /* Conversion cannot happen on overlapping memory areas, 983 * so we need to keep the user BPF around until the 2nd 984 * pass. At this time, the user BPF is stored in fp->insns. 985 */ 986 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 987 GFP_KERNEL | __GFP_NOWARN); 988 if (!old_prog) { 989 err = -ENOMEM; 990 goto out_err; 991 } 992 993 /* 1st pass: calculate the new program length. */ 994 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 995 if (err) 996 goto out_err_free; 997 998 /* Expand fp for appending the new filter representation. */ 999 old_fp = fp; 1000 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 1001 if (!fp) { 1002 /* The old_fp is still around in case we couldn't 1003 * allocate new memory, so uncharge on that one. 1004 */ 1005 fp = old_fp; 1006 err = -ENOMEM; 1007 goto out_err_free; 1008 } 1009 1010 fp->len = new_len; 1011 1012 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1013 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 1014 if (err) 1015 /* 2nd bpf_convert_filter() can fail only if it fails 1016 * to allocate memory, remapping must succeed. Note, 1017 * that at this time old_fp has already been released 1018 * by krealloc(). 1019 */ 1020 goto out_err_free; 1021 1022 fp = bpf_prog_select_runtime(fp, &err); 1023 if (err) 1024 goto out_err_free; 1025 1026 kfree(old_prog); 1027 return fp; 1028 1029 out_err_free: 1030 kfree(old_prog); 1031 out_err: 1032 __bpf_prog_release(fp); 1033 return ERR_PTR(err); 1034 } 1035 bpf_prepare_filter(struct bpf_prog * fp,bpf_aux_classic_check_t trans)1036 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1037 bpf_aux_classic_check_t trans) 1038 { 1039 int err; 1040 1041 fp->bpf_func = NULL; 1042 fp->jited = 0; 1043 1044 err = bpf_check_classic(fp->insns, fp->len); 1045 if (err) { 1046 __bpf_prog_release(fp); 1047 return ERR_PTR(err); 1048 } 1049 1050 /* There might be additional checks and transformations 1051 * needed on classic filters, f.e. in case of seccomp. 1052 */ 1053 if (trans) { 1054 err = trans(fp->insns, fp->len); 1055 if (err) { 1056 __bpf_prog_release(fp); 1057 return ERR_PTR(err); 1058 } 1059 } 1060 1061 /* Probe if we can JIT compile the filter and if so, do 1062 * the compilation of the filter. 1063 */ 1064 bpf_jit_compile(fp); 1065 1066 /* JIT compiler couldn't process this filter, so do the 1067 * internal BPF translation for the optimized interpreter. 1068 */ 1069 if (!fp->jited) 1070 fp = bpf_migrate_filter(fp); 1071 1072 return fp; 1073 } 1074 1075 /** 1076 * bpf_prog_create - create an unattached filter 1077 * @pfp: the unattached filter that is created 1078 * @fprog: the filter program 1079 * 1080 * Create a filter independent of any socket. We first run some 1081 * sanity checks on it to make sure it does not explode on us later. 1082 * If an error occurs or there is insufficient memory for the filter 1083 * a negative errno code is returned. On success the return is zero. 1084 */ bpf_prog_create(struct bpf_prog ** pfp,struct sock_fprog_kern * fprog)1085 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1086 { 1087 unsigned int fsize = bpf_classic_proglen(fprog); 1088 struct bpf_prog *fp; 1089 1090 /* Make sure new filter is there and in the right amounts. */ 1091 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1092 return -EINVAL; 1093 1094 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1095 if (!fp) 1096 return -ENOMEM; 1097 1098 memcpy(fp->insns, fprog->filter, fsize); 1099 1100 fp->len = fprog->len; 1101 /* Since unattached filters are not copied back to user 1102 * space through sk_get_filter(), we do not need to hold 1103 * a copy here, and can spare us the work. 1104 */ 1105 fp->orig_prog = NULL; 1106 1107 /* bpf_prepare_filter() already takes care of freeing 1108 * memory in case something goes wrong. 1109 */ 1110 fp = bpf_prepare_filter(fp, NULL); 1111 if (IS_ERR(fp)) 1112 return PTR_ERR(fp); 1113 1114 *pfp = fp; 1115 return 0; 1116 } 1117 EXPORT_SYMBOL_GPL(bpf_prog_create); 1118 1119 /** 1120 * bpf_prog_create_from_user - create an unattached filter from user buffer 1121 * @pfp: the unattached filter that is created 1122 * @fprog: the filter program 1123 * @trans: post-classic verifier transformation handler 1124 * @save_orig: save classic BPF program 1125 * 1126 * This function effectively does the same as bpf_prog_create(), only 1127 * that it builds up its insns buffer from user space provided buffer. 1128 * It also allows for passing a bpf_aux_classic_check_t handler. 1129 */ bpf_prog_create_from_user(struct bpf_prog ** pfp,struct sock_fprog * fprog,bpf_aux_classic_check_t trans,bool save_orig)1130 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1131 bpf_aux_classic_check_t trans, bool save_orig) 1132 { 1133 unsigned int fsize = bpf_classic_proglen(fprog); 1134 struct bpf_prog *fp; 1135 int err; 1136 1137 /* Make sure new filter is there and in the right amounts. */ 1138 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1139 return -EINVAL; 1140 1141 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1142 if (!fp) 1143 return -ENOMEM; 1144 1145 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1146 __bpf_prog_free(fp); 1147 return -EFAULT; 1148 } 1149 1150 fp->len = fprog->len; 1151 fp->orig_prog = NULL; 1152 1153 if (save_orig) { 1154 err = bpf_prog_store_orig_filter(fp, fprog); 1155 if (err) { 1156 __bpf_prog_free(fp); 1157 return -ENOMEM; 1158 } 1159 } 1160 1161 /* bpf_prepare_filter() already takes care of freeing 1162 * memory in case something goes wrong. 1163 */ 1164 fp = bpf_prepare_filter(fp, trans); 1165 if (IS_ERR(fp)) 1166 return PTR_ERR(fp); 1167 1168 *pfp = fp; 1169 return 0; 1170 } 1171 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1172 bpf_prog_destroy(struct bpf_prog * fp)1173 void bpf_prog_destroy(struct bpf_prog *fp) 1174 { 1175 __bpf_prog_release(fp); 1176 } 1177 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1178 __sk_attach_prog(struct bpf_prog * prog,struct sock * sk)1179 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1180 { 1181 struct sk_filter *fp, *old_fp; 1182 1183 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1184 if (!fp) 1185 return -ENOMEM; 1186 1187 fp->prog = prog; 1188 atomic_set(&fp->refcnt, 0); 1189 1190 if (!sk_filter_charge(sk, fp)) { 1191 kfree(fp); 1192 return -ENOMEM; 1193 } 1194 1195 old_fp = rcu_dereference_protected(sk->sk_filter, 1196 lockdep_sock_is_held(sk)); 1197 rcu_assign_pointer(sk->sk_filter, fp); 1198 1199 if (old_fp) 1200 sk_filter_uncharge(sk, old_fp); 1201 1202 return 0; 1203 } 1204 __reuseport_attach_prog(struct bpf_prog * prog,struct sock * sk)1205 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1206 { 1207 struct bpf_prog *old_prog; 1208 int err; 1209 1210 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1211 return -ENOMEM; 1212 1213 if (sk_unhashed(sk) && sk->sk_reuseport) { 1214 err = reuseport_alloc(sk); 1215 if (err) 1216 return err; 1217 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1218 /* The socket wasn't bound with SO_REUSEPORT */ 1219 return -EINVAL; 1220 } 1221 1222 old_prog = reuseport_attach_prog(sk, prog); 1223 if (old_prog) 1224 bpf_prog_destroy(old_prog); 1225 1226 return 0; 1227 } 1228 1229 static __get_filter(struct sock_fprog * fprog,struct sock * sk)1230 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1231 { 1232 unsigned int fsize = bpf_classic_proglen(fprog); 1233 struct bpf_prog *prog; 1234 int err; 1235 1236 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1237 return ERR_PTR(-EPERM); 1238 1239 /* Make sure new filter is there and in the right amounts. */ 1240 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1241 return ERR_PTR(-EINVAL); 1242 1243 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1244 if (!prog) 1245 return ERR_PTR(-ENOMEM); 1246 1247 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1248 __bpf_prog_free(prog); 1249 return ERR_PTR(-EFAULT); 1250 } 1251 1252 prog->len = fprog->len; 1253 1254 err = bpf_prog_store_orig_filter(prog, fprog); 1255 if (err) { 1256 __bpf_prog_free(prog); 1257 return ERR_PTR(-ENOMEM); 1258 } 1259 1260 /* bpf_prepare_filter() already takes care of freeing 1261 * memory in case something goes wrong. 1262 */ 1263 return bpf_prepare_filter(prog, NULL); 1264 } 1265 1266 /** 1267 * sk_attach_filter - attach a socket filter 1268 * @fprog: the filter program 1269 * @sk: the socket to use 1270 * 1271 * Attach the user's filter code. We first run some sanity checks on 1272 * it to make sure it does not explode on us later. If an error 1273 * occurs or there is insufficient memory for the filter a negative 1274 * errno code is returned. On success the return is zero. 1275 */ sk_attach_filter(struct sock_fprog * fprog,struct sock * sk)1276 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1277 { 1278 struct bpf_prog *prog = __get_filter(fprog, sk); 1279 int err; 1280 1281 if (IS_ERR(prog)) 1282 return PTR_ERR(prog); 1283 1284 err = __sk_attach_prog(prog, sk); 1285 if (err < 0) { 1286 __bpf_prog_release(prog); 1287 return err; 1288 } 1289 1290 return 0; 1291 } 1292 EXPORT_SYMBOL_GPL(sk_attach_filter); 1293 sk_reuseport_attach_filter(struct sock_fprog * fprog,struct sock * sk)1294 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1295 { 1296 struct bpf_prog *prog = __get_filter(fprog, sk); 1297 int err; 1298 1299 if (IS_ERR(prog)) 1300 return PTR_ERR(prog); 1301 1302 err = __reuseport_attach_prog(prog, sk); 1303 if (err < 0) { 1304 __bpf_prog_release(prog); 1305 return err; 1306 } 1307 1308 return 0; 1309 } 1310 __get_bpf(u32 ufd,struct sock * sk)1311 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1312 { 1313 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1314 return ERR_PTR(-EPERM); 1315 1316 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1317 } 1318 sk_attach_bpf(u32 ufd,struct sock * sk)1319 int sk_attach_bpf(u32 ufd, struct sock *sk) 1320 { 1321 struct bpf_prog *prog = __get_bpf(ufd, sk); 1322 int err; 1323 1324 if (IS_ERR(prog)) 1325 return PTR_ERR(prog); 1326 1327 err = __sk_attach_prog(prog, sk); 1328 if (err < 0) { 1329 bpf_prog_put(prog); 1330 return err; 1331 } 1332 1333 return 0; 1334 } 1335 sk_reuseport_attach_bpf(u32 ufd,struct sock * sk)1336 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1337 { 1338 struct bpf_prog *prog = __get_bpf(ufd, sk); 1339 int err; 1340 1341 if (IS_ERR(prog)) 1342 return PTR_ERR(prog); 1343 1344 err = __reuseport_attach_prog(prog, sk); 1345 if (err < 0) { 1346 bpf_prog_put(prog); 1347 return err; 1348 } 1349 1350 return 0; 1351 } 1352 1353 struct bpf_scratchpad { 1354 union { 1355 __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; 1356 u8 buff[MAX_BPF_STACK]; 1357 }; 1358 }; 1359 1360 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1361 __bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1362 static inline int __bpf_try_make_writable(struct sk_buff *skb, 1363 unsigned int write_len) 1364 { 1365 return skb_ensure_writable(skb, write_len); 1366 } 1367 bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1368 static inline int bpf_try_make_writable(struct sk_buff *skb, 1369 unsigned int write_len) 1370 { 1371 int err = __bpf_try_make_writable(skb, write_len); 1372 1373 bpf_compute_data_end(skb); 1374 return err; 1375 } 1376 bpf_try_make_head_writable(struct sk_buff * skb)1377 static int bpf_try_make_head_writable(struct sk_buff *skb) 1378 { 1379 return bpf_try_make_writable(skb, skb_headlen(skb)); 1380 } 1381 bpf_push_mac_rcsum(struct sk_buff * skb)1382 static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1383 { 1384 if (skb_at_tc_ingress(skb)) 1385 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1386 } 1387 bpf_pull_mac_rcsum(struct sk_buff * skb)1388 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) 1389 { 1390 if (skb_at_tc_ingress(skb)) 1391 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1392 } 1393 BPF_CALL_5(bpf_skb_store_bytes,struct sk_buff *,skb,u32,offset,const void *,from,u32,len,u64,flags)1394 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, 1395 const void *, from, u32, len, u64, flags) 1396 { 1397 void *ptr; 1398 1399 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1400 return -EINVAL; 1401 if (unlikely(offset > 0xffff)) 1402 return -EFAULT; 1403 if (unlikely(bpf_try_make_writable(skb, offset + len))) 1404 return -EFAULT; 1405 1406 ptr = skb->data + offset; 1407 if (flags & BPF_F_RECOMPUTE_CSUM) 1408 __skb_postpull_rcsum(skb, ptr, len, offset); 1409 1410 memcpy(ptr, from, len); 1411 1412 if (flags & BPF_F_RECOMPUTE_CSUM) 1413 __skb_postpush_rcsum(skb, ptr, len, offset); 1414 if (flags & BPF_F_INVALIDATE_HASH) 1415 skb_clear_hash(skb); 1416 1417 return 0; 1418 } 1419 1420 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1421 .func = bpf_skb_store_bytes, 1422 .gpl_only = false, 1423 .ret_type = RET_INTEGER, 1424 .arg1_type = ARG_PTR_TO_CTX, 1425 .arg2_type = ARG_ANYTHING, 1426 .arg3_type = ARG_PTR_TO_STACK, 1427 .arg4_type = ARG_CONST_STACK_SIZE, 1428 .arg5_type = ARG_ANYTHING, 1429 }; 1430 BPF_CALL_4(bpf_skb_load_bytes,const struct sk_buff *,skb,u32,offset,void *,to,u32,len)1431 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, 1432 void *, to, u32, len) 1433 { 1434 void *ptr; 1435 1436 if (unlikely(offset > 0xffff)) 1437 goto err_clear; 1438 1439 ptr = skb_header_pointer(skb, offset, len, to); 1440 if (unlikely(!ptr)) 1441 goto err_clear; 1442 if (ptr != to) 1443 memcpy(to, ptr, len); 1444 1445 return 0; 1446 err_clear: 1447 memset(to, 0, len); 1448 return -EFAULT; 1449 } 1450 1451 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1452 .func = bpf_skb_load_bytes, 1453 .gpl_only = false, 1454 .ret_type = RET_INTEGER, 1455 .arg1_type = ARG_PTR_TO_CTX, 1456 .arg2_type = ARG_ANYTHING, 1457 .arg3_type = ARG_PTR_TO_RAW_STACK, 1458 .arg4_type = ARG_CONST_STACK_SIZE, 1459 }; 1460 BPF_CALL_2(bpf_skb_pull_data,struct sk_buff *,skb,u32,len)1461 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1462 { 1463 /* Idea is the following: should the needed direct read/write 1464 * test fail during runtime, we can pull in more data and redo 1465 * again, since implicitly, we invalidate previous checks here. 1466 * 1467 * Or, since we know how much we need to make read/writeable, 1468 * this can be done once at the program beginning for direct 1469 * access case. By this we overcome limitations of only current 1470 * headroom being accessible. 1471 */ 1472 return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); 1473 } 1474 1475 static const struct bpf_func_proto bpf_skb_pull_data_proto = { 1476 .func = bpf_skb_pull_data, 1477 .gpl_only = false, 1478 .ret_type = RET_INTEGER, 1479 .arg1_type = ARG_PTR_TO_CTX, 1480 .arg2_type = ARG_ANYTHING, 1481 }; 1482 BPF_CALL_5(bpf_l3_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1483 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, 1484 u64, from, u64, to, u64, flags) 1485 { 1486 __sum16 *ptr; 1487 1488 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1489 return -EINVAL; 1490 if (unlikely(offset > 0xffff || offset & 1)) 1491 return -EFAULT; 1492 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1493 return -EFAULT; 1494 1495 ptr = (__sum16 *)(skb->data + offset); 1496 switch (flags & BPF_F_HDR_FIELD_MASK) { 1497 case 0: 1498 if (unlikely(from != 0)) 1499 return -EINVAL; 1500 1501 csum_replace_by_diff(ptr, to); 1502 break; 1503 case 2: 1504 csum_replace2(ptr, from, to); 1505 break; 1506 case 4: 1507 csum_replace4(ptr, from, to); 1508 break; 1509 default: 1510 return -EINVAL; 1511 } 1512 1513 return 0; 1514 } 1515 1516 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1517 .func = bpf_l3_csum_replace, 1518 .gpl_only = false, 1519 .ret_type = RET_INTEGER, 1520 .arg1_type = ARG_PTR_TO_CTX, 1521 .arg2_type = ARG_ANYTHING, 1522 .arg3_type = ARG_ANYTHING, 1523 .arg4_type = ARG_ANYTHING, 1524 .arg5_type = ARG_ANYTHING, 1525 }; 1526 BPF_CALL_5(bpf_l4_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1527 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, 1528 u64, from, u64, to, u64, flags) 1529 { 1530 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1531 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1532 __sum16 *ptr; 1533 1534 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1535 BPF_F_HDR_FIELD_MASK))) 1536 return -EINVAL; 1537 if (unlikely(offset > 0xffff || offset & 1)) 1538 return -EFAULT; 1539 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1540 return -EFAULT; 1541 1542 ptr = (__sum16 *)(skb->data + offset); 1543 if (is_mmzero && !*ptr) 1544 return 0; 1545 1546 switch (flags & BPF_F_HDR_FIELD_MASK) { 1547 case 0: 1548 if (unlikely(from != 0)) 1549 return -EINVAL; 1550 1551 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); 1552 break; 1553 case 2: 1554 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1555 break; 1556 case 4: 1557 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1558 break; 1559 default: 1560 return -EINVAL; 1561 } 1562 1563 if (is_mmzero && !*ptr) 1564 *ptr = CSUM_MANGLED_0; 1565 return 0; 1566 } 1567 1568 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1569 .func = bpf_l4_csum_replace, 1570 .gpl_only = false, 1571 .ret_type = RET_INTEGER, 1572 .arg1_type = ARG_PTR_TO_CTX, 1573 .arg2_type = ARG_ANYTHING, 1574 .arg3_type = ARG_ANYTHING, 1575 .arg4_type = ARG_ANYTHING, 1576 .arg5_type = ARG_ANYTHING, 1577 }; 1578 BPF_CALL_5(bpf_csum_diff,__be32 *,from,u32,from_size,__be32 *,to,u32,to_size,__wsum,seed)1579 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, 1580 __be32 *, to, u32, to_size, __wsum, seed) 1581 { 1582 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1583 u32 diff_size = from_size + to_size; 1584 int i, j = 0; 1585 1586 /* This is quite flexible, some examples: 1587 * 1588 * from_size == 0, to_size > 0, seed := csum --> pushing data 1589 * from_size > 0, to_size == 0, seed := csum --> pulling data 1590 * from_size > 0, to_size > 0, seed := 0 --> diffing data 1591 * 1592 * Even for diffing, from_size and to_size don't need to be equal. 1593 */ 1594 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || 1595 diff_size > sizeof(sp->diff))) 1596 return -EINVAL; 1597 1598 for (i = 0; i < from_size / sizeof(__be32); i++, j++) 1599 sp->diff[j] = ~from[i]; 1600 for (i = 0; i < to_size / sizeof(__be32); i++, j++) 1601 sp->diff[j] = to[i]; 1602 1603 return csum_partial(sp->diff, diff_size, seed); 1604 } 1605 1606 static const struct bpf_func_proto bpf_csum_diff_proto = { 1607 .func = bpf_csum_diff, 1608 .gpl_only = false, 1609 .pkt_access = true, 1610 .ret_type = RET_INTEGER, 1611 .arg1_type = ARG_PTR_TO_STACK, 1612 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1613 .arg3_type = ARG_PTR_TO_STACK, 1614 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1615 .arg5_type = ARG_ANYTHING, 1616 }; 1617 BPF_CALL_2(bpf_csum_update,struct sk_buff *,skb,__wsum,csum)1618 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) 1619 { 1620 /* The interface is to be used in combination with bpf_csum_diff() 1621 * for direct packet writes. csum rotation for alignment as well 1622 * as emulating csum_sub() can be done from the eBPF program. 1623 */ 1624 if (skb->ip_summed == CHECKSUM_COMPLETE) 1625 return (skb->csum = csum_add(skb->csum, csum)); 1626 1627 return -ENOTSUPP; 1628 } 1629 1630 static const struct bpf_func_proto bpf_csum_update_proto = { 1631 .func = bpf_csum_update, 1632 .gpl_only = false, 1633 .ret_type = RET_INTEGER, 1634 .arg1_type = ARG_PTR_TO_CTX, 1635 .arg2_type = ARG_ANYTHING, 1636 }; 1637 __bpf_rx_skb(struct net_device * dev,struct sk_buff * skb)1638 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 1639 { 1640 return dev_forward_skb(dev, skb); 1641 } 1642 __bpf_rx_skb_no_mac(struct net_device * dev,struct sk_buff * skb)1643 static inline int __bpf_rx_skb_no_mac(struct net_device *dev, 1644 struct sk_buff *skb) 1645 { 1646 int ret = ____dev_forward_skb(dev, skb); 1647 1648 if (likely(!ret)) { 1649 skb->dev = dev; 1650 ret = netif_rx(skb); 1651 } 1652 1653 return ret; 1654 } 1655 __bpf_tx_skb(struct net_device * dev,struct sk_buff * skb)1656 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) 1657 { 1658 int ret; 1659 1660 if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { 1661 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 1662 kfree_skb(skb); 1663 return -ENETDOWN; 1664 } 1665 1666 skb->dev = dev; 1667 1668 __this_cpu_inc(xmit_recursion); 1669 ret = dev_queue_xmit(skb); 1670 __this_cpu_dec(xmit_recursion); 1671 1672 return ret; 1673 } 1674 __bpf_redirect_no_mac(struct sk_buff * skb,struct net_device * dev,u32 flags)1675 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, 1676 u32 flags) 1677 { 1678 /* skb->mac_len is not set on normal egress */ 1679 unsigned int mlen = skb->network_header - skb->mac_header; 1680 1681 __skb_pull(skb, mlen); 1682 1683 /* At ingress, the mac header has already been pulled once. 1684 * At egress, skb_pospull_rcsum has to be done in case that 1685 * the skb is originated from ingress (i.e. a forwarded skb) 1686 * to ensure that rcsum starts at net header. 1687 */ 1688 if (!skb_at_tc_ingress(skb)) 1689 skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); 1690 skb_pop_mac_header(skb); 1691 skb_reset_mac_len(skb); 1692 return flags & BPF_F_INGRESS ? 1693 __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); 1694 } 1695 __bpf_redirect_common(struct sk_buff * skb,struct net_device * dev,u32 flags)1696 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, 1697 u32 flags) 1698 { 1699 bpf_push_mac_rcsum(skb); 1700 return flags & BPF_F_INGRESS ? 1701 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1702 } 1703 __bpf_redirect(struct sk_buff * skb,struct net_device * dev,u32 flags)1704 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, 1705 u32 flags) 1706 { 1707 switch (dev->type) { 1708 case ARPHRD_TUNNEL: 1709 case ARPHRD_TUNNEL6: 1710 case ARPHRD_SIT: 1711 case ARPHRD_IPGRE: 1712 case ARPHRD_VOID: 1713 case ARPHRD_NONE: 1714 return __bpf_redirect_no_mac(skb, dev, flags); 1715 default: 1716 return __bpf_redirect_common(skb, dev, flags); 1717 } 1718 } 1719 BPF_CALL_3(bpf_clone_redirect,struct sk_buff *,skb,u32,ifindex,u64,flags)1720 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) 1721 { 1722 struct net_device *dev; 1723 struct sk_buff *clone; 1724 int ret; 1725 1726 if (unlikely(flags & ~(BPF_F_INGRESS))) 1727 return -EINVAL; 1728 1729 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1730 if (unlikely(!dev)) 1731 return -EINVAL; 1732 1733 clone = skb_clone(skb, GFP_ATOMIC); 1734 if (unlikely(!clone)) 1735 return -ENOMEM; 1736 1737 /* For direct write, we need to keep the invariant that the skbs 1738 * we're dealing with need to be uncloned. Should uncloning fail 1739 * here, we need to free the just generated clone to unclone once 1740 * again. 1741 */ 1742 ret = bpf_try_make_head_writable(skb); 1743 if (unlikely(ret)) { 1744 kfree_skb(clone); 1745 return -ENOMEM; 1746 } 1747 1748 return __bpf_redirect(clone, dev, flags); 1749 } 1750 1751 static const struct bpf_func_proto bpf_clone_redirect_proto = { 1752 .func = bpf_clone_redirect, 1753 .gpl_only = false, 1754 .ret_type = RET_INTEGER, 1755 .arg1_type = ARG_PTR_TO_CTX, 1756 .arg2_type = ARG_ANYTHING, 1757 .arg3_type = ARG_ANYTHING, 1758 }; 1759 1760 struct redirect_info { 1761 u32 ifindex; 1762 u32 flags; 1763 }; 1764 1765 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1766 BPF_CALL_2(bpf_redirect,u32,ifindex,u64,flags)1767 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 1768 { 1769 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1770 1771 if (unlikely(flags & ~(BPF_F_INGRESS))) 1772 return TC_ACT_SHOT; 1773 1774 ri->ifindex = ifindex; 1775 ri->flags = flags; 1776 1777 return TC_ACT_REDIRECT; 1778 } 1779 skb_do_redirect(struct sk_buff * skb)1780 int skb_do_redirect(struct sk_buff *skb) 1781 { 1782 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1783 struct net_device *dev; 1784 1785 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1786 ri->ifindex = 0; 1787 if (unlikely(!dev)) { 1788 kfree_skb(skb); 1789 return -EINVAL; 1790 } 1791 1792 return __bpf_redirect(skb, dev, ri->flags); 1793 } 1794 1795 static const struct bpf_func_proto bpf_redirect_proto = { 1796 .func = bpf_redirect, 1797 .gpl_only = false, 1798 .ret_type = RET_INTEGER, 1799 .arg1_type = ARG_ANYTHING, 1800 .arg2_type = ARG_ANYTHING, 1801 }; 1802 BPF_CALL_1(bpf_get_cgroup_classid,const struct sk_buff *,skb)1803 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 1804 { 1805 return task_get_classid(skb); 1806 } 1807 1808 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1809 .func = bpf_get_cgroup_classid, 1810 .gpl_only = false, 1811 .ret_type = RET_INTEGER, 1812 .arg1_type = ARG_PTR_TO_CTX, 1813 }; 1814 BPF_CALL_1(bpf_get_route_realm,const struct sk_buff *,skb)1815 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) 1816 { 1817 return dst_tclassid(skb); 1818 } 1819 1820 static const struct bpf_func_proto bpf_get_route_realm_proto = { 1821 .func = bpf_get_route_realm, 1822 .gpl_only = false, 1823 .ret_type = RET_INTEGER, 1824 .arg1_type = ARG_PTR_TO_CTX, 1825 }; 1826 BPF_CALL_1(bpf_get_hash_recalc,struct sk_buff *,skb)1827 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) 1828 { 1829 /* If skb_clear_hash() was called due to mangling, we can 1830 * trigger SW recalculation here. Later access to hash 1831 * can then use the inline skb->hash via context directly 1832 * instead of calling this helper again. 1833 */ 1834 return skb_get_hash(skb); 1835 } 1836 1837 static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 1838 .func = bpf_get_hash_recalc, 1839 .gpl_only = false, 1840 .ret_type = RET_INTEGER, 1841 .arg1_type = ARG_PTR_TO_CTX, 1842 }; 1843 BPF_CALL_1(bpf_set_hash_invalid,struct sk_buff *,skb)1844 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) 1845 { 1846 /* After all direct packet write, this can be used once for 1847 * triggering a lazy recalc on next skb_get_hash() invocation. 1848 */ 1849 skb_clear_hash(skb); 1850 return 0; 1851 } 1852 1853 static const struct bpf_func_proto bpf_set_hash_invalid_proto = { 1854 .func = bpf_set_hash_invalid, 1855 .gpl_only = false, 1856 .ret_type = RET_INTEGER, 1857 .arg1_type = ARG_PTR_TO_CTX, 1858 }; 1859 BPF_CALL_3(bpf_skb_vlan_push,struct sk_buff *,skb,__be16,vlan_proto,u16,vlan_tci)1860 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, 1861 u16, vlan_tci) 1862 { 1863 int ret; 1864 1865 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1866 vlan_proto != htons(ETH_P_8021AD))) 1867 vlan_proto = htons(ETH_P_8021Q); 1868 1869 bpf_push_mac_rcsum(skb); 1870 ret = skb_vlan_push(skb, vlan_proto, vlan_tci); 1871 bpf_pull_mac_rcsum(skb); 1872 1873 bpf_compute_data_end(skb); 1874 return ret; 1875 } 1876 1877 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 1878 .func = bpf_skb_vlan_push, 1879 .gpl_only = false, 1880 .ret_type = RET_INTEGER, 1881 .arg1_type = ARG_PTR_TO_CTX, 1882 .arg2_type = ARG_ANYTHING, 1883 .arg3_type = ARG_ANYTHING, 1884 }; 1885 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1886 BPF_CALL_1(bpf_skb_vlan_pop,struct sk_buff *,skb)1887 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 1888 { 1889 int ret; 1890 1891 bpf_push_mac_rcsum(skb); 1892 ret = skb_vlan_pop(skb); 1893 bpf_pull_mac_rcsum(skb); 1894 1895 bpf_compute_data_end(skb); 1896 return ret; 1897 } 1898 1899 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 1900 .func = bpf_skb_vlan_pop, 1901 .gpl_only = false, 1902 .ret_type = RET_INTEGER, 1903 .arg1_type = ARG_PTR_TO_CTX, 1904 }; 1905 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 1906 bpf_skb_generic_push(struct sk_buff * skb,u32 off,u32 len)1907 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 1908 { 1909 /* Caller already did skb_cow() with len as headroom, 1910 * so no need to do it here. 1911 */ 1912 skb_push(skb, len); 1913 memmove(skb->data, skb->data + len, off); 1914 memset(skb->data + off, 0, len); 1915 1916 /* No skb_postpush_rcsum(skb, skb->data + off, len) 1917 * needed here as it does not change the skb->csum 1918 * result for checksum complete when summing over 1919 * zeroed blocks. 1920 */ 1921 return 0; 1922 } 1923 bpf_skb_generic_pop(struct sk_buff * skb,u32 off,u32 len)1924 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) 1925 { 1926 /* skb_ensure_writable() is not needed here, as we're 1927 * already working on an uncloned skb. 1928 */ 1929 if (unlikely(!pskb_may_pull(skb, off + len))) 1930 return -ENOMEM; 1931 1932 skb_postpull_rcsum(skb, skb->data + off, len); 1933 memmove(skb->data + len, skb->data, off); 1934 __skb_pull(skb, len); 1935 1936 return 0; 1937 } 1938 bpf_skb_net_hdr_push(struct sk_buff * skb,u32 off,u32 len)1939 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) 1940 { 1941 bool trans_same = skb->transport_header == skb->network_header; 1942 int ret; 1943 1944 /* There's no need for __skb_push()/__skb_pull() pair to 1945 * get to the start of the mac header as we're guaranteed 1946 * to always start from here under eBPF. 1947 */ 1948 ret = bpf_skb_generic_push(skb, off, len); 1949 if (likely(!ret)) { 1950 skb->mac_header -= len; 1951 skb->network_header -= len; 1952 if (trans_same) 1953 skb->transport_header = skb->network_header; 1954 } 1955 1956 return ret; 1957 } 1958 bpf_skb_net_hdr_pop(struct sk_buff * skb,u32 off,u32 len)1959 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) 1960 { 1961 bool trans_same = skb->transport_header == skb->network_header; 1962 int ret; 1963 1964 /* Same here, __skb_push()/__skb_pull() pair not needed. */ 1965 ret = bpf_skb_generic_pop(skb, off, len); 1966 if (likely(!ret)) { 1967 skb->mac_header += len; 1968 skb->network_header += len; 1969 if (trans_same) 1970 skb->transport_header = skb->network_header; 1971 } 1972 1973 return ret; 1974 } 1975 bpf_skb_proto_4_to_6(struct sk_buff * skb)1976 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 1977 { 1978 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 1979 u32 off = skb->network_header - skb->mac_header; 1980 int ret; 1981 1982 ret = skb_cow(skb, len_diff); 1983 if (unlikely(ret < 0)) 1984 return ret; 1985 1986 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 1987 if (unlikely(ret < 0)) 1988 return ret; 1989 1990 if (skb_is_gso(skb)) { 1991 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to 1992 * be changed into SKB_GSO_TCPV6. 1993 */ 1994 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 1995 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; 1996 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; 1997 } 1998 1999 /* Due to IPv6 header, MSS needs to be downgraded. */ 2000 skb_shinfo(skb)->gso_size -= len_diff; 2001 /* Header must be checked, and gso_segs recomputed. */ 2002 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2003 skb_shinfo(skb)->gso_segs = 0; 2004 } 2005 2006 skb->protocol = htons(ETH_P_IPV6); 2007 skb_clear_hash(skb); 2008 2009 return 0; 2010 } 2011 bpf_skb_proto_6_to_4(struct sk_buff * skb)2012 static int bpf_skb_proto_6_to_4(struct sk_buff *skb) 2013 { 2014 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 2015 u32 off = skb->network_header - skb->mac_header; 2016 int ret; 2017 2018 ret = skb_unclone(skb, GFP_ATOMIC); 2019 if (unlikely(ret < 0)) 2020 return ret; 2021 2022 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 2023 if (unlikely(ret < 0)) 2024 return ret; 2025 2026 if (skb_is_gso(skb)) { 2027 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to 2028 * be changed into SKB_GSO_TCPV4. 2029 */ 2030 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { 2031 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; 2032 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; 2033 } 2034 2035 /* Due to IPv4 header, MSS can be upgraded. */ 2036 skb_shinfo(skb)->gso_size += len_diff; 2037 /* Header must be checked, and gso_segs recomputed. */ 2038 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2039 skb_shinfo(skb)->gso_segs = 0; 2040 } 2041 2042 skb->protocol = htons(ETH_P_IP); 2043 skb_clear_hash(skb); 2044 2045 return 0; 2046 } 2047 bpf_skb_proto_xlat(struct sk_buff * skb,__be16 to_proto)2048 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) 2049 { 2050 __be16 from_proto = skb->protocol; 2051 2052 if (from_proto == htons(ETH_P_IP) && 2053 to_proto == htons(ETH_P_IPV6)) 2054 return bpf_skb_proto_4_to_6(skb); 2055 2056 if (from_proto == htons(ETH_P_IPV6) && 2057 to_proto == htons(ETH_P_IP)) 2058 return bpf_skb_proto_6_to_4(skb); 2059 2060 return -ENOTSUPP; 2061 } 2062 BPF_CALL_3(bpf_skb_change_proto,struct sk_buff *,skb,__be16,proto,u64,flags)2063 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, 2064 u64, flags) 2065 { 2066 int ret; 2067 2068 if (unlikely(flags)) 2069 return -EINVAL; 2070 2071 /* General idea is that this helper does the basic groundwork 2072 * needed for changing the protocol, and eBPF program fills the 2073 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() 2074 * and other helpers, rather than passing a raw buffer here. 2075 * 2076 * The rationale is to keep this minimal and without a need to 2077 * deal with raw packet data. F.e. even if we would pass buffers 2078 * here, the program still needs to call the bpf_lX_csum_replace() 2079 * helpers anyway. Plus, this way we keep also separation of 2080 * concerns, since f.e. bpf_skb_store_bytes() should only take 2081 * care of stores. 2082 * 2083 * Currently, additional options and extension header space are 2084 * not supported, but flags register is reserved so we can adapt 2085 * that. For offloads, we mark packet as dodgy, so that headers 2086 * need to be verified first. 2087 */ 2088 ret = bpf_skb_proto_xlat(skb, proto); 2089 bpf_compute_data_end(skb); 2090 return ret; 2091 } 2092 2093 static const struct bpf_func_proto bpf_skb_change_proto_proto = { 2094 .func = bpf_skb_change_proto, 2095 .gpl_only = false, 2096 .ret_type = RET_INTEGER, 2097 .arg1_type = ARG_PTR_TO_CTX, 2098 .arg2_type = ARG_ANYTHING, 2099 .arg3_type = ARG_ANYTHING, 2100 }; 2101 BPF_CALL_2(bpf_skb_change_type,struct sk_buff *,skb,u32,pkt_type)2102 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) 2103 { 2104 /* We only allow a restricted subset to be changed for now. */ 2105 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || 2106 !skb_pkt_type_ok(pkt_type))) 2107 return -EINVAL; 2108 2109 skb->pkt_type = pkt_type; 2110 return 0; 2111 } 2112 2113 static const struct bpf_func_proto bpf_skb_change_type_proto = { 2114 .func = bpf_skb_change_type, 2115 .gpl_only = false, 2116 .ret_type = RET_INTEGER, 2117 .arg1_type = ARG_PTR_TO_CTX, 2118 .arg2_type = ARG_ANYTHING, 2119 }; 2120 __bpf_skb_min_len(const struct sk_buff * skb)2121 static u32 __bpf_skb_min_len(const struct sk_buff *skb) 2122 { 2123 u32 min_len = skb_network_offset(skb); 2124 2125 if (skb_transport_header_was_set(skb)) 2126 min_len = skb_transport_offset(skb); 2127 if (skb->ip_summed == CHECKSUM_PARTIAL) 2128 min_len = skb_checksum_start_offset(skb) + 2129 skb->csum_offset + sizeof(__sum16); 2130 return min_len; 2131 } 2132 __bpf_skb_max_len(const struct sk_buff * skb)2133 static u32 __bpf_skb_max_len(const struct sk_buff *skb) 2134 { 2135 return skb->dev->mtu + skb->dev->hard_header_len; 2136 } 2137 bpf_skb_grow_rcsum(struct sk_buff * skb,unsigned int new_len)2138 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) 2139 { 2140 unsigned int old_len = skb->len; 2141 int ret; 2142 2143 ret = __skb_grow_rcsum(skb, new_len); 2144 if (!ret) 2145 memset(skb->data + old_len, 0, new_len - old_len); 2146 return ret; 2147 } 2148 bpf_skb_trim_rcsum(struct sk_buff * skb,unsigned int new_len)2149 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) 2150 { 2151 return __skb_trim_rcsum(skb, new_len); 2152 } 2153 BPF_CALL_3(bpf_skb_change_tail,struct sk_buff *,skb,u32,new_len,u64,flags)2154 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, 2155 u64, flags) 2156 { 2157 u32 max_len = __bpf_skb_max_len(skb); 2158 u32 min_len = __bpf_skb_min_len(skb); 2159 int ret; 2160 2161 if (unlikely(flags || new_len > max_len || new_len < min_len)) 2162 return -EINVAL; 2163 if (skb->encapsulation) 2164 return -ENOTSUPP; 2165 2166 /* The basic idea of this helper is that it's performing the 2167 * needed work to either grow or trim an skb, and eBPF program 2168 * rewrites the rest via helpers like bpf_skb_store_bytes(), 2169 * bpf_lX_csum_replace() and others rather than passing a raw 2170 * buffer here. This one is a slow path helper and intended 2171 * for replies with control messages. 2172 * 2173 * Like in bpf_skb_change_proto(), we want to keep this rather 2174 * minimal and without protocol specifics so that we are able 2175 * to separate concerns as in bpf_skb_store_bytes() should only 2176 * be the one responsible for writing buffers. 2177 * 2178 * It's really expected to be a slow path operation here for 2179 * control message replies, so we're implicitly linearizing, 2180 * uncloning and drop offloads from the skb by this. 2181 */ 2182 ret = __bpf_try_make_writable(skb, skb->len); 2183 if (!ret) { 2184 if (new_len > skb->len) 2185 ret = bpf_skb_grow_rcsum(skb, new_len); 2186 else if (new_len < skb->len) 2187 ret = bpf_skb_trim_rcsum(skb, new_len); 2188 if (!ret && skb_is_gso(skb)) 2189 skb_gso_reset(skb); 2190 } 2191 2192 bpf_compute_data_end(skb); 2193 return ret; 2194 } 2195 2196 static const struct bpf_func_proto bpf_skb_change_tail_proto = { 2197 .func = bpf_skb_change_tail, 2198 .gpl_only = false, 2199 .ret_type = RET_INTEGER, 2200 .arg1_type = ARG_PTR_TO_CTX, 2201 .arg2_type = ARG_ANYTHING, 2202 .arg3_type = ARG_ANYTHING, 2203 }; 2204 bpf_helper_changes_skb_data(void * func)2205 bool bpf_helper_changes_skb_data(void *func) 2206 { 2207 if (func == bpf_skb_vlan_push || 2208 func == bpf_skb_vlan_pop || 2209 func == bpf_skb_store_bytes || 2210 func == bpf_skb_change_proto || 2211 func == bpf_skb_change_tail || 2212 func == bpf_skb_pull_data || 2213 func == bpf_clone_redirect || 2214 func == bpf_l3_csum_replace || 2215 func == bpf_l4_csum_replace) 2216 return true; 2217 2218 return false; 2219 } 2220 bpf_skb_copy(void * dst_buff,const void * skb,unsigned long off,unsigned long len)2221 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 2222 unsigned long off, unsigned long len) 2223 { 2224 void *ptr = skb_header_pointer(skb, off, len, dst_buff); 2225 2226 if (unlikely(!ptr)) 2227 return len; 2228 if (ptr != dst_buff) 2229 memcpy(dst_buff, ptr, len); 2230 2231 return 0; 2232 } 2233 BPF_CALL_5(bpf_skb_event_output,struct sk_buff *,skb,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)2234 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, 2235 u64, flags, void *, meta, u64, meta_size) 2236 { 2237 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2238 2239 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2240 return -EINVAL; 2241 if (unlikely(skb_size > skb->len)) 2242 return -EFAULT; 2243 2244 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, 2245 bpf_skb_copy); 2246 } 2247 2248 static const struct bpf_func_proto bpf_skb_event_output_proto = { 2249 .func = bpf_skb_event_output, 2250 .gpl_only = true, 2251 .ret_type = RET_INTEGER, 2252 .arg1_type = ARG_PTR_TO_CTX, 2253 .arg2_type = ARG_CONST_MAP_PTR, 2254 .arg3_type = ARG_ANYTHING, 2255 .arg4_type = ARG_PTR_TO_STACK, 2256 .arg5_type = ARG_CONST_STACK_SIZE, 2257 }; 2258 bpf_tunnel_key_af(u64 flags)2259 static unsigned short bpf_tunnel_key_af(u64 flags) 2260 { 2261 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 2262 } 2263 BPF_CALL_4(bpf_skb_get_tunnel_key,struct sk_buff *,skb,struct bpf_tunnel_key *,to,u32,size,u64,flags)2264 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, 2265 u32, size, u64, flags) 2266 { 2267 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2268 u8 compat[sizeof(struct bpf_tunnel_key)]; 2269 void *to_orig = to; 2270 int err; 2271 2272 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { 2273 err = -EINVAL; 2274 goto err_clear; 2275 } 2276 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { 2277 err = -EPROTO; 2278 goto err_clear; 2279 } 2280 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2281 err = -EINVAL; 2282 switch (size) { 2283 case offsetof(struct bpf_tunnel_key, tunnel_label): 2284 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2285 goto set_compat; 2286 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2287 /* Fixup deprecated structure layouts here, so we have 2288 * a common path later on. 2289 */ 2290 if (ip_tunnel_info_af(info) != AF_INET) 2291 goto err_clear; 2292 set_compat: 2293 to = (struct bpf_tunnel_key *)compat; 2294 break; 2295 default: 2296 goto err_clear; 2297 } 2298 } 2299 2300 to->tunnel_id = be64_to_cpu(info->key.tun_id); 2301 to->tunnel_tos = info->key.tos; 2302 to->tunnel_ttl = info->key.ttl; 2303 2304 if (flags & BPF_F_TUNINFO_IPV6) { 2305 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 2306 sizeof(to->remote_ipv6)); 2307 to->tunnel_label = be32_to_cpu(info->key.label); 2308 } else { 2309 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 2310 } 2311 2312 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 2313 memcpy(to_orig, to, size); 2314 2315 return 0; 2316 err_clear: 2317 memset(to_orig, 0, size); 2318 return err; 2319 } 2320 2321 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 2322 .func = bpf_skb_get_tunnel_key, 2323 .gpl_only = false, 2324 .ret_type = RET_INTEGER, 2325 .arg1_type = ARG_PTR_TO_CTX, 2326 .arg2_type = ARG_PTR_TO_RAW_STACK, 2327 .arg3_type = ARG_CONST_STACK_SIZE, 2328 .arg4_type = ARG_ANYTHING, 2329 }; 2330 BPF_CALL_3(bpf_skb_get_tunnel_opt,struct sk_buff *,skb,u8 *,to,u32,size)2331 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) 2332 { 2333 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2334 int err; 2335 2336 if (unlikely(!info || 2337 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { 2338 err = -ENOENT; 2339 goto err_clear; 2340 } 2341 if (unlikely(size < info->options_len)) { 2342 err = -ENOMEM; 2343 goto err_clear; 2344 } 2345 2346 ip_tunnel_info_opts_get(to, info); 2347 if (size > info->options_len) 2348 memset(to + info->options_len, 0, size - info->options_len); 2349 2350 return info->options_len; 2351 err_clear: 2352 memset(to, 0, size); 2353 return err; 2354 } 2355 2356 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 2357 .func = bpf_skb_get_tunnel_opt, 2358 .gpl_only = false, 2359 .ret_type = RET_INTEGER, 2360 .arg1_type = ARG_PTR_TO_CTX, 2361 .arg2_type = ARG_PTR_TO_RAW_STACK, 2362 .arg3_type = ARG_CONST_STACK_SIZE, 2363 }; 2364 2365 static struct metadata_dst __percpu *md_dst; 2366 BPF_CALL_4(bpf_skb_set_tunnel_key,struct sk_buff *,skb,const struct bpf_tunnel_key *,from,u32,size,u64,flags)2367 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, 2368 const struct bpf_tunnel_key *, from, u32, size, u64, flags) 2369 { 2370 struct metadata_dst *md = this_cpu_ptr(md_dst); 2371 u8 compat[sizeof(struct bpf_tunnel_key)]; 2372 struct ip_tunnel_info *info; 2373 2374 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 2375 BPF_F_DONT_FRAGMENT))) 2376 return -EINVAL; 2377 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2378 switch (size) { 2379 case offsetof(struct bpf_tunnel_key, tunnel_label): 2380 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2381 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2382 /* Fixup deprecated structure layouts here, so we have 2383 * a common path later on. 2384 */ 2385 memcpy(compat, from, size); 2386 memset(compat + size, 0, sizeof(compat) - size); 2387 from = (const struct bpf_tunnel_key *) compat; 2388 break; 2389 default: 2390 return -EINVAL; 2391 } 2392 } 2393 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 2394 from->tunnel_ext)) 2395 return -EINVAL; 2396 2397 skb_dst_drop(skb); 2398 dst_hold((struct dst_entry *) md); 2399 skb_dst_set(skb, (struct dst_entry *) md); 2400 2401 info = &md->u.tun_info; 2402 info->mode = IP_TUNNEL_INFO_TX; 2403 2404 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; 2405 if (flags & BPF_F_DONT_FRAGMENT) 2406 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; 2407 2408 info->key.tun_id = cpu_to_be64(from->tunnel_id); 2409 info->key.tos = from->tunnel_tos; 2410 info->key.ttl = from->tunnel_ttl; 2411 2412 if (flags & BPF_F_TUNINFO_IPV6) { 2413 info->mode |= IP_TUNNEL_INFO_IPV6; 2414 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 2415 sizeof(from->remote_ipv6)); 2416 info->key.label = cpu_to_be32(from->tunnel_label) & 2417 IPV6_FLOWLABEL_MASK; 2418 } else { 2419 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 2420 if (flags & BPF_F_ZERO_CSUM_TX) 2421 info->key.tun_flags &= ~TUNNEL_CSUM; 2422 } 2423 2424 return 0; 2425 } 2426 2427 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 2428 .func = bpf_skb_set_tunnel_key, 2429 .gpl_only = false, 2430 .ret_type = RET_INTEGER, 2431 .arg1_type = ARG_PTR_TO_CTX, 2432 .arg2_type = ARG_PTR_TO_STACK, 2433 .arg3_type = ARG_CONST_STACK_SIZE, 2434 .arg4_type = ARG_ANYTHING, 2435 }; 2436 BPF_CALL_3(bpf_skb_set_tunnel_opt,struct sk_buff *,skb,const u8 *,from,u32,size)2437 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, 2438 const u8 *, from, u32, size) 2439 { 2440 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2441 const struct metadata_dst *md = this_cpu_ptr(md_dst); 2442 2443 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 2444 return -EINVAL; 2445 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 2446 return -ENOMEM; 2447 2448 ip_tunnel_info_opts_set(info, from, size); 2449 2450 return 0; 2451 } 2452 2453 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 2454 .func = bpf_skb_set_tunnel_opt, 2455 .gpl_only = false, 2456 .ret_type = RET_INTEGER, 2457 .arg1_type = ARG_PTR_TO_CTX, 2458 .arg2_type = ARG_PTR_TO_STACK, 2459 .arg3_type = ARG_CONST_STACK_SIZE, 2460 }; 2461 2462 static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)2463 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 2464 { 2465 if (!md_dst) { 2466 /* Race is not possible, since it's called from verifier 2467 * that is holding verifier mutex. 2468 */ 2469 md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 2470 GFP_KERNEL); 2471 if (!md_dst) 2472 return NULL; 2473 } 2474 2475 switch (which) { 2476 case BPF_FUNC_skb_set_tunnel_key: 2477 return &bpf_skb_set_tunnel_key_proto; 2478 case BPF_FUNC_skb_set_tunnel_opt: 2479 return &bpf_skb_set_tunnel_opt_proto; 2480 default: 2481 return NULL; 2482 } 2483 } 2484 BPF_CALL_3(bpf_skb_under_cgroup,struct sk_buff *,skb,struct bpf_map *,map,u32,idx)2485 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, 2486 u32, idx) 2487 { 2488 struct bpf_array *array = container_of(map, struct bpf_array, map); 2489 struct cgroup *cgrp; 2490 struct sock *sk; 2491 2492 sk = skb_to_full_sk(skb); 2493 if (!sk || !sk_fullsock(sk)) 2494 return -ENOENT; 2495 if (unlikely(idx >= array->map.max_entries)) 2496 return -E2BIG; 2497 2498 cgrp = READ_ONCE(array->ptrs[idx]); 2499 if (unlikely(!cgrp)) 2500 return -EAGAIN; 2501 2502 return sk_under_cgroup_hierarchy(sk, cgrp); 2503 } 2504 2505 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 2506 .func = bpf_skb_under_cgroup, 2507 .gpl_only = false, 2508 .ret_type = RET_INTEGER, 2509 .arg1_type = ARG_PTR_TO_CTX, 2510 .arg2_type = ARG_CONST_MAP_PTR, 2511 .arg3_type = ARG_ANYTHING, 2512 }; 2513 bpf_xdp_copy(void * dst_buff,const void * src_buff,unsigned long off,unsigned long len)2514 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, 2515 unsigned long off, unsigned long len) 2516 { 2517 memcpy(dst_buff, src_buff + off, len); 2518 return 0; 2519 } 2520 BPF_CALL_5(bpf_xdp_event_output,struct xdp_buff *,xdp,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)2521 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, 2522 u64, flags, void *, meta, u64, meta_size) 2523 { 2524 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2525 2526 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2527 return -EINVAL; 2528 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) 2529 return -EFAULT; 2530 2531 return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, 2532 bpf_xdp_copy); 2533 } 2534 2535 static const struct bpf_func_proto bpf_xdp_event_output_proto = { 2536 .func = bpf_xdp_event_output, 2537 .gpl_only = true, 2538 .ret_type = RET_INTEGER, 2539 .arg1_type = ARG_PTR_TO_CTX, 2540 .arg2_type = ARG_CONST_MAP_PTR, 2541 .arg3_type = ARG_ANYTHING, 2542 .arg4_type = ARG_PTR_TO_STACK, 2543 .arg5_type = ARG_CONST_STACK_SIZE, 2544 }; 2545 BPF_CALL_1(bpf_get_socket_cookie,struct sk_buff *,skb)2546 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) 2547 { 2548 return skb->sk ? sock_gen_cookie(skb->sk) : 0; 2549 } 2550 2551 static const struct bpf_func_proto bpf_get_socket_cookie_proto = { 2552 .func = bpf_get_socket_cookie, 2553 .gpl_only = false, 2554 .ret_type = RET_INTEGER, 2555 .arg1_type = ARG_PTR_TO_CTX, 2556 }; 2557 BPF_CALL_1(bpf_get_socket_uid,struct sk_buff *,skb)2558 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) 2559 { 2560 struct sock *sk = sk_to_full_sk(skb->sk); 2561 kuid_t kuid; 2562 2563 if (!sk || !sk_fullsock(sk)) 2564 return overflowuid; 2565 kuid = sock_net_uid(sock_net(sk), sk); 2566 return from_kuid_munged(sock_net(sk)->user_ns, kuid); 2567 } 2568 2569 static const struct bpf_func_proto bpf_get_socket_uid_proto = { 2570 .func = bpf_get_socket_uid, 2571 .gpl_only = false, 2572 .ret_type = RET_INTEGER, 2573 .arg1_type = ARG_PTR_TO_CTX, 2574 }; 2575 2576 static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id)2577 sk_filter_func_proto(enum bpf_func_id func_id) 2578 { 2579 switch (func_id) { 2580 case BPF_FUNC_map_lookup_elem: 2581 return &bpf_map_lookup_elem_proto; 2582 case BPF_FUNC_map_update_elem: 2583 return &bpf_map_update_elem_proto; 2584 case BPF_FUNC_map_delete_elem: 2585 return &bpf_map_delete_elem_proto; 2586 case BPF_FUNC_get_prandom_u32: 2587 return &bpf_get_prandom_u32_proto; 2588 case BPF_FUNC_get_smp_processor_id: 2589 return &bpf_get_raw_smp_processor_id_proto; 2590 case BPF_FUNC_tail_call: 2591 return &bpf_tail_call_proto; 2592 case BPF_FUNC_ktime_get_ns: 2593 return &bpf_ktime_get_ns_proto; 2594 case BPF_FUNC_trace_printk: 2595 if (capable(CAP_SYS_ADMIN)) 2596 return bpf_get_trace_printk_proto(); 2597 case BPF_FUNC_get_socket_cookie: 2598 return &bpf_get_socket_cookie_proto; 2599 case BPF_FUNC_get_socket_uid: 2600 return &bpf_get_socket_uid_proto; 2601 default: 2602 return NULL; 2603 } 2604 } 2605 2606 static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id)2607 tc_cls_act_func_proto(enum bpf_func_id func_id) 2608 { 2609 switch (func_id) { 2610 case BPF_FUNC_skb_store_bytes: 2611 return &bpf_skb_store_bytes_proto; 2612 case BPF_FUNC_skb_load_bytes: 2613 return &bpf_skb_load_bytes_proto; 2614 case BPF_FUNC_skb_pull_data: 2615 return &bpf_skb_pull_data_proto; 2616 case BPF_FUNC_csum_diff: 2617 return &bpf_csum_diff_proto; 2618 case BPF_FUNC_csum_update: 2619 return &bpf_csum_update_proto; 2620 case BPF_FUNC_l3_csum_replace: 2621 return &bpf_l3_csum_replace_proto; 2622 case BPF_FUNC_l4_csum_replace: 2623 return &bpf_l4_csum_replace_proto; 2624 case BPF_FUNC_clone_redirect: 2625 return &bpf_clone_redirect_proto; 2626 case BPF_FUNC_get_cgroup_classid: 2627 return &bpf_get_cgroup_classid_proto; 2628 case BPF_FUNC_skb_vlan_push: 2629 return &bpf_skb_vlan_push_proto; 2630 case BPF_FUNC_skb_vlan_pop: 2631 return &bpf_skb_vlan_pop_proto; 2632 case BPF_FUNC_skb_change_proto: 2633 return &bpf_skb_change_proto_proto; 2634 case BPF_FUNC_skb_change_type: 2635 return &bpf_skb_change_type_proto; 2636 case BPF_FUNC_skb_change_tail: 2637 return &bpf_skb_change_tail_proto; 2638 case BPF_FUNC_skb_get_tunnel_key: 2639 return &bpf_skb_get_tunnel_key_proto; 2640 case BPF_FUNC_skb_set_tunnel_key: 2641 return bpf_get_skb_set_tunnel_proto(func_id); 2642 case BPF_FUNC_skb_get_tunnel_opt: 2643 return &bpf_skb_get_tunnel_opt_proto; 2644 case BPF_FUNC_skb_set_tunnel_opt: 2645 return bpf_get_skb_set_tunnel_proto(func_id); 2646 case BPF_FUNC_redirect: 2647 return &bpf_redirect_proto; 2648 case BPF_FUNC_get_route_realm: 2649 return &bpf_get_route_realm_proto; 2650 case BPF_FUNC_get_hash_recalc: 2651 return &bpf_get_hash_recalc_proto; 2652 case BPF_FUNC_set_hash_invalid: 2653 return &bpf_set_hash_invalid_proto; 2654 case BPF_FUNC_perf_event_output: 2655 return &bpf_skb_event_output_proto; 2656 case BPF_FUNC_get_smp_processor_id: 2657 return &bpf_get_smp_processor_id_proto; 2658 case BPF_FUNC_skb_under_cgroup: 2659 return &bpf_skb_under_cgroup_proto; 2660 default: 2661 return sk_filter_func_proto(func_id); 2662 } 2663 } 2664 2665 static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id)2666 xdp_func_proto(enum bpf_func_id func_id) 2667 { 2668 switch (func_id) { 2669 case BPF_FUNC_perf_event_output: 2670 return &bpf_xdp_event_output_proto; 2671 case BPF_FUNC_get_smp_processor_id: 2672 return &bpf_get_smp_processor_id_proto; 2673 default: 2674 return sk_filter_func_proto(func_id); 2675 } 2676 } 2677 2678 static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id)2679 cg_skb_func_proto(enum bpf_func_id func_id) 2680 { 2681 switch (func_id) { 2682 case BPF_FUNC_skb_load_bytes: 2683 return &bpf_skb_load_bytes_proto; 2684 default: 2685 return sk_filter_func_proto(func_id); 2686 } 2687 } 2688 __is_valid_access(int off,int size,enum bpf_access_type type)2689 static bool __is_valid_access(int off, int size, enum bpf_access_type type) 2690 { 2691 if (off < 0 || off >= sizeof(struct __sk_buff)) 2692 return false; 2693 /* The verifier guarantees that size > 0. */ 2694 if (off % size != 0) 2695 return false; 2696 if (size != sizeof(__u32)) 2697 return false; 2698 2699 return true; 2700 } 2701 sk_filter_is_valid_access(int off,int size,enum bpf_access_type type,enum bpf_reg_type * reg_type)2702 static bool sk_filter_is_valid_access(int off, int size, 2703 enum bpf_access_type type, 2704 enum bpf_reg_type *reg_type) 2705 { 2706 switch (off) { 2707 case offsetof(struct __sk_buff, tc_classid): 2708 case offsetof(struct __sk_buff, data): 2709 case offsetof(struct __sk_buff, data_end): 2710 return false; 2711 } 2712 2713 if (type == BPF_WRITE) { 2714 switch (off) { 2715 case offsetof(struct __sk_buff, cb[0]) ... 2716 offsetof(struct __sk_buff, cb[4]): 2717 break; 2718 default: 2719 return false; 2720 } 2721 } 2722 2723 return __is_valid_access(off, size, type); 2724 } 2725 tc_cls_act_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)2726 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 2727 const struct bpf_prog *prog) 2728 { 2729 struct bpf_insn *insn = insn_buf; 2730 2731 if (!direct_write) 2732 return 0; 2733 2734 /* if (!skb->cloned) 2735 * goto start; 2736 * 2737 * (Fast-path, otherwise approximation that we might be 2738 * a clone, do the rest in helper.) 2739 */ 2740 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); 2741 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); 2742 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); 2743 2744 /* ret = bpf_skb_pull_data(skb, 0); */ 2745 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 2746 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); 2747 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 2748 BPF_FUNC_skb_pull_data); 2749 /* if (!ret) 2750 * goto restore; 2751 * return TC_ACT_SHOT; 2752 */ 2753 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); 2754 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); 2755 *insn++ = BPF_EXIT_INSN(); 2756 2757 /* restore: */ 2758 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 2759 /* start: */ 2760 *insn++ = prog->insnsi[0]; 2761 2762 return insn - insn_buf; 2763 } 2764 tc_cls_act_is_valid_access(int off,int size,enum bpf_access_type type,enum bpf_reg_type * reg_type)2765 static bool tc_cls_act_is_valid_access(int off, int size, 2766 enum bpf_access_type type, 2767 enum bpf_reg_type *reg_type) 2768 { 2769 if (type == BPF_WRITE) { 2770 switch (off) { 2771 case offsetof(struct __sk_buff, mark): 2772 case offsetof(struct __sk_buff, tc_index): 2773 case offsetof(struct __sk_buff, priority): 2774 case offsetof(struct __sk_buff, cb[0]) ... 2775 offsetof(struct __sk_buff, cb[4]): 2776 case offsetof(struct __sk_buff, tc_classid): 2777 break; 2778 default: 2779 return false; 2780 } 2781 } 2782 2783 switch (off) { 2784 case offsetof(struct __sk_buff, data): 2785 *reg_type = PTR_TO_PACKET; 2786 break; 2787 case offsetof(struct __sk_buff, data_end): 2788 *reg_type = PTR_TO_PACKET_END; 2789 break; 2790 } 2791 2792 return __is_valid_access(off, size, type); 2793 } 2794 __is_valid_xdp_access(int off,int size,enum bpf_access_type type)2795 static bool __is_valid_xdp_access(int off, int size, 2796 enum bpf_access_type type) 2797 { 2798 if (off < 0 || off >= sizeof(struct xdp_md)) 2799 return false; 2800 if (off % size != 0) 2801 return false; 2802 if (size != sizeof(__u32)) 2803 return false; 2804 2805 return true; 2806 } 2807 xdp_is_valid_access(int off,int size,enum bpf_access_type type,enum bpf_reg_type * reg_type)2808 static bool xdp_is_valid_access(int off, int size, 2809 enum bpf_access_type type, 2810 enum bpf_reg_type *reg_type) 2811 { 2812 if (type == BPF_WRITE) 2813 return false; 2814 2815 switch (off) { 2816 case offsetof(struct xdp_md, data): 2817 *reg_type = PTR_TO_PACKET; 2818 break; 2819 case offsetof(struct xdp_md, data_end): 2820 *reg_type = PTR_TO_PACKET_END; 2821 break; 2822 } 2823 2824 return __is_valid_xdp_access(off, size, type); 2825 } 2826 bpf_warn_invalid_xdp_action(u32 act)2827 void bpf_warn_invalid_xdp_action(u32 act) 2828 { 2829 WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); 2830 } 2831 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 2832 sk_filter_convert_ctx_access(enum bpf_access_type type,int dst_reg,int src_reg,int ctx_off,struct bpf_insn * insn_buf,struct bpf_prog * prog)2833 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2834 int src_reg, int ctx_off, 2835 struct bpf_insn *insn_buf, 2836 struct bpf_prog *prog) 2837 { 2838 struct bpf_insn *insn = insn_buf; 2839 2840 switch (ctx_off) { 2841 case offsetof(struct __sk_buff, len): 2842 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 2843 2844 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2845 offsetof(struct sk_buff, len)); 2846 break; 2847 2848 case offsetof(struct __sk_buff, protocol): 2849 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 2850 2851 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2852 offsetof(struct sk_buff, protocol)); 2853 break; 2854 2855 case offsetof(struct __sk_buff, vlan_proto): 2856 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 2857 2858 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2859 offsetof(struct sk_buff, vlan_proto)); 2860 break; 2861 2862 case offsetof(struct __sk_buff, priority): 2863 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 2864 2865 if (type == BPF_WRITE) 2866 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2867 offsetof(struct sk_buff, priority)); 2868 else 2869 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2870 offsetof(struct sk_buff, priority)); 2871 break; 2872 2873 case offsetof(struct __sk_buff, ingress_ifindex): 2874 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 2875 2876 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2877 offsetof(struct sk_buff, skb_iif)); 2878 break; 2879 2880 case offsetof(struct __sk_buff, ifindex): 2881 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2882 2883 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 2884 dst_reg, src_reg, 2885 offsetof(struct sk_buff, dev)); 2886 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 2887 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 2888 offsetof(struct net_device, ifindex)); 2889 break; 2890 2891 case offsetof(struct __sk_buff, hash): 2892 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 2893 2894 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2895 offsetof(struct sk_buff, hash)); 2896 break; 2897 2898 case offsetof(struct __sk_buff, mark): 2899 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 2900 2901 if (type == BPF_WRITE) 2902 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2903 offsetof(struct sk_buff, mark)); 2904 else 2905 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2906 offsetof(struct sk_buff, mark)); 2907 break; 2908 2909 case offsetof(struct __sk_buff, pkt_type): 2910 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 2911 2912 case offsetof(struct __sk_buff, queue_mapping): 2913 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 2914 2915 case offsetof(struct __sk_buff, vlan_present): 2916 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 2917 dst_reg, src_reg, insn); 2918 2919 case offsetof(struct __sk_buff, vlan_tci): 2920 return convert_skb_access(SKF_AD_VLAN_TAG, 2921 dst_reg, src_reg, insn); 2922 2923 case offsetof(struct __sk_buff, cb[0]) ... 2924 offsetof(struct __sk_buff, cb[4]): 2925 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 2926 2927 prog->cb_access = 1; 2928 ctx_off -= offsetof(struct __sk_buff, cb[0]); 2929 ctx_off += offsetof(struct sk_buff, cb); 2930 ctx_off += offsetof(struct qdisc_skb_cb, data); 2931 if (type == BPF_WRITE) 2932 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2933 else 2934 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2935 break; 2936 2937 case offsetof(struct __sk_buff, tc_classid): 2938 ctx_off -= offsetof(struct __sk_buff, tc_classid); 2939 ctx_off += offsetof(struct sk_buff, cb); 2940 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 2941 if (type == BPF_WRITE) 2942 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2943 else 2944 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2945 break; 2946 2947 case offsetof(struct __sk_buff, data): 2948 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 2949 dst_reg, src_reg, 2950 offsetof(struct sk_buff, data)); 2951 break; 2952 2953 case offsetof(struct __sk_buff, data_end): 2954 ctx_off -= offsetof(struct __sk_buff, data_end); 2955 ctx_off += offsetof(struct sk_buff, cb); 2956 ctx_off += offsetof(struct bpf_skb_data_end, data_end); 2957 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, 2958 ctx_off); 2959 break; 2960 2961 case offsetof(struct __sk_buff, tc_index): 2962 #ifdef CONFIG_NET_SCHED 2963 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 2964 2965 if (type == BPF_WRITE) 2966 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 2967 offsetof(struct sk_buff, tc_index)); 2968 else 2969 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2970 offsetof(struct sk_buff, tc_index)); 2971 break; 2972 #else 2973 if (type == BPF_WRITE) 2974 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 2975 else 2976 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 2977 break; 2978 #endif 2979 } 2980 2981 return insn - insn_buf; 2982 } 2983 tc_cls_act_convert_ctx_access(enum bpf_access_type type,int dst_reg,int src_reg,int ctx_off,struct bpf_insn * insn_buf,struct bpf_prog * prog)2984 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2985 int src_reg, int ctx_off, 2986 struct bpf_insn *insn_buf, 2987 struct bpf_prog *prog) 2988 { 2989 struct bpf_insn *insn = insn_buf; 2990 2991 switch (ctx_off) { 2992 case offsetof(struct __sk_buff, ifindex): 2993 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2994 2995 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 2996 dst_reg, src_reg, 2997 offsetof(struct sk_buff, dev)); 2998 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 2999 offsetof(struct net_device, ifindex)); 3000 break; 3001 default: 3002 return sk_filter_convert_ctx_access(type, dst_reg, src_reg, 3003 ctx_off, insn_buf, prog); 3004 } 3005 3006 return insn - insn_buf; 3007 } 3008 xdp_convert_ctx_access(enum bpf_access_type type,int dst_reg,int src_reg,int ctx_off,struct bpf_insn * insn_buf,struct bpf_prog * prog)3009 static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, 3010 int src_reg, int ctx_off, 3011 struct bpf_insn *insn_buf, 3012 struct bpf_prog *prog) 3013 { 3014 struct bpf_insn *insn = insn_buf; 3015 3016 switch (ctx_off) { 3017 case offsetof(struct xdp_md, data): 3018 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 3019 dst_reg, src_reg, 3020 offsetof(struct xdp_buff, data)); 3021 break; 3022 case offsetof(struct xdp_md, data_end): 3023 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 3024 dst_reg, src_reg, 3025 offsetof(struct xdp_buff, data_end)); 3026 break; 3027 } 3028 3029 return insn - insn_buf; 3030 } 3031 3032 static const struct bpf_verifier_ops sk_filter_ops = { 3033 .get_func_proto = sk_filter_func_proto, 3034 .is_valid_access = sk_filter_is_valid_access, 3035 .convert_ctx_access = sk_filter_convert_ctx_access, 3036 }; 3037 3038 static const struct bpf_verifier_ops tc_cls_act_ops = { 3039 .get_func_proto = tc_cls_act_func_proto, 3040 .is_valid_access = tc_cls_act_is_valid_access, 3041 .convert_ctx_access = tc_cls_act_convert_ctx_access, 3042 .gen_prologue = tc_cls_act_prologue, 3043 }; 3044 3045 static const struct bpf_verifier_ops xdp_ops = { 3046 .get_func_proto = xdp_func_proto, 3047 .is_valid_access = xdp_is_valid_access, 3048 .convert_ctx_access = xdp_convert_ctx_access, 3049 }; 3050 3051 static const struct bpf_verifier_ops cg_skb_ops = { 3052 .get_func_proto = cg_skb_func_proto, 3053 .is_valid_access = sk_filter_is_valid_access, 3054 .convert_ctx_access = sk_filter_convert_ctx_access, 3055 }; 3056 3057 static struct bpf_prog_type_list sk_filter_type __read_mostly = { 3058 .ops = &sk_filter_ops, 3059 .type = BPF_PROG_TYPE_SOCKET_FILTER, 3060 }; 3061 3062 static struct bpf_prog_type_list sched_cls_type __read_mostly = { 3063 .ops = &tc_cls_act_ops, 3064 .type = BPF_PROG_TYPE_SCHED_CLS, 3065 }; 3066 3067 static struct bpf_prog_type_list sched_act_type __read_mostly = { 3068 .ops = &tc_cls_act_ops, 3069 .type = BPF_PROG_TYPE_SCHED_ACT, 3070 }; 3071 3072 static struct bpf_prog_type_list xdp_type __read_mostly = { 3073 .ops = &xdp_ops, 3074 .type = BPF_PROG_TYPE_XDP, 3075 }; 3076 3077 static struct bpf_prog_type_list cg_skb_type __read_mostly = { 3078 .ops = &cg_skb_ops, 3079 .type = BPF_PROG_TYPE_CGROUP_SKB, 3080 }; 3081 register_sk_filter_ops(void)3082 static int __init register_sk_filter_ops(void) 3083 { 3084 bpf_register_prog_type(&sk_filter_type); 3085 bpf_register_prog_type(&sched_cls_type); 3086 bpf_register_prog_type(&sched_act_type); 3087 bpf_register_prog_type(&xdp_type); 3088 bpf_register_prog_type(&cg_skb_type); 3089 3090 return 0; 3091 } 3092 late_initcall(register_sk_filter_ops); 3093 sk_detach_filter(struct sock * sk)3094 int sk_detach_filter(struct sock *sk) 3095 { 3096 int ret = -ENOENT; 3097 struct sk_filter *filter; 3098 3099 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 3100 return -EPERM; 3101 3102 filter = rcu_dereference_protected(sk->sk_filter, 3103 lockdep_sock_is_held(sk)); 3104 if (filter) { 3105 RCU_INIT_POINTER(sk->sk_filter, NULL); 3106 sk_filter_uncharge(sk, filter); 3107 ret = 0; 3108 } 3109 3110 return ret; 3111 } 3112 EXPORT_SYMBOL_GPL(sk_detach_filter); 3113 sk_get_filter(struct sock * sk,struct sock_filter __user * ubuf,unsigned int len)3114 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 3115 unsigned int len) 3116 { 3117 struct sock_fprog_kern *fprog; 3118 struct sk_filter *filter; 3119 int ret = 0; 3120 3121 lock_sock(sk); 3122 filter = rcu_dereference_protected(sk->sk_filter, 3123 lockdep_sock_is_held(sk)); 3124 if (!filter) 3125 goto out; 3126 3127 /* We're copying the filter that has been originally attached, 3128 * so no conversion/decode needed anymore. eBPF programs that 3129 * have no original program cannot be dumped through this. 3130 */ 3131 ret = -EACCES; 3132 fprog = filter->prog->orig_prog; 3133 if (!fprog) 3134 goto out; 3135 3136 ret = fprog->len; 3137 if (!len) 3138 /* User space only enquires number of filter blocks. */ 3139 goto out; 3140 3141 ret = -EINVAL; 3142 if (len < fprog->len) 3143 goto out; 3144 3145 ret = -EFAULT; 3146 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 3147 goto out; 3148 3149 /* Instead of bytes, the API requests to return the number 3150 * of filter blocks. 3151 */ 3152 ret = fprog->len; 3153 out: 3154 release_sock(sk); 3155 return ret; 3156 } 3157