1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkVM_DEFINED 9 #define SkVM_DEFINED 10 11 #include "include/core/SkTypes.h" 12 #include "include/private/SkTHash.h" 13 #include <vector> 14 15 namespace skvm { 16 17 class Assembler { 18 public: 19 explicit Assembler(void* buf); 20 21 size_t size() const; 22 23 // Order matters... GP64, Xmm, Ymm values match 4-bit register encoding for each. 24 enum GP64 { 25 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, 26 r8 , r9 , r10, r11, r12, r13, r14, r15, 27 }; 28 enum Xmm { 29 xmm0, xmm1, xmm2 , xmm3 , xmm4 , xmm5 , xmm6 , xmm7 , 30 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 31 }; 32 enum Ymm { 33 ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 , 34 ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, 35 }; 36 37 // X and V values match 5-bit encoding for each (nothing tricky). 38 enum X { 39 x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , 40 x8 , x9 , x10, x11, x12, x13, x14, x15, 41 x16, x17, x18, x19, x20, x21, x22, x23, 42 x24, x25, x26, x27, x28, x29, x30, xzr, 43 }; 44 enum V { 45 v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , 46 v8 , v9 , v10, v11, v12, v13, v14, v15, 47 v16, v17, v18, v19, v20, v21, v22, v23, 48 v24, v25, v26, v27, v28, v29, v30, v31, 49 }; 50 51 void bytes(const void*, int); 52 void byte(uint8_t); 53 void word(uint32_t); 54 55 // x86-64 56 57 void align(int mod); 58 59 void vzeroupper(); 60 void ret(); 61 62 void add(GP64, int imm); 63 void sub(GP64, int imm); 64 65 // All dst = x op y. 66 using DstEqXOpY = void(Ymm dst, Ymm x, Ymm y); 67 DstEqXOpY vpand, vpor, vpxor, vpandn, 68 vpaddd, vpsubd, vpmulld, 69 vpsubw, vpmullw, 70 vaddps, vsubps, vmulps, vdivps, 71 vfmadd132ps, vfmadd213ps, vfmadd231ps, 72 vpackusdw, vpackuswb, 73 vpcmpeqd, vpcmpgtd; 74 75 using DstEqXOpImm = void(Ymm dst, Ymm x, int imm); 76 DstEqXOpImm vpslld, vpsrld, vpsrad, 77 vpsrlw, 78 vpermq; 79 80 using DstEqOpX = void(Ymm dst, Ymm x); 81 DstEqOpX vmovdqa, vcvtdq2ps, vcvttps2dq; 82 83 void vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z); 84 85 struct Label { 86 int offset = 0; 87 enum { None, ARMDisp19, X86Disp32 } kind = None; 88 std::vector<int> references; 89 }; 90 91 Label here(); 92 void label(Label*); 93 94 void jmp(Label*); 95 void je (Label*); 96 void jne(Label*); 97 void jl (Label*); 98 void cmp(GP64, int imm); 99 100 void vbroadcastss(Ymm dst, Label*); 101 void vbroadcastss(Ymm dst, Xmm src); 102 void vbroadcastss(Ymm dst, GP64 ptr, int off); // dst = *(ptr+off) 103 104 void vpshufb(Ymm dst, Ymm x, Label*); 105 106 void vmovups (Ymm dst, GP64 ptr); // dst = *ptr, 256-bit 107 void vpmovzxwd(Ymm dst, GP64 ptr); // dst = *ptr, 128-bit, each uint16_t expanded to int 108 void vpmovzxbd(Ymm dst, GP64 ptr); // dst = *ptr, 64-bit, each uint8_t expanded to int 109 void vmovd (Xmm dst, GP64 ptr); // dst = *ptr, 32-bit 110 111 void vmovups(GP64 ptr, Ymm src); // *ptr = src, 256-bit 112 void vmovups(GP64 ptr, Xmm src); // *ptr = src, 128-bit 113 void vmovq (GP64 ptr, Xmm src); // *ptr = src, 64-bit 114 void vmovd (GP64 ptr, Xmm src); // *ptr = src, 32-bit 115 116 void movzbl(GP64 dst, GP64 ptr, int off); // dst = *(ptr+off), uint8_t -> int 117 void movb (GP64 ptr, GP64 src); // *ptr = src, 8-bit 118 119 void vmovd_direct(GP64 dst, Xmm src); // dst = src, 32-bit 120 void vmovd_direct(Xmm dst, GP64 src); // dst = src, 32-bit 121 122 void vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm); // dst = src; dst[imm] = *ptr, 16-bit 123 void vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm); // dst = src; dst[imm] = *ptr, 8-bit 124 125 void vpextrw(GP64 ptr, Xmm src, int imm); // *dst = src[imm] , 16-bit 126 void vpextrb(GP64 ptr, Xmm src, int imm); // *dst = src[imm] , 8-bit 127 128 // aarch64 129 130 // d = op(n,m) 131 using DOpNM = void(V d, V n, V m); 132 DOpNM and16b, orr16b, eor16b, bic16b, 133 add4s, sub4s, mul4s, 134 sub8h, mul8h, 135 fadd4s, fsub4s, fmul4s, fdiv4s, 136 tbl; 137 138 // d += n*m 139 void fmla4s(V d, V n, V m); 140 141 // d = op(n,imm) 142 using DOpNImm = void(V d, V n, int imm); 143 DOpNImm sli4s, 144 shl4s, sshr4s, ushr4s, 145 ushr8h; 146 147 // d = op(n) 148 using DOpN = void(V d, V n); 149 DOpN scvtf4s, // int -> float 150 fcvtzs4s, // truncate float -> int 151 xtns2h, // u32 -> u16 152 xtnh2b, // u16 -> u8 153 uxtlb2h, // u8 -> u16 154 uxtlh2s; // u16 -> u32 155 156 // TODO: both these platforms support rounding float->int (vcvtps2dq, fcvtns.4s)... use? 157 158 void ret (X); 159 void add (X d, X n, int imm12); 160 void sub (X d, X n, int imm12); 161 void subs(X d, X n, int imm12); // subtract setting condition flags 162 163 // There's another encoding for unconditional branches that can jump further, 164 // but this one encoded as b.al is simple to implement and should be fine. b(Label * l)165 void b (Label* l) { this->b(Condition::al, l); } bne(Label * l)166 void bne(Label* l) { this->b(Condition::ne, l); } blt(Label * l)167 void blt(Label* l) { this->b(Condition::lt, l); } 168 169 // "cmp ..." is just an assembler mnemonic for "subs xzr, ..."! cmp(X n,int imm12)170 void cmp(X n, int imm12) { this->subs(xzr, n, imm12); } 171 172 // Compare and branch if zero/non-zero, as if 173 // cmp(t,0) 174 // beq/bne(l) 175 // but without setting condition flags. 176 void cbz (X t, Label* l); 177 void cbnz(X t, Label* l); 178 179 void ldrq(V dst, Label*); // 128-bit PC-relative load 180 181 void ldrq(V dst, X src); // 128-bit dst = *src 182 void ldrs(V dst, X src); // 32-bit dst = *src 183 void ldrb(V dst, X src); // 8-bit dst = *src 184 185 void strq(V src, X dst); // 128-bit *dst = src 186 void strs(V src, X dst); // 32-bit *dst = src 187 void strb(V src, X dst); // 8-bit *dst = src 188 189 private: 190 // dst = op(dst, imm) 191 void op(int opcode, int opcode_ext, GP64 dst, int imm); 192 193 194 // dst = op(x,y) or op(x) 195 void op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W=false); 196 void op(int prefix, int map, int opcode, Ymm dst, Ymm x, bool W=false) { 197 // Two arguments ops seem to pass them in dst and y, forcing x to 0 so VEX.vvvv == 1111. 198 this->op(prefix, map, opcode, dst,(Ymm)0,x, W); 199 } 200 201 // dst = op(x,imm) 202 void op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm); 203 204 // dst = op(x,label) or op(label) 205 void op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l); 206 207 // *ptr = ymm or ymm = *ptr, depending on opcode. 208 void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr); 209 210 // Opcode for 3-arguments ops is split between hi and lo: 211 // [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d] 212 void op(uint32_t hi, V m, uint32_t lo, V n, V d); 213 214 // 2-argument ops, with or without an immediate. 215 void op(uint32_t op22, int imm, V n, V d); op(uint32_t op22,V n,V d)216 void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); } op(uint32_t op22,X x,V v)217 void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); } 218 219 // Order matters... value is 4-bit encoding for condition code. 220 enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al }; 221 void b(Condition, Label*); 222 223 void jump(uint8_t condition, Label*); 224 225 int disp19(Label*); 226 int disp32(Label*); 227 228 uint8_t* fCode; 229 uint8_t* fCurr; 230 size_t fSize; 231 }; 232 233 enum class Op : uint8_t { 234 store8, store16, store32, 235 // ↑ side effects / no side effects ↓ 236 237 load8, load16, load32, 238 gather8, gather16, gather32, 239 // ↑ always varying / uniforms, constants, Just Math ↓ 240 241 uniform8, uniform16, uniform32, 242 splat, 243 244 add_f32, add_i32, add_i16x2, 245 sub_f32, sub_i32, sub_i16x2, 246 mul_f32, mul_i32, mul_i16x2, 247 div_f32, 248 mad_f32, 249 shl_i32, shl_i16x2, 250 shr_i32, shr_i16x2, 251 sra_i32, sra_i16x2, 252 253 to_i32, to_f32, 254 255 eq_f32, eq_i32, eq_i16x2, 256 neq_f32, neq_i32, neq_i16x2, 257 lt_f32, lt_i32, lt_i16x2, 258 lte_f32, lte_i32, lte_i16x2, 259 gt_f32, gt_i32, gt_i16x2, 260 gte_f32, gte_i32, gte_i16x2, 261 262 bit_and, 263 bit_or, 264 bit_xor, 265 bit_clear, 266 select, 267 268 bytes, extract, pack, 269 }; 270 271 using Val = int; 272 // We reserve the last Val ID as a sentinel meaning none, n/a, null, nil, etc. 273 static const Val NA = ~0; 274 275 struct Arg { int ix; }; 276 struct I32 { Val id; }; 277 struct F32 { Val id; }; 278 279 class Program; 280 281 class Builder { 282 public: 283 struct Instruction { 284 Op op; // v* = op(x,y,z,imm), where * == index of this Instruction. 285 Val x,y,z; // Enough arguments for mad(). 286 int imm; // Immediate bit pattern, shift count, argument index, etc. 287 288 // Not populated until done() has been called. 289 int death; // Index of last live instruction taking this input; live if != 0. 290 bool hoist; // Value independent of all loop variables? 291 }; 292 293 Program done(const char* debug_name = nullptr); 294 295 // Mostly for debugging, tests, etc. program()296 std::vector<Instruction> program() const { return fProgram; } 297 298 299 // Declare an argument with given stride (use stride=0 for uniforms). 300 // TODO: different types for varying and uniforms? 301 Arg arg(int stride); 302 303 // Convenience arg() wrappers for most common strides, sizeof(T) and 0. 304 template <typename T> varying()305 Arg varying() { return this->arg(sizeof(T)); } uniform()306 Arg uniform() { return this->arg(0); } 307 308 // TODO: allow uniform (i.e. Arg) offsets to store* and load*? 309 // TODO: sign extension (signed types) for <32-bit loads? 310 // TODO: unsigned integer operations where relevant (just comparisons?)? 311 312 // Store {8,16,32}-bit varying. 313 void store8 (Arg ptr, I32 val); 314 void store16(Arg ptr, I32 val); 315 void store32(Arg ptr, I32 val); 316 317 // Load u8,u16,i32 varying. 318 I32 load8 (Arg ptr); 319 I32 load16(Arg ptr); 320 I32 load32(Arg ptr); 321 322 // Gather u8,u16,i32 with varying element-count offset. 323 I32 gather8 (Arg ptr, I32 offset); 324 I32 gather16(Arg ptr, I32 offset); 325 I32 gather32(Arg ptr, I32 offset); 326 327 // Load u8,u16,i32 uniform with optional byte-count offset. 328 I32 uniform8 (Arg ptr, int offset=0); 329 I32 uniform16(Arg ptr, int offset=0); 330 I32 uniform32(Arg ptr, int offset=0); 331 332 // Load an immediate constant. 333 I32 splat(int n); splat(unsigned u)334 I32 splat(unsigned u) { return this->splat((int)u); } 335 F32 splat(float f); 336 337 // float math, comparisons, etc. 338 F32 add(F32 x, F32 y); 339 F32 sub(F32 x, F32 y); 340 F32 mul(F32 x, F32 y); 341 F32 div(F32 x, F32 y); 342 F32 mad(F32 x, F32 y, F32 z); // x*y+z, often an FMA 343 344 I32 eq (F32 x, F32 y); 345 I32 neq(F32 x, F32 y); 346 I32 lt (F32 x, F32 y); 347 I32 lte(F32 x, F32 y); 348 I32 gt (F32 x, F32 y); 349 I32 gte(F32 x, F32 y); 350 351 I32 to_i32(F32 x); bit_cast(F32 x)352 I32 bit_cast(F32 x) { return {x.id}; } 353 354 // int math, comparisons, etc. 355 I32 add(I32 x, I32 y); 356 I32 sub(I32 x, I32 y); 357 I32 mul(I32 x, I32 y); 358 359 I32 shl(I32 x, int bits); 360 I32 shr(I32 x, int bits); 361 I32 sra(I32 x, int bits); 362 363 I32 eq (I32 x, I32 y); 364 I32 neq(I32 x, I32 y); 365 I32 lt (I32 x, I32 y); 366 I32 lte(I32 x, I32 y); 367 I32 gt (I32 x, I32 y); 368 I32 gte(I32 x, I32 y); 369 370 F32 to_f32(I32 x); bit_cast(I32 x)371 F32 bit_cast(I32 x) { return {x.id}; } 372 373 // Treat each 32-bit lane as a pair of 16-bit ints. 374 I32 add_16x2(I32 x, I32 y); 375 I32 sub_16x2(I32 x, I32 y); 376 I32 mul_16x2(I32 x, I32 y); 377 378 I32 shl_16x2(I32 x, int bits); 379 I32 shr_16x2(I32 x, int bits); 380 I32 sra_16x2(I32 x, int bits); 381 382 I32 eq_16x2(I32 x, I32 y); 383 I32 neq_16x2(I32 x, I32 y); 384 I32 lt_16x2(I32 x, I32 y); 385 I32 lte_16x2(I32 x, I32 y); 386 I32 gt_16x2(I32 x, I32 y); 387 I32 gte_16x2(I32 x, I32 y); 388 389 // Bitwise operations. 390 I32 bit_and (I32 x, I32 y); 391 I32 bit_or (I32 x, I32 y); 392 I32 bit_xor (I32 x, I32 y); 393 I32 bit_clear(I32 x, I32 y); // x & ~y 394 395 I32 select(I32 cond, I32 t, I32 f); // cond ? t : f select(I32 cond,F32 t,F32 f)396 F32 select(I32 cond, F32 t, F32 f) { 397 return this->bit_cast(this->select(cond, this->bit_cast(t) 398 , this->bit_cast(f))); 399 } 400 401 // More complex operations... 402 403 // Shuffle the bytes in x according to each nibble of control, as if 404 // 405 // uint8_t bytes[] = { 406 // 0, 407 // ((uint32_t)x ) & 0xff, 408 // ((uint32_t)x >> 8) & 0xff, 409 // ((uint32_t)x >> 16) & 0xff, 410 // ((uint32_t)x >> 24) & 0xff, 411 // }; 412 // return (uint32_t)bytes[(control >> 0) & 0xf] << 0 413 // | (uint32_t)bytes[(control >> 4) & 0xf] << 8 414 // | (uint32_t)bytes[(control >> 8) & 0xf] << 16 415 // | (uint32_t)bytes[(control >> 12) & 0xf] << 24; 416 // 417 // So, e.g., 418 // - bytes(x, 0x1111) splats the low byte of x to all four bytes 419 // - bytes(x, 0x4321) is x, an identity 420 // - bytes(x, 0x0000) is 0 421 // - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern. 422 I32 bytes (I32 x, int control); 423 424 I32 extract(I32 x, int bits, I32 y); // (x >> bits) & y 425 I32 pack (I32 x, I32 y, int bits); // x | (y << bits), assuming (x & (y << bits)) == 0 426 427 private: 428 struct InstructionHash { 429 template <typename T> HashInstructionHash430 static size_t Hash(T val) { 431 return std::hash<T>{}(val); 432 } operatorInstructionHash433 size_t operator()(const Instruction& inst) const { 434 return Hash((uint8_t)inst.op) 435 ^ Hash(inst.x) 436 ^ Hash(inst.y) 437 ^ Hash(inst.z) 438 ^ Hash(inst.imm) 439 ^ Hash(inst.death) 440 ^ Hash(inst.hoist); 441 } 442 }; 443 444 Val push(Op, Val x, Val y=NA, Val z=NA, int imm=0); 445 bool isZero(Val) const; 446 447 SkTHashMap<Instruction, Val, InstructionHash> fIndex; 448 std::vector<Instruction> fProgram; 449 std::vector<int> fStrides; 450 }; 451 452 using Reg = int; 453 454 class Program { 455 public: 456 struct Instruction { // d = op(x, y, z/imm) 457 Op op; 458 Reg d,x,y; 459 union { Reg z; int imm; }; 460 }; 461 462 Program(const std::vector<Builder::Instruction>& instructions, 463 const std::vector<int> & strides, 464 const char* debug_name); 465 466 Program(); 467 ~Program(); 468 Program(Program&&); 469 Program& operator=(Program&&); 470 Program(const Program&) = delete; 471 Program& operator=(const Program&) = delete; 472 473 void eval(int n, void* args[]) const; 474 475 template <typename... T> eval(int n,T * ...arg)476 void eval(int n, T*... arg) const { 477 SkASSERT(sizeof...(arg) == fStrides.size()); 478 // This nullptr isn't important except that it makes args[] non-empty if you pass none. 479 void* args[] = { (void*)arg..., nullptr }; 480 this->eval(n, args); 481 } 482 instructions()483 std::vector<Instruction> instructions() const { return fInstructions; } nregs()484 int nregs() const { return fRegs; } loop()485 int loop() const { return fLoop; } empty()486 bool empty() const { return fInstructions.empty(); } 487 488 // If this Program has been JITted, drop it, forcing interpreter fallback. 489 void dropJIT(); 490 491 private: 492 void setupInterpreter(const std::vector<Builder::Instruction>&); 493 void setupJIT (const std::vector<Builder::Instruction>&, const char* debug_name); 494 495 bool jit(const std::vector<Builder::Instruction>&, 496 bool hoist, 497 Assembler*) const; 498 499 // Dump jit-*.dump files for perf inject. 500 void dumpJIT(const char* debug_name, size_t size) const; 501 502 std::vector<Instruction> fInstructions; 503 int fRegs = 0; 504 int fLoop = 0; 505 std::vector<int> fStrides; 506 507 void* fJITBuf = nullptr; 508 size_t fJITSize = 0; 509 }; 510 511 // TODO: control flow 512 // TODO: 64-bit values? 513 // TODO: SSE2/SSE4.1, AVX-512F, ARMv8.2 JITs? 514 // TODO: lower to LLVM or WebASM for comparison? 515 } 516 517 #endif//SkVM_DEFINED 518