1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/private/SkSpinlock.h" 9 #include "include/private/SkTFitsIn.h" 10 #include "include/private/SkThreadID.h" 11 #include "include/private/SkVx.h" 12 #include "src/core/SkCpu.h" 13 #include "src/core/SkVM.h" 14 #include <string.h> 15 #if defined(SKVM_JIT) 16 #include <sys/mman.h> 17 #endif 18 19 namespace skvm { 20 done(const char * debug_name)21 Program Builder::done(const char* debug_name) { 22 // Basic liveness analysis: 23 // an instruction is live until all live instructions that need its input have retired. 24 for (Val id = fProgram.size(); id --> 0; ) { 25 Instruction& inst = fProgram[id]; 26 // All side-effect-only instructions (stores) are live. 27 if (inst.op <= Op::store32) { 28 inst.death = id; 29 } 30 // The arguments of a live instruction must live until at least that instruction. 31 if (inst.death != 0) { 32 // Notice how we're walking backward, storing the latest instruction in death. 33 if (inst.x != NA && fProgram[inst.x].death == 0) { fProgram[inst.x].death = id; } 34 if (inst.y != NA && fProgram[inst.y].death == 0) { fProgram[inst.y].death = id; } 35 if (inst.z != NA && fProgram[inst.z].death == 0) { fProgram[inst.z].death = id; } 36 } 37 } 38 39 // Mark which values don't depend on the loop and can be hoisted. 40 for (Val id = 0; id < (Val)fProgram.size(); id++) { 41 Builder::Instruction& inst = fProgram[id]; 42 43 // Varying loads (and gathers) and stores cannot be hoisted out of the loop. 44 if (inst.op <= Op::gather32) { 45 inst.hoist = false; 46 } 47 48 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. 49 if (inst.hoist) { 50 if (inst.x != NA) { inst.hoist &= fProgram[inst.x].hoist; } 51 if (inst.y != NA) { inst.hoist &= fProgram[inst.y].hoist; } 52 if (inst.z != NA) { inst.hoist &= fProgram[inst.z].hoist; } 53 } 54 55 // Any hoisted values used inside the loop need to live forever. 56 if (!inst.hoist) { 57 auto make_immortal = [&](Val arg) { 58 if (fProgram[arg].death != 0) { 59 fProgram[arg].death = (Val)fProgram.size(); 60 } 61 }; 62 if (inst.x != NA && fProgram[inst.x].hoist) { make_immortal(inst.x); } 63 if (inst.y != NA && fProgram[inst.y].hoist) { make_immortal(inst.y); } 64 if (inst.z != NA && fProgram[inst.z].hoist) { make_immortal(inst.z); } 65 } 66 } 67 68 return {fProgram, fStrides, debug_name}; 69 } 70 operator ==(const Builder::Instruction & a,const Builder::Instruction & b)71 static bool operator==(const Builder::Instruction& a, const Builder::Instruction& b) { 72 return a.op == b.op 73 && a.x == b.x 74 && a.y == b.y 75 && a.z == b.z 76 && a.imm == b.imm 77 && a.death == b.death 78 && a.hoist == b.hoist; 79 } 80 81 // Most instructions produce a value and return it by ID, 82 // the value-producing instruction's own index in the program vector. push(Op op,Val x,Val y,Val z,int imm)83 Val Builder::push(Op op, Val x, Val y, Val z, int imm) { 84 Instruction inst{op, x, y, z, imm, /*death=*/0, /*hoist=*/true}; 85 86 // Basic common subexpression elimination: 87 // if we've already seen this exact Instruction, use it instead of creating a new one. 88 if (Val* id = fIndex.find(inst)) { 89 return *id; 90 } 91 Val id = static_cast<Val>(fProgram.size()); 92 fProgram.push_back(inst); 93 fIndex.set(inst, id); 94 return id; 95 } 96 isZero(Val id) const97 bool Builder::isZero(Val id) const { 98 return fProgram[id].op == Op::splat 99 && fProgram[id].imm == 0; 100 } 101 arg(int stride)102 Arg Builder::arg(int stride) { 103 int ix = (int)fStrides.size(); 104 fStrides.push_back(stride); 105 return {ix}; 106 } 107 store8(Arg ptr,I32 val)108 void Builder::store8 (Arg ptr, I32 val) { (void)this->push(Op::store8 , val.id,NA,NA, ptr.ix); } store16(Arg ptr,I32 val)109 void Builder::store16(Arg ptr, I32 val) { (void)this->push(Op::store16, val.id,NA,NA, ptr.ix); } store32(Arg ptr,I32 val)110 void Builder::store32(Arg ptr, I32 val) { (void)this->push(Op::store32, val.id,NA,NA, ptr.ix); } 111 load8(Arg ptr)112 I32 Builder::load8 (Arg ptr) { return {this->push(Op::load8 , NA,NA,NA, ptr.ix) }; } load16(Arg ptr)113 I32 Builder::load16(Arg ptr) { return {this->push(Op::load16, NA,NA,NA, ptr.ix) }; } load32(Arg ptr)114 I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; } 115 gather8(Arg ptr,I32 offset)116 I32 Builder::gather8 (Arg ptr, I32 offset) { 117 return {this->push(Op::gather8 , offset.id,NA,NA, ptr.ix)}; 118 } gather16(Arg ptr,I32 offset)119 I32 Builder::gather16(Arg ptr, I32 offset) { 120 return {this->push(Op::gather16, offset.id,NA,NA, ptr.ix)}; 121 } gather32(Arg ptr,I32 offset)122 I32 Builder::gather32(Arg ptr, I32 offset) { 123 return {this->push(Op::gather32, offset.id,NA,NA, ptr.ix)}; 124 } 125 uniform8(Arg ptr,int offset)126 I32 Builder::uniform8(Arg ptr, int offset) { 127 return {this->push(Op::uniform8, NA,NA,NA, ptr.ix | (offset<<16))}; 128 } uniform16(Arg ptr,int offset)129 I32 Builder::uniform16(Arg ptr, int offset) { 130 return {this->push(Op::uniform16, NA,NA,NA, ptr.ix | (offset<<16))}; 131 } uniform32(Arg ptr,int offset)132 I32 Builder::uniform32(Arg ptr, int offset) { 133 return {this->push(Op::uniform32, NA,NA,NA, ptr.ix | (offset<<16))}; 134 } 135 136 // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern. splat(int n)137 I32 Builder::splat(int n) { return {this->push(Op::splat, NA,NA,NA, n) }; } splat(float f)138 F32 Builder::splat(float f) { 139 int bits; 140 memcpy(&bits, &f, 4); 141 return {this->push(Op::splat, NA,NA,NA, bits)}; 142 } 143 add(F32 x,F32 y)144 F32 Builder::add(F32 x, F32 y ) { return {this->push(Op::add_f32, x.id, y.id)}; } sub(F32 x,F32 y)145 F32 Builder::sub(F32 x, F32 y ) { return {this->push(Op::sub_f32, x.id, y.id)}; } mul(F32 x,F32 y)146 F32 Builder::mul(F32 x, F32 y ) { return {this->push(Op::mul_f32, x.id, y.id)}; } div(F32 x,F32 y)147 F32 Builder::div(F32 x, F32 y ) { return {this->push(Op::div_f32, x.id, y.id)}; } mad(F32 x,F32 y,F32 z)148 F32 Builder::mad(F32 x, F32 y, F32 z) { 149 if (this->isZero(z.id)) { 150 return this->mul(x,y); 151 } 152 return {this->push(Op::mad_f32, x.id, y.id, z.id)}; 153 } 154 add(I32 x,I32 y)155 I32 Builder::add(I32 x, I32 y) { return {this->push(Op::add_i32, x.id, y.id)}; } sub(I32 x,I32 y)156 I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; } mul(I32 x,I32 y)157 I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; } 158 add_16x2(I32 x,I32 y)159 I32 Builder::add_16x2(I32 x, I32 y) { return {this->push(Op::add_i16x2, x.id, y.id)}; } sub_16x2(I32 x,I32 y)160 I32 Builder::sub_16x2(I32 x, I32 y) { return {this->push(Op::sub_i16x2, x.id, y.id)}; } mul_16x2(I32 x,I32 y)161 I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; } 162 shl(I32 x,int bits)163 I32 Builder::shl(I32 x, int bits) { return {this->push(Op::shl_i32, x.id,NA,NA, bits)}; } shr(I32 x,int bits)164 I32 Builder::shr(I32 x, int bits) { return {this->push(Op::shr_i32, x.id,NA,NA, bits)}; } sra(I32 x,int bits)165 I32 Builder::sra(I32 x, int bits) { return {this->push(Op::sra_i32, x.id,NA,NA, bits)}; } 166 shl_16x2(I32 x,int bits)167 I32 Builder::shl_16x2(I32 x, int bits) { return {this->push(Op::shl_i16x2, x.id,NA,NA, bits)}; } shr_16x2(I32 x,int bits)168 I32 Builder::shr_16x2(I32 x, int bits) { return {this->push(Op::shr_i16x2, x.id,NA,NA, bits)}; } sra_16x2(I32 x,int bits)169 I32 Builder::sra_16x2(I32 x, int bits) { return {this->push(Op::sra_i16x2, x.id,NA,NA, bits)}; } 170 eq(F32 x,F32 y)171 I32 Builder:: eq(F32 x, F32 y) { return {this->push(Op:: eq_f32, x.id, y.id)}; } neq(F32 x,F32 y)172 I32 Builder::neq(F32 x, F32 y) { return {this->push(Op::neq_f32, x.id, y.id)}; } lt(F32 x,F32 y)173 I32 Builder:: lt(F32 x, F32 y) { return {this->push(Op:: lt_f32, x.id, y.id)}; } lte(F32 x,F32 y)174 I32 Builder::lte(F32 x, F32 y) { return {this->push(Op::lte_f32, x.id, y.id)}; } gt(F32 x,F32 y)175 I32 Builder:: gt(F32 x, F32 y) { return {this->push(Op:: gt_f32, x.id, y.id)}; } gte(F32 x,F32 y)176 I32 Builder::gte(F32 x, F32 y) { return {this->push(Op::gte_f32, x.id, y.id)}; } 177 eq(I32 x,I32 y)178 I32 Builder:: eq(I32 x, I32 y) { return {this->push(Op:: eq_i32, x.id, y.id)}; } neq(I32 x,I32 y)179 I32 Builder::neq(I32 x, I32 y) { return {this->push(Op::neq_i32, x.id, y.id)}; } lt(I32 x,I32 y)180 I32 Builder:: lt(I32 x, I32 y) { return {this->push(Op:: lt_i32, x.id, y.id)}; } lte(I32 x,I32 y)181 I32 Builder::lte(I32 x, I32 y) { return {this->push(Op::lte_i32, x.id, y.id)}; } gt(I32 x,I32 y)182 I32 Builder:: gt(I32 x, I32 y) { return {this->push(Op:: gt_i32, x.id, y.id)}; } gte(I32 x,I32 y)183 I32 Builder::gte(I32 x, I32 y) { return {this->push(Op::gte_i32, x.id, y.id)}; } 184 eq_16x2(I32 x,I32 y)185 I32 Builder:: eq_16x2(I32 x, I32 y) { return {this->push(Op:: eq_i16x2, x.id, y.id)}; } neq_16x2(I32 x,I32 y)186 I32 Builder::neq_16x2(I32 x, I32 y) { return {this->push(Op::neq_i16x2, x.id, y.id)}; } lt_16x2(I32 x,I32 y)187 I32 Builder:: lt_16x2(I32 x, I32 y) { return {this->push(Op:: lt_i16x2, x.id, y.id)}; } lte_16x2(I32 x,I32 y)188 I32 Builder::lte_16x2(I32 x, I32 y) { return {this->push(Op::lte_i16x2, x.id, y.id)}; } gt_16x2(I32 x,I32 y)189 I32 Builder:: gt_16x2(I32 x, I32 y) { return {this->push(Op:: gt_i16x2, x.id, y.id)}; } gte_16x2(I32 x,I32 y)190 I32 Builder::gte_16x2(I32 x, I32 y) { return {this->push(Op::gte_i16x2, x.id, y.id)}; } 191 bit_and(I32 x,I32 y)192 I32 Builder::bit_and (I32 x, I32 y) { return {this->push(Op::bit_and , x.id, y.id)}; } bit_or(I32 x,I32 y)193 I32 Builder::bit_or (I32 x, I32 y) { return {this->push(Op::bit_or , x.id, y.id)}; } bit_xor(I32 x,I32 y)194 I32 Builder::bit_xor (I32 x, I32 y) { return {this->push(Op::bit_xor , x.id, y.id)}; } bit_clear(I32 x,I32 y)195 I32 Builder::bit_clear(I32 x, I32 y) { return {this->push(Op::bit_clear, x.id, y.id)}; } select(I32 x,I32 y,I32 z)196 I32 Builder::select(I32 x, I32 y, I32 z) { return {this->push(Op::select, x.id, y.id, z.id)}; } 197 198 extract(I32 x,int bits,I32 y)199 I32 Builder::extract(I32 x, int bits, I32 y) { 200 return {this->push(Op::extract, x.id,y.id,NA, bits)}; 201 } 202 pack(I32 x,I32 y,int bits)203 I32 Builder::pack(I32 x, I32 y, int bits) { 204 return {this->push(Op::pack, x.id,y.id,NA, bits)}; 205 } 206 bytes(I32 x,int control)207 I32 Builder::bytes(I32 x, int control) { 208 return {this->push(Op::bytes, x.id,NA,NA, control)}; 209 } 210 to_f32(I32 x)211 F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; } to_i32(F32 x)212 I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; } 213 214 // ~~~~ Program::eval() and co. ~~~~ // 215 216 // Handy references for x86-64 instruction encoding: 217 // https://wiki.osdev.org/X86-64_Instruction_Encoding 218 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm 219 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm 220 // http://ref.x86asm.net/coder64.html 221 222 // Used for ModRM / immediate instruction encoding. _233(int a,int b,int c)223 static uint8_t _233(int a, int b, int c) { 224 return (a & 3) << 6 225 | (b & 7) << 3 226 | (c & 7) << 0; 227 } 228 229 // ModRM byte encodes the arguments of an opcode. 230 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; mod_rm(Mod mod,int reg,int rm)231 static uint8_t mod_rm(Mod mod, int reg, int rm) { 232 return _233((int)mod, reg, rm); 233 } 234 mod(int imm)235 static Mod mod(int imm) { 236 if (imm == 0) { return Mod::Indirect; } 237 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } 238 return Mod::FourByteImm; 239 } 240 imm_bytes(Mod mod)241 static int imm_bytes(Mod mod) { 242 switch (mod) { 243 case Mod::Indirect: return 0; 244 case Mod::OneByteImm: return 1; 245 case Mod::FourByteImm: return 4; 246 case Mod::Direct: SkUNREACHABLE; 247 } 248 SkUNREACHABLE; 249 } 250 251 #if 0 252 // SIB byte encodes a memory address, base + (index * scale). 253 enum class Scale { One, Two, Four, Eight }; 254 static uint8_t sib(Scale scale, int index, int base) { 255 return _233((int)scale, index, base); 256 } 257 #endif 258 259 // The REX prefix is used to extend most old 32-bit instructions to 64-bit. rex(bool W,bool R,bool X,bool B)260 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. 261 bool R, // Extra top bit to select ModRM reg, registers 8-15. 262 bool X, // Extra top bit for SIB index register. 263 bool B) { // Extra top bit for SIB base or ModRM rm register. 264 return 0b01000000 // Fixed 0100 for top four bits. 265 | (W << 3) 266 | (R << 2) 267 | (X << 1) 268 | (B << 0); 269 } 270 271 272 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. 273 struct VEX { 274 int len; 275 uint8_t bytes[3]; 276 }; 277 vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)278 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? 279 bool R, // Same as REX R. Pass high bit of dst register, dst>>3. 280 bool X, // Same as REX X. 281 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. 282 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. 283 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. 284 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. 285 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. 286 287 // Pack x86 opcode map selector to 5-bit VEX encoding. 288 map = [map]{ 289 switch (map) { 290 case 0x0f: return 0b00001; 291 case 0x380f: return 0b00010; 292 case 0x3a0f: return 0b00011; 293 // Several more cases only used by XOP / TBM. 294 } 295 SkUNREACHABLE; 296 }(); 297 298 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. 299 pp = [pp]{ 300 switch (pp) { 301 case 0x66: return 0b01; 302 case 0xf3: return 0b10; 303 case 0xf2: return 0b11; 304 } 305 return 0b00; 306 }(); 307 308 VEX vex = {0, {0,0,0}}; 309 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { 310 // With these conditions met, we can optionally compress VEX to 2-byte. 311 vex.len = 2; 312 vex.bytes[0] = 0xc5; 313 vex.bytes[1] = (pp & 3) << 0 314 | (L & 1) << 2 315 | (~vvvv & 15) << 3 316 | (~(int)R & 1) << 7; 317 } else { 318 // We could use this 3-byte VEX prefix all the time if we like. 319 vex.len = 3; 320 vex.bytes[0] = 0xc4; 321 vex.bytes[1] = (map & 31) << 0 322 | (~(int)B & 1) << 5 323 | (~(int)X & 1) << 6 324 | (~(int)R & 1) << 7; 325 vex.bytes[2] = (pp & 3) << 0 326 | (L & 1) << 2 327 | (~vvvv & 15) << 3 328 | (WE & 1) << 7; 329 } 330 return vex; 331 } 332 Assembler(void * buf)333 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {} 334 size() const335 size_t Assembler::size() const { return fSize; } 336 bytes(const void * p,int n)337 void Assembler::bytes(const void* p, int n) { 338 if (fCurr) { 339 memcpy(fCurr, p, n); 340 fCurr += n; 341 } 342 fSize += n; 343 } 344 byte(uint8_t b)345 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } word(uint32_t w)346 void Assembler::word(uint32_t w) { this->bytes(&w, 4); } 347 align(int mod)348 void Assembler::align(int mod) { 349 while (this->size() % mod) { 350 this->byte(0x00); 351 } 352 } 353 vzeroupper()354 void Assembler::vzeroupper() { 355 this->byte(0xc5); 356 this->byte(0xf8); 357 this->byte(0x77); 358 } ret()359 void Assembler::ret() { this->byte(0xc3); } 360 361 // Common instruction building for 64-bit opcodes with an immediate argument. op(int opcode,int opcode_ext,GP64 dst,int imm)362 void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) { 363 opcode |= 0b0000'0001; // low bit set for 64-bit operands 364 opcode |= 0b1000'0000; // top bit set for instructions with any immediate 365 366 int imm_bytes = 4; 367 if (SkTFitsIn<int8_t>(imm)) { 368 imm_bytes = 1; 369 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. 370 } 371 372 this->byte(rex(1,0,0,dst>>3)); 373 this->byte(opcode); 374 this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7)); 375 this->bytes(&imm, imm_bytes); 376 } 377 add(GP64 dst,int imm)378 void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); } sub(GP64 dst,int imm)379 void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); } cmp(GP64 reg,int imm)380 void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); } 381 op(int prefix,int map,int opcode,Ymm dst,Ymm x,Ymm y,bool W)382 void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) { 383 VEX v = vex(W, dst>>3, 0, y>>3, 384 map, x, 1/*ymm, not xmm*/, prefix); 385 this->bytes(v.bytes, v.len); 386 this->byte(opcode); 387 this->byte(mod_rm(Mod::Direct, dst&7, y&7)); 388 } 389 vpaddd(Ymm dst,Ymm x,Ymm y)390 void Assembler::vpaddd (Ymm dst, Ymm x, Ymm y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } vpsubd(Ymm dst,Ymm x,Ymm y)391 void Assembler::vpsubd (Ymm dst, Ymm x, Ymm y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } vpmulld(Ymm dst,Ymm x,Ymm y)392 void Assembler::vpmulld(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x40, dst,x,y); } 393 vpsubw(Ymm dst,Ymm x,Ymm y)394 void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); } vpmullw(Ymm dst,Ymm x,Ymm y)395 void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); } 396 vpand(Ymm dst,Ymm x,Ymm y)397 void Assembler::vpand (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdb, dst,x,y); } vpor(Ymm dst,Ymm x,Ymm y)398 void Assembler::vpor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xeb, dst,x,y); } vpxor(Ymm dst,Ymm x,Ymm y)399 void Assembler::vpxor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xef, dst,x,y); } vpandn(Ymm dst,Ymm x,Ymm y)400 void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdf, dst,x,y); } 401 vaddps(Ymm dst,Ymm x,Ymm y)402 void Assembler::vaddps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x58, dst,x,y); } vsubps(Ymm dst,Ymm x,Ymm y)403 void Assembler::vsubps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5c, dst,x,y); } vmulps(Ymm dst,Ymm x,Ymm y)404 void Assembler::vmulps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x59, dst,x,y); } vdivps(Ymm dst,Ymm x,Ymm y)405 void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5e, dst,x,y); } 406 vfmadd132ps(Ymm dst,Ymm x,Ymm y)407 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); } vfmadd213ps(Ymm dst,Ymm x,Ymm y)408 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); } vfmadd231ps(Ymm dst,Ymm x,Ymm y)409 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); } 410 vpackusdw(Ymm dst,Ymm x,Ymm y)411 void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); } vpackuswb(Ymm dst,Ymm x,Ymm y)412 void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66, 0x0f,0x67, dst,x,y); } 413 vpcmpeqd(Ymm dst,Ymm x,Ymm y)414 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); } vpcmpgtd(Ymm dst,Ymm x,Ymm y)415 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); } 416 vpblendvb(Ymm dst,Ymm x,Ymm y,Ymm z)417 void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) { 418 int prefix = 0x66, 419 map = 0x3a0f, 420 opcode = 0x4c; 421 VEX v = vex(0, dst>>3, 0, y>>3, 422 map, x, /*ymm?*/1, prefix); 423 this->bytes(v.bytes, v.len); 424 this->byte(opcode); 425 this->byte(mod_rm(Mod::Direct, dst&7, y&7)); 426 this->byte(z << 4); 427 } 428 429 // dst = x op /opcode_ext imm op(int prefix,int map,int opcode,int opcode_ext,Ymm dst,Ymm x,int imm)430 void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) { 431 // This is a little weird, but if we pass the opcode_ext as if it were the dst register, 432 // the dst register as if x, and the x register as if y, all the bits end up where we want. 433 this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x); 434 this->byte(imm); 435 } 436 vpslld(Ymm dst,Ymm x,int imm)437 void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); } vpsrld(Ymm dst,Ymm x,int imm)438 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); } vpsrad(Ymm dst,Ymm x,int imm)439 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); } 440 vpsrlw(Ymm dst,Ymm x,int imm)441 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); } 442 443 vpermq(Ymm dst,Ymm x,int imm)444 void Assembler::vpermq(Ymm dst, Ymm x, int imm) { 445 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. 446 bool W = true; 447 this->op(0x66,0x3a0f,0x00, dst,x,W); 448 this->byte(imm); 449 } 450 vmovdqa(Ymm dst,Ymm src)451 void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); } 452 vcvtdq2ps(Ymm dst,Ymm x)453 void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op(0, 0x0f,0x5b, dst,x); } vcvttps2dq(Ymm dst,Ymm x)454 void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); } 455 here()456 Assembler::Label Assembler::here() { 457 return { (int)this->size(), Label::None, {} }; 458 } 459 disp19(Label * l)460 int Assembler::disp19(Label* l) { 461 SkASSERT(l->kind == Label::None || 462 l->kind == Label::ARMDisp19); 463 l->kind = Label::ARMDisp19; 464 l->references.push_back(here().offset); 465 // ARM 19-bit instruction count, from the beginning of this instruction. 466 return (l->offset - here().offset) / 4; 467 } 468 disp32(Label * l)469 int Assembler::disp32(Label* l) { 470 SkASSERT(l->kind == Label::None || 471 l->kind == Label::X86Disp32); 472 l->kind = Label::X86Disp32; 473 l->references.push_back(here().offset); 474 // x86 32-bit byte count, from the end of this instruction. 475 return l->offset - (here().offset + 4); 476 } 477 op(int prefix,int map,int opcode,Ymm dst,Ymm x,Label * l)478 void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) { 479 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. 480 const int rip = rbp; 481 482 VEX v = vex(0, dst>>3, 0, rip>>3, 483 map, x, /*ymm?*/1, prefix); 484 this->bytes(v.bytes, v.len); 485 this->byte(opcode); 486 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); 487 this->word(this->disp32(l)); 488 } 489 vpshufb(Ymm dst,Ymm x,Label * l)490 void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); } 491 vbroadcastss(Ymm dst,Label * l)492 void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); } vbroadcastss(Ymm dst,Xmm src)493 void Assembler::vbroadcastss(Ymm dst, Xmm src) { this->op(0x66,0x380f,0x18, dst, (Ymm)src); } vbroadcastss(Ymm dst,GP64 ptr,int off)494 void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) { 495 int prefix = 0x66, 496 map = 0x380f, 497 opcode = 0x18; 498 VEX v = vex(0, dst>>3, 0, ptr>>3, 499 map, 0, /*ymm?*/1, prefix); 500 this->bytes(v.bytes, v.len); 501 this->byte(opcode); 502 503 this->byte(mod_rm(mod(off), dst&7, ptr&7)); 504 this->bytes(&off, imm_bytes(mod(off))); 505 } 506 jump(uint8_t condition,Label * l)507 void Assembler::jump(uint8_t condition, Label* l) { 508 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): 509 // 7? one-byte-disp 510 // 0F 8? four-byte-disp 511 // We always use the near displacement to make updating labels simpler (no resizing). 512 this->byte(0x0f); 513 this->byte(condition); 514 this->word(this->disp32(l)); 515 } je(Label * l)516 void Assembler::je (Label* l) { this->jump(0x84, l); } jne(Label * l)517 void Assembler::jne(Label* l) { this->jump(0x85, l); } jl(Label * l)518 void Assembler::jl (Label* l) { this->jump(0x8c, l); } 519 jmp(Label * l)520 void Assembler::jmp(Label* l) { 521 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. 522 this->byte(0xe9); 523 this->word(this->disp32(l)); 524 } 525 load_store(int prefix,int map,int opcode,Ymm ymm,GP64 ptr)526 void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) { 527 VEX v = vex(0, ymm>>3, 0, ptr>>3, 528 map, 0, /*ymm?*/1, prefix); 529 this->bytes(v.bytes, v.len); 530 this->byte(opcode); 531 this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7)); 532 } 533 vmovups(Ymm dst,GP64 src)534 void Assembler::vmovups (Ymm dst, GP64 src) { this->load_store(0 , 0x0f,0x10, dst,src); } vpmovzxwd(Ymm dst,GP64 src)535 void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); } vpmovzxbd(Ymm dst,GP64 src)536 void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); } 537 vmovups(GP64 dst,Ymm src)538 void Assembler::vmovups (GP64 dst, Ymm src) { this->load_store(0 , 0x0f,0x11, src,dst); } vmovups(GP64 dst,Xmm src)539 void Assembler::vmovups (GP64 dst, Xmm src) { 540 // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0. 541 int prefix = 0, 542 map = 0x0f, 543 opcode = 0x11; 544 VEX v = vex(0, src>>3, 0, dst>>3, 545 map, 0, /*ymm?*/0, prefix); 546 this->bytes(v.bytes, v.len); 547 this->byte(opcode); 548 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 549 } 550 vmovq(GP64 dst,Xmm src)551 void Assembler::vmovq(GP64 dst, Xmm src) { 552 int prefix = 0x66, 553 map = 0x0f, 554 opcode = 0xd6; 555 VEX v = vex(0, src>>3, 0, dst>>3, 556 map, 0, /*ymm?*/0, prefix); 557 this->bytes(v.bytes, v.len); 558 this->byte(opcode); 559 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 560 } 561 vmovd(GP64 dst,Xmm src)562 void Assembler::vmovd(GP64 dst, Xmm src) { 563 int prefix = 0x66, 564 map = 0x0f, 565 opcode = 0x7e; 566 VEX v = vex(0, src>>3, 0, dst>>3, 567 map, 0, /*ymm?*/0, prefix); 568 this->bytes(v.bytes, v.len); 569 this->byte(opcode); 570 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 571 } 572 vmovd_direct(GP64 dst,Xmm src)573 void Assembler::vmovd_direct(GP64 dst, Xmm src) { 574 int prefix = 0x66, 575 map = 0x0f, 576 opcode = 0x7e; 577 VEX v = vex(0, src>>3, 0, dst>>3, 578 map, 0, /*ymm?*/0, prefix); 579 this->bytes(v.bytes, v.len); 580 this->byte(opcode); 581 this->byte(mod_rm(Mod::Direct, src&7, dst&7)); 582 } 583 vmovd(Xmm dst,GP64 src)584 void Assembler::vmovd(Xmm dst, GP64 src) { 585 int prefix = 0x66, 586 map = 0x0f, 587 opcode = 0x6e; 588 VEX v = vex(0, dst>>3, 0, src>>3, 589 map, 0, /*ymm?*/0, prefix); 590 this->bytes(v.bytes, v.len); 591 this->byte(opcode); 592 this->byte(mod_rm(Mod::Indirect, dst&7, src&7)); 593 } 594 vmovd_direct(Xmm dst,GP64 src)595 void Assembler::vmovd_direct(Xmm dst, GP64 src) { 596 int prefix = 0x66, 597 map = 0x0f, 598 opcode = 0x6e; 599 VEX v = vex(0, dst>>3, 0, src>>3, 600 map, 0, /*ymm?*/0, prefix); 601 this->bytes(v.bytes, v.len); 602 this->byte(opcode); 603 this->byte(mod_rm(Mod::Direct, dst&7, src&7)); 604 } 605 movzbl(GP64 dst,GP64 src,int off)606 void Assembler::movzbl(GP64 dst, GP64 src, int off) { 607 if ((dst>>3) || (src>>3)) { 608 this->byte(rex(0,dst>>3,0,src>>3)); 609 } 610 this->byte(0x0f); 611 this->byte(0xb6); 612 this->byte(mod_rm(mod(off), dst&7, src&7)); 613 this->bytes(&off, imm_bytes(mod(off))); 614 } 615 616 movb(GP64 dst,GP64 src)617 void Assembler::movb(GP64 dst, GP64 src) { 618 if ((dst>>3) || (src>>3)) { 619 this->byte(rex(0,src>>3,0,dst>>3)); 620 } 621 this->byte(0x88); 622 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 623 } 624 vpinsrw(Xmm dst,Xmm src,GP64 ptr,int imm)625 void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) { 626 int prefix = 0x66, 627 map = 0x0f, 628 opcode = 0xc4; 629 VEX v = vex(0, dst>>3, 0, ptr>>3, 630 map, src, /*ymm?*/0, prefix); 631 this->bytes(v.bytes, v.len); 632 this->byte(opcode); 633 this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7)); 634 this->byte(imm); 635 } 636 vpinsrb(Xmm dst,Xmm src,GP64 ptr,int imm)637 void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) { 638 int prefix = 0x66, 639 map = 0x3a0f, 640 opcode = 0x20; 641 VEX v = vex(0, dst>>3, 0, ptr>>3, 642 map, src, /*ymm?*/0, prefix); 643 this->bytes(v.bytes, v.len); 644 this->byte(opcode); 645 this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7)); 646 this->byte(imm); 647 } 648 vpextrw(GP64 ptr,Xmm src,int imm)649 void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) { 650 int prefix = 0x66, 651 map = 0x3a0f, 652 opcode = 0x15; 653 654 VEX v = vex(0, src>>3, 0, ptr>>3, 655 map, 0, /*ymm?*/0, prefix); 656 this->bytes(v.bytes, v.len); 657 this->byte(opcode); 658 this->byte(mod_rm(Mod::Indirect, src&7, ptr&7)); 659 this->byte(imm); 660 } vpextrb(GP64 ptr,Xmm src,int imm)661 void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) { 662 int prefix = 0x66, 663 map = 0x3a0f, 664 opcode = 0x14; 665 666 VEX v = vex(0, src>>3, 0, ptr>>3, 667 map, 0, /*ymm?*/0, prefix); 668 this->bytes(v.bytes, v.len); 669 this->byte(opcode); 670 this->byte(mod_rm(Mod::Indirect, src&7, ptr&7)); 671 this->byte(imm); 672 } 673 674 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf 675 operator ""_mask(unsigned long long bits)676 static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } 677 op(uint32_t hi,V m,uint32_t lo,V n,V d)678 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { 679 this->word( (hi & 11_mask) << 21 680 | (m & 5_mask) << 16 681 | (lo & 6_mask) << 10 682 | (n & 5_mask) << 5 683 | (d & 5_mask) << 0); 684 } 685 and16b(V d,V n,V m)686 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } orr16b(V d,V n,V m)687 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } eor16b(V d,V n,V m)688 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } bic16b(V d,V n,V m)689 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } 690 add4s(V d,V n,V m)691 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } sub4s(V d,V n,V m)692 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } mul4s(V d,V n,V m)693 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } 694 sub8h(V d,V n,V m)695 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } mul8h(V d,V n,V m)696 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } 697 fadd4s(V d,V n,V m)698 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } fsub4s(V d,V n,V m)699 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } fmul4s(V d,V n,V m)700 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } fdiv4s(V d,V n,V m)701 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } 702 fmla4s(V d,V n,V m)703 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } 704 tbl(V d,V n,V m)705 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } 706 op(uint32_t op22,int imm,V n,V d)707 void Assembler::op(uint32_t op22, int imm, V n, V d) { 708 this->word( (op22 & 22_mask) << 10 709 | imm << 16 // imm is embedded inside op, bit size depends on op 710 | (n & 5_mask) << 5 711 | (d & 5_mask) << 0); 712 } 713 sli4s(V d,V n,int imm)714 void Assembler::sli4s(V d, V n, int imm) { 715 this->op(0b0'1'1'011110'0100'000'01010'1, ( imm&31), n, d); 716 } shl4s(V d,V n,int imm)717 void Assembler::shl4s(V d, V n, int imm) { 718 this->op(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d); 719 } sshr4s(V d,V n,int imm)720 void Assembler::sshr4s(V d, V n, int imm) { 721 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d); 722 } ushr4s(V d,V n,int imm)723 void Assembler::ushr4s(V d, V n, int imm) { 724 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d); 725 } ushr8h(V d,V n,int imm)726 void Assembler::ushr8h(V d, V n, int imm) { 727 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d); 728 } 729 scvtf4s(V d,V n)730 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } fcvtzs4s(V d,V n)731 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } 732 xtns2h(V d,V n)733 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } xtnh2b(V d,V n)734 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } 735 uxtlb2h(V d,V n)736 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } uxtlh2s(V d,V n)737 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } 738 ret(X n)739 void Assembler::ret(X n) { 740 this->word(0b1101011'0'0'10'11111'0000'0'0 << 10 741 | (n & 5_mask) << 5); 742 } 743 add(X d,X n,int imm12)744 void Assembler::add(X d, X n, int imm12) { 745 this->word(0b1'0'0'10001'00 << 22 746 | (imm12 & 12_mask) << 10 747 | (n & 5_mask) << 5 748 | (d & 5_mask) << 0); 749 } sub(X d,X n,int imm12)750 void Assembler::sub(X d, X n, int imm12) { 751 this->word( 0b1'1'0'10001'00 << 22 752 | (imm12 & 12_mask) << 10 753 | (n & 5_mask) << 5 754 | (d & 5_mask) << 0); 755 } subs(X d,X n,int imm12)756 void Assembler::subs(X d, X n, int imm12) { 757 this->word( 0b1'1'1'10001'00 << 22 758 | (imm12 & 12_mask) << 10 759 | (n & 5_mask) << 5 760 | (d & 5_mask) << 0); 761 } 762 b(Condition cond,Label * l)763 void Assembler::b(Condition cond, Label* l) { 764 const int imm19 = this->disp19(l); 765 this->word( 0b0101010'0 << 24 766 | (imm19 & 19_mask) << 5 767 | ((int)cond & 4_mask) << 0); 768 } cbz(X t,Label * l)769 void Assembler::cbz(X t, Label* l) { 770 const int imm19 = this->disp19(l); 771 this->word( 0b1'011010'0 << 24 772 | (imm19 & 19_mask) << 5 773 | (t & 5_mask) << 0); 774 } cbnz(X t,Label * l)775 void Assembler::cbnz(X t, Label* l) { 776 const int imm19 = this->disp19(l); 777 this->word( 0b1'011010'1 << 24 778 | (imm19 & 19_mask) << 5 779 | (t & 5_mask) << 0); 780 } 781 ldrq(V dst,X src)782 void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); } ldrs(V dst,X src)783 void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); } ldrb(V dst,X src)784 void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); } 785 strq(V src,X dst)786 void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); } strs(V src,X dst)787 void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); } strb(V src,X dst)788 void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); } 789 ldrq(V dst,Label * l)790 void Assembler::ldrq(V dst, Label* l) { 791 const int imm19 = this->disp19(l); 792 this->word( 0b10'011'1'00 << 24 793 | (imm19 & 19_mask) << 5 794 | (dst & 5_mask) << 0); 795 } 796 label(Label * l)797 void Assembler::label(Label* l) { 798 if (fCode) { 799 // The instructions all currently point to l->offset. 800 // We'll want to add a delta to point them to here(). 801 int delta = here().offset - l->offset; 802 l->offset = here().offset; 803 804 if (l->kind == Label::ARMDisp19) { 805 for (int ref : l->references) { 806 // ref points to a 32-bit instruction with 19-bit displacement in instructions. 807 uint32_t inst; 808 memcpy(&inst, fCode + ref, 4); 809 810 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] 811 int disp = (int)(inst << 8) >> 13; 812 813 disp += delta/4; // delta is in bytes, we want instructions. 814 815 // Put it all back together, preserving the high 8 bits and low 5. 816 inst = ((disp << 5) & (19_mask << 5)) 817 | ((inst ) & ~(19_mask << 5)); 818 819 memcpy(fCode + ref, &inst, 4); 820 } 821 } 822 823 if (l->kind == Label::X86Disp32) { 824 for (int ref : l->references) { 825 // ref points to a 32-bit displacement in bytes. 826 int disp; 827 memcpy(&disp, fCode + ref, 4); 828 829 disp += delta; 830 831 memcpy(fCode + ref, &disp, 4); 832 } 833 } 834 } 835 } 836 eval(int n,void * args[]) const837 void Program::eval(int n, void* args[]) const { 838 const int nargs = (int)fStrides.size(); 839 840 if (fJITBuf) { 841 void** a = args; 842 const void* b = fJITBuf; 843 switch (nargs) { 844 case 0: return ((void(*)(int ))b)(n ); 845 case 1: return ((void(*)(int,void* ))b)(n,a[0] ); 846 case 2: return ((void(*)(int,void*,void* ))b)(n,a[0],a[1] ); 847 case 3: return ((void(*)(int,void*,void*,void* ))b)(n,a[0],a[1],a[2] ); 848 case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]); 849 default: SkUNREACHABLE; // TODO 850 } 851 } 852 853 // We'll operate in SIMT style, knocking off K-size chunks from n while possible. 854 constexpr int K = 16; 855 using I32 = skvx::Vec<K, int>; 856 using F32 = skvx::Vec<K, float>; 857 using U32 = skvx::Vec<K, uint32_t>; 858 using U16 = skvx::Vec<K, uint16_t>; 859 using U8 = skvx::Vec<K, uint8_t>; 860 861 using I16x2 = skvx::Vec<2*K, int16_t>; 862 using U16x2 = skvx::Vec<2*K, uint16_t>; 863 864 union Slot { 865 F32 f32; 866 I32 i32; 867 U32 u32; 868 I16x2 i16x2; 869 U16x2 u16x2; 870 }; 871 872 Slot few_regs[16]; 873 std::unique_ptr<char[]> many_regs; 874 875 Slot* regs = few_regs; 876 877 if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) { 878 // Annoyingly we can't trust that malloc() or new will work with Slot because 879 // the skvx::Vec types may have alignment greater than what they provide. 880 // We'll overallocate one extra register so we can align manually. 881 many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]); 882 883 uintptr_t addr = (uintptr_t)many_regs.get(); 884 addr += alignof(Slot) - 885 (addr & (alignof(Slot) - 1)); 886 SkASSERT((addr & (alignof(Slot) - 1)) == 0); 887 regs = (Slot*)addr; 888 } 889 890 891 auto r = [&](Reg id) -> Slot& { 892 SkASSERT(0 <= id && id < fRegs); 893 return regs[id]; 894 }; 895 auto arg = [&](int ix) { 896 SkASSERT(0 <= ix && ix < nargs); 897 return args[ix]; 898 }; 899 900 // Step each argument pointer ahead by its stride a number of times. 901 auto step_args = [&](int times) { 902 for (int i = 0; i < (int)fStrides.size(); i++) { 903 args[i] = (void*)( (char*)args[i] + times * fStrides[i] ); 904 } 905 }; 906 907 int start = 0, 908 stride; 909 for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) { 910 stride = n >= K ? K : 1; 911 912 for (int i = start; i < (int)fInstructions.size(); i++) { 913 Instruction inst = fInstructions[i]; 914 915 // d = op(x,y,z/imm) 916 Reg d = inst.d, 917 x = inst.x, 918 y = inst.y, 919 z = inst.z; 920 int imm = inst.imm; 921 922 // Ops that interact with memory need to know whether we're stride=1 or K, 923 // but all non-memory ops can run the same code no matter the stride. 924 switch (2*(int)inst.op + (stride == K ? 1 : 0)) { 925 default: SkUNREACHABLE; 926 927 #define STRIDE_1(op) case 2*(int)op 928 #define STRIDE_K(op) case 2*(int)op + 1 929 STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break; 930 STRIDE_1(Op::store16): memcpy(arg(imm), &r(x).i32, 2); break; 931 STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break; 932 933 STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(imm)); break; 934 STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(imm)); break; 935 STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break; 936 937 STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break; 938 STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 2); break; 939 STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break; 940 941 STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break; 942 STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(imm))); break; 943 STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(imm)) ; break; 944 945 STRIDE_1(Op::gather8): 946 for (int i = 0; i < K; i++) { 947 r(d).i32[i] = (i == 0) ? ((const uint8_t* )arg(imm))[ r(x).i32[i] ] : 0; 948 } break; 949 STRIDE_1(Op::gather16): 950 for (int i = 0; i < K; i++) { 951 r(d).i32[i] = (i == 0) ? ((const uint16_t*)arg(imm))[ r(x).i32[i] ] : 0; 952 } break; 953 STRIDE_1(Op::gather32): 954 for (int i = 0; i < K; i++) { 955 r(d).i32[i] = (i == 0) ? ((const int* )arg(imm))[ r(x).i32[i] ] : 0; 956 } break; 957 958 STRIDE_K(Op::gather8): 959 for (int i = 0; i < K; i++) { 960 r(d).i32[i] = ((const uint8_t* )arg(imm))[ r(x).i32[i] ]; 961 } break; 962 STRIDE_K(Op::gather16): 963 for (int i = 0; i < K; i++) { 964 r(d).i32[i] = ((const uint16_t*)arg(imm))[ r(x).i32[i] ]; 965 } break; 966 STRIDE_K(Op::gather32): 967 for (int i = 0; i < K; i++) { 968 r(d).i32[i] = ((const int* )arg(imm))[ r(x).i32[i] ]; 969 } break; 970 971 #undef STRIDE_1 972 #undef STRIDE_K 973 974 // Ops that don't interact with memory should never care about the stride. 975 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1 976 977 CASE(Op::uniform8): 978 r(d).i32 = *(const uint8_t* )( (const char*)arg(imm&0xffff) + (imm>>16) ); 979 break; 980 CASE(Op::uniform16): 981 r(d).i32 = *(const uint16_t*)( (const char*)arg(imm&0xffff) + (imm>>16) ); 982 break; 983 CASE(Op::uniform32): 984 r(d).i32 = *(const int* )( (const char*)arg(imm&0xffff) + (imm>>16) ); 985 break; 986 987 CASE(Op::splat): r(d).i32 = imm; break; 988 989 CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break; 990 CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break; 991 CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break; 992 CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break; 993 994 CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break; 995 996 CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break; 997 CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break; 998 CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break; 999 1000 CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break; 1001 CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break; 1002 CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break; 1003 1004 CASE(Op::shl_i32): r(d).i32 = r(x).i32 << imm; break; 1005 CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> imm; break; 1006 CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> imm; break; 1007 1008 CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << imm; break; 1009 CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> imm; break; 1010 CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> imm; break; 1011 1012 CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break; 1013 CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break; 1014 CASE(Op:: lt_f32): r(d).i32 = r(x).f32 < r(y).f32; break; 1015 CASE(Op::lte_f32): r(d).i32 = r(x).f32 <= r(y).f32; break; 1016 CASE(Op:: gt_f32): r(d).i32 = r(x).f32 > r(y).f32; break; 1017 CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break; 1018 1019 CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break; 1020 CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break; 1021 CASE(Op:: lt_i32): r(d).i32 = r(x).i32 < r(y).i32; break; 1022 CASE(Op::lte_i32): r(d).i32 = r(x).i32 <= r(y).i32; break; 1023 CASE(Op:: gt_i32): r(d).i32 = r(x).i32 > r(y).i32; break; 1024 CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break; 1025 1026 CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break; 1027 CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break; 1028 CASE(Op:: lt_i16x2): r(d).i16x2 = r(x).i16x2 < r(y).i16x2; break; 1029 CASE(Op::lte_i16x2): r(d).i16x2 = r(x).i16x2 <= r(y).i16x2; break; 1030 CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 > r(y).i16x2; break; 1031 CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break; 1032 1033 CASE(Op::bit_and ): r(d).i32 = r(x).i32 & r(y).i32; break; 1034 CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break; 1035 CASE(Op::bit_xor ): r(d).i32 = r(x).i32 ^ r(y).i32; break; 1036 CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break; 1037 1038 CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32); 1039 break; 1040 1041 1042 CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break; 1043 CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break; 1044 1045 CASE(Op::bytes): { 1046 const U32 table[] = { 1047 0, 1048 (r(x).u32 ) & 0xff, 1049 (r(x).u32 >> 8) & 0xff, 1050 (r(x).u32 >> 16) & 0xff, 1051 (r(x).u32 >> 24) & 0xff, 1052 }; 1053 r(d).u32 = table[(imm >> 0) & 0xf] << 0 1054 | table[(imm >> 4) & 0xf] << 8 1055 | table[(imm >> 8) & 0xf] << 16 1056 | table[(imm >> 12) & 0xf] << 24; 1057 } break; 1058 1059 CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break; 1060 CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break; 1061 #undef CASE 1062 } 1063 } 1064 } 1065 } 1066 dropJIT()1067 void Program::dropJIT() { 1068 #if defined(SKVM_JIT) 1069 if (fJITBuf) { 1070 munmap(fJITBuf, fJITSize); 1071 } 1072 #else 1073 SkASSERT(fJITBuf == nullptr); 1074 #endif 1075 1076 fJITBuf = nullptr; 1077 fJITSize = 0; 1078 } 1079 ~Program()1080 Program::~Program() { this->dropJIT(); } 1081 Program(Program && other)1082 Program::Program(Program&& other) { 1083 fInstructions = std::move(other.fInstructions); 1084 fRegs = other.fRegs; 1085 fLoop = other.fLoop; 1086 fStrides = std::move(other.fStrides); 1087 1088 std::swap(fJITBuf , other.fJITBuf); 1089 std::swap(fJITSize , other.fJITSize); 1090 } 1091 operator =(Program && other)1092 Program& Program::operator=(Program&& other) { 1093 fInstructions = std::move(other.fInstructions); 1094 fRegs = other.fRegs; 1095 fLoop = other.fLoop; 1096 fStrides = std::move(other.fStrides); 1097 1098 std::swap(fJITBuf , other.fJITBuf); 1099 std::swap(fJITSize , other.fJITSize); 1100 return *this; 1101 } 1102 Program()1103 Program::Program() {} 1104 Program(const std::vector<Builder::Instruction> & instructions,const std::vector<int> & strides,const char * debug_name)1105 Program::Program(const std::vector<Builder::Instruction>& instructions, 1106 const std::vector<int>& strides, 1107 const char* debug_name) : fStrides(strides) { 1108 this->setupInterpreter(instructions); 1109 #if defined(SKVM_JIT) 1110 this->setupJIT(instructions, debug_name); 1111 #endif 1112 } 1113 1114 // Translate Builder::Instructions to Program::Instructions used by the interpreter. setupInterpreter(const std::vector<Builder::Instruction> & instructions)1115 void Program::setupInterpreter(const std::vector<Builder::Instruction>& instructions) { 1116 // Register each instruction is assigned to. 1117 std::vector<Reg> reg(instructions.size()); 1118 1119 // This next bit is a bit more complicated than strictly necessary; 1120 // we could just assign every live instruction to its own register. 1121 // 1122 // But recycling registers is fairly cheap, and good practice for the 1123 // JITs where minimizing register pressure really is important. 1124 1125 fRegs = 0; 1126 int live_instructions = 0; 1127 std::vector<Reg> avail; 1128 1129 // Assign this value to a register, recycling them where we can. 1130 auto assign_register = [&](Val id) { 1131 live_instructions++; 1132 const Builder::Instruction& inst = instructions[id]; 1133 1134 // If this is a real input and it's lifetime ends at this instruction, 1135 // we can recycle the register it's occupying. 1136 auto maybe_recycle_register = [&](Val input) { 1137 if (input != NA && instructions[input].death == id) { 1138 avail.push_back(reg[input]); 1139 } 1140 }; 1141 1142 // Take care to not recycle the same register twice. 1143 if (true ) { maybe_recycle_register(inst.x); } 1144 if (inst.y != inst.x ) { maybe_recycle_register(inst.y); } 1145 if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); } 1146 1147 // Allocate a register if we have to, preferring to reuse anything available. 1148 if (avail.empty()) { 1149 reg[id] = fRegs++; 1150 } else { 1151 reg[id] = avail.back(); 1152 avail.pop_back(); 1153 } 1154 }; 1155 1156 // Assign a register to each live hoisted instruction. 1157 for (Val id = 0; id < (Val)instructions.size(); id++) { 1158 const Builder::Instruction& inst = instructions[id]; 1159 if (inst.death != 0 && inst.hoist) { 1160 assign_register(id); 1161 } 1162 } 1163 1164 // Assign registers to each live loop instruction. 1165 for (Val id = 0; id < (Val)instructions.size(); id++) { 1166 const Builder::Instruction& inst = instructions[id]; 1167 if (inst.death != 0 && !inst.hoist) { 1168 assign_register(id); 1169 1170 } 1171 } 1172 1173 // Translate Builder::Instructions to Program::Instructions by mapping values to 1174 // registers. This will be two passes, first hoisted instructions, then inside the loop. 1175 1176 // The loop begins at the fLoop'th Instruction. 1177 fLoop = 0; 1178 fInstructions.reserve(live_instructions); 1179 1180 // Add a dummy mapping for the N/A sentinel Val to any arbitrary register 1181 // so lookups don't have to know which arguments are used by which Ops. 1182 auto lookup_register = [&](Val id) { 1183 return id == NA ? (Reg)0 1184 : reg[id]; 1185 }; 1186 1187 auto push_instruction = [&](Val id, const Builder::Instruction& inst) { 1188 Program::Instruction pinst{ 1189 inst.op, 1190 lookup_register(id), 1191 lookup_register(inst.x), 1192 lookup_register(inst.y), 1193 {lookup_register(inst.z)}, 1194 }; 1195 if (inst.z == NA) { pinst.imm = inst.imm; } 1196 fInstructions.push_back(pinst); 1197 }; 1198 1199 for (Val id = 0; id < (Val)instructions.size(); id++) { 1200 const Builder::Instruction& inst = instructions[id]; 1201 if (inst.death != 0 && inst.hoist) { 1202 push_instruction(id, inst); 1203 fLoop++; 1204 } 1205 } 1206 for (Val id = 0; id < (Val)instructions.size(); id++) { 1207 const Builder::Instruction& inst = instructions[id]; 1208 if (inst.death != 0 && !inst.hoist) { 1209 push_instruction(id, inst); 1210 } 1211 } 1212 } 1213 1214 #if defined(SKVM_JIT) 1215 1216 // Just so happens that we can translate the immediate control for our bytes() op 1217 // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl! bytes_control(int imm,int mask[4])1218 static void bytes_control(int imm, int mask[4]) { 1219 auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t { 1220 // 0 -> 0xff, Fill with zero 1221 // 1 -> 0x00, Select byte 0 1222 // 2 -> 0x01, " 1 1223 // 3 -> 0x02, " 2 1224 // 4 -> 0x03, " 3 1225 return n - 1; 1226 }; 1227 uint8_t control[] = { 1228 nibble_to_vpshufb( (imm >> 0) & 0xf ), 1229 nibble_to_vpshufb( (imm >> 4) & 0xf ), 1230 nibble_to_vpshufb( (imm >> 8) & 0xf ), 1231 nibble_to_vpshufb( (imm >> 12) & 0xf ), 1232 }; 1233 for (int i = 0; i < 4; i++) { 1234 mask[i] = (int)control[0] << 0 1235 | (int)control[1] << 8 1236 | (int)control[2] << 16 1237 | (int)control[3] << 24; 1238 1239 // Update each byte that refers to a byte index by 4 to 1240 // point into the next 32-bit lane, but leave any 0xff 1241 // that fills with zero alone. 1242 control[0] += control[0] == 0xff ? 0 : 4; 1243 control[1] += control[1] == 0xff ? 0 : 4; 1244 control[2] += control[2] == 0xff ? 0 : 4; 1245 control[3] += control[3] == 0xff ? 0 : 4; 1246 } 1247 } 1248 jit(const std::vector<Builder::Instruction> & instructions,const bool hoist,Assembler * a) const1249 bool Program::jit(const std::vector<Builder::Instruction>& instructions, 1250 const bool hoist, 1251 Assembler* a) const { 1252 using A = Assembler; 1253 1254 #if defined(__x86_64__) 1255 if (!SkCpu::Supports(SkCpu::HSW)) { 1256 return false; 1257 } 1258 A::GP64 N = A::rdi, 1259 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 }; 1260 1261 // All 16 ymm registers are available to use. 1262 using Reg = A::Ymm; 1263 uint32_t avail = 0xffff; 1264 1265 #elif defined(__aarch64__) 1266 A::X N = A::x0, 1267 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; 1268 1269 // We can use v0-v7 and v16-v31 freely; we'd need to preseve v8-v15. 1270 using Reg = A::V; 1271 uint32_t avail = 0xffff00ff; 1272 #endif 1273 1274 if (SK_ARRAY_COUNT(arg) < fStrides.size()) { 1275 return false; 1276 } 1277 1278 auto hoisted = [&](Val id) { return hoist && instructions[id].hoist; }; 1279 1280 std::vector<Reg> r(instructions.size()); 1281 1282 struct LabelAndReg { 1283 A::Label label; 1284 Reg reg; 1285 }; 1286 SkTHashMap<int, LabelAndReg> splats, 1287 bytes_masks; 1288 1289 auto warmup = [&](Val id) { 1290 const Builder::Instruction& inst = instructions[id]; 1291 if (inst.death == 0) { 1292 return true; 1293 } 1294 1295 Op op = inst.op; 1296 int imm = inst.imm; 1297 1298 switch (op) { 1299 default: break; 1300 1301 case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); } 1302 break; 1303 1304 case Op::bytes: if (!bytes_masks.find(imm)) { 1305 bytes_masks.set(imm, {}); 1306 if (hoist) { 1307 // vpshufb can always work with the mask from memory, 1308 // but it helps to hoist the mask to a register for tbl. 1309 #if defined(__aarch64__) 1310 LabelAndReg* entry = bytes_masks.find(imm); 1311 if (int found = __builtin_ffs(avail)) { 1312 entry->reg = (Reg)(found-1); 1313 avail ^= 1 << entry->reg; 1314 a->ldrq(entry->reg, &entry->label); 1315 } else { 1316 return false; 1317 } 1318 #endif 1319 } 1320 } 1321 break; 1322 } 1323 return true; 1324 }; 1325 1326 auto emit = [&](Val id, bool scalar) { 1327 const Builder::Instruction& inst = instructions[id]; 1328 1329 // No need to emit dead code instructions that produce values that are never used. 1330 if (inst.death == 0) { 1331 return true; 1332 } 1333 1334 Op op = inst.op; 1335 Val x = inst.x, 1336 y = inst.y, 1337 z = inst.z; 1338 int imm = inst.imm; 1339 1340 // Most (but not all) ops create an output value and need a register to hold it, dst. 1341 // We track each instruction's dst in r[] so we can thread it through as an input 1342 // to any future instructions needing that value. 1343 // 1344 // And some ops may need a temporary scratch register, tmp. Some need both tmp and dst. 1345 // 1346 // tmp and dst are very similar and can and will often be assigned the same register, 1347 // but tmp may never alias any of the instructions's inputs, while dst may when this 1348 // instruction consumes that input, i.e. if the input reaches its end of life here. 1349 // 1350 // We'll assign both registers lazily to keep register pressure as low as possible. 1351 bool tmp_is_set = false, 1352 dst_is_set = false; 1353 Reg tmp_reg = (Reg)0; // This initial value won't matter... anything legal is fine. 1354 1355 bool ok = true; // Set to false if we need to assign a register and none's available. 1356 1357 // First lock in how to choose tmp if we need to based on the registers 1358 // available before this instruction, not including any of its input registers. 1359 auto tmp = [&,avail/*important, closing over avail's current value*/]{ 1360 if (!tmp_is_set) { 1361 tmp_is_set = true; 1362 if (int found = __builtin_ffs(avail)) { 1363 // This is a scratch register just for this op, 1364 // so we leave it marked available for future ops. 1365 tmp_reg = (Reg)(found - 1); 1366 } else { 1367 // We needed a tmp register but couldn't find one available. :'( 1368 // This will cause emit() to return false, in turn causing jit() to fail. 1369 ok = false; 1370 } 1371 } 1372 return tmp_reg; 1373 }; 1374 1375 // Now make available any registers that are consumed by this instruction. 1376 // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.) 1377 if (x != NA && instructions[x].death == id) { avail |= 1 << r[x]; } 1378 if (y != NA && instructions[y].death == id) { avail |= 1 << r[y]; } 1379 if (z != NA && instructions[z].death == id) { avail |= 1 << r[z]; } 1380 // set_dst() and dst() will work read/write with this perhaps-just-updated avail. 1381 1382 // Some ops may decide dst on their own to best fit the instruction (see Op::mad_f32). 1383 auto set_dst = [&](Reg reg){ 1384 SkASSERT(dst_is_set == false); 1385 dst_is_set = true; 1386 1387 SkASSERT(avail & (1<<reg)); 1388 avail ^= 1<<reg; 1389 1390 r[id] = reg; 1391 }; 1392 1393 // Thanks to AVX and NEON's 3-argument instruction sets, 1394 // most ops can use any register as dst. 1395 auto dst = [&]{ 1396 if (!dst_is_set) { 1397 if (int found = __builtin_ffs(avail)) { 1398 set_dst((Reg)(found-1)); 1399 } else { 1400 // Same deal as with tmp... all the registers are occupied. Time to fail! 1401 ok = false; 1402 } 1403 } 1404 return r[id]; 1405 }; 1406 1407 // Because we use the same logic to pick an arbitrary dst and to pick tmp, 1408 // and we know that tmp will never overlap any of the inputs, `dst() == tmp()` 1409 // is a simple idiom to check that the destination does not overlap any of the inputs. 1410 // Sometimes we can use this knowledge to do better instruction selection. 1411 1412 // Ok! Keep in mind that we haven't assigned tmp or dst yet, 1413 // just laid out hooks for how to do so if we need them, depending on the instruction. 1414 // 1415 // Now let's actually assemble the instruction! 1416 switch (op) { 1417 default: 1418 #if 0 1419 SkDEBUGFAILF("\n%d not yet implemented\n", op); 1420 #endif 1421 return false; // TODO: many new ops 1422 1423 #if defined(__x86_64__) 1424 case Op::store8: if (scalar) { a->vpextrb (arg[imm], (A::Xmm)r[x], 0); } 1425 else { a->vpackusdw(tmp(), r[x], r[x]); 1426 a->vpermq (tmp(), tmp(), 0xd8); 1427 a->vpackuswb(tmp(), tmp(), tmp()); 1428 a->vmovq (arg[imm], (A::Xmm)tmp()); } 1429 break; 1430 1431 case Op::store16: if (scalar) { a->vpextrw (arg[imm], (A::Xmm)r[x], 0); } 1432 else { a->vpackusdw(tmp(), r[x], r[x]); 1433 a->vpermq (tmp(), tmp(), 0xd8); 1434 a->vmovups (arg[imm], (A::Xmm)tmp()); } 1435 break; 1436 1437 case Op::store32: if (scalar) { a->vmovd (arg[imm], (A::Xmm)r[x]); } 1438 else { a->vmovups(arg[imm], r[x]); } 1439 break; 1440 1441 case Op::load8: if (scalar) { 1442 a->vpxor (dst(), dst(), dst()); 1443 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[imm], 0); 1444 } else { 1445 a->vpmovzxbd(dst(), arg[imm]); 1446 } break; 1447 1448 case Op::load16: if (scalar) { 1449 a->vpxor (dst(), dst(), dst()); 1450 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[imm], 0); 1451 } else { 1452 a->vpmovzxwd(dst(), arg[imm]); 1453 } break; 1454 1455 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), arg[imm]); } 1456 else { a->vmovups( dst(), arg[imm]); } 1457 break; 1458 1459 case Op::uniform8: a->movzbl(A::rax, arg[imm&0xffff], imm>>16); 1460 a->vmovd_direct((A::Xmm)dst(), A::rax); 1461 a->vbroadcastss(dst(), (A::Xmm)dst()); 1462 break; 1463 1464 case Op::uniform32: a->vbroadcastss(dst(), arg[imm&0xffff], imm>>16); 1465 break; 1466 1467 case Op::splat: a->vbroadcastss(dst(), &splats.find(imm)->label); 1468 break; 1469 // TODO: many of these instructions have variants that 1470 // can read one of their arugments from 32-byte memory 1471 // instead of a register. Find a way to avoid needing 1472 // to splat most* constants out at all? 1473 // (*Might work for x - 255 but not 255 - x, so will 1474 // always need to be able to splat to a register.) 1475 1476 case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break; 1477 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break; 1478 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break; 1479 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break; 1480 1481 case Op::mad_f32: 1482 if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); } 1483 else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); } 1484 else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); } 1485 else { SkASSERT(dst() == tmp()); 1486 a->vmovdqa (dst(),r[x]); 1487 a->vfmadd132ps(dst(),r[z], r[y]); } 1488 break; 1489 1490 case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break; 1491 case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break; 1492 case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break; 1493 1494 case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break; 1495 case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break; 1496 case Op::shr_i16x2: a->vpsrlw (dst(), r[x], imm); break; 1497 1498 case Op::bit_and : a->vpand (dst(), r[x], r[y]); break; 1499 case Op::bit_or : a->vpor (dst(), r[x], r[y]); break; 1500 case Op::bit_xor : a->vpxor (dst(), r[x], r[y]); break; 1501 case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break; // N.B. Y then X. 1502 case Op::select : a->vpblendvb(dst(), r[z], r[y], r[x]); break; 1503 1504 case Op::shl_i32: a->vpslld(dst(), r[x], imm); break; 1505 case Op::shr_i32: a->vpsrld(dst(), r[x], imm); break; 1506 case Op::sra_i32: a->vpsrad(dst(), r[x], imm); break; 1507 1508 case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break; 1509 case Op::lt_i32: a->vpcmpgtd(dst(), r[y], r[x]); break; 1510 case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break; 1511 1512 case Op::extract: if (imm == 0) { a->vpand (dst(), r[x], r[y]); } 1513 else { a->vpsrld(tmp(), r[x], imm); 1514 a->vpand (dst(), tmp(), r[y]); } 1515 break; 1516 1517 case Op::pack: a->vpslld(tmp(), r[y], imm); 1518 a->vpor (dst(), tmp(), r[x]); 1519 break; 1520 1521 case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break; 1522 case Op::to_i32: a->vcvttps2dq(dst(), r[x]); break; 1523 1524 case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(imm)->label); 1525 break; 1526 1527 #elif defined(__aarch64__) 1528 case Op::store8: a->xtns2h(tmp(), r[x]); 1529 a->xtnh2b(tmp(), tmp()); 1530 if (scalar) { a->strb (tmp(), arg[imm]); } 1531 else { a->strs (tmp(), arg[imm]); } 1532 break; 1533 // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here. 1534 1535 case Op::store32: if (scalar) { a->strs(r[x], arg[imm]); } 1536 else { a->strq(r[x], arg[imm]); } 1537 break; 1538 1539 case Op::load8: if (scalar) { a->ldrb(tmp(), arg[imm]); } 1540 else { a->ldrs(tmp(), arg[imm]); } 1541 a->uxtlb2h(tmp(), tmp()); 1542 a->uxtlh2s(dst(), tmp()); 1543 break; 1544 1545 case Op::load32: if (scalar) { a->ldrs(dst(), arg[imm]); } 1546 else { a->ldrq(dst(), arg[imm]); } 1547 break; 1548 1549 case Op::splat: a->ldrq(dst(), &splats.find(imm)->label); 1550 break; 1551 // TODO: If we hoist these, pack 4 values in each register 1552 // and use vector/lane operations, cutting the register 1553 // pressure cost of hoisting by 4? 1554 1555 case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break; 1556 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break; 1557 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break; 1558 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break; 1559 1560 case Op::mad_f32: 1561 if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z], r[x], r[y]); } 1562 else { a->orr16b(tmp(), r[z], r[z]); 1563 a->fmla4s(tmp(), r[x], r[y]); 1564 if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } 1565 break; 1566 1567 1568 case Op::add_i32: a->add4s(dst(), r[x], r[y]); break; 1569 case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break; 1570 case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break; 1571 1572 case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break; 1573 case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break; 1574 case Op::shr_i16x2: a->ushr8h(dst(), r[x], imm); break; 1575 1576 case Op::bit_and : a->and16b(dst(), r[x], r[y]); break; 1577 case Op::bit_or : a->orr16b(dst(), r[x], r[y]); break; 1578 case Op::bit_xor : a->eor16b(dst(), r[x], r[y]); break; 1579 case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break; 1580 1581 case Op::shl_i32: a-> shl4s(dst(), r[x], imm); break; 1582 case Op::shr_i32: a->ushr4s(dst(), r[x], imm); break; 1583 case Op::sra_i32: a->sshr4s(dst(), r[x], imm); break; 1584 1585 case Op::extract: if (imm) { a->ushr4s(tmp(), r[x], imm); 1586 a->and16b(dst(), tmp(), r[y]); } 1587 else { a->and16b(dst(), r[x], r[y]); } 1588 break; 1589 1590 case Op::pack: 1591 if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x], r[y], imm); } 1592 else { a->shl4s (tmp(), r[y], imm); 1593 a->orr16b(dst(), tmp(), r[x]); } 1594 break; 1595 1596 case Op::to_f32: a->scvtf4s (dst(), r[x]); break; 1597 case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break; 1598 1599 case Op::bytes: if (hoist) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); } 1600 else { a->ldrq(tmp(), &bytes_masks.find(imm)->label); 1601 a->tbl (dst(), r[x], tmp()); } 1602 break; 1603 #endif 1604 } 1605 1606 // Calls to tmp() or dst() might have flipped this false from its default true state. 1607 return ok; 1608 }; 1609 1610 1611 #if defined(__x86_64__) 1612 const int K = 8; 1613 auto jump_if_less = [&](A::Label* l) { a->jl (l); }; 1614 auto jump = [&](A::Label* l) { a->jmp(l); }; 1615 1616 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; 1617 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; 1618 1619 auto exit = [&]{ a->vzeroupper(); a->ret(); }; 1620 #elif defined(__aarch64__) 1621 const int K = 4; 1622 auto jump_if_less = [&](A::Label* l) { a->blt(l); }; 1623 auto jump = [&](A::Label* l) { a->b (l); }; 1624 1625 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; 1626 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; 1627 1628 auto exit = [&]{ a->ret(A::x30); }; 1629 #endif 1630 1631 A::Label body, 1632 tail, 1633 done; 1634 1635 for (Val id = 0; id < (Val)instructions.size(); id++) { 1636 if (!warmup(id)) { 1637 return false; 1638 } 1639 if (hoisted(id) && !emit(id, /*scalar=*/false)) { 1640 return false; 1641 } 1642 } 1643 1644 a->label(&body); 1645 { 1646 a->cmp(N, K); 1647 jump_if_less(&tail); 1648 for (Val id = 0; id < (Val)instructions.size(); id++) { 1649 if (!hoisted(id) && !emit(id, /*scalar=*/false)) { 1650 return false; 1651 } 1652 } 1653 for (int i = 0; i < (int)fStrides.size(); i++) { 1654 if (fStrides[i]) { 1655 add(arg[i], K*fStrides[i]); 1656 } 1657 } 1658 sub(N, K); 1659 jump(&body); 1660 } 1661 1662 a->label(&tail); 1663 { 1664 a->cmp(N, 1); 1665 jump_if_less(&done); 1666 for (Val id = 0; id < (Val)instructions.size(); id++) { 1667 if (!hoisted(id) && !emit(id, /*scalar=*/true)) { 1668 return false; 1669 } 1670 } 1671 for (int i = 0; i < (int)fStrides.size(); i++) { 1672 if (fStrides[i]) { 1673 add(arg[i], 1*fStrides[i]); 1674 } 1675 } 1676 sub(N, 1); 1677 jump(&tail); 1678 } 1679 1680 a->label(&done); 1681 { 1682 exit(); 1683 } 1684 1685 bytes_masks.foreach([&](int imm, LabelAndReg* entry) { 1686 // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb. 1687 #if defined(__x86_64__) 1688 a->align(32); 1689 #elif defined(__aarch64__) 1690 a->align(4); 1691 #endif 1692 1693 a->label(&entry->label); 1694 int mask[4]; 1695 bytes_control(imm, mask); 1696 a->bytes(mask, sizeof(mask)); 1697 #if defined(__x86_64__) 1698 a->bytes(mask, sizeof(mask)); 1699 #endif 1700 }); 1701 1702 splats.foreach([&](int imm, LabelAndReg* entry) { 1703 // vbroadcastss 4 bytes on x86-64, or simply load 16-bytes on aarch64. 1704 a->align(4); 1705 a->label(&entry->label); 1706 a->word(imm); 1707 #if defined(__aarch64__) 1708 a->word(imm); 1709 a->word(imm); 1710 a->word(imm); 1711 #endif 1712 }); 1713 1714 return true; 1715 } 1716 setupJIT(const std::vector<Builder::Instruction> & instructions,const char * debug_name)1717 void Program::setupJIT(const std::vector<Builder::Instruction>& instructions, 1718 const char* debug_name) { 1719 // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble. 1720 Assembler a{nullptr}; 1721 1722 // First try allowing code hoisting (faster code) 1723 // then again without if that fails (lower register pressure). 1724 bool hoist = true; 1725 if (!this->jit(instructions, hoist, &a)) { 1726 hoist = false; 1727 if (!this->jit(instructions, hoist, &a)) { 1728 return; 1729 } 1730 } 1731 1732 // Allocate space that we can remap as executable. 1733 const size_t page = sysconf(_SC_PAGESIZE); 1734 fJITSize = ((a.size() + page - 1) / page) * page; // mprotect works at page granularity. 1735 fJITBuf = mmap(nullptr,fJITSize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); 1736 1737 // Assemble the program for real. 1738 a = Assembler{fJITBuf}; 1739 SkAssertResult(this->jit(instructions, hoist, &a)); 1740 SkASSERT(a.size() <= fJITSize); 1741 1742 // Remap as executable, and flush caches on platforms that need that. 1743 mprotect(fJITBuf, fJITSize, PROT_READ|PROT_EXEC); 1744 __builtin___clear_cache((char*)fJITBuf, 1745 (char*)fJITBuf + fJITSize); 1746 #if defined(SKVM_PERF_DUMPS) 1747 this->dumpJIT(debug_name, a.size()); 1748 #endif 1749 } 1750 #endif 1751 1752 #if defined(SKVM_PERF_DUMPS) dumpJIT(const char * debug_name,size_t size) const1753 void Program::dumpJIT(const char* debug_name, size_t size) const { 1754 #if 0 && defined(__aarch64__) 1755 if (debug_name) { 1756 SkDebugf("\n%s:", debug_name); 1757 } 1758 // cat | llvm-mc -arch aarch64 -disassemble 1759 auto cur = (const uint8_t*)fJITBuf; 1760 for (int i = 0; i < (int)size; i++) { 1761 if (i % 4 == 0) { 1762 SkDebugf("\n"); 1763 } 1764 SkDebugf("0x%02x ", *cur++); 1765 } 1766 SkDebugf("\n"); 1767 #endif 1768 1769 // We're doing some really stateful things below so one thread at a time please... 1770 static SkSpinlock dump_lock; 1771 SkAutoSpinlock lock(dump_lock); 1772 1773 auto fnv1a = [](const void* vbuf, size_t n) { 1774 uint32_t hash = 2166136261; 1775 for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) { 1776 hash ^= *buf; 1777 hash *= 16777619; 1778 } 1779 return hash; 1780 }; 1781 1782 1783 char name[64]; 1784 uint32_t hash = fnv1a(fJITBuf, size); 1785 if (debug_name) { 1786 sprintf(name, "skvm-jit-%s", debug_name); 1787 } else { 1788 sprintf(name, "skvm-jit-%u", hash); 1789 } 1790 1791 // Create a jit-<pid>.dump file that we can `perf inject -j` into a 1792 // perf.data captured with `perf record -k 1`, letting us see each 1793 // JIT'd Program as if a function named skvm-jit-<hash>. E.g. 1794 // 1795 // ninja -C out nanobench 1796 // perf record -k 1 out/nanobench -m SkVM_4096_I32\$ 1797 // perf inject -j -i perf.data -o perf.data.jit 1798 // perf report -i perf.data.jit 1799 // 1800 // Running `perf inject -j` will also dump an .so for each JIT'd 1801 // program, named jitted-<pid>-<hash>.so. 1802 // 1803 // https://lwn.net/Articles/638566/ 1804 // https://v8.dev/docs/linux-perf 1805 // https://cs.chromium.org/chromium/src/v8/src/diagnostics/perf-jit.cc 1806 // https://lore.kernel.org/patchwork/patch/622240/ 1807 1808 1809 auto timestamp_ns = []() -> uint64_t { 1810 // It's important to use CLOCK_MONOTONIC here so that perf can 1811 // correlate our timestamps with those captured by `perf record 1812 // -k 1`. That's also what `-k 1` does, by the way, tell perf 1813 // record to use CLOCK_MONOTONIC. 1814 struct timespec ts; 1815 clock_gettime(CLOCK_MONOTONIC, &ts); 1816 return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec; 1817 }; 1818 1819 // We'll open the jit-<pid>.dump file and write a small header once, 1820 // and just leave it open forever because we're lazy. 1821 static FILE* jitdump = [&]{ 1822 // Must map as w+ for the mmap() call below to work. 1823 char path[64]; 1824 sprintf(path, "jit-%d.dump", getpid()); 1825 FILE* f = fopen(path, "w+"); 1826 1827 // Calling mmap() on the file adds a "hey they mmap()'d this" record to 1828 // the perf.data file that will point `perf inject -j` at this log file. 1829 // Kind of a strange way to tell `perf inject` where the file is... 1830 void* marker = mmap(nullptr, sysconf(_SC_PAGESIZE), 1831 PROT_READ|PROT_EXEC, MAP_PRIVATE, 1832 fileno(f), /*offset=*/0); 1833 SkASSERT_RELEASE(marker != MAP_FAILED); 1834 // Like never calling fclose(f), we'll also just always leave marker mmap()'d. 1835 1836 #if defined(__x86_64__) 1837 const uint32_t elf_mach = 62; 1838 #elif defined(__aarch64__) 1839 const uint32_t elf_mach = 183; 1840 #endif 1841 1842 struct Header { 1843 uint32_t magic, version, header_size, elf_mach, reserved, pid; 1844 uint64_t timestamp_us, flags; 1845 } header = { 1846 0x4A695444, 1, sizeof(Header), elf_mach, 0, (uint32_t)getpid(), 1847 timestamp_ns() / 1000, 0, 1848 }; 1849 fwrite(&header, sizeof(header), 1, f); 1850 1851 return f; 1852 }(); 1853 1854 struct CodeLoad { 1855 uint32_t event_type, event_size; 1856 uint64_t timestamp_ns; 1857 1858 uint32_t pid, tid; 1859 uint64_t vma/*???*/, code_addr, code_size, id; 1860 } load = { 1861 0/*code load*/, (uint32_t)(sizeof(CodeLoad) + strlen(name) + 1 + size), 1862 timestamp_ns(), 1863 1864 (uint32_t)getpid(), (uint32_t)SkGetThreadID(), 1865 (uint64_t)fJITBuf, (uint64_t)fJITBuf, size, hash, 1866 }; 1867 1868 // Write the header, the JIT'd function name, and the JIT'd code itself. 1869 fwrite(&load, sizeof(load), 1, jitdump); 1870 fwrite(name, 1, strlen(name), jitdump); 1871 fwrite("\0", 1, 1, jitdump); 1872 fwrite(fJITBuf, 1, size, jitdump); 1873 } 1874 #endif 1875 1876 } // namespace skvm 1877