• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/private/SkSpinlock.h"
9 #include "include/private/SkTFitsIn.h"
10 #include "include/private/SkThreadID.h"
11 #include "include/private/SkVx.h"
12 #include "src/core/SkCpu.h"
13 #include "src/core/SkVM.h"
14 #include <string.h>
15 #if defined(SKVM_JIT)
16     #include <sys/mman.h>
17 #endif
18 
19 namespace skvm {
20 
done(const char * debug_name)21     Program Builder::done(const char* debug_name) {
22         // Basic liveness analysis:
23         // an instruction is live until all live instructions that need its input have retired.
24         for (Val id = fProgram.size(); id --> 0; ) {
25             Instruction& inst = fProgram[id];
26             // All side-effect-only instructions (stores) are live.
27             if (inst.op <= Op::store32) {
28                 inst.death = id;
29             }
30             // The arguments of a live instruction must live until at least that instruction.
31             if (inst.death != 0) {
32                 // Notice how we're walking backward, storing the latest instruction in death.
33                 if (inst.x != NA && fProgram[inst.x].death == 0) { fProgram[inst.x].death = id; }
34                 if (inst.y != NA && fProgram[inst.y].death == 0) { fProgram[inst.y].death = id; }
35                 if (inst.z != NA && fProgram[inst.z].death == 0) { fProgram[inst.z].death = id; }
36             }
37         }
38 
39         // Mark which values don't depend on the loop and can be hoisted.
40         for (Val id = 0; id < (Val)fProgram.size(); id++) {
41             Builder::Instruction& inst = fProgram[id];
42 
43             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
44             if (inst.op <= Op::gather32) {
45                 inst.hoist = false;
46             }
47 
48             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
49             if (inst.hoist) {
50                 if (inst.x != NA) { inst.hoist &= fProgram[inst.x].hoist; }
51                 if (inst.y != NA) { inst.hoist &= fProgram[inst.y].hoist; }
52                 if (inst.z != NA) { inst.hoist &= fProgram[inst.z].hoist; }
53             }
54 
55             // Any hoisted values used inside the loop need to live forever.
56             if (!inst.hoist) {
57                 auto make_immortal = [&](Val arg) {
58                     if (fProgram[arg].death != 0) {
59                         fProgram[arg].death = (Val)fProgram.size();
60                     }
61                 };
62                 if (inst.x != NA && fProgram[inst.x].hoist) { make_immortal(inst.x); }
63                 if (inst.y != NA && fProgram[inst.y].hoist) { make_immortal(inst.y); }
64                 if (inst.z != NA && fProgram[inst.z].hoist) { make_immortal(inst.z); }
65             }
66         }
67 
68         return {fProgram, fStrides, debug_name};
69     }
70 
operator ==(const Builder::Instruction & a,const Builder::Instruction & b)71     static bool operator==(const Builder::Instruction& a, const Builder::Instruction& b) {
72         return a.op    == b.op
73             && a.x     == b.x
74             && a.y     == b.y
75             && a.z     == b.z
76             && a.imm   == b.imm
77             && a.death == b.death
78             && a.hoist == b.hoist;
79     }
80 
81     // Most instructions produce a value and return it by ID,
82     // the value-producing instruction's own index in the program vector.
push(Op op,Val x,Val y,Val z,int imm)83     Val Builder::push(Op op, Val x, Val y, Val z, int imm) {
84         Instruction inst{op, x, y, z, imm, /*death=*/0, /*hoist=*/true};
85 
86         // Basic common subexpression elimination:
87         // if we've already seen this exact Instruction, use it instead of creating a new one.
88         if (Val* id = fIndex.find(inst)) {
89             return *id;
90         }
91         Val id = static_cast<Val>(fProgram.size());
92         fProgram.push_back(inst);
93         fIndex.set(inst, id);
94         return id;
95     }
96 
isZero(Val id) const97     bool Builder::isZero(Val id) const {
98         return fProgram[id].op  == Op::splat
99             && fProgram[id].imm == 0;
100     }
101 
arg(int stride)102     Arg Builder::arg(int stride) {
103         int ix = (int)fStrides.size();
104         fStrides.push_back(stride);
105         return {ix};
106     }
107 
store8(Arg ptr,I32 val)108     void Builder::store8 (Arg ptr, I32 val) { (void)this->push(Op::store8 , val.id,NA,NA, ptr.ix); }
store16(Arg ptr,I32 val)109     void Builder::store16(Arg ptr, I32 val) { (void)this->push(Op::store16, val.id,NA,NA, ptr.ix); }
store32(Arg ptr,I32 val)110     void Builder::store32(Arg ptr, I32 val) { (void)this->push(Op::store32, val.id,NA,NA, ptr.ix); }
111 
load8(Arg ptr)112     I32 Builder::load8 (Arg ptr) { return {this->push(Op::load8 , NA,NA,NA, ptr.ix) }; }
load16(Arg ptr)113     I32 Builder::load16(Arg ptr) { return {this->push(Op::load16, NA,NA,NA, ptr.ix) }; }
load32(Arg ptr)114     I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; }
115 
gather8(Arg ptr,I32 offset)116     I32 Builder::gather8 (Arg ptr, I32 offset) {
117         return {this->push(Op::gather8 , offset.id,NA,NA, ptr.ix)};
118     }
gather16(Arg ptr,I32 offset)119     I32 Builder::gather16(Arg ptr, I32 offset) {
120         return {this->push(Op::gather16, offset.id,NA,NA, ptr.ix)};
121     }
gather32(Arg ptr,I32 offset)122     I32 Builder::gather32(Arg ptr, I32 offset) {
123         return {this->push(Op::gather32, offset.id,NA,NA, ptr.ix)};
124     }
125 
uniform8(Arg ptr,int offset)126     I32 Builder::uniform8(Arg ptr, int offset) {
127         return {this->push(Op::uniform8, NA,NA,NA, ptr.ix | (offset<<16))};
128     }
uniform16(Arg ptr,int offset)129     I32 Builder::uniform16(Arg ptr, int offset) {
130         return {this->push(Op::uniform16, NA,NA,NA, ptr.ix | (offset<<16))};
131     }
uniform32(Arg ptr,int offset)132     I32 Builder::uniform32(Arg ptr, int offset) {
133         return {this->push(Op::uniform32, NA,NA,NA, ptr.ix | (offset<<16))};
134     }
135 
136     // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
splat(int n)137     I32 Builder::splat(int   n) { return {this->push(Op::splat, NA,NA,NA, n) }; }
splat(float f)138     F32 Builder::splat(float f) {
139         int bits;
140         memcpy(&bits, &f, 4);
141         return {this->push(Op::splat, NA,NA,NA, bits)};
142     }
143 
add(F32 x,F32 y)144     F32 Builder::add(F32 x, F32 y       ) { return {this->push(Op::add_f32, x.id, y.id)}; }
sub(F32 x,F32 y)145     F32 Builder::sub(F32 x, F32 y       ) { return {this->push(Op::sub_f32, x.id, y.id)}; }
mul(F32 x,F32 y)146     F32 Builder::mul(F32 x, F32 y       ) { return {this->push(Op::mul_f32, x.id, y.id)}; }
div(F32 x,F32 y)147     F32 Builder::div(F32 x, F32 y       ) { return {this->push(Op::div_f32, x.id, y.id)}; }
mad(F32 x,F32 y,F32 z)148     F32 Builder::mad(F32 x, F32 y, F32 z) {
149         if (this->isZero(z.id)) {
150             return this->mul(x,y);
151         }
152         return {this->push(Op::mad_f32, x.id, y.id, z.id)};
153     }
154 
add(I32 x,I32 y)155     I32 Builder::add(I32 x, I32 y) { return {this->push(Op::add_i32, x.id, y.id)}; }
sub(I32 x,I32 y)156     I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; }
mul(I32 x,I32 y)157     I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; }
158 
add_16x2(I32 x,I32 y)159     I32 Builder::add_16x2(I32 x, I32 y) { return {this->push(Op::add_i16x2, x.id, y.id)}; }
sub_16x2(I32 x,I32 y)160     I32 Builder::sub_16x2(I32 x, I32 y) { return {this->push(Op::sub_i16x2, x.id, y.id)}; }
mul_16x2(I32 x,I32 y)161     I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; }
162 
shl(I32 x,int bits)163     I32 Builder::shl(I32 x, int bits) { return {this->push(Op::shl_i32, x.id,NA,NA, bits)}; }
shr(I32 x,int bits)164     I32 Builder::shr(I32 x, int bits) { return {this->push(Op::shr_i32, x.id,NA,NA, bits)}; }
sra(I32 x,int bits)165     I32 Builder::sra(I32 x, int bits) { return {this->push(Op::sra_i32, x.id,NA,NA, bits)}; }
166 
shl_16x2(I32 x,int bits)167     I32 Builder::shl_16x2(I32 x, int bits) { return {this->push(Op::shl_i16x2, x.id,NA,NA, bits)}; }
shr_16x2(I32 x,int bits)168     I32 Builder::shr_16x2(I32 x, int bits) { return {this->push(Op::shr_i16x2, x.id,NA,NA, bits)}; }
sra_16x2(I32 x,int bits)169     I32 Builder::sra_16x2(I32 x, int bits) { return {this->push(Op::sra_i16x2, x.id,NA,NA, bits)}; }
170 
eq(F32 x,F32 y)171     I32 Builder:: eq(F32 x, F32 y) { return {this->push(Op:: eq_f32, x.id, y.id)}; }
neq(F32 x,F32 y)172     I32 Builder::neq(F32 x, F32 y) { return {this->push(Op::neq_f32, x.id, y.id)}; }
lt(F32 x,F32 y)173     I32 Builder:: lt(F32 x, F32 y) { return {this->push(Op:: lt_f32, x.id, y.id)}; }
lte(F32 x,F32 y)174     I32 Builder::lte(F32 x, F32 y) { return {this->push(Op::lte_f32, x.id, y.id)}; }
gt(F32 x,F32 y)175     I32 Builder:: gt(F32 x, F32 y) { return {this->push(Op:: gt_f32, x.id, y.id)}; }
gte(F32 x,F32 y)176     I32 Builder::gte(F32 x, F32 y) { return {this->push(Op::gte_f32, x.id, y.id)}; }
177 
eq(I32 x,I32 y)178     I32 Builder:: eq(I32 x, I32 y) { return {this->push(Op:: eq_i32, x.id, y.id)}; }
neq(I32 x,I32 y)179     I32 Builder::neq(I32 x, I32 y) { return {this->push(Op::neq_i32, x.id, y.id)}; }
lt(I32 x,I32 y)180     I32 Builder:: lt(I32 x, I32 y) { return {this->push(Op:: lt_i32, x.id, y.id)}; }
lte(I32 x,I32 y)181     I32 Builder::lte(I32 x, I32 y) { return {this->push(Op::lte_i32, x.id, y.id)}; }
gt(I32 x,I32 y)182     I32 Builder:: gt(I32 x, I32 y) { return {this->push(Op:: gt_i32, x.id, y.id)}; }
gte(I32 x,I32 y)183     I32 Builder::gte(I32 x, I32 y) { return {this->push(Op::gte_i32, x.id, y.id)}; }
184 
eq_16x2(I32 x,I32 y)185     I32 Builder:: eq_16x2(I32 x, I32 y) { return {this->push(Op:: eq_i16x2, x.id, y.id)}; }
neq_16x2(I32 x,I32 y)186     I32 Builder::neq_16x2(I32 x, I32 y) { return {this->push(Op::neq_i16x2, x.id, y.id)}; }
lt_16x2(I32 x,I32 y)187     I32 Builder:: lt_16x2(I32 x, I32 y) { return {this->push(Op:: lt_i16x2, x.id, y.id)}; }
lte_16x2(I32 x,I32 y)188     I32 Builder::lte_16x2(I32 x, I32 y) { return {this->push(Op::lte_i16x2, x.id, y.id)}; }
gt_16x2(I32 x,I32 y)189     I32 Builder:: gt_16x2(I32 x, I32 y) { return {this->push(Op:: gt_i16x2, x.id, y.id)}; }
gte_16x2(I32 x,I32 y)190     I32 Builder::gte_16x2(I32 x, I32 y) { return {this->push(Op::gte_i16x2, x.id, y.id)}; }
191 
bit_and(I32 x,I32 y)192     I32 Builder::bit_and  (I32 x, I32 y) { return {this->push(Op::bit_and  , x.id, y.id)}; }
bit_or(I32 x,I32 y)193     I32 Builder::bit_or   (I32 x, I32 y) { return {this->push(Op::bit_or   , x.id, y.id)}; }
bit_xor(I32 x,I32 y)194     I32 Builder::bit_xor  (I32 x, I32 y) { return {this->push(Op::bit_xor  , x.id, y.id)}; }
bit_clear(I32 x,I32 y)195     I32 Builder::bit_clear(I32 x, I32 y) { return {this->push(Op::bit_clear, x.id, y.id)}; }
select(I32 x,I32 y,I32 z)196     I32 Builder::select(I32 x, I32 y, I32 z) { return {this->push(Op::select, x.id, y.id, z.id)}; }
197 
198 
extract(I32 x,int bits,I32 y)199     I32 Builder::extract(I32 x, int bits, I32 y) {
200         return {this->push(Op::extract, x.id,y.id,NA, bits)};
201     }
202 
pack(I32 x,I32 y,int bits)203     I32 Builder::pack(I32 x, I32 y, int bits) {
204         return {this->push(Op::pack, x.id,y.id,NA, bits)};
205     }
206 
bytes(I32 x,int control)207     I32 Builder::bytes(I32 x, int control) {
208         return {this->push(Op::bytes, x.id,NA,NA, control)};
209     }
210 
to_f32(I32 x)211     F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; }
to_i32(F32 x)212     I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; }
213 
214     // ~~~~ Program::eval() and co. ~~~~ //
215 
216     // Handy references for x86-64 instruction encoding:
217     // https://wiki.osdev.org/X86-64_Instruction_Encoding
218     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
219     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
220     // http://ref.x86asm.net/coder64.html
221 
222     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)223     static uint8_t _233(int a, int b, int c) {
224         return (a & 3) << 6
225              | (b & 7) << 3
226              | (c & 7) << 0;
227     }
228 
229     // ModRM byte encodes the arguments of an opcode.
230     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)231     static uint8_t mod_rm(Mod mod, int reg, int rm) {
232         return _233((int)mod, reg, rm);
233     }
234 
mod(int imm)235     static Mod mod(int imm) {
236         if (imm == 0)               { return Mod::Indirect; }
237         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
238         return Mod::FourByteImm;
239     }
240 
imm_bytes(Mod mod)241     static int imm_bytes(Mod mod) {
242         switch (mod) {
243             case Mod::Indirect:    return 0;
244             case Mod::OneByteImm:  return 1;
245             case Mod::FourByteImm: return 4;
246             case Mod::Direct: SkUNREACHABLE;
247         }
248         SkUNREACHABLE;
249     }
250 
251 #if 0
252     // SIB byte encodes a memory address, base + (index * scale).
253     enum class Scale { One, Two, Four, Eight };
254     static uint8_t sib(Scale scale, int index, int base) {
255         return _233((int)scale, index, base);
256     }
257 #endif
258 
259     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)260     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
261                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
262                        bool X,   // Extra top bit for SIB index register.
263                        bool B) { // Extra top bit for SIB base or ModRM rm register.
264         return 0b01000000   // Fixed 0100 for top four bits.
265              | (W << 3)
266              | (R << 2)
267              | (X << 1)
268              | (B << 0);
269     }
270 
271 
272     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
273     struct VEX {
274         int     len;
275         uint8_t bytes[3];
276     };
277 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)278     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
279                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
280                    bool   X,   // Same as REX X.
281                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
282                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
283                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
284                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
285                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
286 
287         // Pack x86 opcode map selector to 5-bit VEX encoding.
288         map = [map]{
289             switch (map) {
290                 case   0x0f: return 0b00001;
291                 case 0x380f: return 0b00010;
292                 case 0x3a0f: return 0b00011;
293                 // Several more cases only used by XOP / TBM.
294             }
295             SkUNREACHABLE;
296         }();
297 
298         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
299         pp = [pp]{
300             switch (pp) {
301                 case 0x66: return 0b01;
302                 case 0xf3: return 0b10;
303                 case 0xf2: return 0b11;
304             }
305             return 0b00;
306         }();
307 
308         VEX vex = {0, {0,0,0}};
309         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
310             // With these conditions met, we can optionally compress VEX to 2-byte.
311             vex.len = 2;
312             vex.bytes[0] = 0xc5;
313             vex.bytes[1] = (pp      &  3) << 0
314                          | (L       &  1) << 2
315                          | (~vvvv   & 15) << 3
316                          | (~(int)R &  1) << 7;
317         } else {
318             // We could use this 3-byte VEX prefix all the time if we like.
319             vex.len = 3;
320             vex.bytes[0] = 0xc4;
321             vex.bytes[1] = (map     & 31) << 0
322                          | (~(int)B &  1) << 5
323                          | (~(int)X &  1) << 6
324                          | (~(int)R &  1) << 7;
325             vex.bytes[2] = (pp    &  3) << 0
326                          | (L     &  1) << 2
327                          | (~vvvv & 15) << 3
328                          | (WE    &  1) << 7;
329         }
330         return vex;
331     }
332 
Assembler(void * buf)333     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {}
334 
size() const335     size_t Assembler::size() const { return fSize; }
336 
bytes(const void * p,int n)337     void Assembler::bytes(const void* p, int n) {
338         if (fCurr) {
339             memcpy(fCurr, p, n);
340             fCurr += n;
341         }
342         fSize += n;
343     }
344 
byte(uint8_t b)345     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)346     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
347 
align(int mod)348     void Assembler::align(int mod) {
349         while (this->size() % mod) {
350             this->byte(0x00);
351         }
352     }
353 
vzeroupper()354     void Assembler::vzeroupper() {
355         this->byte(0xc5);
356         this->byte(0xf8);
357         this->byte(0x77);
358     }
ret()359     void Assembler::ret() { this->byte(0xc3); }
360 
361     // Common instruction building for 64-bit opcodes with an immediate argument.
op(int opcode,int opcode_ext,GP64 dst,int imm)362     void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) {
363         opcode |= 0b0000'0001;   // low bit set for 64-bit operands
364         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
365 
366         int imm_bytes = 4;
367         if (SkTFitsIn<int8_t>(imm)) {
368             imm_bytes = 1;
369             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
370         }
371 
372         this->byte(rex(1,0,0,dst>>3));
373         this->byte(opcode);
374         this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7));
375         this->bytes(&imm, imm_bytes);
376     }
377 
add(GP64 dst,int imm)378     void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); }
sub(GP64 dst,int imm)379     void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); }
cmp(GP64 reg,int imm)380     void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); }
381 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Ymm y,bool W)382     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) {
383         VEX v = vex(W, dst>>3, 0, y>>3,
384                     map, x, 1/*ymm, not xmm*/, prefix);
385         this->bytes(v.bytes, v.len);
386         this->byte(opcode);
387         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
388     }
389 
vpaddd(Ymm dst,Ymm x,Ymm y)390     void Assembler::vpaddd (Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,Ymm y)391     void Assembler::vpsubd (Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Ymm y)392     void Assembler::vpmulld(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x40, dst,x,y); }
393 
vpsubw(Ymm dst,Ymm x,Ymm y)394     void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Ymm y)395     void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
396 
vpand(Ymm dst,Ymm x,Ymm y)397     void Assembler::vpand (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,Ymm y)398     void Assembler::vpor  (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,Ymm y)399     void Assembler::vpxor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Ymm y)400     void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
401 
vaddps(Ymm dst,Ymm x,Ymm y)402     void Assembler::vaddps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,Ymm y)403     void Assembler::vsubps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,Ymm y)404     void Assembler::vmulps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Ymm y)405     void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5e, dst,x,y); }
406 
vfmadd132ps(Ymm dst,Ymm x,Ymm y)407     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Ymm y)408     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Ymm y)409     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
410 
vpackusdw(Ymm dst,Ymm x,Ymm y)411     void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Ymm y)412     void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
413 
vpcmpeqd(Ymm dst,Ymm x,Ymm y)414     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Ymm y)415     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); }
416 
vpblendvb(Ymm dst,Ymm x,Ymm y,Ymm z)417     void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) {
418         int prefix = 0x66,
419             map    = 0x3a0f,
420             opcode = 0x4c;
421         VEX v = vex(0, dst>>3, 0, y>>3,
422                     map, x, /*ymm?*/1, prefix);
423         this->bytes(v.bytes, v.len);
424         this->byte(opcode);
425         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
426         this->byte(z << 4);
427     }
428 
429     // dst = x op /opcode_ext imm
op(int prefix,int map,int opcode,int opcode_ext,Ymm dst,Ymm x,int imm)430     void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) {
431         // This is a little weird, but if we pass the opcode_ext as if it were the dst register,
432         // the dst register as if x, and the x register as if y, all the bits end up where we want.
433         this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x);
434         this->byte(imm);
435     }
436 
vpslld(Ymm dst,Ymm x,int imm)437     void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); }
vpsrld(Ymm dst,Ymm x,int imm)438     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); }
vpsrad(Ymm dst,Ymm x,int imm)439     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); }
440 
vpsrlw(Ymm dst,Ymm x,int imm)441     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); }
442 
443 
vpermq(Ymm dst,Ymm x,int imm)444     void Assembler::vpermq(Ymm dst, Ymm x, int imm) {
445         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
446         bool W = true;
447         this->op(0x66,0x3a0f,0x00, dst,x,W);
448         this->byte(imm);
449     }
450 
vmovdqa(Ymm dst,Ymm src)451     void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); }
452 
vcvtdq2ps(Ymm dst,Ymm x)453     void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op(0,   0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Ymm x)454     void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); }
455 
here()456     Assembler::Label Assembler::here() {
457         return { (int)this->size(), Label::None, {} };
458     }
459 
disp19(Label * l)460     int Assembler::disp19(Label* l) {
461         SkASSERT(l->kind == Label::None ||
462                  l->kind == Label::ARMDisp19);
463         l->kind = Label::ARMDisp19;
464         l->references.push_back(here().offset);
465         // ARM 19-bit instruction count, from the beginning of this instruction.
466         return (l->offset - here().offset) / 4;
467     }
468 
disp32(Label * l)469     int Assembler::disp32(Label* l) {
470         SkASSERT(l->kind == Label::None ||
471                  l->kind == Label::X86Disp32);
472         l->kind = Label::X86Disp32;
473         l->references.push_back(here().offset);
474         // x86 32-bit byte count, from the end of this instruction.
475         return l->offset - (here().offset + 4);
476     }
477 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Label * l)478     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) {
479         // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
480         const int rip = rbp;
481 
482         VEX v = vex(0, dst>>3, 0, rip>>3,
483                     map, x, /*ymm?*/1, prefix);
484         this->bytes(v.bytes, v.len);
485         this->byte(opcode);
486         this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
487         this->word(this->disp32(l));
488     }
489 
vpshufb(Ymm dst,Ymm x,Label * l)490     void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); }
491 
vbroadcastss(Ymm dst,Label * l)492     void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); }
vbroadcastss(Ymm dst,Xmm src)493     void Assembler::vbroadcastss(Ymm dst, Xmm src)  { this->op(0x66,0x380f,0x18, dst, (Ymm)src); }
vbroadcastss(Ymm dst,GP64 ptr,int off)494     void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) {
495         int prefix = 0x66,
496                map = 0x380f,
497             opcode = 0x18;
498         VEX v = vex(0, dst>>3, 0, ptr>>3,
499                     map, 0, /*ymm?*/1, prefix);
500         this->bytes(v.bytes, v.len);
501         this->byte(opcode);
502 
503         this->byte(mod_rm(mod(off), dst&7, ptr&7));
504         this->bytes(&off, imm_bytes(mod(off)));
505     }
506 
jump(uint8_t condition,Label * l)507     void Assembler::jump(uint8_t condition, Label* l) {
508         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
509         //    7?     one-byte-disp
510         //    0F 8? four-byte-disp
511         // We always use the near displacement to make updating labels simpler (no resizing).
512         this->byte(0x0f);
513         this->byte(condition);
514         this->word(this->disp32(l));
515     }
je(Label * l)516     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)517     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)518     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
519 
jmp(Label * l)520     void Assembler::jmp(Label* l) {
521         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
522         this->byte(0xe9);
523         this->word(this->disp32(l));
524     }
525 
load_store(int prefix,int map,int opcode,Ymm ymm,GP64 ptr)526     void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) {
527         VEX v = vex(0, ymm>>3, 0, ptr>>3,
528                     map, 0, /*ymm?*/1, prefix);
529         this->bytes(v.bytes, v.len);
530         this->byte(opcode);
531         this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7));
532     }
533 
vmovups(Ymm dst,GP64 src)534     void Assembler::vmovups  (Ymm dst, GP64 src) { this->load_store(0   ,  0x0f,0x10, dst,src); }
vpmovzxwd(Ymm dst,GP64 src)535     void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,GP64 src)536     void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); }
537 
vmovups(GP64 dst,Ymm src)538     void Assembler::vmovups  (GP64 dst, Ymm src) { this->load_store(0   ,  0x0f,0x11, src,dst); }
vmovups(GP64 dst,Xmm src)539     void Assembler::vmovups  (GP64 dst, Xmm src) {
540         // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0.
541         int prefix = 0,
542             map    = 0x0f,
543             opcode = 0x11;
544         VEX v = vex(0, src>>3, 0, dst>>3,
545                     map, 0, /*ymm?*/0, prefix);
546         this->bytes(v.bytes, v.len);
547         this->byte(opcode);
548         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
549     }
550 
vmovq(GP64 dst,Xmm src)551     void Assembler::vmovq(GP64 dst, Xmm src) {
552         int prefix = 0x66,
553             map    = 0x0f,
554             opcode = 0xd6;
555         VEX v = vex(0, src>>3, 0, dst>>3,
556                     map, 0, /*ymm?*/0, prefix);
557         this->bytes(v.bytes, v.len);
558         this->byte(opcode);
559         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
560     }
561 
vmovd(GP64 dst,Xmm src)562     void Assembler::vmovd(GP64 dst, Xmm src) {
563         int prefix = 0x66,
564             map    = 0x0f,
565             opcode = 0x7e;
566         VEX v = vex(0, src>>3, 0, dst>>3,
567                     map, 0, /*ymm?*/0, prefix);
568         this->bytes(v.bytes, v.len);
569         this->byte(opcode);
570         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
571     }
572 
vmovd_direct(GP64 dst,Xmm src)573     void Assembler::vmovd_direct(GP64 dst, Xmm src) {
574         int prefix = 0x66,
575             map    = 0x0f,
576             opcode = 0x7e;
577         VEX v = vex(0, src>>3, 0, dst>>3,
578                     map, 0, /*ymm?*/0, prefix);
579         this->bytes(v.bytes, v.len);
580         this->byte(opcode);
581         this->byte(mod_rm(Mod::Direct, src&7, dst&7));
582     }
583 
vmovd(Xmm dst,GP64 src)584     void Assembler::vmovd(Xmm dst, GP64 src) {
585         int prefix = 0x66,
586             map    = 0x0f,
587             opcode = 0x6e;
588         VEX v = vex(0, dst>>3, 0, src>>3,
589                     map, 0, /*ymm?*/0, prefix);
590         this->bytes(v.bytes, v.len);
591         this->byte(opcode);
592         this->byte(mod_rm(Mod::Indirect, dst&7, src&7));
593     }
594 
vmovd_direct(Xmm dst,GP64 src)595     void Assembler::vmovd_direct(Xmm dst, GP64 src) {
596         int prefix = 0x66,
597             map    = 0x0f,
598             opcode = 0x6e;
599         VEX v = vex(0, dst>>3, 0, src>>3,
600                     map, 0, /*ymm?*/0, prefix);
601         this->bytes(v.bytes, v.len);
602         this->byte(opcode);
603         this->byte(mod_rm(Mod::Direct, dst&7, src&7));
604     }
605 
movzbl(GP64 dst,GP64 src,int off)606     void Assembler::movzbl(GP64 dst, GP64 src, int off) {
607         if ((dst>>3) || (src>>3)) {
608             this->byte(rex(0,dst>>3,0,src>>3));
609         }
610         this->byte(0x0f);
611         this->byte(0xb6);
612         this->byte(mod_rm(mod(off), dst&7, src&7));
613         this->bytes(&off, imm_bytes(mod(off)));
614     }
615 
616 
movb(GP64 dst,GP64 src)617     void Assembler::movb(GP64 dst, GP64 src) {
618         if ((dst>>3) || (src>>3)) {
619             this->byte(rex(0,src>>3,0,dst>>3));
620         }
621         this->byte(0x88);
622         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
623     }
624 
vpinsrw(Xmm dst,Xmm src,GP64 ptr,int imm)625     void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) {
626         int prefix = 0x66,
627             map    = 0x0f,
628             opcode = 0xc4;
629         VEX v = vex(0, dst>>3, 0, ptr>>3,
630                     map, src, /*ymm?*/0, prefix);
631         this->bytes(v.bytes, v.len);
632         this->byte(opcode);
633         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
634         this->byte(imm);
635     }
636 
vpinsrb(Xmm dst,Xmm src,GP64 ptr,int imm)637     void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) {
638         int prefix = 0x66,
639             map    = 0x3a0f,
640             opcode = 0x20;
641         VEX v = vex(0, dst>>3, 0, ptr>>3,
642                     map, src, /*ymm?*/0, prefix);
643         this->bytes(v.bytes, v.len);
644         this->byte(opcode);
645         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
646         this->byte(imm);
647     }
648 
vpextrw(GP64 ptr,Xmm src,int imm)649     void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) {
650         int prefix = 0x66,
651             map    = 0x3a0f,
652             opcode = 0x15;
653 
654         VEX v = vex(0, src>>3, 0, ptr>>3,
655                     map, 0, /*ymm?*/0, prefix);
656         this->bytes(v.bytes, v.len);
657         this->byte(opcode);
658         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
659         this->byte(imm);
660     }
vpextrb(GP64 ptr,Xmm src,int imm)661     void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) {
662         int prefix = 0x66,
663             map    = 0x3a0f,
664             opcode = 0x14;
665 
666         VEX v = vex(0, src>>3, 0, ptr>>3,
667                     map, 0, /*ymm?*/0, prefix);
668         this->bytes(v.bytes, v.len);
669         this->byte(opcode);
670         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
671         this->byte(imm);
672     }
673 
674     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
675 
operator ""_mask(unsigned long long bits)676     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
677 
op(uint32_t hi,V m,uint32_t lo,V n,V d)678     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
679         this->word( (hi & 11_mask) << 21
680                   | (m  &  5_mask) << 16
681                   | (lo &  6_mask) << 10
682                   | (n  &  5_mask) <<  5
683                   | (d  &  5_mask) <<  0);
684     }
685 
and16b(V d,V n,V m)686     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)687     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)688     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)689     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
690 
add4s(V d,V n,V m)691     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)692     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)693     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
694 
sub8h(V d,V n,V m)695     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)696     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
697 
fadd4s(V d,V n,V m)698     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)699     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)700     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)701     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
702 
fmla4s(V d,V n,V m)703     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
704 
tbl(V d,V n,V m)705     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
706 
op(uint32_t op22,int imm,V n,V d)707     void Assembler::op(uint32_t op22, int imm, V n, V d) {
708         this->word( (op22 & 22_mask) << 10
709                   | imm              << 16   // imm is embedded inside op, bit size depends on op
710                   | (n    &  5_mask) <<  5
711                   | (d    &  5_mask) <<  0);
712     }
713 
sli4s(V d,V n,int imm)714     void Assembler::sli4s(V d, V n, int imm) {
715         this->op(0b0'1'1'011110'0100'000'01010'1,    ( imm&31), n, d);
716     }
shl4s(V d,V n,int imm)717     void Assembler::shl4s(V d, V n, int imm) {
718         this->op(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
719     }
sshr4s(V d,V n,int imm)720     void Assembler::sshr4s(V d, V n, int imm) {
721         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
722     }
ushr4s(V d,V n,int imm)723     void Assembler::ushr4s(V d, V n, int imm) {
724         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
725     }
ushr8h(V d,V n,int imm)726     void Assembler::ushr8h(V d, V n, int imm) {
727         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
728     }
729 
scvtf4s(V d,V n)730     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)731     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
732 
xtns2h(V d,V n)733     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)734     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
735 
uxtlb2h(V d,V n)736     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)737     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
738 
ret(X n)739     void Assembler::ret(X n) {
740         this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
741                   | (n & 5_mask) << 5);
742     }
743 
add(X d,X n,int imm12)744     void Assembler::add(X d, X n, int imm12) {
745         this->word(0b1'0'0'10001'00   << 22
746                   | (imm12 & 12_mask) << 10
747                   | (n     &  5_mask) <<  5
748                   | (d     &  5_mask) <<  0);
749     }
sub(X d,X n,int imm12)750     void Assembler::sub(X d, X n, int imm12) {
751         this->word( 0b1'1'0'10001'00  << 22
752                   | (imm12 & 12_mask) << 10
753                   | (n     &  5_mask) <<  5
754                   | (d     &  5_mask) <<  0);
755     }
subs(X d,X n,int imm12)756     void Assembler::subs(X d, X n, int imm12) {
757         this->word( 0b1'1'1'10001'00  << 22
758                   | (imm12 & 12_mask) << 10
759                   | (n     &  5_mask) <<  5
760                   | (d     &  5_mask) <<  0);
761     }
762 
b(Condition cond,Label * l)763     void Assembler::b(Condition cond, Label* l) {
764         const int imm19 = this->disp19(l);
765         this->word( 0b0101010'0           << 24
766                   | (imm19     & 19_mask) <<  5
767                   | ((int)cond &  4_mask) <<  0);
768     }
cbz(X t,Label * l)769     void Assembler::cbz(X t, Label* l) {
770         const int imm19 = this->disp19(l);
771         this->word( 0b1'011010'0      << 24
772                   | (imm19 & 19_mask) <<  5
773                   | (t     &  5_mask) <<  0);
774     }
cbnz(X t,Label * l)775     void Assembler::cbnz(X t, Label* l) {
776         const int imm19 = this->disp19(l);
777         this->word( 0b1'011010'1      << 24
778                   | (imm19 & 19_mask) <<  5
779                   | (t     &  5_mask) <<  0);
780     }
781 
ldrq(V dst,X src)782     void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
ldrs(V dst,X src)783     void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
ldrb(V dst,X src)784     void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); }
785 
strq(V src,X dst)786     void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
strs(V src,X dst)787     void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
strb(V src,X dst)788     void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
789 
ldrq(V dst,Label * l)790     void Assembler::ldrq(V dst, Label* l) {
791         const int imm19 = this->disp19(l);
792         this->word( 0b10'011'1'00     << 24
793                   | (imm19 & 19_mask) << 5
794                   | (dst   &  5_mask) << 0);
795     }
796 
label(Label * l)797     void Assembler::label(Label* l) {
798         if (fCode) {
799             // The instructions all currently point to l->offset.
800             // We'll want to add a delta to point them to here().
801             int delta = here().offset - l->offset;
802             l->offset = here().offset;
803 
804             if (l->kind == Label::ARMDisp19) {
805                 for (int ref : l->references) {
806                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
807                     uint32_t inst;
808                     memcpy(&inst, fCode + ref, 4);
809 
810                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
811                     int disp = (int)(inst << 8) >> 13;
812 
813                     disp += delta/4;  // delta is in bytes, we want instructions.
814 
815                     // Put it all back together, preserving the high 8 bits and low 5.
816                     inst = ((disp << 5) &  (19_mask << 5))
817                          | ((inst     ) & ~(19_mask << 5));
818 
819                     memcpy(fCode + ref, &inst, 4);
820                 }
821             }
822 
823             if (l->kind == Label::X86Disp32) {
824                 for (int ref : l->references) {
825                     // ref points to a 32-bit displacement in bytes.
826                     int disp;
827                     memcpy(&disp, fCode + ref, 4);
828 
829                     disp += delta;
830 
831                     memcpy(fCode + ref, &disp, 4);
832                 }
833             }
834         }
835     }
836 
eval(int n,void * args[]) const837     void Program::eval(int n, void* args[]) const {
838         const int nargs = (int)fStrides.size();
839 
840         if (fJITBuf) {
841             void** a = args;
842             const void* b = fJITBuf;
843             switch (nargs) {
844                 case 0: return ((void(*)(int                        ))b)(n                    );
845                 case 1: return ((void(*)(int,void*                  ))b)(n,a[0]               );
846                 case 2: return ((void(*)(int,void*,void*            ))b)(n,a[0],a[1]          );
847                 case 3: return ((void(*)(int,void*,void*,void*      ))b)(n,a[0],a[1],a[2]     );
848                 case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]);
849                 default: SkUNREACHABLE;  // TODO
850             }
851         }
852 
853         // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
854         constexpr int K = 16;
855         using I32 = skvx::Vec<K, int>;
856         using F32 = skvx::Vec<K, float>;
857         using U32 = skvx::Vec<K, uint32_t>;
858         using U16 = skvx::Vec<K, uint16_t>;
859         using  U8 = skvx::Vec<K, uint8_t>;
860 
861         using I16x2 = skvx::Vec<2*K,  int16_t>;
862         using U16x2 = skvx::Vec<2*K, uint16_t>;
863 
864         union Slot {
865             F32   f32;
866             I32   i32;
867             U32   u32;
868             I16x2 i16x2;
869             U16x2 u16x2;
870         };
871 
872         Slot                     few_regs[16];
873         std::unique_ptr<char[]> many_regs;
874 
875         Slot* regs = few_regs;
876 
877         if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
878             // Annoyingly we can't trust that malloc() or new will work with Slot because
879             // the skvx::Vec types may have alignment greater than what they provide.
880             // We'll overallocate one extra register so we can align manually.
881             many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
882 
883             uintptr_t addr = (uintptr_t)many_regs.get();
884             addr += alignof(Slot) -
885                      (addr & (alignof(Slot) - 1));
886             SkASSERT((addr & (alignof(Slot) - 1)) == 0);
887             regs = (Slot*)addr;
888         }
889 
890 
891         auto r = [&](Reg id) -> Slot& {
892             SkASSERT(0 <= id && id < fRegs);
893             return regs[id];
894         };
895         auto arg = [&](int ix) {
896             SkASSERT(0 <= ix && ix < nargs);
897             return args[ix];
898         };
899 
900         // Step each argument pointer ahead by its stride a number of times.
901         auto step_args = [&](int times) {
902             for (int i = 0; i < (int)fStrides.size(); i++) {
903                 args[i] = (void*)( (char*)args[i] + times * fStrides[i] );
904             }
905         };
906 
907         int start = 0,
908             stride;
909         for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
910             stride = n >= K ? K : 1;
911 
912             for (int i = start; i < (int)fInstructions.size(); i++) {
913                 Instruction inst = fInstructions[i];
914 
915                 // d = op(x,y,z/imm)
916                 Reg   d = inst.d,
917                       x = inst.x,
918                       y = inst.y,
919                       z = inst.z;
920                 int imm = inst.imm;
921 
922                 // Ops that interact with memory need to know whether we're stride=1 or K,
923                 // but all non-memory ops can run the same code no matter the stride.
924                 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
925                     default: SkUNREACHABLE;
926 
927                 #define STRIDE_1(op) case 2*(int)op
928                 #define STRIDE_K(op) case 2*(int)op + 1
929                     STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
930                     STRIDE_1(Op::store16): memcpy(arg(imm), &r(x).i32, 2); break;
931                     STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
932 
933                     STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(imm)); break;
934                     STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(imm)); break;
935                     STRIDE_K(Op::store32):                     (r(x).i32).store(arg(imm)); break;
936 
937                     STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
938                     STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 2); break;
939                     STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
940 
941                     STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break;
942                     STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(imm))); break;
943                     STRIDE_K(Op::load32): r(d).i32=                 I32::Load(arg(imm)) ; break;
944 
945                     STRIDE_1(Op::gather8):
946                         for (int i = 0; i < K; i++) {
947                             r(d).i32[i] = (i == 0) ? ((const uint8_t* )arg(imm))[ r(x).i32[i] ] : 0;
948                         } break;
949                     STRIDE_1(Op::gather16):
950                         for (int i = 0; i < K; i++) {
951                             r(d).i32[i] = (i == 0) ? ((const uint16_t*)arg(imm))[ r(x).i32[i] ] : 0;
952                         } break;
953                     STRIDE_1(Op::gather32):
954                         for (int i = 0; i < K; i++) {
955                             r(d).i32[i] = (i == 0) ? ((const int*     )arg(imm))[ r(x).i32[i] ] : 0;
956                         } break;
957 
958                     STRIDE_K(Op::gather8):
959                         for (int i = 0; i < K; i++) {
960                             r(d).i32[i] = ((const uint8_t* )arg(imm))[ r(x).i32[i] ];
961                         } break;
962                     STRIDE_K(Op::gather16):
963                         for (int i = 0; i < K; i++) {
964                             r(d).i32[i] = ((const uint16_t*)arg(imm))[ r(x).i32[i] ];
965                         } break;
966                     STRIDE_K(Op::gather32):
967                         for (int i = 0; i < K; i++) {
968                             r(d).i32[i] = ((const int*     )arg(imm))[ r(x).i32[i] ];
969                         } break;
970 
971                 #undef STRIDE_1
972                 #undef STRIDE_K
973 
974                     // Ops that don't interact with memory should never care about the stride.
975                 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
976 
977                     CASE(Op::uniform8):
978                         r(d).i32 = *(const uint8_t* )( (const char*)arg(imm&0xffff) + (imm>>16) );
979                         break;
980                     CASE(Op::uniform16):
981                         r(d).i32 = *(const uint16_t*)( (const char*)arg(imm&0xffff) + (imm>>16) );
982                         break;
983                     CASE(Op::uniform32):
984                         r(d).i32 = *(const int*     )( (const char*)arg(imm&0xffff) + (imm>>16) );
985                         break;
986 
987                     CASE(Op::splat): r(d).i32 = imm; break;
988 
989                     CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
990                     CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
991                     CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
992                     CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
993 
994                     CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
995 
996                     CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
997                     CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
998                     CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
999 
1000                     CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
1001                     CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
1002                     CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
1003 
1004                     CASE(Op::shl_i32): r(d).i32 = r(x).i32 << imm; break;
1005                     CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> imm; break;
1006                     CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> imm; break;
1007 
1008                     CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << imm; break;
1009                     CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> imm; break;
1010                     CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> imm; break;
1011 
1012                     CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
1013                     CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
1014                     CASE(Op:: lt_f32): r(d).i32 = r(x).f32 <  r(y).f32; break;
1015                     CASE(Op::lte_f32): r(d).i32 = r(x).f32 <= r(y).f32; break;
1016                     CASE(Op:: gt_f32): r(d).i32 = r(x).f32 >  r(y).f32; break;
1017                     CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
1018 
1019                     CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
1020                     CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
1021                     CASE(Op:: lt_i32): r(d).i32 = r(x).i32 <  r(y).i32; break;
1022                     CASE(Op::lte_i32): r(d).i32 = r(x).i32 <= r(y).i32; break;
1023                     CASE(Op:: gt_i32): r(d).i32 = r(x).i32 >  r(y).i32; break;
1024                     CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
1025 
1026                     CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
1027                     CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
1028                     CASE(Op:: lt_i16x2): r(d).i16x2 = r(x).i16x2 <  r(y).i16x2; break;
1029                     CASE(Op::lte_i16x2): r(d).i16x2 = r(x).i16x2 <= r(y).i16x2; break;
1030                     CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 >  r(y).i16x2; break;
1031                     CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
1032 
1033                     CASE(Op::bit_and  ): r(d).i32 = r(x).i32 &  r(y).i32; break;
1034                     CASE(Op::bit_or   ): r(d).i32 = r(x).i32 |  r(y).i32; break;
1035                     CASE(Op::bit_xor  ): r(d).i32 = r(x).i32 ^  r(y).i32; break;
1036                     CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
1037 
1038                     CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
1039                                       break;
1040 
1041 
1042                     CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
1043                     CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
1044 
1045                     CASE(Op::bytes): {
1046                         const U32 table[] = {
1047                             0,
1048                             (r(x).u32      ) & 0xff,
1049                             (r(x).u32 >>  8) & 0xff,
1050                             (r(x).u32 >> 16) & 0xff,
1051                             (r(x).u32 >> 24) & 0xff,
1052                         };
1053                         r(d).u32 = table[(imm >>  0) & 0xf] <<  0
1054                                  | table[(imm >>  4) & 0xf] <<  8
1055                                  | table[(imm >>  8) & 0xf] << 16
1056                                  | table[(imm >> 12) & 0xf] << 24;
1057                     } break;
1058 
1059                     CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
1060                     CASE(Op::to_i32): r(d).i32 = skvx::cast<int>  (r(x).f32); break;
1061                 #undef CASE
1062                 }
1063             }
1064         }
1065     }
1066 
dropJIT()1067     void Program::dropJIT() {
1068     #if defined(SKVM_JIT)
1069         if (fJITBuf) {
1070             munmap(fJITBuf, fJITSize);
1071         }
1072     #else
1073         SkASSERT(fJITBuf == nullptr);
1074     #endif
1075 
1076         fJITBuf   = nullptr;
1077         fJITSize  = 0;
1078     }
1079 
~Program()1080     Program::~Program() { this->dropJIT(); }
1081 
Program(Program && other)1082     Program::Program(Program&& other) {
1083         fInstructions = std::move(other.fInstructions);
1084         fRegs         = other.fRegs;
1085         fLoop         = other.fLoop;
1086         fStrides      = std::move(other.fStrides);
1087 
1088         std::swap(fJITBuf  , other.fJITBuf);
1089         std::swap(fJITSize , other.fJITSize);
1090     }
1091 
operator =(Program && other)1092     Program& Program::operator=(Program&& other) {
1093         fInstructions = std::move(other.fInstructions);
1094         fRegs         = other.fRegs;
1095         fLoop         = other.fLoop;
1096         fStrides      = std::move(other.fStrides);
1097 
1098         std::swap(fJITBuf  , other.fJITBuf);
1099         std::swap(fJITSize , other.fJITSize);
1100         return *this;
1101     }
1102 
Program()1103     Program::Program() {}
1104 
Program(const std::vector<Builder::Instruction> & instructions,const std::vector<int> & strides,const char * debug_name)1105     Program::Program(const std::vector<Builder::Instruction>& instructions,
1106                      const std::vector<int>& strides,
1107                      const char* debug_name) : fStrides(strides) {
1108         this->setupInterpreter(instructions);
1109     #if defined(SKVM_JIT)
1110         this->setupJIT(instructions, debug_name);
1111     #endif
1112     }
1113 
1114     // Translate Builder::Instructions to Program::Instructions used by the interpreter.
setupInterpreter(const std::vector<Builder::Instruction> & instructions)1115     void Program::setupInterpreter(const std::vector<Builder::Instruction>& instructions) {
1116         // Register each instruction is assigned to.
1117         std::vector<Reg> reg(instructions.size());
1118 
1119         // This next bit is a bit more complicated than strictly necessary;
1120         // we could just assign every live instruction to its own register.
1121         //
1122         // But recycling registers is fairly cheap, and good practice for the
1123         // JITs where minimizing register pressure really is important.
1124 
1125         fRegs = 0;
1126         int live_instructions = 0;
1127         std::vector<Reg> avail;
1128 
1129         // Assign this value to a register, recycling them where we can.
1130         auto assign_register = [&](Val id) {
1131             live_instructions++;
1132             const Builder::Instruction& inst = instructions[id];
1133 
1134             // If this is a real input and it's lifetime ends at this instruction,
1135             // we can recycle the register it's occupying.
1136             auto maybe_recycle_register = [&](Val input) {
1137                 if (input != NA && instructions[input].death == id) {
1138                     avail.push_back(reg[input]);
1139                 }
1140             };
1141 
1142             // Take care to not recycle the same register twice.
1143             if (true                                ) { maybe_recycle_register(inst.x); }
1144             if (inst.y != inst.x                    ) { maybe_recycle_register(inst.y); }
1145             if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); }
1146 
1147             // Allocate a register if we have to, preferring to reuse anything available.
1148             if (avail.empty()) {
1149                 reg[id] = fRegs++;
1150             } else {
1151                 reg[id] = avail.back();
1152                 avail.pop_back();
1153             }
1154         };
1155 
1156         // Assign a register to each live hoisted instruction.
1157         for (Val id = 0; id < (Val)instructions.size(); id++) {
1158             const Builder::Instruction& inst = instructions[id];
1159             if (inst.death != 0 && inst.hoist) {
1160                 assign_register(id);
1161             }
1162         }
1163 
1164         // Assign registers to each live loop instruction.
1165         for (Val id = 0; id < (Val)instructions.size(); id++) {
1166             const Builder::Instruction& inst = instructions[id];
1167             if (inst.death != 0 && !inst.hoist) {
1168                 assign_register(id);
1169 
1170             }
1171         }
1172 
1173         // Translate Builder::Instructions to Program::Instructions by mapping values to
1174         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
1175 
1176         // The loop begins at the fLoop'th Instruction.
1177         fLoop = 0;
1178         fInstructions.reserve(live_instructions);
1179 
1180         // Add a dummy mapping for the N/A sentinel Val to any arbitrary register
1181         // so lookups don't have to know which arguments are used by which Ops.
1182         auto lookup_register = [&](Val id) {
1183             return id == NA ? (Reg)0
1184                             : reg[id];
1185         };
1186 
1187         auto push_instruction = [&](Val id, const Builder::Instruction& inst) {
1188             Program::Instruction pinst{
1189                 inst.op,
1190                 lookup_register(id),
1191                 lookup_register(inst.x),
1192                 lookup_register(inst.y),
1193                {lookup_register(inst.z)},
1194             };
1195             if (inst.z == NA) { pinst.imm = inst.imm; }
1196             fInstructions.push_back(pinst);
1197         };
1198 
1199         for (Val id = 0; id < (Val)instructions.size(); id++) {
1200             const Builder::Instruction& inst = instructions[id];
1201             if (inst.death != 0 && inst.hoist) {
1202                 push_instruction(id, inst);
1203                 fLoop++;
1204             }
1205         }
1206         for (Val id = 0; id < (Val)instructions.size(); id++) {
1207             const Builder::Instruction& inst = instructions[id];
1208             if (inst.death != 0 && !inst.hoist) {
1209                 push_instruction(id, inst);
1210             }
1211         }
1212     }
1213 
1214 #if defined(SKVM_JIT)
1215 
1216     // Just so happens that we can translate the immediate control for our bytes() op
1217     // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl!
bytes_control(int imm,int mask[4])1218     static void bytes_control(int imm, int mask[4]) {
1219         auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
1220             // 0 -> 0xff,    Fill with zero
1221             // 1 -> 0x00,    Select byte 0
1222             // 2 -> 0x01,         "      1
1223             // 3 -> 0x02,         "      2
1224             // 4 -> 0x03,         "      3
1225             return n - 1;
1226         };
1227         uint8_t control[] = {
1228             nibble_to_vpshufb( (imm >>  0) & 0xf ),
1229             nibble_to_vpshufb( (imm >>  4) & 0xf ),
1230             nibble_to_vpshufb( (imm >>  8) & 0xf ),
1231             nibble_to_vpshufb( (imm >> 12) & 0xf ),
1232         };
1233         for (int i = 0; i < 4; i++) {
1234             mask[i] = (int)control[0] <<  0
1235                     | (int)control[1] <<  8
1236                     | (int)control[2] << 16
1237                     | (int)control[3] << 24;
1238 
1239             // Update each byte that refers to a byte index by 4 to
1240             // point into the next 32-bit lane, but leave any 0xff
1241             // that fills with zero alone.
1242             control[0] += control[0] == 0xff ? 0 : 4;
1243             control[1] += control[1] == 0xff ? 0 : 4;
1244             control[2] += control[2] == 0xff ? 0 : 4;
1245             control[3] += control[3] == 0xff ? 0 : 4;
1246         }
1247     }
1248 
jit(const std::vector<Builder::Instruction> & instructions,const bool hoist,Assembler * a) const1249     bool Program::jit(const std::vector<Builder::Instruction>& instructions,
1250                       const bool hoist,
1251                       Assembler* a) const {
1252         using A = Assembler;
1253 
1254     #if defined(__x86_64__)
1255         if (!SkCpu::Supports(SkCpu::HSW)) {
1256             return false;
1257         }
1258         A::GP64 N     = A::rdi,
1259                 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
1260 
1261         // All 16 ymm registers are available to use.
1262         using Reg = A::Ymm;
1263         uint32_t avail = 0xffff;
1264 
1265     #elif defined(__aarch64__)
1266         A::X N     = A::x0,
1267              arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
1268 
1269         // We can use v0-v7 and v16-v31 freely; we'd need to preseve v8-v15.
1270         using Reg = A::V;
1271         uint32_t avail = 0xffff00ff;
1272     #endif
1273 
1274         if (SK_ARRAY_COUNT(arg) < fStrides.size()) {
1275             return false;
1276         }
1277 
1278         auto hoisted = [&](Val id) { return hoist && instructions[id].hoist; };
1279 
1280         std::vector<Reg> r(instructions.size());
1281 
1282         struct LabelAndReg {
1283             A::Label label;
1284             Reg      reg;
1285         };
1286         SkTHashMap<int, LabelAndReg> splats,
1287                                      bytes_masks;
1288 
1289         auto warmup = [&](Val id) {
1290             const Builder::Instruction& inst = instructions[id];
1291             if (inst.death == 0) {
1292                 return true;
1293             }
1294 
1295             Op op = inst.op;
1296             int imm = inst.imm;
1297 
1298             switch (op) {
1299                 default: break;
1300 
1301                 case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
1302                                 break;
1303 
1304                 case Op::bytes: if (!bytes_masks.find(imm)) {
1305                                     bytes_masks.set(imm, {});
1306                                     if (hoist) {
1307                                         // vpshufb can always work with the mask from memory,
1308                                         // but it helps to hoist the mask to a register for tbl.
1309                                     #if defined(__aarch64__)
1310                                         LabelAndReg* entry = bytes_masks.find(imm);
1311                                         if (int found = __builtin_ffs(avail)) {
1312                                             entry->reg = (Reg)(found-1);
1313                                             avail ^= 1 << entry->reg;
1314                                             a->ldrq(entry->reg, &entry->label);
1315                                         } else {
1316                                             return false;
1317                                         }
1318                                     #endif
1319                                     }
1320                                 }
1321                                 break;
1322             }
1323             return true;
1324         };
1325 
1326         auto emit = [&](Val id, bool scalar) {
1327             const Builder::Instruction& inst = instructions[id];
1328 
1329             // No need to emit dead code instructions that produce values that are never used.
1330             if (inst.death == 0) {
1331                 return true;
1332             }
1333 
1334             Op op = inst.op;
1335             Val x = inst.x,
1336                 y = inst.y,
1337                 z = inst.z;
1338             int imm = inst.imm;
1339 
1340             // Most (but not all) ops create an output value and need a register to hold it, dst.
1341             // We track each instruction's dst in r[] so we can thread it through as an input
1342             // to any future instructions needing that value.
1343             //
1344             // And some ops may need a temporary scratch register, tmp.  Some need both tmp and dst.
1345             //
1346             // tmp and dst are very similar and can and will often be assigned the same register,
1347             // but tmp may never alias any of the instructions's inputs, while dst may when this
1348             // instruction consumes that input, i.e. if the input reaches its end of life here.
1349             //
1350             // We'll assign both registers lazily to keep register pressure as low as possible.
1351             bool tmp_is_set = false,
1352                  dst_is_set = false;
1353             Reg tmp_reg = (Reg)0;  // This initial value won't matter... anything legal is fine.
1354 
1355             bool ok = true;   // Set to false if we need to assign a register and none's available.
1356 
1357             // First lock in how to choose tmp if we need to based on the registers
1358             // available before this instruction, not including any of its input registers.
1359             auto tmp = [&,avail/*important, closing over avail's current value*/]{
1360                 if (!tmp_is_set) {
1361                     tmp_is_set = true;
1362                     if (int found = __builtin_ffs(avail)) {
1363                         // This is a scratch register just for this op,
1364                         // so we leave it marked available for future ops.
1365                         tmp_reg = (Reg)(found - 1);
1366                     } else {
1367                         // We needed a tmp register but couldn't find one available. :'(
1368                         // This will cause emit() to return false, in turn causing jit() to fail.
1369                         ok = false;
1370                     }
1371                 }
1372                 return tmp_reg;
1373             };
1374 
1375             // Now make available any registers that are consumed by this instruction.
1376             // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.)
1377             if (x != NA && instructions[x].death == id) { avail |= 1 << r[x]; }
1378             if (y != NA && instructions[y].death == id) { avail |= 1 << r[y]; }
1379             if (z != NA && instructions[z].death == id) { avail |= 1 << r[z]; }
1380             // set_dst() and dst() will work read/write with this perhaps-just-updated avail.
1381 
1382             // Some ops may decide dst on their own to best fit the instruction (see Op::mad_f32).
1383             auto set_dst = [&](Reg reg){
1384                 SkASSERT(dst_is_set == false);
1385                 dst_is_set = true;
1386 
1387                 SkASSERT(avail & (1<<reg));
1388                 avail ^= 1<<reg;
1389 
1390                 r[id] = reg;
1391             };
1392 
1393             // Thanks to AVX and NEON's 3-argument instruction sets,
1394             // most ops can use any register as dst.
1395             auto dst = [&]{
1396                 if (!dst_is_set) {
1397                     if (int found = __builtin_ffs(avail)) {
1398                         set_dst((Reg)(found-1));
1399                     } else {
1400                         // Same deal as with tmp... all the registers are occupied.  Time to fail!
1401                         ok = false;
1402                     }
1403                 }
1404                 return r[id];
1405             };
1406 
1407             // Because we use the same logic to pick an arbitrary dst and to pick tmp,
1408             // and we know that tmp will never overlap any of the inputs, `dst() == tmp()`
1409             // is a simple idiom to check that the destination does not overlap any of the inputs.
1410             // Sometimes we can use this knowledge to do better instruction selection.
1411 
1412             // Ok!  Keep in mind that we haven't assigned tmp or dst yet,
1413             // just laid out hooks for how to do so if we need them, depending on the instruction.
1414             //
1415             // Now let's actually assemble the instruction!
1416             switch (op) {
1417                 default:
1418                 #if 0
1419                     SkDEBUGFAILF("\n%d not yet implemented\n", op);
1420                 #endif
1421                     return false;  // TODO: many new ops
1422 
1423             #if defined(__x86_64__)
1424                 case Op::store8: if (scalar) { a->vpextrb  (arg[imm], (A::Xmm)r[x], 0); }
1425                                  else        { a->vpackusdw(tmp(), r[x], r[x]);
1426                                                a->vpermq   (tmp(), tmp(), 0xd8);
1427                                                a->vpackuswb(tmp(), tmp(), tmp());
1428                                                a->vmovq    (arg[imm], (A::Xmm)tmp()); }
1429                                                break;
1430 
1431                 case Op::store16: if (scalar) { a->vpextrw  (arg[imm], (A::Xmm)r[x], 0); }
1432                                   else        { a->vpackusdw(tmp(), r[x], r[x]);
1433                                                 a->vpermq   (tmp(), tmp(), 0xd8);
1434                                                 a->vmovups  (arg[imm], (A::Xmm)tmp()); }
1435                                                 break;
1436 
1437                 case Op::store32: if (scalar) { a->vmovd  (arg[imm], (A::Xmm)r[x]); }
1438                                   else        { a->vmovups(arg[imm],         r[x]); }
1439                                                 break;
1440 
1441                 case Op::load8:  if (scalar) {
1442                                      a->vpxor  (dst(), dst(), dst());
1443                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[imm], 0);
1444                                  } else {
1445                                      a->vpmovzxbd(dst(), arg[imm]);
1446                                  } break;
1447 
1448                 case Op::load16: if (scalar) {
1449                                      a->vpxor  (dst(), dst(), dst());
1450                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[imm], 0);
1451                                  } else {
1452                                      a->vpmovzxwd(dst(), arg[imm]);
1453                                  } break;
1454 
1455                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), arg[imm]); }
1456                                  else        { a->vmovups(        dst(), arg[imm]); }
1457                                  break;
1458 
1459                 case Op::uniform8: a->movzbl(A::rax, arg[imm&0xffff], imm>>16);
1460                                    a->vmovd_direct((A::Xmm)dst(), A::rax);
1461                                    a->vbroadcastss(dst(), (A::Xmm)dst());
1462                                    break;
1463 
1464                 case Op::uniform32: a->vbroadcastss(dst(), arg[imm&0xffff], imm>>16);
1465                                     break;
1466 
1467                 case Op::splat: a->vbroadcastss(dst(), &splats.find(imm)->label);
1468                                 break;
1469                                 // TODO: many of these instructions have variants that
1470                                 // can read one of their arugments from 32-byte memory
1471                                 // instead of a register.  Find a way to avoid needing
1472                                 // to splat most* constants out at all?
1473                                 // (*Might work for x - 255 but not 255 - x, so will
1474                                 // always need to be able to splat to a register.)
1475 
1476                 case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
1477                 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
1478                 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break;
1479                 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break;
1480 
1481                 case Op::mad_f32:
1482                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); }
1483                     else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); }
1484                     else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); }
1485                     else                        {                SkASSERT(dst() == tmp());
1486                                                                  a->vmovdqa    (dst(),r[x]);
1487                                                                  a->vfmadd132ps(dst(),r[z], r[y]); }
1488                                                                  break;
1489 
1490                 case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break;
1491                 case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break;
1492                 case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break;
1493 
1494                 case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break;
1495                 case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break;
1496                 case Op::shr_i16x2: a->vpsrlw (dst(), r[x],  imm); break;
1497 
1498                 case Op::bit_and  : a->vpand (dst(), r[x], r[y]); break;
1499                 case Op::bit_or   : a->vpor  (dst(), r[x], r[y]); break;
1500                 case Op::bit_xor  : a->vpxor (dst(), r[x], r[y]); break;
1501                 case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break;  // N.B. Y then X.
1502                 case Op::select   : a->vpblendvb(dst(), r[z], r[y], r[x]); break;
1503 
1504                 case Op::shl_i32: a->vpslld(dst(), r[x], imm); break;
1505                 case Op::shr_i32: a->vpsrld(dst(), r[x], imm); break;
1506                 case Op::sra_i32: a->vpsrad(dst(), r[x], imm); break;
1507 
1508                 case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break;
1509                 case Op::lt_i32: a->vpcmpgtd(dst(), r[y], r[x]); break;
1510                 case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break;
1511 
1512                 case Op::extract: if (imm == 0) { a->vpand (dst(),  r[x], r[y]); }
1513                                   else          { a->vpsrld(tmp(),  r[x], imm);
1514                                                   a->vpand (dst(), tmp(), r[y]); }
1515                                   break;
1516 
1517                 case Op::pack: a->vpslld(tmp(),  r[y], imm);
1518                                a->vpor  (dst(), tmp(), r[x]);
1519                                break;
1520 
1521                 case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
1522                 case Op::to_i32: a->vcvttps2dq(dst(), r[x]); break;
1523 
1524                 case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(imm)->label);
1525                                 break;
1526 
1527             #elif defined(__aarch64__)
1528                 case Op::store8: a->xtns2h(tmp(), r[x]);
1529                                  a->xtnh2b(tmp(), tmp());
1530                    if (scalar) { a->strb  (tmp(), arg[imm]); }
1531                    else        { a->strs  (tmp(), arg[imm]); }
1532                                  break;
1533                 // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here.
1534 
1535                 case Op::store32: if (scalar) { a->strs(r[x], arg[imm]); }
1536                                   else        { a->strq(r[x], arg[imm]); }
1537                                                 break;
1538 
1539                 case Op::load8: if (scalar) { a->ldrb(tmp(), arg[imm]); }
1540                                 else        { a->ldrs(tmp(), arg[imm]); }
1541                                               a->uxtlb2h(tmp(), tmp());
1542                                               a->uxtlh2s(dst(), tmp());
1543                                               break;
1544 
1545                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[imm]); }
1546                                  else        { a->ldrq(dst(), arg[imm]); }
1547                                                break;
1548 
1549                 case Op::splat: a->ldrq(dst(), &splats.find(imm)->label);
1550                                 break;
1551                                 // TODO: If we hoist these, pack 4 values in each register
1552                                 // and use vector/lane operations, cutting the register
1553                                 // pressure cost of hoisting by 4?
1554 
1555                 case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
1556                 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
1557                 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break;
1558                 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
1559 
1560                 case Op::mad_f32:
1561                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z],  r[x],  r[y]);   }
1562                     else                   {                a->orr16b(tmp(),  r[z],  r[z]);
1563                                                             a->fmla4s(tmp(),  r[x],  r[y]);
1564                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
1565                                                             break;
1566 
1567 
1568                 case Op::add_i32: a->add4s(dst(), r[x], r[y]); break;
1569                 case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break;
1570                 case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break;
1571 
1572                 case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break;
1573                 case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break;
1574                 case Op::shr_i16x2: a->ushr8h(dst(), r[x],  imm); break;
1575 
1576                 case Op::bit_and  : a->and16b(dst(), r[x], r[y]); break;
1577                 case Op::bit_or   : a->orr16b(dst(), r[x], r[y]); break;
1578                 case Op::bit_xor  : a->eor16b(dst(), r[x], r[y]); break;
1579                 case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break;
1580 
1581                 case Op::shl_i32: a-> shl4s(dst(), r[x], imm); break;
1582                 case Op::shr_i32: a->ushr4s(dst(), r[x], imm); break;
1583                 case Op::sra_i32: a->sshr4s(dst(), r[x], imm); break;
1584 
1585                 case Op::extract: if (imm) { a->ushr4s(tmp(), r[x], imm);
1586                                              a->and16b(dst(), tmp(), r[y]); }
1587                                   else     { a->and16b(dst(), r[x], r[y]); }
1588                                              break;
1589 
1590                 case Op::pack:
1591                     if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x],  r[y],  imm); }
1592                     else                   {                a->shl4s (tmp(),  r[y],  imm);
1593                                                             a->orr16b(dst(), tmp(), r[x]); }
1594                                                             break;
1595 
1596                 case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
1597                 case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
1598 
1599                 case Op::bytes: if (hoist) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
1600                                 else       { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
1601                                              a->tbl (dst(), r[x], tmp()); }
1602                                 break;
1603             #endif
1604             }
1605 
1606             // Calls to tmp() or dst() might have flipped this false from its default true state.
1607             return ok;
1608         };
1609 
1610 
1611         #if defined(__x86_64__)
1612             const int K = 8;
1613             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
1614             auto jump         = [&](A::Label* l) { a->jmp(l); };
1615 
1616             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
1617             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
1618 
1619             auto exit = [&]{ a->vzeroupper(); a->ret(); };
1620         #elif defined(__aarch64__)
1621             const int K = 4;
1622             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
1623             auto jump         = [&](A::Label* l) { a->b  (l); };
1624 
1625             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
1626             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
1627 
1628             auto exit = [&]{ a->ret(A::x30); };
1629         #endif
1630 
1631         A::Label body,
1632                  tail,
1633                  done;
1634 
1635         for (Val id = 0; id < (Val)instructions.size(); id++) {
1636             if (!warmup(id)) {
1637                 return false;
1638             }
1639             if (hoisted(id) && !emit(id, /*scalar=*/false)) {
1640                 return false;
1641             }
1642         }
1643 
1644         a->label(&body);
1645         {
1646             a->cmp(N, K);
1647             jump_if_less(&tail);
1648             for (Val id = 0; id < (Val)instructions.size(); id++) {
1649                 if (!hoisted(id) && !emit(id, /*scalar=*/false)) {
1650                     return false;
1651                 }
1652             }
1653             for (int i = 0; i < (int)fStrides.size(); i++) {
1654                 if (fStrides[i]) {
1655                     add(arg[i], K*fStrides[i]);
1656                 }
1657             }
1658             sub(N, K);
1659             jump(&body);
1660         }
1661 
1662         a->label(&tail);
1663         {
1664             a->cmp(N, 1);
1665             jump_if_less(&done);
1666             for (Val id = 0; id < (Val)instructions.size(); id++) {
1667                 if (!hoisted(id) && !emit(id, /*scalar=*/true)) {
1668                     return false;
1669                 }
1670             }
1671             for (int i = 0; i < (int)fStrides.size(); i++) {
1672                 if (fStrides[i]) {
1673                     add(arg[i], 1*fStrides[i]);
1674                 }
1675             }
1676             sub(N, 1);
1677             jump(&tail);
1678         }
1679 
1680         a->label(&done);
1681         {
1682             exit();
1683         }
1684 
1685         bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
1686             // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
1687         #if defined(__x86_64__)
1688             a->align(32);
1689         #elif defined(__aarch64__)
1690             a->align(4);
1691         #endif
1692 
1693             a->label(&entry->label);
1694             int mask[4];
1695             bytes_control(imm, mask);
1696             a->bytes(mask, sizeof(mask));
1697         #if defined(__x86_64__)
1698             a->bytes(mask, sizeof(mask));
1699         #endif
1700         });
1701 
1702         splats.foreach([&](int imm, LabelAndReg* entry) {
1703             // vbroadcastss 4 bytes on x86-64, or simply load 16-bytes on aarch64.
1704             a->align(4);
1705             a->label(&entry->label);
1706             a->word(imm);
1707         #if defined(__aarch64__)
1708             a->word(imm);
1709             a->word(imm);
1710             a->word(imm);
1711         #endif
1712         });
1713 
1714         return true;
1715     }
1716 
setupJIT(const std::vector<Builder::Instruction> & instructions,const char * debug_name)1717     void Program::setupJIT(const std::vector<Builder::Instruction>& instructions,
1718                            const char* debug_name) {
1719         // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble.
1720         Assembler a{nullptr};
1721 
1722         // First try allowing code hoisting (faster code)
1723         // then again without if that fails (lower register pressure).
1724         bool hoist = true;
1725         if (!this->jit(instructions, hoist, &a)) {
1726             hoist = false;
1727             if (!this->jit(instructions, hoist, &a)) {
1728                 return;
1729             }
1730         }
1731 
1732         // Allocate space that we can remap as executable.
1733         const size_t page = sysconf(_SC_PAGESIZE);
1734         fJITSize = ((a.size() + page - 1) / page) * page;  // mprotect works at page granularity.
1735         fJITBuf = mmap(nullptr,fJITSize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
1736 
1737         // Assemble the program for real.
1738         a = Assembler{fJITBuf};
1739         SkAssertResult(this->jit(instructions, hoist, &a));
1740         SkASSERT(a.size() <= fJITSize);
1741 
1742         // Remap as executable, and flush caches on platforms that need that.
1743         mprotect(fJITBuf, fJITSize, PROT_READ|PROT_EXEC);
1744         __builtin___clear_cache((char*)fJITBuf,
1745                                 (char*)fJITBuf + fJITSize);
1746     #if defined(SKVM_PERF_DUMPS)
1747         this->dumpJIT(debug_name, a.size());
1748     #endif
1749     }
1750 #endif
1751 
1752 #if defined(SKVM_PERF_DUMPS)
dumpJIT(const char * debug_name,size_t size) const1753     void Program::dumpJIT(const char* debug_name, size_t size) const {
1754     #if 0 && defined(__aarch64__)
1755         if (debug_name) {
1756             SkDebugf("\n%s:", debug_name);
1757         }
1758         // cat | llvm-mc -arch aarch64 -disassemble
1759         auto cur = (const uint8_t*)fJITBuf;
1760         for (int i = 0; i < (int)size; i++) {
1761             if (i % 4 == 0) {
1762                 SkDebugf("\n");
1763             }
1764             SkDebugf("0x%02x ", *cur++);
1765         }
1766         SkDebugf("\n");
1767     #endif
1768 
1769         // We're doing some really stateful things below so one thread at a time please...
1770         static SkSpinlock dump_lock;
1771         SkAutoSpinlock lock(dump_lock);
1772 
1773         auto fnv1a = [](const void* vbuf, size_t n) {
1774             uint32_t hash = 2166136261;
1775             for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) {
1776                 hash ^= *buf;
1777                 hash *= 16777619;
1778             }
1779             return hash;
1780         };
1781 
1782 
1783         char name[64];
1784         uint32_t hash = fnv1a(fJITBuf, size);
1785         if (debug_name) {
1786             sprintf(name, "skvm-jit-%s", debug_name);
1787         } else {
1788             sprintf(name, "skvm-jit-%u", hash);
1789         }
1790 
1791         // Create a jit-<pid>.dump file that we can `perf inject -j` into a
1792         // perf.data captured with `perf record -k 1`, letting us see each
1793         // JIT'd Program as if a function named skvm-jit-<hash>.   E.g.
1794         //
1795         //   ninja -C out nanobench
1796         //   perf record -k 1 out/nanobench -m SkVM_4096_I32\$
1797         //   perf inject -j -i perf.data -o perf.data.jit
1798         //   perf report -i perf.data.jit
1799         //
1800         // Running `perf inject -j` will also dump an .so for each JIT'd
1801         // program, named jitted-<pid>-<hash>.so.
1802         //
1803         //    https://lwn.net/Articles/638566/
1804         //    https://v8.dev/docs/linux-perf
1805         //    https://cs.chromium.org/chromium/src/v8/src/diagnostics/perf-jit.cc
1806         //    https://lore.kernel.org/patchwork/patch/622240/
1807 
1808 
1809         auto timestamp_ns = []() -> uint64_t {
1810             // It's important to use CLOCK_MONOTONIC here so that perf can
1811             // correlate our timestamps with those captured by `perf record
1812             // -k 1`.  That's also what `-k 1` does, by the way, tell perf
1813             // record to use CLOCK_MONOTONIC.
1814             struct timespec ts;
1815             clock_gettime(CLOCK_MONOTONIC, &ts);
1816             return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec;
1817         };
1818 
1819         // We'll open the jit-<pid>.dump file and write a small header once,
1820         // and just leave it open forever because we're lazy.
1821         static FILE* jitdump = [&]{
1822             // Must map as w+ for the mmap() call below to work.
1823             char path[64];
1824             sprintf(path, "jit-%d.dump", getpid());
1825             FILE* f = fopen(path, "w+");
1826 
1827             // Calling mmap() on the file adds a "hey they mmap()'d this" record to
1828             // the perf.data file that will point `perf inject -j` at this log file.
1829             // Kind of a strange way to tell `perf inject` where the file is...
1830             void* marker = mmap(nullptr, sysconf(_SC_PAGESIZE),
1831                                 PROT_READ|PROT_EXEC, MAP_PRIVATE,
1832                                 fileno(f), /*offset=*/0);
1833             SkASSERT_RELEASE(marker != MAP_FAILED);
1834             // Like never calling fclose(f), we'll also just always leave marker mmap()'d.
1835 
1836         #if defined(__x86_64__)
1837             const uint32_t elf_mach = 62;
1838         #elif defined(__aarch64__)
1839             const uint32_t elf_mach = 183;
1840         #endif
1841 
1842             struct Header {
1843                 uint32_t magic, version, header_size, elf_mach, reserved, pid;
1844                 uint64_t timestamp_us, flags;
1845             } header = {
1846                 0x4A695444, 1, sizeof(Header), elf_mach, 0, (uint32_t)getpid(),
1847                 timestamp_ns() / 1000, 0,
1848             };
1849             fwrite(&header, sizeof(header), 1, f);
1850 
1851             return f;
1852         }();
1853 
1854         struct CodeLoad {
1855             uint32_t event_type, event_size;
1856             uint64_t timestamp_ns;
1857 
1858             uint32_t pid, tid;
1859             uint64_t vma/*???*/, code_addr, code_size, id;
1860         } load = {
1861             0/*code load*/, (uint32_t)(sizeof(CodeLoad) + strlen(name) + 1 + size),
1862             timestamp_ns(),
1863 
1864             (uint32_t)getpid(), (uint32_t)SkGetThreadID(),
1865             (uint64_t)fJITBuf, (uint64_t)fJITBuf, size, hash,
1866         };
1867 
1868         // Write the header, the JIT'd function name, and the JIT'd code itself.
1869         fwrite(&load, sizeof(load), 1, jitdump);
1870         fwrite(name, 1, strlen(name), jitdump);
1871         fwrite("\0", 1, 1, jitdump);
1872         fwrite(fJITBuf, 1, size, jitdump);
1873     }
1874 #endif
1875 
1876 }  // namespace skvm
1877