• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/core/SkString.h"
10 #include "include/private/SkChecksum.h"
11 #include "include/private/SkSpinlock.h"
12 #include "include/private/SkTFitsIn.h"
13 #include "include/private/SkThreadID.h"
14 #include "include/private/SkVx.h"
15 #include "src/core/SkCpu.h"
16 #include "src/core/SkOpts.h"
17 #include "src/core/SkVM.h"
18 
19 bool gSkVMJITViaDylib{false};
20 
21 // JIT code isn't MSAN-instrumented, so we won't see when it uses
22 // uninitialized memory, and we'll not see the writes it makes as properly
23 // initializing memory.  Instead force the interpreter, which should let
24 // MSAN see everything our programs do properly.
25 //
26 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
27 #if defined(__has_feature)
28     #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
29         #undef SKVM_JIT
30     #endif
31 #endif
32 
33 #if defined(SKVM_JIT)
34     #include <dlfcn.h>      // dlopen, dlsym
35     #include <sys/mman.h>   // mmap, mprotect
36 #endif
37 
38 namespace skvm {
39 
40     // Debugging tools, mostly for printing various data structures out to a stream.
41 
42     namespace {
43         class SkDebugfStream final : public SkWStream {
44             size_t fBytesWritten = 0;
45 
write(const void * buffer,size_t size)46             bool write(const void* buffer, size_t size) override {
47                 SkDebugf("%.*s", size, buffer);
48                 fBytesWritten += size;
49                 return true;
50             }
51 
bytesWritten() const52             size_t bytesWritten() const override {
53                 return fBytesWritten;
54             }
55         };
56 
57         struct V { Val id; };
58         struct R { Reg id; };
59         struct Shift { int bits; };
60         struct Splat { int bits; };
61         struct Hex   { int bits; };
62 
write(SkWStream * o,const char * s)63         static void write(SkWStream* o, const char* s) {
64             o->writeText(s);
65         }
66 
name(Op op)67         static const char* name(Op op) {
68             switch (op) {
69             #define M(x) case Op::x: return #x;
70                 SKVM_OPS(M)
71             #undef M
72             }
73             return "unknown op";
74         }
75 
write(SkWStream * o,Op op)76         static void write(SkWStream* o, Op op) {
77             const char* raw = name(op);
78             if (const char* found = strstr(raw, "_imm")) {
79                 o->write(raw, found-raw);
80             } else {
81                 o->writeText(raw);
82             }
83         }
write(SkWStream * o,Arg a)84         static void write(SkWStream* o, Arg a) {
85             write(o, "arg(");
86             o->writeDecAsText(a.ix);
87             write(o, ")");
88         }
write(SkWStream * o,V v)89         static void write(SkWStream* o, V v) {
90             write(o, "v");
91             o->writeDecAsText(v.id);
92         }
write(SkWStream * o,R r)93         static void write(SkWStream* o, R r) {
94             write(o, "r");
95             o->writeDecAsText(r.id);
96         }
write(SkWStream * o,Shift s)97         static void write(SkWStream* o, Shift s) {
98             o->writeDecAsText(s.bits);
99         }
write(SkWStream * o,Splat s)100         static void write(SkWStream* o, Splat s) {
101             float f;
102             memcpy(&f, &s.bits, 4);
103             o->writeHexAsText(s.bits);
104             write(o, " (");
105             o->writeScalarAsText(f);
106             write(o, ")");
107         }
write(SkWStream * o,Hex h)108         static void write(SkWStream* o, Hex h) {
109             o->writeHexAsText(h.bits);
110         }
111 
112         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)113         static void write(SkWStream* o, T first, Ts... rest) {
114             write(o, first);
115             write(o, " ");
116             write(o, rest...);
117         }
118     }
119 
120 
dump(SkWStream * o) const121     void Builder::dump(SkWStream* o) const {
122         SkDebugfStream debug;
123         if (!o) { o = &debug; }
124 
125         std::vector<OptimizedInstruction> optimized = this->optimize();
126         o->writeDecAsText(optimized.size());
127         o->writeText(" values (originally ");
128         o->writeDecAsText(fProgram.size());
129         o->writeText("):\n");
130         for (Val id = 0; id < (Val)optimized.size(); id++) {
131             const OptimizedInstruction& inst = optimized[id];
132             Op  op = inst.op;
133             Val  x = inst.x,
134                  y = inst.y,
135                  z = inst.z;
136             int immy = inst.immy,
137                 immz = inst.immz;
138             write(o, !inst.can_hoist    ? "  " :
139                       inst.used_in_loop ? "↑ " :
140                                           "↟ ");
141             switch (op) {
142                 case Op::assert_true: write(o, op, V{x}, V{y}); break;
143 
144                 case Op::store8:  write(o, op, Arg{immy}, V{x}); break;
145                 case Op::store16: write(o, op, Arg{immy}, V{x}); break;
146                 case Op::store32: write(o, op, Arg{immy}, V{x}); break;
147 
148                 case Op::index: write(o, V{id}, "=", op); break;
149 
150                 case Op::load8:  write(o, V{id}, "=", op, Arg{immy}); break;
151                 case Op::load16: write(o, V{id}, "=", op, Arg{immy}); break;
152                 case Op::load32: write(o, V{id}, "=", op, Arg{immy}); break;
153 
154                 case Op::gather8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
155                 case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
156                 case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
157 
158                 case Op::uniform8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
159                 case Op::uniform16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
160                 case Op::uniform32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
161 
162                 case Op::splat:  write(o, V{id}, "=", op, Splat{immy}); break;
163 
164 
165                 case Op::add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
166                 case Op::sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
167                 case Op::mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
168                 case Op::div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
169                 case Op::min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
170                 case Op::max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
171                 case Op::mad_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
172 
173                 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
174 
175                 case Op::add_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
176                 case Op::sub_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
177                 case Op::mul_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
178                 case Op::min_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
179                 case Op::max_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
180 
181                 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
182                 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
183                 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
184                 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
185 
186 
187                 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
188                 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
189                 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
190 
191                 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
192                 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
193                 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
194 
195                 case Op:: eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
196                 case Op::neq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
197                 case Op:: gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
198                 case Op::gte_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
199 
200                 case Op::add_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
201                 case Op::sub_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
202                 case Op::mul_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
203 
204                 case Op::shl_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
205                 case Op::shr_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
206                 case Op::sra_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
207 
208                 case Op:: eq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
209                 case Op::neq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
210                 case Op:: gt_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
211                 case Op::gte_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
212 
213                 case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}      ); break;
214                 case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}      ); break;
215                 case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}      ); break;
216                 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
217 
218                 case Op::bit_and_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
219                 case Op::bit_or_imm : write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
220                 case Op::bit_xor_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
221 
222                 case Op::select:  write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
223                 case Op::bytes:   write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
224                 case Op::pack:    write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}); break;
225 
226                 case Op::floor:  write(o, V{id}, "=", op, V{x}); break;
227                 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break;
228                 case Op::trunc:  write(o, V{id}, "=", op, V{x}); break;
229                 case Op::round:  write(o, V{id}, "=", op, V{x}); break;
230             }
231 
232             write(o, "\n");
233         }
234     }
235 
dump(SkWStream * o) const236     void Program::dump(SkWStream* o) const {
237         SkDebugfStream debug;
238         if (!o) { o = &debug; }
239 
240         o->writeDecAsText(fRegs);
241         o->writeText(" registers, ");
242         o->writeDecAsText(fInstructions.size());
243         o->writeText(" instructions:\n");
244         for (int i = 0; i < (int)fInstructions.size(); i++) {
245             if (i == fLoop) { write(o, "loop:\n"); }
246             o->writeDecAsText(i);
247             o->writeText("\t");
248             if (i >= fLoop) { write(o, "    "); }
249             const Program::Instruction& inst = fInstructions[i];
250             Op   op = inst.op;
251             Reg   d = inst.d,
252                   x = inst.x,
253                   y = inst.y,
254                   z = inst.z;
255             int immy = inst.immy,
256                 immz = inst.immz;
257             switch (op) {
258                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
259 
260                 case Op::store8:  write(o, op, Arg{immy}, R{x}); break;
261                 case Op::store16: write(o, op, Arg{immy}, R{x}); break;
262                 case Op::store32: write(o, op, Arg{immy}, R{x}); break;
263 
264                 case Op::index: write(o, R{d}, "=", op); break;
265 
266                 case Op::load8:  write(o, R{d}, "=", op, Arg{immy}); break;
267                 case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
268                 case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
269 
270                 case Op::gather8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
271                 case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
272                 case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
273 
274                 case Op::uniform8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
275                 case Op::uniform16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
276                 case Op::uniform32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
277 
278                 case Op::splat:  write(o, R{d}, "=", op, Splat{immy}); break;
279 
280 
281                 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
282                 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
283                 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
284                 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
285                 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
286                 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
287                 case Op::mad_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
288 
289                 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
290 
291                 case Op::add_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
292                 case Op::sub_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
293                 case Op::mul_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
294                 case Op::min_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
295                 case Op::max_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
296 
297                 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
298                 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
299                 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
300                 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
301 
302 
303                 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
304                 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
305                 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
306 
307                 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
308                 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
309                 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
310 
311                 case Op:: eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
312                 case Op::neq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
313                 case Op:: gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
314                 case Op::gte_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
315 
316 
317                 case Op::add_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
318                 case Op::sub_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
319                 case Op::mul_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
320 
321                 case Op::shl_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
322                 case Op::shr_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
323                 case Op::sra_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
324 
325                 case Op:: eq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
326                 case Op::neq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
327                 case Op:: gt_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
328                 case Op::gte_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
329 
330 
331                 case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}      ); break;
332                 case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}      ); break;
333                 case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}      ); break;
334                 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
335 
336                 case Op::bit_and_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
337                 case Op::bit_or_imm : write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
338                 case Op::bit_xor_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
339 
340                 case Op::select:  write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
341                 case Op::bytes:   write(o, R{d}, "=", op,  R{x}, Hex{immy}); break;
342                 case Op::pack:    write(o, R{d}, "=", op,   R{x}, R{y}, Shift{immz}); break;
343 
344                 case Op::floor:  write(o, R{d}, "=", op,  R{x}); break;
345                 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break;
346                 case Op::trunc:  write(o, R{d}, "=", op,  R{x}); break;
347                 case Op::round:  write(o, R{d}, "=", op,  R{x}); break;
348             }
349             write(o, "\n");
350         }
351     }
352 
optimize(bool for_jit) const353     std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const {
354         // If requested, first specialize for our JIT backend.
355         auto specialize_for_jit = [&]() -> std::vector<Instruction> {
356             Builder specialized;
357             for (int i = 0; i < (int)fProgram.size(); i++) {
358                 Builder::Instruction inst = fProgram[i];
359 
360                 #if defined(SK_CPU_X86)
361                 switch (Op imm_op; inst.op) {
362                     default: break;
363 
364                     case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y;
365                     case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y;
366                     case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x_and_y;
367                     case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x_and_y;
368                     case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y;
369                     case Op::bit_or:  imm_op = Op::bit_or_imm ; goto try_imm_x_and_y;
370                     case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y;
371 
372                     try_imm_x_and_y:
373                         if (int bits; this->allImm(inst.x, &bits)) {
374                             inst.op   = imm_op;
375                             inst.x    = inst.y;
376                             inst.y    = NA;
377                             inst.immy = bits;
378                         } else if (int bits; this->allImm(inst.y, &bits)) {
379                             inst.op   = imm_op;
380                             inst.y    = NA;
381                             inst.immy = bits;
382                         } break;
383 
384                     case Op::sub_f32:
385                         if (int bits; this->allImm(inst.y, &bits)) {
386                             inst.op   = Op::sub_f32_imm;
387                             inst.y    = NA;
388                             inst.immy = bits;
389                         } break;
390 
391                     case Op::bit_clear:
392                         if (int bits; this->allImm(inst.y, &bits)) {
393                             inst.op   = Op::bit_and_imm;
394                             inst.y    = NA;
395                             inst.immy = ~bits;
396                         } break;
397                 }
398                 #endif
399                 SkDEBUGCODE(Val id =) specialized.push(inst.op,
400                                                        inst.x,inst.y,inst.z,
401                                                        inst.immy,inst.immz);
402                 // If we replace single instructions with multiple, this will start breaking,
403                 // and we'll need a table to remap them like we have in optimize().
404                 SkASSERT(id == i);
405             }
406             return specialized.fProgram;
407         };
408         const std::vector<Builder::Instruction>& program = for_jit ? specialize_for_jit()
409                                                                    : fProgram;
410 
411         // Next rewrite the program order by issuing instructions as late as possible:
412         //    - any side-effect-only (i.e. store) instruction in order as we see them;
413         //    - any other instruction only once it's shown to be needed.
414         // This elides all dead code and helps minimize value lifetime / register pressure.
415         std::vector<OptimizedInstruction> optimized;
416         optimized.reserve(program.size());
417 
418         // Map old Val index to rewritten index in optimized.
419         std::vector<Val> new_index(program.size(), NA);
420 
421         auto rewrite = [&](Val id, auto& recurse) -> Val {
422             auto rewrite_input = [&](Val input) -> Val {
423                 if (input == NA) {
424                     return NA;
425                 }
426                 if (new_index[input] == NA) {
427                     new_index[input] = recurse(input, recurse);
428                 }
429                 return new_index[input];
430             };
431 
432             // The order we rewrite inputs is somewhat arbitrary; we could just go x,y,z.
433             // But we try to preserve the original program order as much as possible by
434             // rewriting inst's inputs in the order they were themselves originally issued.
435             // This makes debugging  dumps a little easier.
436             Builder::Instruction inst = program[id];
437             Val *min = &inst.x,
438                 *mid = &inst.y,
439                 *max = &inst.z;
440             if (*min > *mid) { std::swap(min, mid); }
441             if (*mid > *max) { std::swap(mid, max); }
442             if (*min > *mid) { std::swap(min, mid); }
443             *min = rewrite_input(*min);
444             *mid = rewrite_input(*mid);
445             *max = rewrite_input(*max);
446             optimized.push_back({inst.op,
447                                  inst.x, inst.y, inst.z,
448                                  inst.immy, inst.immz,
449                                  /*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false});
450             return (Val)optimized.size()-1;
451         };
452 
453         // Here we go with the actual rewriting, starting with all the store instructions
454         // and letting rewrite() work back recursively through their inputs.
455         for (Val id = 0; id < (Val)program.size(); id++) {
456             if (program[id].op <= Op::store32) {
457                 rewrite(id, rewrite);
458             }
459         }
460 
461         // We're done with `program` now... everything below will analyze `optimized`.
462 
463         // We'll want to know when it's safe to recycle registers holding the values
464         // produced by each instruction, that is, when no future instruction needs it.
465         for (Val id = 0; id < (Val)optimized.size(); id++) {
466             OptimizedInstruction& inst = optimized[id];
467             // Stores don't really produce values.  Just mark them as dying on issue.
468             if (inst.op <= Op::store32) {
469                 inst.death = id;
470             }
471             // Extend the lifetime of this instruction's inputs to live until it issues.
472             // (We're walking in order, so this is the same as max()ing.)
473             if (inst.x != NA) { optimized[inst.x].death = id; }
474             if (inst.y != NA) { optimized[inst.y].death = id; }
475             if (inst.z != NA) { optimized[inst.z].death = id; }
476         }
477 
478         // Mark which values don't depend on the loop and can be hoisted.
479         for (Val id = 0; id < (Val)optimized.size(); id++) {
480             OptimizedInstruction& inst = optimized[id];
481 
482             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
483             if (inst.op <= Op::gather32 && inst.op != Op::assert_true) {
484                 inst.can_hoist = false;
485             }
486 
487             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
488             if (inst.can_hoist) {
489                 if (inst.x != NA) { inst.can_hoist &= optimized[inst.x].can_hoist; }
490                 if (inst.y != NA) { inst.can_hoist &= optimized[inst.y].can_hoist; }
491                 if (inst.z != NA) { inst.can_hoist &= optimized[inst.z].can_hoist; }
492             }
493 
494             // We'll want to know if hoisted values are used in the loop;
495             // if not, we can recycle their registers like we do loop values.
496             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) {
497                 if (inst.x != NA) { optimized[inst.x].used_in_loop = true; }
498                 if (inst.y != NA) { optimized[inst.y].used_in_loop = true; }
499                 if (inst.z != NA) { optimized[inst.z].used_in_loop = true; }
500             }
501         }
502 
503         return optimized;
504     }
505 
done(const char * debug_name) const506     Program Builder::done(const char* debug_name) const {
507         char buf[64] = "skvm-jit-";
508         if (!debug_name) {
509             *SkStrAppendU32(buf+9, this->hash()) = '\0';
510             debug_name = buf;
511         }
512 
513     #if defined(SKVM_JIT)
514         return {this->optimize(false), this->optimize(true), fStrides, debug_name};
515     #else
516         return {this->optimize(false), fStrides};
517     #endif
518     }
519 
hash() const520     uint64_t Builder::hash() const {
521         uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
522                  hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
523         return (uint64_t)lo | (uint64_t)hi << 32;
524     }
525 
operator ==(const Builder::Instruction & a,const Builder::Instruction & b)526     static bool operator==(const Builder::Instruction& a, const Builder::Instruction& b) {
527         return a.op   == b.op
528             && a.x    == b.x
529             && a.y    == b.y
530             && a.z    == b.z
531             && a.immy == b.immy
532             && a.immz == b.immz;
533     }
534 
operator ()(const Instruction & inst,uint32_t seed) const535     uint32_t Builder::InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
536         return SkOpts::hash(&inst, sizeof(inst), seed);
537     }
538 
539 
540     // Most instructions produce a value and return it by ID,
541     // the value-producing instruction's own index in the program vector.
push(Op op,Val x,Val y,Val z,int immy,int immz)542     Val Builder::push(Op op, Val x, Val y, Val z, int immy, int immz) {
543         Instruction inst{op, x, y, z, immy, immz};
544 
545         // Basic common subexpression elimination:
546         // if we've already seen this exact Instruction, use it instead of creating a new one.
547         if (Val* id = fIndex.find(inst)) {
548             return *id;
549         }
550         Val id = static_cast<Val>(fProgram.size());
551         fProgram.push_back(inst);
552         fIndex.set(inst, id);
553         return id;
554     }
555 
allImm() const556     bool Builder::allImm() const { return true; }
557 
558     template <typename T, typename... Rest>
allImm(Val id,T * imm,Rest...rest) const559     bool Builder::allImm(Val id, T* imm, Rest... rest) const {
560         if (fProgram[id].op == Op::splat) {
561             static_assert(sizeof(T) == 4);
562             memcpy(imm, &fProgram[id].immy, 4);
563             return this->allImm(rest...);
564         }
565         return false;
566     }
567 
arg(int stride)568     Arg Builder::arg(int stride) {
569         int ix = (int)fStrides.size();
570         fStrides.push_back(stride);
571         return {ix};
572     }
573 
assert_true(I32 cond,I32 debug)574     void Builder::assert_true(I32 cond, I32 debug) {
575     #ifdef SK_DEBUG
576         int imm;
577         if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
578         (void)this->push(Op::assert_true, cond.id,debug.id,NA);
579     #endif
580     }
581 
store8(Arg ptr,I32 val)582     void Builder::store8 (Arg ptr, I32 val) { (void)this->push(Op::store8 , val.id,NA,NA, ptr.ix); }
store16(Arg ptr,I32 val)583     void Builder::store16(Arg ptr, I32 val) { (void)this->push(Op::store16, val.id,NA,NA, ptr.ix); }
store32(Arg ptr,I32 val)584     void Builder::store32(Arg ptr, I32 val) { (void)this->push(Op::store32, val.id,NA,NA, ptr.ix); }
585 
index()586     I32 Builder::index() { return {this->push(Op::index , NA,NA,NA,0) }; }
587 
load8(Arg ptr)588     I32 Builder::load8 (Arg ptr) { return {this->push(Op::load8 , NA,NA,NA, ptr.ix) }; }
load16(Arg ptr)589     I32 Builder::load16(Arg ptr) { return {this->push(Op::load16, NA,NA,NA, ptr.ix) }; }
load32(Arg ptr)590     I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; }
591 
gather8(Arg ptr,int offset,I32 index)592     I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
593         return {this->push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
594     }
gather16(Arg ptr,int offset,I32 index)595     I32 Builder::gather16(Arg ptr, int offset, I32 index) {
596         return {this->push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
597     }
gather32(Arg ptr,int offset,I32 index)598     I32 Builder::gather32(Arg ptr, int offset, I32 index) {
599         return {this->push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
600     }
601 
uniform8(Arg ptr,int offset)602     I32 Builder::uniform8(Arg ptr, int offset) {
603         return {this->push(Op::uniform8, NA,NA,NA, ptr.ix, offset)};
604     }
uniform16(Arg ptr,int offset)605     I32 Builder::uniform16(Arg ptr, int offset) {
606         return {this->push(Op::uniform16, NA,NA,NA, ptr.ix, offset)};
607     }
uniform32(Arg ptr,int offset)608     I32 Builder::uniform32(Arg ptr, int offset) {
609         return {this->push(Op::uniform32, NA,NA,NA, ptr.ix, offset)};
610     }
611 
612     // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
splat(int n)613     I32 Builder::splat(int   n) { return {this->push(Op::splat, NA,NA,NA, n) }; }
splat(float f)614     F32 Builder::splat(float f) {
615         int bits;
616         memcpy(&bits, &f, 4);
617         return {this->push(Op::splat, NA,NA,NA, bits)};
618     }
619 
620     // Be careful peepholing float math!  Transformations you might expect to
621     // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
622     // Float peepholes must pass this equivalence test for all ~4B floats:
623     //
624     //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
625     //
626     //     unsigned bits = 0;
627     //     do {
628     //        float f;
629     //        memcpy(&f, &bits, 4);
630     //        if (!equiv(f, ...)) {
631     //           abort();
632     //        }
633     //     } while (++bits != 0);
634 
add(F32 x,F32 y)635     F32 Builder::add(F32 x, F32 y) {
636         float X,Y;
637         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X+Y); }
638         if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
639         if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
640         return {this->push(Op::add_f32, x.id, y.id)};
641     }
642 
sub(F32 x,F32 y)643     F32 Builder::sub(F32 x, F32 y) {
644         float X,Y;
645         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X-Y); }
646         if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
647         return {this->push(Op::sub_f32, x.id, y.id)};
648     }
649 
mul(F32 x,F32 y)650     F32 Builder::mul(F32 x, F32 y) {
651         float X,Y;
652         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X*Y); }
653         if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
654         if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
655         return {this->push(Op::mul_f32, x.id, y.id)};
656     }
657 
div(F32 x,F32 y)658     F32 Builder::div(F32 x, F32 y) {
659         float X,Y;
660         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X/Y); }
661         if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
662         return {this->push(Op::div_f32, x.id, y.id)};
663     }
664 
mad(F32 x,F32 y,F32 z)665     F32 Builder::mad(F32 x, F32 y, F32 z) {
666         float X,Y,Z;
667         if (this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return this->splat(X*Y+Z); }
668         if (this->isImm(y.id, 1.0f)) { return this->add(x,z); }  // x*1+z == x+z
669         if (this->isImm(x.id, 1.0f)) { return this->add(y,z); }  // 1*y+z == y+z
670         if (this->isImm(z.id, 0.0f)) { return this->mul(x,y); }  // x*y+0 == x*y
671         return {this->push(Op::mad_f32, x.id, y.id, z.id)};
672     }
673 
sqrt(F32 x)674     F32 Builder::sqrt(F32 x) {
675         float X;
676         if (this->allImm(x.id,&X)) { return this->splat(std::sqrt(X)); }
677         return {this->push(Op::sqrt_f32, x.id,NA,NA)};
678     }
679 
min(F32 x,F32 y)680     F32 Builder::min(F32 x, F32 y) {
681         float X,Y;
682         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(std::min(X,Y)); }
683         return {this->push(Op::min_f32, x.id, y.id)};
684     }
max(F32 x,F32 y)685     F32 Builder::max(F32 x, F32 y) {
686         float X,Y;
687         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(std::max(X,Y)); }
688         return {this->push(Op::max_f32, x.id, y.id)};
689     }
690 
add(I32 x,I32 y)691     I32 Builder::add(I32 x, I32 y) { return {this->push(Op::add_i32, x.id, y.id)}; }
sub(I32 x,I32 y)692     I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; }
mul(I32 x,I32 y)693     I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; }
694 
add_16x2(I32 x,I32 y)695     I32 Builder::add_16x2(I32 x, I32 y) { return {this->push(Op::add_i16x2, x.id, y.id)}; }
sub_16x2(I32 x,I32 y)696     I32 Builder::sub_16x2(I32 x, I32 y) { return {this->push(Op::sub_i16x2, x.id, y.id)}; }
mul_16x2(I32 x,I32 y)697     I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; }
698 
shl(I32 x,int bits)699     I32 Builder::shl(I32 x, int bits) {
700         if (bits == 0) { return x; }
701         int X;
702         if (this->allImm(x.id,&X)) { return this->splat(X << bits); }
703         return {this->push(Op::shl_i32, x.id,NA,NA, bits)};
704     }
shr(I32 x,int bits)705     I32 Builder::shr(I32 x, int bits) {
706         if (bits == 0) { return x; }
707         int X;
708         if (this->allImm(x.id,&X)) { return this->splat(unsigned(X) >> bits); }
709         return {this->push(Op::shr_i32, x.id,NA,NA, bits)};
710     }
sra(I32 x,int bits)711     I32 Builder::sra(I32 x, int bits) {
712         if (bits == 0) { return x; }
713         int X;
714         if (this->allImm(x.id,&X)) { return this->splat(X >> bits); }
715         return {this->push(Op::sra_i32, x.id,NA,NA, bits)};
716     }
717 
shl_16x2(I32 x,int bits)718     I32 Builder::shl_16x2(I32 x, int bits) { return {this->push(Op::shl_i16x2, x.id,NA,NA, bits)}; }
shr_16x2(I32 x,int bits)719     I32 Builder::shr_16x2(I32 x, int bits) { return {this->push(Op::shr_i16x2, x.id,NA,NA, bits)}; }
sra_16x2(I32 x,int bits)720     I32 Builder::sra_16x2(I32 x, int bits) { return {this->push(Op::sra_i16x2, x.id,NA,NA, bits)}; }
721 
eq(F32 x,F32 y)722     I32 Builder:: eq(F32 x, F32 y) {
723         float X,Y;
724         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X==Y ? ~0 : 0); }
725         return {this->push(Op::eq_f32, x.id, y.id)};
726     }
neq(F32 x,F32 y)727     I32 Builder::neq(F32 x, F32 y) {
728         float X,Y;
729         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X!=Y ? ~0 : 0); }
730         return {this->push(Op::neq_f32, x.id, y.id)};
731     }
lt(F32 x,F32 y)732     I32 Builder::lt(F32 x, F32 y) {
733         float X,Y;
734         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(Y> X ? ~0 : 0); }
735         return {this->push(Op::gt_f32, y.id, x.id)};
736     }
lte(F32 x,F32 y)737     I32 Builder::lte(F32 x, F32 y) {
738         float X,Y;
739         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(Y>=X ? ~0 : 0); }
740         return {this->push(Op::gte_f32, y.id, x.id)};
741     }
gt(F32 x,F32 y)742     I32 Builder::gt(F32 x, F32 y) {
743         float X,Y;
744         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X> Y ? ~0 : 0); }
745         return {this->push(Op::gt_f32, x.id, y.id)};
746     }
gte(F32 x,F32 y)747     I32 Builder::gte(F32 x, F32 y) {
748         float X,Y;
749         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X>=Y ? ~0 : 0); }
750         return {this->push(Op::gte_f32, x.id, y.id)};
751     }
752 
eq(I32 x,I32 y)753     I32 Builder:: eq(I32 x, I32 y) { return {this->push(Op:: eq_i32, x.id, y.id)}; }
neq(I32 x,I32 y)754     I32 Builder::neq(I32 x, I32 y) { return {this->push(Op::neq_i32, x.id, y.id)}; }
lt(I32 x,I32 y)755     I32 Builder:: lt(I32 x, I32 y) { return {this->push(Op:: gt_i32, y.id, x.id)}; }
lte(I32 x,I32 y)756     I32 Builder::lte(I32 x, I32 y) { return {this->push(Op::gte_i32, y.id, x.id)}; }
gt(I32 x,I32 y)757     I32 Builder:: gt(I32 x, I32 y) { return {this->push(Op:: gt_i32, x.id, y.id)}; }
gte(I32 x,I32 y)758     I32 Builder::gte(I32 x, I32 y) { return {this->push(Op::gte_i32, x.id, y.id)}; }
759 
eq_16x2(I32 x,I32 y)760     I32 Builder:: eq_16x2(I32 x, I32 y) { return {this->push(Op:: eq_i16x2, x.id, y.id)}; }
neq_16x2(I32 x,I32 y)761     I32 Builder::neq_16x2(I32 x, I32 y) { return {this->push(Op::neq_i16x2, x.id, y.id)}; }
lt_16x2(I32 x,I32 y)762     I32 Builder:: lt_16x2(I32 x, I32 y) { return {this->push(Op:: gt_i16x2, y.id, x.id)}; }
lte_16x2(I32 x,I32 y)763     I32 Builder::lte_16x2(I32 x, I32 y) { return {this->push(Op::gte_i16x2, y.id, x.id)}; }
gt_16x2(I32 x,I32 y)764     I32 Builder:: gt_16x2(I32 x, I32 y) { return {this->push(Op:: gt_i16x2, x.id, y.id)}; }
gte_16x2(I32 x,I32 y)765     I32 Builder::gte_16x2(I32 x, I32 y) { return {this->push(Op::gte_i16x2, x.id, y.id)}; }
766 
bit_and(I32 x,I32 y)767     I32 Builder::bit_and(I32 x, I32 y) {
768         int X,Y;
769         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&Y); }
770         if (this->isImm(y.id, 0)) { return this->splat(0); }   // (x & false) == false
771         if (this->isImm(x.id, 0)) { return this->splat(0); }   // (false & y) == false
772         if (this->isImm(y.id,~0)) { return x; }                // (x & true) == x
773         if (this->isImm(x.id,~0)) { return y; }                // (true & y) == y
774         return {this->push(Op::bit_and, x.id, y.id)};
775     }
bit_or(I32 x,I32 y)776     I32 Builder::bit_or(I32 x, I32 y) {
777         int X,Y;
778         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X|Y); }
779         if (this->isImm(y.id, 0)) { return x; }                 // (x | false) == x
780         if (this->isImm(x.id, 0)) { return y; }                 // (false | y) == y
781         if (this->isImm(y.id,~0)) { return this->splat(~0); }   // (x | true) == true
782         if (this->isImm(x.id,~0)) { return this->splat(~0); }   // (true | y) == true
783         return {this->push(Op::bit_or, x.id, y.id)};
784     }
bit_xor(I32 x,I32 y)785     I32 Builder::bit_xor(I32 x, I32 y) {
786         int X,Y;
787         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X^Y); }
788         if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
789         if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
790         return {this->push(Op::bit_xor, x.id, y.id)};
791     }
bit_clear(I32 x,I32 y)792     I32 Builder::bit_clear(I32 x, I32 y) {
793         int X,Y;
794         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&~Y); }
795         if (this->isImm(y.id, 0)) { return x; }                // (x & ~false) == x
796         if (this->isImm(y.id,~0)) { return this->splat(0); }   // (x & ~true) == false
797         if (this->isImm(x.id, 0)) { return this->splat(0); }   // (false & ~y) == false
798         return {this->push(Op::bit_clear, x.id, y.id)};
799     }
800 
select(I32 x,I32 y,I32 z)801     I32 Builder::select(I32 x, I32 y, I32 z) {
802         int X,Y,Z;
803         if (this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return this->splat(X?Y:Z); }
804         // TODO: some cases to reduce to bit_and when y == 0 or z == 0?
805         return {this->push(Op::select, x.id, y.id, z.id)};
806     }
807 
extract(I32 x,int bits,I32 z)808     I32 Builder::extract(I32 x, int bits, I32 z) {
809         int Z;
810         if (this->allImm(z.id,&Z) && (~0u>>bits) == (unsigned)Z) { return this->shr(x, bits); }
811         return this->bit_and(z, this->shr(x, bits));
812     }
813 
pack(I32 x,I32 y,int bits)814     I32 Builder::pack(I32 x, I32 y, int bits) {
815         int X,Y;
816         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X|(Y<<bits)); }
817         return {this->push(Op::pack, x.id,y.id,NA, 0,bits)};
818     }
819 
bytes(I32 x,int control)820     I32 Builder::bytes(I32 x, int control) {
821         return {this->push(Op::bytes, x.id,NA,NA, control)};
822     }
823 
floor(F32 x)824     F32 Builder::floor(F32 x) {
825         float X;
826         if (this->allImm(x.id,&X)) { return this->splat(floorf(X)); }
827         return {this->push(Op::floor, x.id)};
828     }
to_f32(I32 x)829     F32 Builder::to_f32(I32 x) {
830         int X;
831         if (this->allImm(x.id,&X)) { return this->splat((float)X); }
832         return {this->push(Op::to_f32, x.id)};
833     }
trunc(F32 x)834     I32 Builder::trunc(F32 x) {
835         float X;
836         if (this->allImm(x.id,&X)) { return this->splat((int)X); }
837         return {this->push(Op::trunc, x.id)};
838     }
round(F32 x)839     I32 Builder::round(F32 x) {
840         float X;
841         if (this->allImm(x.id,&X)) { return this->splat((int)lrintf(X)); }
842         return {this->push(Op::round, x.id)};
843     }
844 
from_unorm(int bits,I32 x)845     F32 Builder::from_unorm(int bits, I32 x) {
846         float limit = (1<<bits)-1.0f;
847         return mul(to_f32(x), splat(1/limit));
848     }
to_unorm(int bits,F32 x)849     I32 Builder::to_unorm(int bits, F32 x) {
850         float limit = (1<<bits)-1.0f;
851         return round(mul(x, splat(limit)));
852     }
853 
unpack_1010102(I32 rgba)854     Color Builder::unpack_1010102(I32 rgba) {
855         return {
856             from_unorm(10, extract(rgba,  0, splat(0x3ff))),
857             from_unorm(10, extract(rgba, 10, splat(0x3ff))),
858             from_unorm(10, extract(rgba, 20, splat(0x3ff))),
859             from_unorm( 2, extract(rgba, 30, splat(0x3  ))),
860         };
861     }
unpack_8888(I32 rgba)862     Color Builder::unpack_8888(I32 rgba) {
863         return {
864             from_unorm(8, extract(rgba,  0, splat(0xff))),
865             from_unorm(8, extract(rgba,  8, splat(0xff))),
866             from_unorm(8, extract(rgba, 16, splat(0xff))),
867             from_unorm(8, extract(rgba, 24, splat(0xff))),
868         };
869     }
unpack_565(I32 bgr)870     Color Builder::unpack_565(I32 bgr) {
871         return {
872             from_unorm(5, extract(bgr, 11, splat(0b011'111))),
873             from_unorm(6, extract(bgr,  5, splat(0b111'111))),
874             from_unorm(5, extract(bgr,  0, splat(0b011'111))),
875             splat(1.0f),
876         };
877     }
878 
unpremul(F32 * r,F32 * g,F32 * b,F32 a)879     void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
880         skvm::F32 invA = div(splat(1.0f), a),
881                   inf  = bit_cast(splat(0x7f800000));
882         // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
883         invA = bit_cast(bit_and(lt(invA, inf),
884                                 bit_cast(invA)));
885         *r = mul(*r, invA);
886         *g = mul(*g, invA);
887         *b = mul(*b, invA);
888     }
889 
premul(F32 * r,F32 * g,F32 * b,F32 a)890     void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
891         *r = mul(*r, a);
892         *g = mul(*g, a);
893         *b = mul(*b, a);
894     }
895 
lerp(Color lo,Color hi,F32 t)896     Color Builder::lerp(Color lo, Color hi, F32 t) {
897         return {
898             lerp(lo.r, hi.r, t),
899             lerp(lo.g, hi.g, t),
900             lerp(lo.b, hi.b, t),
901             lerp(lo.a, hi.a, t),
902         };
903     }
904 
905     // ~~~~ Program::eval() and co. ~~~~ //
906 
907     // Handy references for x86-64 instruction encoding:
908     // https://wiki.osdev.org/X86-64_Instruction_Encoding
909     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
910     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
911     // http://ref.x86asm.net/coder64.html
912 
913     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)914     static uint8_t _233(int a, int b, int c) {
915         return (a & 3) << 6
916              | (b & 7) << 3
917              | (c & 7) << 0;
918     }
919 
920     // ModRM byte encodes the arguments of an opcode.
921     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)922     static uint8_t mod_rm(Mod mod, int reg, int rm) {
923         return _233((int)mod, reg, rm);
924     }
925 
mod(int imm)926     static Mod mod(int imm) {
927         if (imm == 0)               { return Mod::Indirect; }
928         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
929         return Mod::FourByteImm;
930     }
931 
imm_bytes(Mod mod)932     static int imm_bytes(Mod mod) {
933         switch (mod) {
934             case Mod::Indirect:    return 0;
935             case Mod::OneByteImm:  return 1;
936             case Mod::FourByteImm: return 4;
937             case Mod::Direct: SkUNREACHABLE;
938         }
939         SkUNREACHABLE;
940     }
941 
942     // SIB byte encodes a memory address, base + (index * scale).
sib(Assembler::Scale scale,int index,int base)943     static uint8_t sib(Assembler::Scale scale, int index, int base) {
944         return _233((int)scale, index, base);
945     }
946 
947     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)948     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
949                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
950                        bool X,   // Extra top bit for SIB index register.
951                        bool B) { // Extra top bit for SIB base or ModRM rm register.
952         return 0b01000000   // Fixed 0100 for top four bits.
953              | (W << 3)
954              | (R << 2)
955              | (X << 1)
956              | (B << 0);
957     }
958 
959 
960     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
961     struct VEX {
962         int     len;
963         uint8_t bytes[3];
964     };
965 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)966     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
967                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
968                    bool   X,   // Same as REX X.
969                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
970                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
971                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
972                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
973                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
974 
975         // Pack x86 opcode map selector to 5-bit VEX encoding.
976         map = [map]{
977             switch (map) {
978                 case   0x0f: return 0b00001;
979                 case 0x380f: return 0b00010;
980                 case 0x3a0f: return 0b00011;
981                 // Several more cases only used by XOP / TBM.
982             }
983             SkUNREACHABLE;
984         }();
985 
986         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
987         pp = [pp]{
988             switch (pp) {
989                 case 0x66: return 0b01;
990                 case 0xf3: return 0b10;
991                 case 0xf2: return 0b11;
992             }
993             return 0b00;
994         }();
995 
996         VEX vex = {0, {0,0,0}};
997         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
998             // With these conditions met, we can optionally compress VEX to 2-byte.
999             vex.len = 2;
1000             vex.bytes[0] = 0xc5;
1001             vex.bytes[1] = (pp      &  3) << 0
1002                          | (L       &  1) << 2
1003                          | (~vvvv   & 15) << 3
1004                          | (~(int)R &  1) << 7;
1005         } else {
1006             // We could use this 3-byte VEX prefix all the time if we like.
1007             vex.len = 3;
1008             vex.bytes[0] = 0xc4;
1009             vex.bytes[1] = (map     & 31) << 0
1010                          | (~(int)B &  1) << 5
1011                          | (~(int)X &  1) << 6
1012                          | (~(int)R &  1) << 7;
1013             vex.bytes[2] = (pp    &  3) << 0
1014                          | (L     &  1) << 2
1015                          | (~vvvv & 15) << 3
1016                          | (WE    &  1) << 7;
1017         }
1018         return vex;
1019     }
1020 
Assembler(void * buf)1021     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {}
1022 
size() const1023     size_t Assembler::size() const { return fSize; }
1024 
bytes(const void * p,int n)1025     void Assembler::bytes(const void* p, int n) {
1026         if (fCurr) {
1027             memcpy(fCurr, p, n);
1028             fCurr += n;
1029         }
1030         fSize += n;
1031     }
1032 
byte(uint8_t b)1033     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)1034     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1035 
align(int mod)1036     void Assembler::align(int mod) {
1037         while (this->size() % mod) {
1038             this->byte(0x00);
1039         }
1040     }
1041 
int3()1042     void Assembler::int3() {
1043         this->byte(0xcc);
1044     }
1045 
vzeroupper()1046     void Assembler::vzeroupper() {
1047         this->byte(0xc5);
1048         this->byte(0xf8);
1049         this->byte(0x77);
1050     }
ret()1051     void Assembler::ret() { this->byte(0xc3); }
1052 
1053     // Common instruction building for 64-bit opcodes with an immediate argument.
op(int opcode,int opcode_ext,GP64 dst,int imm)1054     void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) {
1055         opcode |= 0b0000'0001;   // low bit set for 64-bit operands
1056         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1057 
1058         int imm_bytes = 4;
1059         if (SkTFitsIn<int8_t>(imm)) {
1060             imm_bytes = 1;
1061             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
1062         }
1063 
1064         this->byte(rex(1,0,0,dst>>3));
1065         this->byte(opcode);
1066         this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7));
1067         this->bytes(&imm, imm_bytes);
1068     }
1069 
add(GP64 dst,int imm)1070     void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); }
sub(GP64 dst,int imm)1071     void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); }
cmp(GP64 reg,int imm)1072     void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); }
1073 
movq(GP64 dst,GP64 src,int off)1074     void Assembler::movq(GP64 dst, GP64 src, int off) {
1075         this->byte(rex(1,dst>>3,0,src>>3));
1076         this->byte(0x8b);
1077         this->byte(mod_rm(mod(off), dst&7, src&7));
1078         this->bytes(&off, imm_bytes(mod(off)));
1079     }
1080 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Ymm y,bool W)1081     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) {
1082         VEX v = vex(W, dst>>3, 0, y>>3,
1083                     map, x, 1/*ymm, not xmm*/, prefix);
1084         this->bytes(v.bytes, v.len);
1085         this->byte(opcode);
1086         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
1087     }
1088 
vpaddd(Ymm dst,Ymm x,YmmOrLabel y)1089     void Assembler::vpaddd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,YmmOrLabel y)1090     void Assembler::vpsubd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Ymm y)1091     void Assembler::vpmulld(Ymm dst, Ymm x, Ymm        y) { this->op(0x66,0x380f,0x40, dst,x,y); }
1092 
vpsubw(Ymm dst,Ymm x,Ymm y)1093     void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Ymm y)1094     void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
1095 
vpand(Ymm dst,Ymm x,YmmOrLabel y)1096     void Assembler::vpand (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,YmmOrLabel y)1097     void Assembler::vpor  (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,YmmOrLabel y)1098     void Assembler::vpxor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Ymm y)1099     void Assembler::vpandn(Ymm dst, Ymm x, Ymm        y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
1100 
vaddps(Ymm dst,Ymm x,YmmOrLabel y)1101     void Assembler::vaddps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,YmmOrLabel y)1102     void Assembler::vsubps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,YmmOrLabel y)1103     void Assembler::vmulps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Ymm y)1104     void Assembler::vdivps(Ymm dst, Ymm x, Ymm        y) { this->op(0,0x0f,0x5e, dst,x,y); }
vminps(Ymm dst,Ymm x,YmmOrLabel y)1105     void Assembler::vminps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5d, dst,x,y); }
vmaxps(Ymm dst,Ymm x,YmmOrLabel y)1106     void Assembler::vmaxps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5f, dst,x,y); }
1107 
vfmadd132ps(Ymm dst,Ymm x,Ymm y)1108     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Ymm y)1109     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Ymm y)1110     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
1111 
vpackusdw(Ymm dst,Ymm x,Ymm y)1112     void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Ymm y)1113     void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
1114 
vpcmpeqd(Ymm dst,Ymm x,Ymm y)1115     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Ymm y)1116     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); }
1117 
vcmpps(Ymm dst,Ymm x,Ymm y,int imm)1118     void Assembler::vcmpps(Ymm dst, Ymm x, Ymm y, int imm) {
1119         this->op(0,0x0f,0xc2, dst,x,y);
1120         this->byte(imm);
1121     }
1122 
vpblendvb(Ymm dst,Ymm x,Ymm y,Ymm z)1123     void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) {
1124         int prefix = 0x66,
1125             map    = 0x3a0f,
1126             opcode = 0x4c;
1127         VEX v = vex(0, dst>>3, 0, y>>3,
1128                     map, x, /*ymm?*/1, prefix);
1129         this->bytes(v.bytes, v.len);
1130         this->byte(opcode);
1131         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
1132         this->byte(z << 4);
1133     }
1134 
1135     // dst = x op /opcode_ext imm
op(int prefix,int map,int opcode,int opcode_ext,Ymm dst,Ymm x,int imm)1136     void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) {
1137         // This is a little weird, but if we pass the opcode_ext as if it were the dst register,
1138         // the dst register as if x, and the x register as if y, all the bits end up where we want.
1139         this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x);
1140         this->byte(imm);
1141     }
1142 
vpslld(Ymm dst,Ymm x,int imm)1143     void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); }
vpsrld(Ymm dst,Ymm x,int imm)1144     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); }
vpsrad(Ymm dst,Ymm x,int imm)1145     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); }
1146 
vpsrlw(Ymm dst,Ymm x,int imm)1147     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); }
1148 
1149 
vpermq(Ymm dst,Ymm x,int imm)1150     void Assembler::vpermq(Ymm dst, Ymm x, int imm) {
1151         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
1152         bool W = true;
1153         this->op(0x66,0x3a0f,0x00, dst,x,W);
1154         this->byte(imm);
1155     }
1156 
vroundps(Ymm dst,Ymm x,int imm)1157     void Assembler::vroundps(Ymm dst, Ymm x, int imm) {
1158         this->op(0x66,0x3a0f,0x08, dst,x);
1159         this->byte(imm);
1160     }
1161 
vmovdqa(Ymm dst,Ymm src)1162     void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); }
1163 
vcvtdq2ps(Ymm dst,Ymm x)1164     void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op(   0,0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Ymm x)1165     void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); }
vcvtps2dq(Ymm dst,Ymm x)1166     void Assembler::vcvtps2dq (Ymm dst, Ymm x) { this->op(0x66,0x0f,0x5b, dst,x); }
vsqrtps(Ymm dst,Ymm x)1167     void Assembler::vsqrtps   (Ymm dst, Ymm x) { this->op(   0,0x0f,0x51, dst,x); }
1168 
here()1169     Assembler::Label Assembler::here() {
1170         return { (int)this->size(), Label::NotYetSet, {} };
1171     }
1172 
disp19(Label * l)1173     int Assembler::disp19(Label* l) {
1174         SkASSERT(l->kind == Label::NotYetSet ||
1175                  l->kind == Label::ARMDisp19);
1176         l->kind = Label::ARMDisp19;
1177         l->references.push_back(here().offset);
1178         // ARM 19-bit instruction count, from the beginning of this instruction.
1179         return (l->offset - here().offset) / 4;
1180     }
1181 
disp32(Label * l)1182     int Assembler::disp32(Label* l) {
1183         SkASSERT(l->kind == Label::NotYetSet ||
1184                  l->kind == Label::X86Disp32);
1185         l->kind = Label::X86Disp32;
1186         l->references.push_back(here().offset);
1187         // x86 32-bit byte count, from the end of this instruction.
1188         return l->offset - (here().offset + 4);
1189     }
1190 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Label * l)1191     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) {
1192         // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
1193         const int rip = rbp;
1194 
1195         VEX v = vex(0, dst>>3, 0, rip>>3,
1196                     map, x, /*ymm?*/1, prefix);
1197         this->bytes(v.bytes, v.len);
1198         this->byte(opcode);
1199         this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
1200         this->word(this->disp32(l));
1201     }
1202 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,YmmOrLabel y)1203     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, YmmOrLabel y) {
1204         y.label ? this->op(prefix,map,opcode,dst,x, y.label)
1205                 : this->op(prefix,map,opcode,dst,x, y.ymm  );
1206     }
1207 
vpshufb(Ymm dst,Ymm x,Label * l)1208     void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); }
vptest(Ymm dst,Label * l)1209     void Assembler::vptest(Ymm dst, Label* l) { this->op(0x66, 0x380f, 0x17, dst, (Ymm)0, l); }
1210 
vbroadcastss(Ymm dst,Label * l)1211     void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); }
vbroadcastss(Ymm dst,Xmm src)1212     void Assembler::vbroadcastss(Ymm dst, Xmm src)  { this->op(0x66,0x380f,0x18, dst, (Ymm)src); }
vbroadcastss(Ymm dst,GP64 ptr,int off)1213     void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) {
1214         int prefix = 0x66,
1215                map = 0x380f,
1216             opcode = 0x18;
1217         VEX v = vex(0, dst>>3, 0, ptr>>3,
1218                     map, 0, /*ymm?*/1, prefix);
1219         this->bytes(v.bytes, v.len);
1220         this->byte(opcode);
1221 
1222         this->byte(mod_rm(mod(off), dst&7, ptr&7));
1223         this->bytes(&off, imm_bytes(mod(off)));
1224     }
1225 
jump(uint8_t condition,Label * l)1226     void Assembler::jump(uint8_t condition, Label* l) {
1227         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
1228         //    7?     one-byte-disp
1229         //    0F 8? four-byte-disp
1230         // We always use the near displacement to make updating labels simpler (no resizing).
1231         this->byte(0x0f);
1232         this->byte(condition);
1233         this->word(this->disp32(l));
1234     }
je(Label * l)1235     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)1236     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)1237     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
jc(Label * l)1238     void Assembler::jc (Label* l) { this->jump(0x82, l); }
1239 
jmp(Label * l)1240     void Assembler::jmp(Label* l) {
1241         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
1242         this->byte(0xe9);
1243         this->word(this->disp32(l));
1244     }
1245 
load_store(int prefix,int map,int opcode,Ymm ymm,GP64 ptr)1246     void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) {
1247         VEX v = vex(0, ymm>>3, 0, ptr>>3,
1248                     map, 0, /*ymm?*/1, prefix);
1249         this->bytes(v.bytes, v.len);
1250         this->byte(opcode);
1251         this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7));
1252     }
1253 
vmovups(Ymm dst,GP64 src)1254     void Assembler::vmovups  (Ymm dst, GP64 src) { this->load_store(0   ,  0x0f,0x10, dst,src); }
vpmovzxwd(Ymm dst,GP64 src)1255     void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,GP64 src)1256     void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); }
1257 
vmovups(GP64 dst,Ymm src)1258     void Assembler::vmovups  (GP64 dst, Ymm src) { this->load_store(0   ,  0x0f,0x11, src,dst); }
vmovups(GP64 dst,Xmm src)1259     void Assembler::vmovups  (GP64 dst, Xmm src) {
1260         // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0.
1261         int prefix = 0,
1262             map    = 0x0f,
1263             opcode = 0x11;
1264         VEX v = vex(0, src>>3, 0, dst>>3,
1265                     map, 0, /*ymm?*/0, prefix);
1266         this->bytes(v.bytes, v.len);
1267         this->byte(opcode);
1268         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1269     }
1270 
vmovq(GP64 dst,Xmm src)1271     void Assembler::vmovq(GP64 dst, Xmm src) {
1272         int prefix = 0x66,
1273             map    = 0x0f,
1274             opcode = 0xd6;
1275         VEX v = vex(0, src>>3, 0, dst>>3,
1276                     map, 0, /*ymm?*/0, prefix);
1277         this->bytes(v.bytes, v.len);
1278         this->byte(opcode);
1279         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1280     }
1281 
vmovd(GP64 dst,Xmm src)1282     void Assembler::vmovd(GP64 dst, Xmm src) {
1283         int prefix = 0x66,
1284             map    = 0x0f,
1285             opcode = 0x7e;
1286         VEX v = vex(0, src>>3, 0, dst>>3,
1287                     map, 0, /*ymm?*/0, prefix);
1288         this->bytes(v.bytes, v.len);
1289         this->byte(opcode);
1290         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1291     }
1292 
vmovd_direct(GP64 dst,Xmm src)1293     void Assembler::vmovd_direct(GP64 dst, Xmm src) {
1294         int prefix = 0x66,
1295             map    = 0x0f,
1296             opcode = 0x7e;
1297         VEX v = vex(0, src>>3, 0, dst>>3,
1298                     map, 0, /*ymm?*/0, prefix);
1299         this->bytes(v.bytes, v.len);
1300         this->byte(opcode);
1301         this->byte(mod_rm(Mod::Direct, src&7, dst&7));
1302     }
1303 
vmovd(Xmm dst,GP64 src)1304     void Assembler::vmovd(Xmm dst, GP64 src) {
1305         int prefix = 0x66,
1306             map    = 0x0f,
1307             opcode = 0x6e;
1308         VEX v = vex(0, dst>>3, 0, src>>3,
1309                     map, 0, /*ymm?*/0, prefix);
1310         this->bytes(v.bytes, v.len);
1311         this->byte(opcode);
1312         this->byte(mod_rm(Mod::Indirect, dst&7, src&7));
1313     }
1314 
vmovd(Xmm dst,Scale scale,GP64 index,GP64 base)1315     void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) {
1316         int prefix = 0x66,
1317             map    = 0x0f,
1318             opcode = 0x6e;
1319         VEX v = vex(0, dst>>3, index>>3, base>>3,
1320                     map, 0, /*ymm?*/0, prefix);
1321         this->bytes(v.bytes, v.len);
1322         this->byte(opcode);
1323         this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
1324         this->byte(sib(scale, index&7, base&7));
1325     }
1326 
vmovd_direct(Xmm dst,GP64 src)1327     void Assembler::vmovd_direct(Xmm dst, GP64 src) {
1328         int prefix = 0x66,
1329             map    = 0x0f,
1330             opcode = 0x6e;
1331         VEX v = vex(0, dst>>3, 0, src>>3,
1332                     map, 0, /*ymm?*/0, prefix);
1333         this->bytes(v.bytes, v.len);
1334         this->byte(opcode);
1335         this->byte(mod_rm(Mod::Direct, dst&7, src&7));
1336     }
1337 
movzbl(GP64 dst,GP64 src,int off)1338     void Assembler::movzbl(GP64 dst, GP64 src, int off) {
1339         if ((dst>>3) || (src>>3)) {
1340             this->byte(rex(0,dst>>3,0,src>>3));
1341         }
1342         this->byte(0x0f);
1343         this->byte(0xb6);
1344         this->byte(mod_rm(mod(off), dst&7, src&7));
1345         this->bytes(&off, imm_bytes(mod(off)));
1346     }
1347 
1348 
movb(GP64 dst,GP64 src)1349     void Assembler::movb(GP64 dst, GP64 src) {
1350         if ((dst>>3) || (src>>3)) {
1351             this->byte(rex(0,src>>3,0,dst>>3));
1352         }
1353         this->byte(0x88);
1354         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1355     }
1356 
vpinsrw(Xmm dst,Xmm src,GP64 ptr,int imm)1357     void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) {
1358         int prefix = 0x66,
1359             map    = 0x0f,
1360             opcode = 0xc4;
1361         VEX v = vex(0, dst>>3, 0, ptr>>3,
1362                     map, src, /*ymm?*/0, prefix);
1363         this->bytes(v.bytes, v.len);
1364         this->byte(opcode);
1365         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
1366         this->byte(imm);
1367     }
1368 
vpinsrb(Xmm dst,Xmm src,GP64 ptr,int imm)1369     void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) {
1370         int prefix = 0x66,
1371             map    = 0x3a0f,
1372             opcode = 0x20;
1373         VEX v = vex(0, dst>>3, 0, ptr>>3,
1374                     map, src, /*ymm?*/0, prefix);
1375         this->bytes(v.bytes, v.len);
1376         this->byte(opcode);
1377         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
1378         this->byte(imm);
1379     }
1380 
vpextrw(GP64 ptr,Xmm src,int imm)1381     void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) {
1382         int prefix = 0x66,
1383             map    = 0x3a0f,
1384             opcode = 0x15;
1385 
1386         VEX v = vex(0, src>>3, 0, ptr>>3,
1387                     map, 0, /*ymm?*/0, prefix);
1388         this->bytes(v.bytes, v.len);
1389         this->byte(opcode);
1390         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
1391         this->byte(imm);
1392     }
vpextrb(GP64 ptr,Xmm src,int imm)1393     void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) {
1394         int prefix = 0x66,
1395             map    = 0x3a0f,
1396             opcode = 0x14;
1397 
1398         VEX v = vex(0, src>>3, 0, ptr>>3,
1399                     map, 0, /*ymm?*/0, prefix);
1400         this->bytes(v.bytes, v.len);
1401         this->byte(opcode);
1402         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
1403         this->byte(imm);
1404     }
1405 
vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)1406     void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
1407         // Unlike most instructions, no aliasing is permitted here.
1408         SkASSERT(dst != ix);
1409         SkASSERT(dst != mask);
1410         SkASSERT(mask != ix);
1411 
1412         int prefix = 0x66,
1413             map    = 0x380f,
1414             opcode = 0x92;
1415         VEX v = vex(0, dst>>3, ix>>3, base>>3,
1416                     map, mask, /*ymm?*/1, prefix);
1417         this->bytes(v.bytes, v.len);
1418         this->byte(opcode);
1419         this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
1420         this->byte(sib(scale, ix&7, base&7));
1421     }
1422 
1423     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
1424 
operator ""_mask(unsigned long long bits)1425     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
1426 
op(uint32_t hi,V m,uint32_t lo,V n,V d)1427     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
1428         this->word( (hi & 11_mask) << 21
1429                   | (m  &  5_mask) << 16
1430                   | (lo &  6_mask) << 10
1431                   | (n  &  5_mask) <<  5
1432                   | (d  &  5_mask) <<  0);
1433     }
1434 
and16b(V d,V n,V m)1435     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)1436     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)1437     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)1438     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)1439     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
not16b(V d,V n)1440     void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
1441 
add4s(V d,V n,V m)1442     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)1443     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)1444     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
1445 
cmeq4s(V d,V n,V m)1446     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)1447     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
1448 
sub8h(V d,V n,V m)1449     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)1450     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
1451 
fadd4s(V d,V n,V m)1452     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)1453     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)1454     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)1455     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
fmin4s(V d,V n,V m)1456     void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
fmax4s(V d,V n,V m)1457     void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
1458 
fcmeq4s(V d,V n,V m)1459     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
fcmgt4s(V d,V n,V m)1460     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
fcmge4s(V d,V n,V m)1461     void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
1462 
fmla4s(V d,V n,V m)1463     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
fmls4s(V d,V n,V m)1464     void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
1465 
tbl(V d,V n,V m)1466     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
1467 
op(uint32_t op22,int imm,V n,V d)1468     void Assembler::op(uint32_t op22, int imm, V n, V d) {
1469         this->word( (op22 & 22_mask) << 10
1470                   | imm              << 16   // imm is embedded inside op, bit size depends on op
1471                   | (n    &  5_mask) <<  5
1472                   | (d    &  5_mask) <<  0);
1473     }
1474 
sli4s(V d,V n,int imm)1475     void Assembler::sli4s(V d, V n, int imm) {
1476         this->op(0b0'1'1'011110'0100'000'01010'1,    ( imm&31), n, d);
1477     }
shl4s(V d,V n,int imm)1478     void Assembler::shl4s(V d, V n, int imm) {
1479         this->op(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
1480     }
sshr4s(V d,V n,int imm)1481     void Assembler::sshr4s(V d, V n, int imm) {
1482         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
1483     }
ushr4s(V d,V n,int imm)1484     void Assembler::ushr4s(V d, V n, int imm) {
1485         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
1486     }
ushr8h(V d,V n,int imm)1487     void Assembler::ushr8h(V d, V n, int imm) {
1488         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
1489     }
1490 
scvtf4s(V d,V n)1491     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)1492     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
fcvtns4s(V d,V n)1493     void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
1494 
xtns2h(V d,V n)1495     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)1496     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
1497 
uxtlb2h(V d,V n)1498     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)1499     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
1500 
uminv4s(V d,V n)1501     void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
1502 
brk(int imm16)1503     void Assembler::brk(int imm16) {
1504         this->word(0b11010100'001'0000000000000000'000'00
1505                   | (imm16 & 16_mask) << 5);
1506     }
1507 
ret(X n)1508     void Assembler::ret(X n) {
1509         this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
1510                   | (n & 5_mask) << 5);
1511     }
1512 
add(X d,X n,int imm12)1513     void Assembler::add(X d, X n, int imm12) {
1514         this->word(0b1'0'0'10001'00   << 22
1515                   | (imm12 & 12_mask) << 10
1516                   | (n     &  5_mask) <<  5
1517                   | (d     &  5_mask) <<  0);
1518     }
sub(X d,X n,int imm12)1519     void Assembler::sub(X d, X n, int imm12) {
1520         this->word( 0b1'1'0'10001'00  << 22
1521                   | (imm12 & 12_mask) << 10
1522                   | (n     &  5_mask) <<  5
1523                   | (d     &  5_mask) <<  0);
1524     }
subs(X d,X n,int imm12)1525     void Assembler::subs(X d, X n, int imm12) {
1526         this->word( 0b1'1'1'10001'00  << 22
1527                   | (imm12 & 12_mask) << 10
1528                   | (n     &  5_mask) <<  5
1529                   | (d     &  5_mask) <<  0);
1530     }
1531 
b(Condition cond,Label * l)1532     void Assembler::b(Condition cond, Label* l) {
1533         const int imm19 = this->disp19(l);
1534         this->word( 0b0101010'0           << 24
1535                   | (imm19     & 19_mask) <<  5
1536                   | ((int)cond &  4_mask) <<  0);
1537     }
cbz(X t,Label * l)1538     void Assembler::cbz(X t, Label* l) {
1539         const int imm19 = this->disp19(l);
1540         this->word( 0b1'011010'0      << 24
1541                   | (imm19 & 19_mask) <<  5
1542                   | (t     &  5_mask) <<  0);
1543     }
cbnz(X t,Label * l)1544     void Assembler::cbnz(X t, Label* l) {
1545         const int imm19 = this->disp19(l);
1546         this->word( 0b1'011010'1      << 24
1547                   | (imm19 & 19_mask) <<  5
1548                   | (t     &  5_mask) <<  0);
1549     }
1550 
ldrq(V dst,X src)1551     void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
ldrs(V dst,X src)1552     void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
ldrb(V dst,X src)1553     void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); }
1554 
strq(V src,X dst)1555     void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
strs(V src,X dst)1556     void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
strb(V src,X dst)1557     void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
1558 
fmovs(X dst,V src)1559     void Assembler::fmovs(X dst, V src) {
1560         this->word(0b0'0'0'11110'00'1'00'110'000000 << 10
1561                   | (src & 5_mask)                  << 5
1562                   | (dst & 5_mask)                  << 0);
1563     }
1564 
ldrq(V dst,Label * l)1565     void Assembler::ldrq(V dst, Label* l) {
1566         const int imm19 = this->disp19(l);
1567         this->word( 0b10'011'1'00     << 24
1568                   | (imm19 & 19_mask) << 5
1569                   | (dst   &  5_mask) << 0);
1570     }
1571 
label(Label * l)1572     void Assembler::label(Label* l) {
1573         if (fCode) {
1574             // The instructions all currently point to l->offset.
1575             // We'll want to add a delta to point them to here().
1576             int delta = here().offset - l->offset;
1577             l->offset = here().offset;
1578 
1579             if (l->kind == Label::ARMDisp19) {
1580                 for (int ref : l->references) {
1581                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
1582                     uint32_t inst;
1583                     memcpy(&inst, fCode + ref, 4);
1584 
1585                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
1586                     int disp = (int)(inst << 8) >> 13;
1587 
1588                     disp += delta/4;  // delta is in bytes, we want instructions.
1589 
1590                     // Put it all back together, preserving the high 8 bits and low 5.
1591                     inst = ((disp << 5) &  (19_mask << 5))
1592                          | ((inst     ) & ~(19_mask << 5));
1593 
1594                     memcpy(fCode + ref, &inst, 4);
1595                 }
1596             }
1597 
1598             if (l->kind == Label::X86Disp32) {
1599                 for (int ref : l->references) {
1600                     // ref points to a 32-bit displacement in bytes.
1601                     int disp;
1602                     memcpy(&disp, fCode + ref, 4);
1603 
1604                     disp += delta;
1605 
1606                     memcpy(fCode + ref, &disp, 4);
1607                 }
1608             }
1609         }
1610     }
1611 
eval(int n,void * args[]) const1612     void Program::eval(int n, void* args[]) const {
1613         if (const void* b = fJITEntry) {
1614             void** a = args;
1615             switch (fStrides.size()) {
1616                 case 0: return ((void(*)(int                        ))b)(n                    );
1617                 case 1: return ((void(*)(int,void*                  ))b)(n,a[0]               );
1618                 case 2: return ((void(*)(int,void*,void*            ))b)(n,a[0],a[1]          );
1619                 case 3: return ((void(*)(int,void*,void*,void*      ))b)(n,a[0],a[1],a[2]     );
1620                 case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]);
1621                 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))b)
1622                                 (n,a[0],a[1],a[2],a[3],a[4]);
1623                 default: SkUNREACHABLE;  // TODO
1624             }
1625         }
1626 
1627         this->interpret(n, args);
1628     }
1629 
interpret(int n,void * args[]) const1630     void Program::interpret(int n, void* args[]) const {
1631         // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
1632         constexpr int K = 16;
1633         using I32 = skvx::Vec<K, int>;
1634         using F32 = skvx::Vec<K, float>;
1635         using U32 = skvx::Vec<K, uint32_t>;
1636         using U16 = skvx::Vec<K, uint16_t>;
1637         using  U8 = skvx::Vec<K, uint8_t>;
1638 
1639         using I16x2 = skvx::Vec<2*K,  int16_t>;
1640         using U16x2 = skvx::Vec<2*K, uint16_t>;
1641 
1642         union Slot {
1643             F32   f32;
1644             I32   i32;
1645             U32   u32;
1646             I16x2 i16x2;
1647             U16x2 u16x2;
1648         };
1649 
1650         Slot                     few_regs[16];
1651         std::unique_ptr<char[]> many_regs;
1652 
1653         Slot* regs = few_regs;
1654 
1655         if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
1656             // Annoyingly we can't trust that malloc() or new will work with Slot because
1657             // the skvx::Vec types may have alignment greater than what they provide.
1658             // We'll overallocate one extra register so we can align manually.
1659             many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
1660 
1661             uintptr_t addr = (uintptr_t)many_regs.get();
1662             addr += alignof(Slot) -
1663                      (addr & (alignof(Slot) - 1));
1664             SkASSERT((addr & (alignof(Slot) - 1)) == 0);
1665             regs = (Slot*)addr;
1666         }
1667 
1668 
1669         auto r = [&](Reg id) -> Slot& {
1670             SkASSERT(0 <= id && id < fRegs);
1671             return regs[id];
1672         };
1673         auto arg = [&](int ix) {
1674             SkASSERT(0 <= ix && ix < (int)fStrides.size());
1675             return args[ix];
1676         };
1677 
1678         // Step each argument pointer ahead by its stride a number of times.
1679         auto step_args = [&](int times) {
1680             for (int i = 0; i < (int)fStrides.size(); i++) {
1681                 args[i] = (void*)( (char*)args[i] + times * fStrides[i] );
1682             }
1683         };
1684 
1685         int start = 0,
1686             stride;
1687         for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
1688             stride = n >= K ? K : 1;
1689 
1690             for (int i = start; i < (int)fInstructions.size(); i++) {
1691                 Instruction inst = fInstructions[i];
1692 
1693                 // d = op(x,y/imm,z/imm)
1694                 Reg   d = inst.d,
1695                       x = inst.x,
1696                       y = inst.y,
1697                       z = inst.z;
1698                 int immy = inst.immy,
1699                     immz = inst.immz;
1700 
1701                 // Ops that interact with memory need to know whether we're stride=1 or K,
1702                 // but all non-memory ops can run the same code no matter the stride.
1703                 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
1704                     default: SkUNREACHABLE;
1705 
1706                 #define STRIDE_1(op) case 2*(int)op
1707                 #define STRIDE_K(op) case 2*(int)op + 1
1708                     STRIDE_1(Op::store8 ): memcpy(arg(immy), &r(x).i32, 1); break;
1709                     STRIDE_1(Op::store16): memcpy(arg(immy), &r(x).i32, 2); break;
1710                     STRIDE_1(Op::store32): memcpy(arg(immy), &r(x).i32, 4); break;
1711 
1712                     STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(immy)); break;
1713                     STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(immy)); break;
1714                     STRIDE_K(Op::store32):                     (r(x).i32).store(arg(immy)); break;
1715 
1716                     STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 1); break;
1717                     STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 2); break;
1718                     STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 4); break;
1719 
1720                     STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(immy))); break;
1721                     STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(immy))); break;
1722                     STRIDE_K(Op::load32): r(d).i32=                 I32::Load(arg(immy)) ; break;
1723 
1724                     // The pointer we base our gather on is loaded indirectly from a uniform:
1725                     //     - arg(immy) is the uniform holding our gather base pointer somewhere;
1726                     //     - (const uint8_t*)arg(immy) + immz points to the gather base pointer;
1727                     //     - memcpy() loads the gather base and into a pointer of the right type.
1728                     // After all that we have an ordinary (uniform) pointer `ptr` to load from,
1729                     // and we then gather from it using the varying indices in r(x).
1730                     STRIDE_1(Op::gather8):
1731                         for (int i = 0; i < K; i++) {
1732                             const uint8_t* ptr;
1733                             memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
1734                             r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
1735                         } break;
1736                     STRIDE_1(Op::gather16):
1737                         for (int i = 0; i < K; i++) {
1738                             const uint16_t* ptr;
1739                             memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
1740                             r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
1741                         } break;
1742                     STRIDE_1(Op::gather32):
1743                         for (int i = 0; i < K; i++) {
1744                             const int* ptr;
1745                             memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
1746                             r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
1747                         } break;
1748 
1749                     STRIDE_K(Op::gather8):
1750                         for (int i = 0; i < K; i++) {
1751                             const uint8_t* ptr;
1752                             memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
1753                             r(d).i32[i] = ptr[ r(x).i32[i] ];
1754                         } break;
1755                     STRIDE_K(Op::gather16):
1756                         for (int i = 0; i < K; i++) {
1757                             const uint16_t* ptr;
1758                             memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
1759                             r(d).i32[i] = ptr[ r(x).i32[i] ];
1760                         } break;
1761                     STRIDE_K(Op::gather32):
1762                         for (int i = 0; i < K; i++) {
1763                             const int* ptr;
1764                             memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
1765                             r(d).i32[i] = ptr[ r(x).i32[i] ];
1766                         } break;
1767 
1768                 #undef STRIDE_1
1769                 #undef STRIDE_K
1770 
1771                     // Ops that don't interact with memory should never care about the stride.
1772                 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
1773 
1774                     CASE(Op::assert_true):
1775                     #ifdef SK_DEBUG
1776                         if (!all(r(x).i32)) {
1777                             this->dump();
1778                             SkDebugf("inst %d, register %d\n", i, y);
1779                             for (int i = 0; i < K; i++) {
1780                                 SkDebugf("\t%2d: %08x (%g)\n", i, r(y).i32[i], r(y).f32[i]);
1781                             }
1782                         }
1783                         SkASSERT(all(r(x).i32));
1784                     #endif
1785                     break;
1786 
1787                     CASE(Op::index): static_assert(K == 16, "");
1788                                      r(d).i32 = n - I32{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1789                                      break;
1790 
1791                     CASE(Op::uniform8):
1792                         r(d).i32 = *(const uint8_t* )( (const char*)arg(immy) + immz );
1793                         break;
1794                     CASE(Op::uniform16):
1795                         r(d).i32 = *(const uint16_t*)( (const char*)arg(immy) + immz );
1796                         break;
1797                     CASE(Op::uniform32):
1798                         r(d).i32 = *(const int*     )( (const char*)arg(immy) + immz );
1799                         break;
1800 
1801                     CASE(Op::splat): r(d).i32 = immy; break;
1802 
1803                     CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
1804                     CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
1805                     CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
1806                     CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
1807                     CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
1808                     CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
1809 
1810                     // These _imm instructions are all x86/JIT only.
1811                     CASE(Op::add_f32_imm):
1812                     CASE(Op::sub_f32_imm):
1813                     CASE(Op::mul_f32_imm):
1814                     CASE(Op::min_f32_imm):
1815                     CASE(Op::max_f32_imm):
1816                     CASE(Op::bit_and_imm):
1817                     CASE(Op::bit_or_imm ):
1818                     CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
1819 
1820                     CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
1821 
1822                     CASE(Op::sqrt_f32): r(d).f32 = sqrt(r(x).f32); break;
1823 
1824                     CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
1825                     CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
1826                     CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
1827 
1828                     CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
1829                     CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
1830                     CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
1831 
1832                     CASE(Op::shl_i32): r(d).i32 = r(x).i32 << immy; break;
1833                     CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> immy; break;
1834                     CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> immy; break;
1835 
1836                     CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << immy; break;
1837                     CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> immy; break;
1838                     CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> immy; break;
1839 
1840                     CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
1841                     CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
1842                     CASE(Op:: gt_f32): r(d).i32 = r(x).f32 >  r(y).f32; break;
1843                     CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
1844 
1845                     CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
1846                     CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
1847                     CASE(Op:: gt_i32): r(d).i32 = r(x).i32 >  r(y).i32; break;
1848                     CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
1849 
1850                     CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
1851                     CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
1852                     CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 >  r(y).i16x2; break;
1853                     CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
1854 
1855                     CASE(Op::bit_and  ): r(d).i32 = r(x).i32 &  r(y).i32; break;
1856                     CASE(Op::bit_or   ): r(d).i32 = r(x).i32 |  r(y).i32; break;
1857                     CASE(Op::bit_xor  ): r(d).i32 = r(x).i32 ^  r(y).i32; break;
1858                     CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
1859 
1860                     CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
1861                                       break;
1862 
1863                     CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
1864 
1865                     CASE(Op::bytes): {
1866                         const U32 table[] = {
1867                             0,
1868                             (r(x).u32      ) & 0xff,
1869                             (r(x).u32 >>  8) & 0xff,
1870                             (r(x).u32 >> 16) & 0xff,
1871                             (r(x).u32 >> 24) & 0xff,
1872                         };
1873                         r(d).u32 = table[(immy >>  0) & 0xf] <<  0
1874                                  | table[(immy >>  4) & 0xf] <<  8
1875                                  | table[(immy >>  8) & 0xf] << 16
1876                                  | table[(immy >> 12) & 0xf] << 24;
1877                     } break;
1878 
1879                     CASE(Op::floor):  r(d).f32 = skvx::floor(r(x).f32); break;
1880                     CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
1881                     CASE(Op::trunc):  r(d).i32 = skvx::cast<int>  (r(x).f32); break;
1882                     CASE(Op::round):  r(d).i32 = skvx::cast<int>  (r(x).f32 + 0.5f); break;
1883                 #undef CASE
1884                 }
1885             }
1886         }
1887     }
1888 
hasJIT() const1889     bool Program::hasJIT() const {
1890         return fJITEntry != nullptr;
1891     }
1892 
dropJIT()1893     void Program::dropJIT() {
1894     #if defined(SKVM_JIT)
1895         if (fDylib) {
1896             dlclose(fDylib);
1897         } else if (fJITEntry) {
1898             munmap(fJITEntry, fJITSize);
1899         }
1900     #else
1901         SkASSERT(!this->hasJIT());
1902     #endif
1903 
1904         fJITEntry = nullptr;
1905         fJITSize  = 0;
1906         fDylib    = nullptr;
1907     }
1908 
~Program()1909     Program::~Program() { this->dropJIT(); }
1910 
Program(Program && other)1911     Program::Program(Program&& other) {
1912         fInstructions    = std::move(other.fInstructions);
1913         fRegs            = other.fRegs;
1914         fLoop            = other.fLoop;
1915         fStrides         = std::move(other.fStrides);
1916 
1917         std::swap(fJITEntry, other.fJITEntry);
1918         std::swap(fJITSize , other.fJITSize);
1919         std::swap(fDylib   , other.fDylib);
1920     }
1921 
operator =(Program && other)1922     Program& Program::operator=(Program&& other) {
1923         fInstructions    = std::move(other.fInstructions);
1924         fRegs            = other.fRegs;
1925         fLoop            = other.fLoop;
1926         fStrides         = std::move(other.fStrides);
1927 
1928         std::swap(fJITEntry, other.fJITEntry);
1929         std::swap(fJITSize , other.fJITSize);
1930         std::swap(fDylib   , other.fDylib);
1931         return *this;
1932     }
1933 
Program()1934     Program::Program() {}
1935 
Program(const std::vector<OptimizedInstruction> & interpreter,const std::vector<int> & strides)1936     Program::Program(const std::vector<OptimizedInstruction>& interpreter,
1937                      const std::vector<int>& strides) : fStrides(strides) {
1938         this->setupInterpreter(interpreter);
1939     }
1940 
Program(const std::vector<OptimizedInstruction> & interpreter,const std::vector<OptimizedInstruction> & jit,const std::vector<int> & strides,const char * debug_name)1941     Program::Program(const std::vector<OptimizedInstruction>& interpreter,
1942                      const std::vector<OptimizedInstruction>& jit,
1943                      const std::vector<int>& strides,
1944                      const char* debug_name) : Program(interpreter, strides) {
1945     #if 1 && defined(SKVM_JIT)
1946         this->setupJIT(jit, debug_name);
1947     #endif
1948     }
1949 
1950     // Translate OptimizedInstructions to Program::Instructions used by the interpreter.
setupInterpreter(const std::vector<OptimizedInstruction> & instructions)1951     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
1952         // Register each instruction is assigned to.
1953         std::vector<Reg> reg(instructions.size());
1954 
1955         // This next bit is a bit more complicated than strictly necessary;
1956         // we could just assign every instruction to its own register.
1957         //
1958         // But recycling registers is fairly cheap, and good practice for the
1959         // JITs where minimizing register pressure really is important.
1960         //
1961         // Since we have effectively infinite registers, we hoist any value we can.
1962         // (The JIT may choose a more complex policy to reduce register pressure.)
1963         auto hoisted = [&](Val id) { return instructions[id].can_hoist; };
1964 
1965         fRegs = 0;
1966         std::vector<Reg> avail;
1967 
1968         // Assign this value to a register, recycling them where we can.
1969         auto assign_register = [&](Val id) {
1970             const OptimizedInstruction& inst = instructions[id];
1971 
1972             // If this is a real input and it's lifetime ends at this instruction,
1973             // we can recycle the register it's occupying.
1974             auto maybe_recycle_register = [&](Val input) {
1975                 if (input != NA
1976                         && instructions[input].death == id
1977                         && !(hoisted(input) && instructions[input].used_in_loop)) {
1978                     avail.push_back(reg[input]);
1979                 }
1980             };
1981 
1982             // Take care to not recycle the same register twice.
1983             if (true                                ) { maybe_recycle_register(inst.x); }
1984             if (inst.y != inst.x                    ) { maybe_recycle_register(inst.y); }
1985             if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); }
1986 
1987             // Instructions that die at themselves (stores) don't need a register.
1988             if (inst.death != id) {
1989                 // Allocate a register if we have to, preferring to reuse anything available.
1990                 if (avail.empty()) {
1991                     reg[id] = fRegs++;
1992                 } else {
1993                     reg[id] = avail.back();
1994                     avail.pop_back();
1995                 }
1996             }
1997         };
1998 
1999         // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
2000         for (Val id = 0; id < (Val)instructions.size(); id++) {
2001             if ( hoisted(id)) { assign_register(id); }
2002         }
2003         for (Val id = 0; id < (Val)instructions.size(); id++) {
2004             if (!hoisted(id)) { assign_register(id); }
2005         }
2006 
2007         // Translate OptimizedInstructions to Program::Instructions by mapping values to
2008         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
2009 
2010         // The loop begins at the fLoop'th Instruction.
2011         fLoop = 0;
2012         fInstructions.reserve(instructions.size());
2013 
2014         // Add a dummy mapping for the N/A sentinel Val to any arbitrary register
2015         // so lookups don't have to know which arguments are used by which Ops.
2016         auto lookup_register = [&](Val id) {
2017             return id == NA ? (Reg)0
2018                             : reg[id];
2019         };
2020 
2021         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
2022             Program::Instruction pinst{
2023                 inst.op,
2024                 lookup_register(id),
2025                 lookup_register(inst.x),
2026                {lookup_register(inst.y)},
2027                {lookup_register(inst.z)},
2028             };
2029             if (inst.y == NA) { pinst.immy = inst.immy; }
2030             if (inst.z == NA) { pinst.immz = inst.immz; }
2031             fInstructions.push_back(pinst);
2032         };
2033 
2034         for (Val id = 0; id < (Val)instructions.size(); id++) {
2035             const OptimizedInstruction& inst = instructions[id];
2036             if (hoisted(id)) {
2037                 push_instruction(id, inst);
2038                 fLoop++;
2039             }
2040         }
2041         for (Val id = 0; id < (Val)instructions.size(); id++) {
2042             const OptimizedInstruction& inst = instructions[id];
2043             if (!hoisted(id)) {
2044                 push_instruction(id, inst);
2045             }
2046         }
2047     }
2048 
2049 #if defined(SKVM_JIT)
2050 
2051     // Just so happens that we can translate the immediate control for our bytes() op
2052     // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl!
bytes_control(int imm,int mask[4])2053     static void bytes_control(int imm, int mask[4]) {
2054         auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
2055             // 0 -> 0xff,    Fill with zero
2056             // 1 -> 0x00,    Select byte 0
2057             // 2 -> 0x01,         "      1
2058             // 3 -> 0x02,         "      2
2059             // 4 -> 0x03,         "      3
2060             return n - 1;
2061         };
2062         uint8_t control[] = {
2063             nibble_to_vpshufb( (imm >>  0) & 0xf ),
2064             nibble_to_vpshufb( (imm >>  4) & 0xf ),
2065             nibble_to_vpshufb( (imm >>  8) & 0xf ),
2066             nibble_to_vpshufb( (imm >> 12) & 0xf ),
2067         };
2068         for (int i = 0; i < 4; i++) {
2069             mask[i] = (int)control[0] <<  0
2070                     | (int)control[1] <<  8
2071                     | (int)control[2] << 16
2072                     | (int)control[3] << 24;
2073 
2074             // Update each byte that refers to a byte index by 4 to
2075             // point into the next 32-bit lane, but leave any 0xff
2076             // that fills with zero alone.
2077             control[0] += control[0] == 0xff ? 0 : 4;
2078             control[1] += control[1] == 0xff ? 0 : 4;
2079             control[2] += control[2] == 0xff ? 0 : 4;
2080             control[3] += control[3] == 0xff ? 0 : 4;
2081         }
2082     }
2083 
jit(const std::vector<OptimizedInstruction> & instructions,const bool try_hoisting,Assembler * a) const2084     bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
2085                       const bool try_hoisting,
2086                       Assembler* a) const {
2087         using A = Assembler;
2088 
2089         auto debug_dump = [&] {
2090         #if 0
2091             SkDebugfStream stream;
2092             this->dump(&stream);
2093             return true;
2094         #else
2095             return false;
2096         #endif
2097         };
2098 
2099     #if defined(__x86_64__)
2100         if (!SkCpu::Supports(SkCpu::HSW)) {
2101             return false;
2102         }
2103         A::GP64 N        = A::rdi,
2104                 scratch  = A::rax,
2105                 scratch2 = A::r11,
2106                 arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
2107 
2108         // All 16 ymm registers are available to use.
2109         using Reg = A::Ymm;
2110         uint32_t avail = 0xffff;
2111 
2112     #elif defined(__aarch64__)
2113         A::X N       = A::x0,
2114              scratch = A::x8,
2115              arg[]   = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
2116 
2117         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15.
2118         using Reg = A::V;
2119         uint32_t avail = 0xffff00ff;
2120     #endif
2121 
2122         if (SK_ARRAY_COUNT(arg) < fStrides.size()) {
2123             return false;
2124         }
2125 
2126         auto hoisted = [&](Val id) { return try_hoisting && instructions[id].can_hoist; };
2127 
2128         std::vector<Reg> r(instructions.size());
2129 
2130         struct LabelAndReg {
2131             A::Label label;
2132             Reg      reg;
2133         };
2134         SkTHashMap<int, LabelAndReg> constants,    // All constants share the same pool.
2135                                      bytes_masks;  // These vary per-lane.
2136         LabelAndReg                  iota;         // Exists _only_ to vary per-lane.
2137 
2138         auto warmup = [&](Val id) {
2139             const OptimizedInstruction& inst = instructions[id];
2140 
2141             switch (inst.op) {
2142                 default: break;
2143 
2144                 case Op::bytes: if (!bytes_masks.find(inst.immy)) {
2145                                     bytes_masks.set(inst.immy, {});
2146                                     if (try_hoisting) {
2147                                         // vpshufb can always work with the mask from memory,
2148                                         // but it helps to hoist the mask to a register for tbl.
2149                                     #if defined(__aarch64__)
2150                                         LabelAndReg* entry = bytes_masks.find(inst.immy);
2151                                         if (int found = __builtin_ffs(avail)) {
2152                                             entry->reg = (Reg)(found-1);
2153                                             avail ^= 1 << entry->reg;
2154                                             a->ldrq(entry->reg, &entry->label);
2155                                         } else {
2156                                             return false;
2157                                         }
2158                                     #endif
2159                                     }
2160                                 }
2161                                 break;
2162             }
2163             return true;
2164         };
2165 
2166         auto emit = [&](Val id, bool scalar) {
2167             const OptimizedInstruction& inst = instructions[id];
2168 
2169             Op op = inst.op;
2170             Val x = inst.x,
2171                 y = inst.y,
2172                 z = inst.z;
2173             int immy = inst.immy,
2174                 immz = inst.immz;
2175 
2176             // Most (but not all) ops create an output value and need a register to hold it, dst.
2177             // We track each instruction's dst in r[] so we can thread it through as an input
2178             // to any future instructions needing that value.
2179             //
2180             // And some ops may need a temporary register, tmp.  Some need both tmp and dst.
2181             //
2182             // tmp and dst are very similar and can and will often be assigned the same register,
2183             // but tmp may never alias any of the instructions's inputs, while dst may when this
2184             // instruction consumes that input, i.e. if the input reaches its end of life here.
2185             //
2186             // We'll assign both registers lazily to keep register pressure as low as possible.
2187             bool tmp_is_set = false,
2188                  dst_is_set = false;
2189             Reg tmp_reg = (Reg)0;  // This initial value won't matter... anything legal is fine.
2190 
2191             bool ok = true;   // Set to false if we need to assign a register and none's available.
2192 
2193             // First lock in how to choose tmp if we need to based on the registers
2194             // available before this instruction, not including any of its input registers.
2195             auto tmp = [&,avail/*important, closing over avail's current value*/]{
2196                 if (!tmp_is_set) {
2197                     tmp_is_set = true;
2198                     if (int found = __builtin_ffs(avail)) {
2199                         // This is a temporary register just for this op,
2200                         // so we leave it marked available for future ops.
2201                         tmp_reg = (Reg)(found - 1);
2202                     } else {
2203                         // We needed a tmp register but couldn't find one available. :'(
2204                         // This will cause emit() to return false, in turn causing jit() to fail.
2205                         if (debug_dump()) {
2206                             SkDebugf("\nCould not find a register to hold tmp\n");
2207                         }
2208                         ok = false;
2209                     }
2210                 }
2211                 return tmp_reg;
2212             };
2213 
2214             // Now make available any registers that are consumed by this instruction.
2215             // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.)
2216             auto maybe_recycle_register = [&](Val input) {
2217                 if (input != NA
2218                         && instructions[input].death == id
2219                         && !(hoisted(input) && instructions[input].used_in_loop)) {
2220                     avail |= 1 << r[input];
2221                 }
2222             };
2223             maybe_recycle_register(x);
2224             maybe_recycle_register(y);
2225             maybe_recycle_register(z);
2226             // set_dst() and dst() will work read/write with this perhaps-just-updated avail.
2227 
2228             // Some ops may decide dst on their own to best fit the instruction (see Op::mad_f32).
2229             auto set_dst = [&](Reg reg){
2230                 SkASSERT(dst_is_set == false);
2231                 dst_is_set = true;
2232 
2233                 SkASSERT(avail & (1<<reg));
2234                 avail ^= 1<<reg;
2235 
2236                 r[id] = reg;
2237             };
2238 
2239             // Thanks to AVX and NEON's 3-argument instruction sets,
2240             // most ops can use any register as dst.
2241             auto dst = [&]{
2242                 if (!dst_is_set) {
2243                     if (int found = __builtin_ffs(avail)) {
2244                         set_dst((Reg)(found-1));
2245                     } else {
2246                         // Same deal as with tmp... all the registers are occupied.  Time to fail!
2247                         if (debug_dump()) {
2248                             SkDebugf("\nCould not find a register to hold value %d\n", id);
2249                         }
2250                         ok = false;
2251                     }
2252                 }
2253                 return r[id];
2254             };
2255 
2256             // Because we use the same logic to pick an arbitrary dst and to pick tmp,
2257             // and we know that tmp will never overlap any of the inputs, `dst() == tmp()`
2258             // is a simple idiom to check that the destination does not overlap any of the inputs.
2259             // Sometimes we can use this knowledge to do better instruction selection.
2260 
2261             // Ok!  Keep in mind that we haven't assigned tmp or dst yet,
2262             // just laid out hooks for how to do so if we need them, depending on the instruction.
2263             //
2264             // Now let's actually assemble the instruction!
2265             switch (op) {
2266                 default:
2267                     if (debug_dump()) {
2268                         SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
2269                     }
2270                     return false;  // TODO: many new ops
2271 
2272             #if defined(__x86_64__)
2273                 case Op::assert_true: {
2274                     a->vptest (r[x], &constants[0xffffffff].label);
2275                     A::Label all_true;
2276                     a->jc(&all_true);
2277                     a->int3();
2278                     a->label(&all_true);
2279                 } break;
2280 
2281                 case Op::store8: if (scalar) { a->vpextrb  (arg[immy], (A::Xmm)r[x], 0); }
2282                                  else        { a->vpackusdw(tmp(), r[x], r[x]);
2283                                                a->vpermq   (tmp(), tmp(), 0xd8);
2284                                                a->vpackuswb(tmp(), tmp(), tmp());
2285                                                a->vmovq    (arg[immy], (A::Xmm)tmp()); }
2286                                                break;
2287 
2288                 case Op::store16: if (scalar) { a->vpextrw  (arg[immy], (A::Xmm)r[x], 0); }
2289                                   else        { a->vpackusdw(tmp(), r[x], r[x]);
2290                                                 a->vpermq   (tmp(), tmp(), 0xd8);
2291                                                 a->vmovups  (arg[immy], (A::Xmm)tmp()); }
2292                                                 break;
2293 
2294                 case Op::store32: if (scalar) { a->vmovd  (arg[immy], (A::Xmm)r[x]); }
2295                                   else        { a->vmovups(arg[immy],         r[x]); }
2296                                                 break;
2297 
2298                 case Op::load8:  if (scalar) {
2299                                      a->vpxor  (dst(), dst(), dst());
2300                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0);
2301                                  } else {
2302                                      a->vpmovzxbd(dst(), arg[immy]);
2303                                  } break;
2304 
2305                 case Op::load16: if (scalar) {
2306                                      a->vpxor  (dst(), dst(), dst());
2307                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0);
2308                                  } else {
2309                                      a->vpmovzxwd(dst(), arg[immy]);
2310                                  } break;
2311 
2312                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), arg[immy]); }
2313                                  else        { a->vmovups(        dst(), arg[immy]); }
2314                                  break;
2315 
2316                 case Op::gather32:
2317                 if (scalar) {
2318                     auto base  = scratch,
2319                          index = scratch2;
2320                     // Our gather base pointer is immz bytes off of uniform immy.
2321                     a->movq(base, arg[immy], immz);
2322 
2323                     // Grab our index from lane 0 of the index argument.
2324                     a->vmovd_direct(index, (A::Xmm)r[x]);
2325 
2326                     // dst = *(base + 4*index)
2327                     a->vmovd((A::Xmm)dst(), A::FOUR, index, base);
2328                 } else {
2329                     // We may not let any of dst(), index, or mask use the same register,
2330                     // so we must allocate registers manually and very carefully.
2331 
2332                     // index is argument x and has already been maybe_recycle_register()'d,
2333                     // so we explicitly ignore its availability during this op.
2334                     A::Ymm index = r[x];
2335                     uint32_t avail_during_gather = avail & ~(1<<index);
2336 
2337                     // Choose dst() to not overlap with index.
2338                     if (int found = __builtin_ffs(avail_during_gather)) {
2339                         set_dst((A::Ymm)(found-1));
2340                         avail_during_gather ^= (1<<dst());
2341                     } else {
2342                         ok = false;
2343                         break;
2344                     }
2345 
2346                     // Choose (temporary) mask to not overlap with dst() or index.
2347                     A::Ymm mask;
2348                     if (int found = __builtin_ffs(avail_during_gather)) {
2349                         mask = (A::Ymm)(found-1);
2350                     } else {
2351                         ok = false;
2352                         break;
2353                     }
2354 
2355                     // Our gather base pointer is immz bytes off of uniform immy.
2356                     auto base = scratch;
2357                     a->movq(base, arg[immy], immz);
2358                     a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
2359                     a->vgatherdps(dst(), A::FOUR, index, base, mask);
2360                 }
2361                 break;
2362 
2363                 case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
2364                                    a->vmovd_direct((A::Xmm)dst(), scratch);
2365                                    a->vbroadcastss(dst(), (A::Xmm)dst());
2366                                    break;
2367 
2368                 case Op::uniform32: a->vbroadcastss(dst(), arg[immy], immz);
2369                                     break;
2370 
2371                 case Op::index: a->vmovd_direct((A::Xmm)tmp(), N);
2372                                 a->vbroadcastss(tmp(), (A::Xmm)tmp());
2373                                 a->vpsubd(dst(), tmp(), &iota.label);
2374                                 break;
2375 
2376                 case Op::splat: if (immy) { a->vbroadcastss(dst(), &constants[immy].label); }
2377                                 else      { a->vpxor(dst(), dst(), dst()); }
2378                                 break;
2379 
2380                 case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
2381                 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
2382                 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break;
2383                 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break;
2384                 case Op::min_f32: a->vminps(dst(), r[x], r[y]); break;
2385                 case Op::max_f32: a->vmaxps(dst(), r[x], r[y]); break;
2386 
2387                 case Op::mad_f32:
2388                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); }
2389                     else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); }
2390                     else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); }
2391                     else                        {                SkASSERT(dst() == tmp());
2392                                                                  a->vmovdqa    (dst(),r[x]);
2393                                                                  a->vfmadd132ps(dst(),r[z], r[y]); }
2394                                                                  break;
2395                 case Op::sqrt_f32: a->vsqrtps(dst(), r[x]); break;
2396 
2397                 case Op::add_f32_imm: a->vaddps(dst(), r[x], &constants[immy].label); break;
2398                 case Op::sub_f32_imm: a->vsubps(dst(), r[x], &constants[immy].label); break;
2399                 case Op::mul_f32_imm: a->vmulps(dst(), r[x], &constants[immy].label); break;
2400                 case Op::min_f32_imm: a->vminps(dst(), r[x], &constants[immy].label); break;
2401                 case Op::max_f32_imm: a->vmaxps(dst(), r[x], &constants[immy].label); break;
2402 
2403                 case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break;
2404                 case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break;
2405                 case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break;
2406 
2407                 case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break;
2408                 case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break;
2409                 case Op::shr_i16x2: a->vpsrlw (dst(), r[x], immy); break;
2410 
2411                 case Op::bit_and  : a->vpand (dst(), r[x], r[y]); break;
2412                 case Op::bit_or   : a->vpor  (dst(), r[x], r[y]); break;
2413                 case Op::bit_xor  : a->vpxor (dst(), r[x], r[y]); break;
2414                 case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break;  // N.B. Y then X.
2415                 case Op::select   : a->vpblendvb(dst(), r[z], r[y], r[x]); break;
2416 
2417                 case Op::bit_and_imm: a->vpand (dst(), r[x], &constants[immy].label); break;
2418                 case Op::bit_or_imm : a->vpor  (dst(), r[x], &constants[immy].label); break;
2419                 case Op::bit_xor_imm: a->vpxor (dst(), r[x], &constants[immy].label); break;
2420 
2421                 case Op::shl_i32: a->vpslld(dst(), r[x], immy); break;
2422                 case Op::shr_i32: a->vpsrld(dst(), r[x], immy); break;
2423                 case Op::sra_i32: a->vpsrad(dst(), r[x], immy); break;
2424 
2425                 case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break;
2426                 case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break;
2427 
2428                 case Op:: eq_f32: a->vcmpeqps (dst(), r[x], r[y]); break;
2429                 case Op::neq_f32: a->vcmpneqps(dst(), r[x], r[y]); break;
2430                 case Op:: gt_f32: a->vcmpltps (dst(), r[y], r[x]); break;
2431                 case Op::gte_f32: a->vcmpleps (dst(), r[y], r[x]); break;
2432 
2433                 case Op::pack: a->vpslld(tmp(),  r[y], immz);
2434                                a->vpor  (dst(), tmp(), r[x]);
2435                                break;
2436 
2437                 case Op::floor : a->vroundps  (dst(), r[x], Assembler::FLOOR); break;
2438                 case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
2439                 case Op::trunc : a->vcvttps2dq(dst(), r[x]); break;
2440                 case Op::round : a->vcvtps2dq (dst(), r[x]); break;
2441 
2442                 case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(immy)->label);
2443                                 break;
2444 
2445             #elif defined(__aarch64__)
2446                 case Op::assert_true: {
2447                     a->uminv4s(tmp(), r[x]);   // uminv acts like an all() across the vector.
2448                     a->fmovs(scratch, tmp());
2449                     A::Label all_true;
2450                     a->cbnz(scratch, &all_true);
2451                     a->brk(0);
2452                     a->label(&all_true);
2453                 } break;
2454 
2455                 case Op::store8: a->xtns2h(tmp(), r[x]);
2456                                  a->xtnh2b(tmp(), tmp());
2457                    if (scalar) { a->strb  (tmp(), arg[immy]); }
2458                    else        { a->strs  (tmp(), arg[immy]); }
2459                                  break;
2460                 // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here.
2461 
2462                 case Op::store32: if (scalar) { a->strs(r[x], arg[immy]); }
2463                                   else        { a->strq(r[x], arg[immy]); }
2464                                                 break;
2465 
2466                 case Op::load8: if (scalar) { a->ldrb(tmp(), arg[immy]); }
2467                                 else        { a->ldrs(tmp(), arg[immy]); }
2468                                               a->uxtlb2h(tmp(), tmp());
2469                                               a->uxtlh2s(dst(), tmp());
2470                                               break;
2471 
2472                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); }
2473                                  else        { a->ldrq(dst(), arg[immy]); }
2474                                                break;
2475 
2476                 case Op::splat: if (immy) { a->ldrq(dst(), &constants[immy].label); }
2477                                 else      { a->eor16b(dst(), dst(), dst()); }
2478                                 break;
2479                                 // TODO: If we hoist these, pack 4 values in each register
2480                                 // and use vector/lane operations, cutting the register
2481                                 // pressure cost of hoisting by 4?
2482 
2483                 case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
2484                 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
2485                 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break;
2486                 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
2487                 case Op::min_f32: a->fmin4s(dst(), r[x], r[y]); break;
2488                 case Op::max_f32: a->fmax4s(dst(), r[x], r[y]); break;
2489 
2490                 case Op::mad_f32: // fmla4s is z += x*y
2491                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z],  r[x],  r[y]);   }
2492                     else {                                  a->orr16b(tmp(),  r[z],  r[z]);
2493                                                             a->fmla4s(tmp(),  r[x],  r[y]);
2494                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
2495                                                             break;
2496 
2497                 // These _imm instructions are all x86/JIT only.
2498                 case  Op::add_f32_imm :
2499                 case  Op::sub_f32_imm :
2500                 case  Op::mul_f32_imm :
2501                 case  Op::min_f32_imm :
2502                 case  Op::max_f32_imm :
2503                 case  Op::bit_and_imm :
2504                 case  Op::bit_or_imm  :
2505                 case  Op::bit_xor_imm : SkUNREACHABLE; break;
2506 
2507                 case Op:: gt_f32: a->fcmgt4s (dst(), r[x], r[y]); break;
2508                 case Op::gte_f32: a->fcmge4s (dst(), r[x], r[y]); break;
2509                 case Op:: eq_f32: a->fcmeq4s (dst(), r[x], r[y]); break;
2510                 case Op::neq_f32: a->fcmeq4s (tmp(), r[x], r[y]);
2511                                   a->not16b  (dst(), tmp());      break;
2512 
2513 
2514                 case Op::add_i32: a->add4s(dst(), r[x], r[y]); break;
2515                 case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break;
2516                 case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break;
2517 
2518                 case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break;
2519                 case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break;
2520                 case Op::shr_i16x2: a->ushr8h(dst(), r[x], immy); break;
2521 
2522                 case Op::bit_and  : a->and16b(dst(), r[x], r[y]); break;
2523                 case Op::bit_or   : a->orr16b(dst(), r[x], r[y]); break;
2524                 case Op::bit_xor  : a->eor16b(dst(), r[x], r[y]); break;
2525                 case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break;
2526 
2527                 case Op::select: // bsl16b is x = x ? y : z
2528                     if (avail & (1<<r[x])) { set_dst(r[x]); a->bsl16b( r[x],  r[y],  r[z]); }
2529                     else {                                  a->orr16b(tmp(),  r[x],  r[x]);
2530                                                             a->bsl16b(tmp(),  r[y],  r[z]);
2531                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
2532                                                             break;
2533 
2534                 case Op::shl_i32: a-> shl4s(dst(), r[x], immy); break;
2535                 case Op::shr_i32: a->ushr4s(dst(), r[x], immy); break;
2536                 case Op::sra_i32: a->sshr4s(dst(), r[x], immy); break;
2537 
2538                 case Op::eq_i32: a->cmeq4s(dst(), r[x], r[y]); break;
2539                 case Op::gt_i32: a->cmgt4s(dst(), r[x], r[y]); break;
2540 
2541                 case Op::pack:
2542                     if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x],  r[y],  immz); }
2543                     else                   {                a->shl4s (tmp(),  r[y],  immz);
2544                                                             a->orr16b(dst(), tmp(),  r[x]); }
2545                                                             break;
2546 
2547                 case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
2548                 case Op::trunc:  a->fcvtzs4s(dst(), r[x]); break;
2549                 case Op::round:  a->fcvtns4s(dst(), r[x]); break;
2550 
2551                 case Op::bytes:
2552                     if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(immy)->reg); }
2553                     else              { a->ldrq(tmp(), &bytes_masks.find(immy)->label);
2554                                         a->tbl (dst(), r[x], tmp()); }
2555                                         break;
2556             #endif
2557             }
2558 
2559             // Calls to tmp() or dst() might have flipped this false from its default true state.
2560             return ok;
2561         };
2562 
2563 
2564         #if defined(__x86_64__)
2565             const int K = 8;
2566             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
2567             auto jump         = [&](A::Label* l) { a->jmp(l); };
2568 
2569             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
2570             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
2571 
2572             auto exit = [&]{ a->vzeroupper(); a->ret(); };
2573         #elif defined(__aarch64__)
2574             const int K = 4;
2575             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
2576             auto jump         = [&](A::Label* l) { a->b  (l); };
2577 
2578             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
2579             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
2580 
2581             auto exit = [&]{ a->ret(A::x30); };
2582         #endif
2583 
2584         A::Label body,
2585                  tail,
2586                  done;
2587 
2588         for (Val id = 0; id < (Val)instructions.size(); id++) {
2589             if (!warmup(id)) {
2590                 return false;
2591             }
2592             if (hoisted(id) && !emit(id, /*scalar=*/false)) {
2593                 return false;
2594             }
2595         }
2596 
2597         a->label(&body);
2598         {
2599             a->cmp(N, K);
2600             jump_if_less(&tail);
2601             for (Val id = 0; id < (Val)instructions.size(); id++) {
2602                 if (!hoisted(id) && !emit(id, /*scalar=*/false)) {
2603                     return false;
2604                 }
2605             }
2606             for (int i = 0; i < (int)fStrides.size(); i++) {
2607                 if (fStrides[i]) {
2608                     add(arg[i], K*fStrides[i]);
2609                 }
2610             }
2611             sub(N, K);
2612             jump(&body);
2613         }
2614 
2615         a->label(&tail);
2616         {
2617             a->cmp(N, 1);
2618             jump_if_less(&done);
2619             for (Val id = 0; id < (Val)instructions.size(); id++) {
2620                 if (!hoisted(id) && !emit(id, /*scalar=*/true)) {
2621                     return false;
2622                 }
2623             }
2624             for (int i = 0; i < (int)fStrides.size(); i++) {
2625                 if (fStrides[i]) {
2626                     add(arg[i], 1*fStrides[i]);
2627                 }
2628             }
2629             sub(N, 1);
2630             jump(&tail);
2631         }
2632 
2633         a->label(&done);
2634         {
2635             exit();
2636         }
2637 
2638         // Except for explicit aligned load and store instructions, AVX allows
2639         // memory operands to be unaligned.  So even though we're creating 16
2640         // byte patterns on ARM or 32-byte patterns on x86, we only need to
2641         // align to 4 bytes, the element size and alignment requirement.
2642 
2643         constants.foreach([&](int imm, LabelAndReg* entry) {
2644             a->align(4);
2645             a->label(&entry->label);
2646             for (int i = 0; i < K; i++) {
2647                 a->word(imm);
2648             }
2649         });
2650 
2651         bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
2652             // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
2653             a->align(4);
2654             a->label(&entry->label);
2655             int mask[4];
2656             bytes_control(imm, mask);
2657             a->bytes(mask, sizeof(mask));
2658         #if defined(__x86_64__)
2659             a->bytes(mask, sizeof(mask));
2660         #endif
2661         });
2662 
2663         if (!iota.label.references.empty()) {
2664             a->align(4);
2665             a->label(&iota.label);
2666             for (int i = 0; i < K; i++) {
2667                 a->word(i);
2668             }
2669         }
2670 
2671         return true;
2672     }
2673 
setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2674     void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
2675                            const char* debug_name) {
2676         // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble.
2677         Assembler a{nullptr};
2678 
2679         // First try allowing code hoisting (faster code)
2680         // then again without if that fails (lower register pressure).
2681         bool try_hoisting = true;
2682         if (!this->jit(instructions, try_hoisting, &a)) {
2683             try_hoisting = false;
2684             if (!this->jit(instructions, try_hoisting, &a)) {
2685                 return;
2686             }
2687         }
2688 
2689         // Allocate space that we can remap as executable.
2690         const size_t page = sysconf(_SC_PAGESIZE);
2691         fJITSize = ((a.size() + page - 1) / page) * page;  // mprotect works at page granularity.
2692         fJITEntry = mmap(nullptr,fJITSize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
2693 
2694         // Assemble the program for real.
2695         a = Assembler{fJITEntry};
2696         SkAssertResult(this->jit(instructions, try_hoisting, &a));
2697         SkASSERT(a.size() <= fJITSize);
2698 
2699         // Remap as executable, and flush caches on platforms that need that.
2700         mprotect(fJITEntry, fJITSize, PROT_READ|PROT_EXEC);
2701         __builtin___clear_cache((char*)fJITEntry,
2702                                 (char*)fJITEntry + fJITSize);
2703 
2704         // For profiling and debugging, it's helpful to have this code loaded
2705         // dynamically rather than just jumping info fJITEntry.
2706         if (gSkVMJITViaDylib) {
2707             // Dump the raw program binary.
2708             SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
2709             int fd = mkstemp(path.writable_str());
2710             ::write(fd, fJITEntry, a.size());
2711             close(fd);
2712 
2713             this->dropJIT();  // (unmap and null out fJITEntry.)
2714 
2715             // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
2716             SkString cmd = SkStringPrintf(
2717                     "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
2718                     " | clang -x assembler -shared - -o %s",
2719                     path.c_str(), path.c_str());
2720             system(cmd.c_str());
2721 
2722             // Load that dynamic library and look up skvm_jit().
2723             fDylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
2724             fJITEntry = dlsym(fDylib, "skvm_jit");
2725         }
2726     }
2727 #endif
2728 
2729 }  // namespace skvm
2730