• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/core/SkString.h"
10 #include "include/private/SkHalf.h"
11 #include "include/private/SkTFitsIn.h"
12 #include "include/private/SkThreadID.h"
13 #include "src/core/SkColorSpacePriv.h"
14 #include "src/core/SkColorSpaceXformSteps.h"
15 #include "src/core/SkCpu.h"
16 #include "src/core/SkEnumerate.h"
17 #include "src/core/SkOpts.h"
18 #include "src/core/SkStreamPriv.h"
19 #include "src/core/SkVM.h"
20 #include "src/utils/SkVMVisualizer.h"
21 #include <algorithm>
22 #include <atomic>
23 #include <queue>
24 
25 #if defined(SKVM_LLVM)
26     #include <future>
27     #include <llvm/Bitcode/BitcodeWriter.h>
28     #include <llvm/ExecutionEngine/ExecutionEngine.h>
29     #include <llvm/IR/IRBuilder.h>
30     #include <llvm/IR/Verifier.h>
31     #include <llvm/Support/TargetSelect.h>
32     #include <llvm/Support/Host.h>
33 
34     // Platform-specific intrinsics got their own files in LLVM 10.
35     #if __has_include(<llvm/IR/IntrinsicsX86.h>)
36         #include <llvm/IR/IntrinsicsX86.h>
37     #endif
38 #endif
39 
40 #if !defined(SK_BUILD_FOR_WIN)
41 #include <unistd.h>
42 #endif
43 
44 // #define SKVM_LLVM_WAIT_FOR_COMPILATION
45 
46 bool gSkVMAllowJIT{false};
47 bool gSkVMJITViaDylib{false};
48 
49 #if defined(SKVM_JIT)
50     #if defined(SK_BUILD_FOR_WIN)
51         #include "src/core/SkLeanWindows.h"
52         #include <memoryapi.h>
53 
alloc_jit_buffer(size_t * len)54         static void* alloc_jit_buffer(size_t* len) {
55             return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
56         }
remap_as_executable(void * ptr,size_t len)57         static void remap_as_executable(void* ptr, size_t len) {
58             DWORD old;
59             VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
60             SkASSERT(old == PAGE_READWRITE);
61         }
62         #if !defined(SKVM_LLVM)
unmap_jit_buffer(void * ptr,size_t len)63         static void unmap_jit_buffer(void* ptr, size_t len) {
64             VirtualFree(ptr, 0, MEM_RELEASE);
65         }
close_dylib(void * dylib)66         static void close_dylib(void* dylib) {
67             SkASSERT(false);  // TODO?  For now just assert we never make one.
68         }
69         #endif
70     #else
71         #include <dlfcn.h>
72         #include <sys/mman.h>
73 
alloc_jit_buffer(size_t * len)74         static void* alloc_jit_buffer(size_t* len) {
75             // While mprotect and VirtualAlloc both work at page granularity,
76             // mprotect doesn't round up for you, and instead requires *len is at page granularity.
77             const size_t page = sysconf(_SC_PAGESIZE);
78             *len = ((*len + page - 1) / page) * page;
79             return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
80         }
remap_as_executable(void * ptr,size_t len)81         static void remap_as_executable(void* ptr, size_t len) {
82             mprotect(ptr, len, PROT_READ|PROT_EXEC);
83             __builtin___clear_cache((char*)ptr,
84                                     (char*)ptr + len);
85         }
86         #if !defined(SKVM_LLVM)
unmap_jit_buffer(void * ptr,size_t len)87         static void unmap_jit_buffer(void* ptr, size_t len) {
88             munmap(ptr, len);
89         }
close_dylib(void * dylib)90         static void close_dylib(void* dylib) {
91             dlclose(dylib);
92         }
93         #endif
94     #endif
95 
96     #if defined(SKVM_JIT_VTUNE)
97         #include <jitprofiling.h>
notify_vtune(const char * name,void * addr,size_t len)98         static void notify_vtune(const char* name, void* addr, size_t len) {
99             if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
100                 iJIT_Method_Load event;
101                 memset(&event, 0, sizeof(event));
102                 event.method_id           = iJIT_GetNewMethodID();
103                 event.method_name         = const_cast<char*>(name);
104                 event.method_load_address = addr;
105                 event.method_size         = len;
106                 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
107             }
108         }
109     #else
notify_vtune(const char * name,void * addr,size_t len)110         static void notify_vtune(const char* name, void* addr, size_t len) {}
111     #endif
112 #endif
113 
114 // JIT code isn't MSAN-instrumented, so we won't see when it uses
115 // uninitialized memory, and we'll not see the writes it makes as properly
116 // initializing memory.  Instead force the interpreter, which should let
117 // MSAN see everything our programs do properly.
118 //
119 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
120 #if defined(__has_feature)
121     #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
122         #define SKVM_JIT_BUT_IGNORE_IT
123     #endif
124 #endif
125 
126 #if defined(SKSL_STANDALONE)
127     // skslc needs to link against this module (for the VM code generator). This module pulls in
128     // color-space code, but attempting to add those transitive dependencies to skslc gets out of
129     // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM
130     // never cares about color management.
sk_program_transfer_fn(skvm::F32 v,TFKind tf_kind,skvm::F32 G,skvm::F32 A,skvm::F32 B,skvm::F32 C,skvm::F32 D,skvm::F32 E,skvm::F32 F)131     skvm::F32 sk_program_transfer_fn(
132         skvm::F32 v, TFKind tf_kind,
133         skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) {
134             return v;
135     }
136 
skcms_sRGB_TransferFunction()137     const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; }
skcms_sRGB_Inverse_TransferFunction()138     const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; }
139 #endif
140 
141 namespace skvm {
142 
detect_features()143     static Features detect_features() {
144         static const bool fma =
145         #if defined(SK_CPU_X86)
146             SkCpu::Supports(SkCpu::HSW);
147         #elif defined(SK_CPU_ARM64)
148             true;
149         #else
150             false;
151         #endif
152 
153         static const bool fp16 = false;  // TODO
154 
155         return { fma, fp16 };
156     }
157 
Builder(bool createDuplicates)158     Builder::Builder(bool createDuplicates)
159         : fFeatures(detect_features()), fCreateDuplicates(createDuplicates) {}
Builder(Features features,bool createDuplicates)160     Builder::Builder(Features features, bool createDuplicates)
161         : fFeatures(features         ), fCreateDuplicates(createDuplicates) {}
162 
163     struct Program::Impl {
164         std::vector<InterpreterInstruction> instructions;
165         int regs = 0;
166         int loop = 0;
167         std::vector<int> strides;
168         std::vector<TraceHook*> traceHooks;
169         std::unique_ptr<viz::Visualizer> visualizer;
170 
171         std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
172         size_t jit_size = 0;
173         void*  dylib    = nullptr;
174 
175     #if defined(SKVM_LLVM)
176         std::unique_ptr<llvm::LLVMContext>     llvm_ctx;
177         std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
178         std::future<void>                      llvm_compiling;
179     #endif
180     };
181 
182     // Debugging tools, mostly for printing various data structures out to a stream.
183 
184     namespace {
185         struct V { Val id; };
186         struct R { Reg id; };
187         struct Shift       { int bits; };
188         struct Splat       { int bits; };
189         struct Hex         { int bits; };
190         struct TraceHookID { int bits; };
191         // For op `trace_line`
192         struct Line  { int bits; };
193         // For op `trace_var`
194         struct VarSlot { int bits; };
195         // For op `trace_enter`/`trace_exit`
196         struct FnIdx { int bits; };
197 
write(SkWStream * o,const char * s)198         static void write(SkWStream* o, const char* s) {
199             o->writeText(s);
200         }
201 
name(Op op)202         static const char* name(Op op) {
203             switch (op) {
204             #define M(x) case Op::x: return #x;
205                 SKVM_OPS(M)
206             #undef M
207             }
208             return "unknown op";
209         }
210 
write(SkWStream * o,Op op)211         static void write(SkWStream* o, Op op) {
212             o->writeText(name(op));
213         }
write(SkWStream * o,Ptr p)214         static void write(SkWStream* o, Ptr p) {
215             write(o, "ptr");
216             o->writeDecAsText(p.ix);
217         }
write(SkWStream * o,V v)218         static void write(SkWStream* o, V v) {
219             write(o, "v");
220             o->writeDecAsText(v.id);
221         }
write(SkWStream * o,R r)222         static void write(SkWStream* o, R r) {
223             write(o, "r");
224             o->writeDecAsText(r.id);
225         }
write(SkWStream * o,Shift s)226         static void write(SkWStream* o, Shift s) {
227             o->writeDecAsText(s.bits);
228         }
write(SkWStream * o,Splat s)229         static void write(SkWStream* o, Splat s) {
230             float f;
231             memcpy(&f, &s.bits, 4);
232             o->writeHexAsText(s.bits);
233             write(o, " (");
234             o->writeScalarAsText(f);
235             write(o, ")");
236         }
write(SkWStream * o,Hex h)237         static void write(SkWStream* o, Hex h) {
238             o->writeHexAsText(h.bits);
239         }
write(SkWStream * o,TraceHookID h)240         static void write(SkWStream* o, TraceHookID h) {
241             o->writeDecAsText(h.bits);
242         }
write(SkWStream * o,Line d)243         static void write(SkWStream* o, Line d) {
244             write(o, "L");
245             o->writeDecAsText(d.bits);
246         }
write(SkWStream * o,VarSlot s)247         static void write(SkWStream* o, VarSlot s) {
248             write(o, "$");
249             o->writeDecAsText(s.bits);
250         }
write(SkWStream * o,FnIdx s)251         static void write(SkWStream* o, FnIdx s) {
252             write(o, "F");
253             o->writeDecAsText(s.bits);
254         }
255         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)256         static void write(SkWStream* o, T first, Ts... rest) {
257             write(o, first);
258             write(o, " ");
259             write(o, rest...);
260         }
261     }  // namespace
262 
write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)263     static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) {
264         Op  op = inst.op;
265         Val  x = inst.x,
266              y = inst.y,
267              z = inst.z,
268              w = inst.w;
269         int immA = inst.immA,
270             immB = inst.immB,
271             immC = inst.immC;
272         switch (op) {
273             case Op::assert_true: write(o, op, V{x}, V{y}); break;
274 
275             case Op::trace_line:  write(o, op, TraceHookID{immA}, V{x}, V{y}, Line{immB}); break;
276             case Op::trace_var:   write(o, op, TraceHookID{immA}, V{x}, V{y},
277                                                                   VarSlot{immB}, "=", V{z}); break;
278             case Op::trace_enter: write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break;
279             case Op::trace_exit:  write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break;
280             case Op::trace_scope: write(o, op, TraceHookID{immA}, V{x}, V{y}, Shift{immB}); break;
281 
282             case Op::store8:   write(o, op, Ptr{immA}, V{x}               ); break;
283             case Op::store16:  write(o, op, Ptr{immA}, V{x}               ); break;
284             case Op::store32:  write(o, op, Ptr{immA}, V{x}               ); break;
285             case Op::store64:  write(o, op, Ptr{immA}, V{x},V{y}          ); break;
286             case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break;
287 
288             case Op::index: write(o, V{id}, "=", op); break;
289 
290             case Op::load8:   write(o, V{id}, "=", op, Ptr{immA}); break;
291             case Op::load16:  write(o, V{id}, "=", op, Ptr{immA}); break;
292             case Op::load32:  write(o, V{id}, "=", op, Ptr{immA}); break;
293             case Op::load64:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
294             case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
295 
296             case Op::gather8:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
297             case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
298             case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
299 
300             case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
301             case Op::array32:   write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
302 
303             case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
304 
305             case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
306             case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
307             case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
308             case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
309             case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
310             case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
311             case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
312             case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
313             case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
314 
315 
316             case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
317 
318             case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
319             case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
320             case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
321             case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
322 
323 
324             case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
325             case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
326             case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
327 
328             case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
329             case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
330             case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
331 
332             case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
333             case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
334 
335 
336             case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}); break;
337             case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}); break;
338             case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}); break;
339             case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break;
340 
341             case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
342 
343             case Op::ceil:      write(o, V{id}, "=", op, V{x}); break;
344             case Op::floor:     write(o, V{id}, "=", op, V{x}); break;
345             case Op::to_f32:    write(o, V{id}, "=", op, V{x}); break;
346             case Op::to_fp16:   write(o, V{id}, "=", op, V{x}); break;
347             case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break;
348             case Op::trunc:     write(o, V{id}, "=", op, V{x}); break;
349             case Op::round:     write(o, V{id}, "=", op, V{x}); break;
350 
351             case Op::duplicate: write(o, V{id}, "=", op, Hex{immA}); break;
352         }
353 
354         write(o, "\n");
355     }
356 
dump(SkWStream * o) const357     void Builder::dump(SkWStream* o) const {
358         SkDebugfStream debug;
359         if (!o) { o = &debug; }
360 
361         std::vector<OptimizedInstruction> optimized = this->optimize();
362         o->writeDecAsText(optimized.size());
363         o->writeText(" values (originally ");
364         o->writeDecAsText(fProgram.size());
365         o->writeText("):\n");
366         for (Val id = 0; id < (Val)optimized.size(); id++) {
367             const OptimizedInstruction& inst = optimized[id];
368             write(o, inst.can_hoist ? "↑ " : "  ");
369             write_one_instruction(id, inst, o);
370         }
371     }
372 
visualize(SkWStream * output,const char * code) const373     void Program::visualize(SkWStream* output, const char* code) const {
374         if (fImpl->visualizer) {
375             fImpl->visualizer->dump(output, code);
376         }
377     }
378 
visualizer()379     viz::Visualizer* Program::visualizer() { return fImpl->visualizer.get(); }
dump(SkWStream * o) const380     void Program::dump(SkWStream* o) const {
381         SkDebugfStream debug;
382         if (!o) { o = &debug; }
383 
384         o->writeDecAsText(fImpl->regs);
385         o->writeText(" registers, ");
386         o->writeDecAsText(fImpl->instructions.size());
387         o->writeText(" instructions:\n");
388         for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
389             if (i == fImpl->loop) { write(o, "loop:\n"); }
390             o->writeDecAsText(i);
391             o->writeText("\t");
392             if (i >= fImpl->loop) { write(o, "    "); }
393             const InterpreterInstruction& inst = fImpl->instructions[i];
394             Op   op = inst.op;
395             Reg   d = inst.d,
396                   x = inst.x,
397                   y = inst.y,
398                   z = inst.z,
399                   w = inst.w;
400             int immA = inst.immA,
401                 immB = inst.immB,
402                 immC = inst.immC;
403             switch (op) {
404                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
405 
406                 case Op::trace_line:  write(o, op, TraceHookID{immA},
407                                                    R{x}, R{y}, Line{immB}); break;
408                 case Op::trace_var:   write(o, op, TraceHookID{immA}, R{x}, R{y},
409                                                    VarSlot{immB}, "=", R{z}); break;
410                 case Op::trace_enter: write(o, op, TraceHookID{immA},
411                                                    R{x}, R{y}, FnIdx{immB}); break;
412                 case Op::trace_exit:  write(o, op, TraceHookID{immA},
413                                                    R{x}, R{y}, FnIdx{immB}); break;
414                 case Op::trace_scope: write(o, op, TraceHookID{immA},
415                                                    R{x}, R{y}, Shift{immB}); break;
416 
417                 case Op::store8:   write(o, op, Ptr{immA}, R{x}                  ); break;
418                 case Op::store16:  write(o, op, Ptr{immA}, R{x}                  ); break;
419                 case Op::store32:  write(o, op, Ptr{immA}, R{x}                  ); break;
420                 case Op::store64:  write(o, op, Ptr{immA}, R{x}, R{y}            ); break;
421                 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break;
422 
423                 case Op::index: write(o, R{d}, "=", op); break;
424 
425                 case Op::load8:   write(o, R{d}, "=", op, Ptr{immA}); break;
426                 case Op::load16:  write(o, R{d}, "=", op, Ptr{immA}); break;
427                 case Op::load32:  write(o, R{d}, "=", op, Ptr{immA}); break;
428                 case Op::load64:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
429                 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
430 
431                 case Op::gather8:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
432                 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
433                 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
434 
435                 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
436                 case Op::array32:   write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
437 
438                 case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
439 
440                 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
441                 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
442                 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
443                 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
444                 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
445                 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
446                 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
447                 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
448                 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
449 
450                 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
451 
452                 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
453                 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
454                 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
455                 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
456 
457 
458                 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
459                 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
460                 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
461 
462                 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
463                 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
464                 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
465 
466                 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
467                 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
468 
469                 case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}); break;
470                 case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}); break;
471                 case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}); break;
472                 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break;
473 
474                 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
475 
476                 case Op::ceil:      write(o, R{d}, "=", op, R{x}); break;
477                 case Op::floor:     write(o, R{d}, "=", op, R{x}); break;
478                 case Op::to_f32:    write(o, R{d}, "=", op, R{x}); break;
479                 case Op::to_fp16:   write(o, R{d}, "=", op, R{x}); break;
480                 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break;
481                 case Op::trunc:     write(o, R{d}, "=", op, R{x}); break;
482                 case Op::round:     write(o, R{d}, "=", op, R{x}); break;
483 
484                 case Op::duplicate: write(o, R{d}, "=", op, Hex{immA}); break;
485             }
486             write(o, "\n");
487         }
488     }
eliminate_dead_code(std::vector<Instruction> program,viz::Visualizer * visualizer)489     std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program,
490                                                  viz::Visualizer* visualizer) {
491         // Determine which Instructions are live by working back from side effects.
492         std::vector<bool> live(program.size(), false);
493         for (Val id = program.size(); id--;) {
494             if (live[id] || has_side_effect(program[id].op)) {
495                 live[id] = true;
496                 const Instruction& inst = program[id];
497                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
498                     if (arg != NA) { live[arg] = true; }
499                 }
500             }
501         }
502 
503         // Rewrite the program with only live Instructions:
504         //   - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
505         //   - then actually remove the dead Instructions.
506         std::vector<Val> new_id(program.size(), NA);
507         for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
508             if (live[id]) {
509                 Instruction& inst = program[id];
510                 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) {
511                     if (*arg != NA) {
512                         *arg = new_id[*arg];
513                         SkASSERT(*arg != NA);
514                     }
515                 }
516                 new_id[id] = next++;
517             }
518         }
519 
520         if (visualizer) {
521             visualizer->addInstructions(program);
522             visualizer->markAsDeadCode(live, new_id);
523         }
524 
525         // Eliminate any non-live ops.
526         auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
527             Val id = (Val)(&inst - program.data());
528             return !live[id];
529         });
530         program.erase(it, program.end());
531 
532         return program;
533     }
534 
finalize(const std::vector<Instruction> program,viz::Visualizer * visualizer)535     std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program,
536                                                viz::Visualizer* visualizer) {
537         std::vector<OptimizedInstruction> optimized(program.size());
538         for (Val id = 0; id < (Val)program.size(); id++) {
539             Instruction inst = program[id];
540             optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w,
541                              inst.immA,inst.immB,inst.immC,
542                              /*death=*/id, /*can_hoist=*/true};
543         }
544 
545         // Each Instruction's inputs need to live at least until that Instruction issues.
546         for (Val id = 0; id < (Val)optimized.size(); id++) {
547             OptimizedInstruction& inst = optimized[id];
548             for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
549                 // (We're walking in order, so this is the same as max()ing with the existing Val.)
550                 if (arg != NA) { optimized[arg].death = id; }
551             }
552         }
553 
554         // Mark which values don't depend on the loop and can be hoisted.
555         for (OptimizedInstruction& inst : optimized) {
556             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
557             if (is_always_varying(inst.op) || is_trace(inst.op)) {
558                 inst.can_hoist = false;
559             }
560 
561             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
562             if (inst.can_hoist) {
563                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
564                     if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
565                 }
566             }
567         }
568 
569         // Extend the lifetime of any hoisted value that's used in the loop to infinity.
570         for (OptimizedInstruction& inst : optimized) {
571             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
572                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
573                     if (arg != NA && optimized[arg].can_hoist) {
574                         optimized[arg].death = (Val)program.size();
575                     }
576                 }
577             }
578         }
579 
580         if (visualizer) {
581             visualizer->finalize(program, optimized);
582         }
583 
584         return optimized;
585     }
586 
optimize(viz::Visualizer * visualizer) const587     std::vector<OptimizedInstruction> Builder::optimize(viz::Visualizer* visualizer) const {
588         std::vector<Instruction> program = this->program();
589         program = eliminate_dead_code(std::move(program), visualizer);
590         return    finalize           (std::move(program), visualizer);
591     }
592 
done(const char * debug_name,bool allow_jit) const593     Program Builder::done(const char* debug_name,
594                           bool allow_jit) const {
595         return this->done(debug_name, allow_jit, /*visualizer=*/nullptr);
596     }
597 
done(const char * debug_name,bool allow_jit,std::unique_ptr<viz::Visualizer> visualizer) const598     Program Builder::done(const char* debug_name,
599                           bool allow_jit,
600                           std::unique_ptr<viz::Visualizer> visualizer) const {
601         char buf[64] = "skvm-jit-";
602         if (!debug_name) {
603             *SkStrAppendU32(buf+9, this->hash()) = '\0';
604             debug_name = buf;
605         }
606 
607         auto optimized = this->optimize(visualizer ? visualizer.get() : nullptr);
608         return {optimized,
609                 std::move(visualizer),
610                 fStrides,
611                 fTraceHooks, debug_name, allow_jit};
612     }
613 
hash() const614     uint64_t Builder::hash() const {
615         uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
616                  hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
617         return (uint64_t)lo | (uint64_t)hi << 32;
618     }
619 
operator !=(Ptr a,Ptr b)620     bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; }
621 
operator ==(const Instruction & a,const Instruction & b)622     bool operator==(const Instruction& a, const Instruction& b) {
623         return a.op   == b.op
624             && a.x    == b.x
625             && a.y    == b.y
626             && a.z    == b.z
627             && a.w    == b.w
628             && a.immA == b.immA
629             && a.immB == b.immB
630             && a.immC == b.immC;
631     }
632 
operator ()(const Instruction & inst,uint32_t seed) const633     uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
634         return SkOpts::hash(&inst, sizeof(inst), seed);
635     }
636 
637 
638     // Most instructions produce a value and return it by ID,
639     // the value-producing instruction's own index in the program vector.
push(Instruction inst)640     Val Builder::push(Instruction inst) {
641         // Basic common subexpression elimination:
642         // if we've already seen this exact Instruction, use it instead of creating a new one.
643         //
644         // But we never dedup loads or stores: an intervening store could change that memory.
645         // Uniforms and gathers touch only uniform memory, so they're fine to dedup,
646         // and index is varying but doesn't touch memory, so it's fine to dedup too.
647         if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) {
648             if (Val* id = fIndex.find(inst)) {
649                 if (fCreateDuplicates) {
650                     inst.op = Op::duplicate;
651                     inst.immA = *id;
652                     fProgram.push_back(inst);
653                 }
654                 return *id;
655             }
656         }
657 
658         Val id = static_cast<Val>(fProgram.size());
659         fProgram.push_back(inst);
660         fIndex.set(inst, id);
661         return id;
662     }
663 
arg(int stride)664     Ptr Builder::arg(int stride) {
665         int ix = (int)fStrides.size();
666         fStrides.push_back(stride);
667         return {ix};
668     }
669 
assert_true(I32 cond,I32 debug)670     void Builder::assert_true(I32 cond, I32 debug) {
671     #ifdef SK_DEBUG
672         int imm;
673         if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
674         (void)push(Op::assert_true, cond.id, debug.id);
675     #endif
676     }
677 
attachTraceHook(TraceHook * hook)678     int Builder::attachTraceHook(TraceHook* hook) {
679         int traceHookID = (int)fTraceHooks.size();
680         fTraceHooks.push_back(hook);
681         return traceHookID;
682     }
683 
mergeMasks(I32 & mask,I32 & traceMask)684     bool Builder::mergeMasks(I32& mask, I32& traceMask) {
685         if (this->isImm(mask.id,      0)) { return false; }
686         if (this->isImm(traceMask.id, 0)) { return false; }
687         if (this->isImm(mask.id,     ~0)) { mask = traceMask; }
688         if (this->isImm(traceMask.id,~0)) { traceMask = mask; }
689         return true;
690     }
691 
trace_line(int traceHookID,I32 mask,I32 traceMask,int line)692     void Builder::trace_line(int traceHookID, I32 mask, I32 traceMask, int line) {
693         SkASSERT(traceHookID >= 0);
694         SkASSERT(traceHookID < (int)fTraceHooks.size());
695         if (!this->mergeMasks(mask, traceMask)) { return; }
696         (void)push(Op::trace_line, mask.id,traceMask.id,NA,NA, traceHookID, line);
697     }
trace_var(int traceHookID,I32 mask,I32 traceMask,int slot,I32 val)698     void Builder::trace_var(int traceHookID, I32 mask, I32 traceMask, int slot, I32 val) {
699         SkASSERT(traceHookID >= 0);
700         SkASSERT(traceHookID < (int)fTraceHooks.size());
701         if (!this->mergeMasks(mask, traceMask)) { return; }
702         (void)push(Op::trace_var, mask.id,traceMask.id,val.id,NA, traceHookID, slot);
703     }
trace_enter(int traceHookID,I32 mask,I32 traceMask,int fnIdx)704     void Builder::trace_enter(int traceHookID, I32 mask, I32 traceMask, int fnIdx) {
705         SkASSERT(traceHookID >= 0);
706         SkASSERT(traceHookID < (int)fTraceHooks.size());
707         if (!this->mergeMasks(mask, traceMask)) { return; }
708         (void)push(Op::trace_enter, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx);
709     }
trace_exit(int traceHookID,I32 mask,I32 traceMask,int fnIdx)710     void Builder::trace_exit(int traceHookID, I32 mask, I32 traceMask, int fnIdx) {
711         SkASSERT(traceHookID >= 0);
712         SkASSERT(traceHookID < (int)fTraceHooks.size());
713         if (!this->mergeMasks(mask, traceMask)) { return; }
714         (void)push(Op::trace_exit, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx);
715     }
trace_scope(int traceHookID,I32 mask,I32 traceMask,int delta)716     void Builder::trace_scope(int traceHookID, I32 mask, I32 traceMask, int delta) {
717         SkASSERT(traceHookID >= 0);
718         SkASSERT(traceHookID < (int)fTraceHooks.size());
719         if (!this->mergeMasks(mask, traceMask)) { return; }
720         (void)push(Op::trace_scope, mask.id,traceMask.id,NA,NA, traceHookID, delta);
721     }
722 
store8(Ptr ptr,I32 val)723     void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); }
store16(Ptr ptr,I32 val)724     void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); }
store32(Ptr ptr,I32 val)725     void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); }
store64(Ptr ptr,I32 lo,I32 hi)726     void Builder::store64(Ptr ptr, I32 lo, I32 hi) {
727         (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix);
728     }
store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)729     void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) {
730         (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix);
731     }
732 
index()733     I32 Builder::index() { return {this, push(Op::index)}; }
734 
load8(Ptr ptr)735     I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; }
load16(Ptr ptr)736     I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; }
load32(Ptr ptr)737     I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; }
load64(Ptr ptr,int lane)738     I32 Builder::load64(Ptr ptr, int lane) {
739         return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) };
740     }
load128(Ptr ptr,int lane)741     I32 Builder::load128(Ptr ptr, int lane) {
742         return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) };
743     }
744 
gather8(UPtr ptr,int offset,I32 index)745     I32 Builder::gather8 (UPtr ptr, int offset, I32 index) {
746         return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)};
747     }
gather16(UPtr ptr,int offset,I32 index)748     I32 Builder::gather16(UPtr ptr, int offset, I32 index) {
749         return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)};
750     }
gather32(UPtr ptr,int offset,I32 index)751     I32 Builder::gather32(UPtr ptr, int offset, I32 index) {
752         return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)};
753     }
754 
uniform32(UPtr ptr,int offset)755     I32 Builder::uniform32(UPtr ptr, int offset) {
756         return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
757     }
758 
759     // Note: this converts the array index into a byte offset for the op.
array32(UPtr ptr,int offset,int index)760     I32 Builder::array32  (UPtr ptr, int offset, int index) {
761         return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))};
762     }
763 
splat(int n)764     I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
765 
766     // Be careful peepholing float math!  Transformations you might expect to
767     // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
768     // Float peepholes must pass this equivalence test for all ~4B floats:
769     //
770     //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
771     //
772     //     unsigned bits = 0;
773     //     do {
774     //        float f;
775     //        memcpy(&f, &bits, 4);
776     //        if (!equiv(f, ...)) {
777     //           abort();
778     //        }
779     //     } while (++bits != 0);
780 
add(F32 x,F32 y)781     F32 Builder::add(F32 x, F32 y) {
782         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
783         if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
784         if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
785 
786         if (fFeatures.fma) {
787             if (fProgram[x.id].op == Op::mul_f32) {
788                 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
789             }
790             if (fProgram[y.id].op == Op::mul_f32) {
791                 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
792             }
793         }
794         return {this, this->push(Op::add_f32, std::min(x.id, y.id), std::max(x.id, y.id))};
795     }
796 
sub(F32 x,F32 y)797     F32 Builder::sub(F32 x, F32 y) {
798         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
799         if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
800         if (fFeatures.fma) {
801             if (fProgram[x.id].op == Op::mul_f32) {
802                 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
803             }
804             if (fProgram[y.id].op == Op::mul_f32) {
805                 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
806             }
807         }
808         return {this, this->push(Op::sub_f32, x.id, y.id)};
809     }
810 
mul(F32 x,F32 y)811     F32 Builder::mul(F32 x, F32 y) {
812         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
813         if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
814         if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
815         return {this, this->push(Op::mul_f32, std::min(x.id, y.id), std::max(x.id, y.id))};
816     }
817 
fast_mul(F32 x,F32 y)818     F32 Builder::fast_mul(F32 x, F32 y) {
819         if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); }
820         return mul(x,y);
821     }
822 
div(F32 x,F32 y)823     F32 Builder::div(F32 x, F32 y) {
824         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); }
825         if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
826         return {this, this->push(Op::div_f32, x.id, y.id)};
827     }
828 
sqrt(F32 x)829     F32 Builder::sqrt(F32 x) {
830         if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
831         return {this, this->push(Op::sqrt_f32, x.id)};
832     }
833 
834     // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F32 x)835     F32 Builder::approx_log2(F32 x) {
836         // e - 127 is a fair approximation of log2(x) in its own right...
837         F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23)));
838 
839         // ... but using the mantissa to refine its error is _much_ better.
840         F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff),
841                                 0x3f000000));
842         F32 approx = sub(e,        124.225514990f);
843             approx = sub(approx, mul(1.498030302f, m));
844             approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
845 
846         return approx;
847     }
848 
approx_pow2(F32 x)849     F32 Builder::approx_pow2(F32 x) {
850         constexpr float kInfinityBits = 0x7f800000;
851 
852         F32 f = fract(x);
853         F32 approx = add(x,         121.274057500f);
854             approx = sub(approx, mul( 1.490129070f, f));
855             approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
856             approx = mul(1.0f * (1<<23), approx);
857             approx = clamp(approx, 0, kInfinityBits);  // guard against underflow/overflow
858 
859         return pun_to_F32(round(approx));
860     }
861 
approx_powf(F32 x,F32 y)862     F32 Builder::approx_powf(F32 x, F32 y) {
863         // TODO: assert this instead?  Sometimes x is very slightly negative.  See skia:10210.
864         x = max(0.0f, x);
865 
866         if (this->isImm(x.id, 1.0f)) { return x; }                    // 1^y is one
867         if (this->isImm(x.id, 2.0f)) { return this->approx_pow2(y); } // 2^y is pow2(y)
868         if (this->isImm(y.id, 0.5f)) { return this->sqrt(x); }        // x^0.5 is sqrt(x)
869         if (this->isImm(y.id, 1.0f)) { return x; }                    // x^1 is x
870         if (this->isImm(y.id, 2.0f)) { return x * x; }                // x^2 is x*x
871 
872         auto is_x = bit_or(eq(x, 0.0f),
873                            eq(x, 1.0f));
874         return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
875     }
876 
877     // Bhaskara I's sine approximation
878     // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
879     // ... divide by 4
880     // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
881     //
882     // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
883     // radians into that range first.
884     //
approx_sin(F32 radians)885     F32 Builder::approx_sin(F32 radians) {
886         constexpr float Pi = SK_ScalarPI;
887         // x = radians mod 2pi
888         F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
889         I32 neg = x > Pi;   // are we pi < x < 2pi --> need to negate result
890         x = select(neg, x - Pi, x);
891 
892         F32 pair = x * (Pi - x);
893         x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
894         x = select(neg, -x, x);
895         return x;
896     }
897 
898     /*  "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
899          https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
900 
901         approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
902 
903         Some simplifications:
904         1. tan(x) is periodic, -PI/2 < x < PI/2
905         2. tan(x) is odd, so tan(-x) = -tan(x)
906         3. Our polynomial approximation is best near zero, so we use the following identity
907                         tan(x) + tan(y)
908            tan(x + y) = -----------------
909                        1 - tan(x)*tan(y)
910            tan(PI/4) = 1
911 
912            So for x > PI/8, we do the following refactor:
913            x' = x - PI/4
914 
915                     1 + tan(x')
916            tan(x) = ------------
917                     1 - tan(x')
918      */
approx_tan(F32 x)919     F32 Builder::approx_tan(F32 x) {
920         constexpr float Pi = SK_ScalarPI;
921         // periodic between -pi/2 ... pi/2
922         // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
923         x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
924 
925         I32 neg = (x < 0.0f);
926         x = select(neg, -x, x);
927 
928         // minimize total error by shifting if x > pi/8
929         I32 use_quotient = (x > (Pi/8));
930         x = select(use_quotient, x - (Pi/4), x);
931 
932         // 9th order poly = 4th order(x^2) * x
933         x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
934         x = select(use_quotient, (1+x)/(1-x), x);
935         x = select(neg, -x, x);
936         return x;
937     }
938 
939      // http://mathforum.org/library/drmath/view/54137.html
940      // referencing Handbook of Mathematical Functions,
941      //             by Milton Abramowitz and Irene Stegun
approx_asin(F32 x)942      F32 Builder::approx_asin(F32 x) {
943          I32 neg = (x < 0.0f);
944          x = select(neg, -x, x);
945          x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
946          x = select(neg, -x, x);
947          return x;
948      }
949 
950     /*  Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
951      *      with 129 values of x,atan(x) for x:[0...1]
952      *  This only works for 0 <= x <= 1
953      */
approx_atan_unit(F32 x)954     static F32 approx_atan_unit(F32 x) {
955         // for now we might be given NaN... let that through
956         x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
957         return poly(x, 0.14130025741326729f,
958                       -0.34312835980675116f,
959                       -0.016172900528248768f,
960                        1.0037696976200385f,
961                       -0.00014758242182738969f);
962     }
963 
964     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
965      */
approx_atan(F32 x)966     F32 Builder::approx_atan(F32 x) {
967         I32 neg = (x < 0.0f);
968         x = select(neg, -x, x);
969         I32 flip = (x > 1.0f);
970         x = select(flip, 1/x, x);
971         x = approx_atan_unit(x);
972         x = select(flip, SK_ScalarPI/2 - x, x);
973         x = select(neg, -x, x);
974         return x;
975     }
976 
977     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
978      *  By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
979      *  which avoids a 2nd divide instruction if we had instead called atan().
980      */
approx_atan2(F32 y0,F32 x0)981     F32 Builder::approx_atan2(F32 y0, F32 x0) {
982 
983         I32 flip = (abs(y0) > abs(x0));
984         F32 y = select(flip, x0, y0);
985         F32 x = select(flip, y0, x0);
986         F32 arg = y/x;
987 
988         I32 neg = (arg < 0.0f);
989         arg = select(neg, -arg, arg);
990 
991         F32 r = approx_atan_unit(arg);
992         r = select(flip, SK_ScalarPI/2 - r, r);
993         r = select(neg, -r, r);
994 
995         // handle quadrant distinctions
996         r = select((y0 >= 0) & (x0  < 0), r + SK_ScalarPI, r);
997         r = select((y0  < 0) & (x0 <= 0), r - SK_ScalarPI, r);
998         // Note: we don't try to handle 0,0 or infinities (yet)
999         return r;
1000     }
1001 
min(F32 x,F32 y)1002     F32 Builder::min(F32 x, F32 y) {
1003         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
1004         return {this, this->push(Op::min_f32, x.id, y.id)};
1005     }
max(F32 x,F32 y)1006     F32 Builder::max(F32 x, F32 y) {
1007         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
1008         return {this, this->push(Op::max_f32, x.id, y.id)};
1009     }
1010 
1011     SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
add(I32 x,I32 y)1012     I32 Builder::add(I32 x, I32 y) {
1013         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
1014         if (this->isImm(x.id, 0)) { return y; }
1015         if (this->isImm(y.id, 0)) { return x; }
1016         return {this, this->push(Op::add_i32, std::min(x.id, y.id), std::max(x.id, y.id))};
1017     }
1018     SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
sub(I32 x,I32 y)1019     I32 Builder::sub(I32 x, I32 y) {
1020         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
1021         if (this->isImm(y.id, 0)) { return x; }
1022         return {this, this->push(Op::sub_i32, x.id, y.id)};
1023     }
1024     SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
mul(I32 x,I32 y)1025     I32 Builder::mul(I32 x, I32 y) {
1026         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
1027         if (this->isImm(x.id, 0)) { return splat(0); }
1028         if (this->isImm(y.id, 0)) { return splat(0); }
1029         if (this->isImm(x.id, 1)) { return y; }
1030         if (this->isImm(y.id, 1)) { return x; }
1031         return {this, this->push(Op::mul_i32, std::min(x.id, y.id), std::max(x.id, y.id))};
1032     }
1033 
1034     SK_ATTRIBUTE(no_sanitize("shift"))
shl(I32 x,int bits)1035     I32 Builder::shl(I32 x, int bits) {
1036         if (bits == 0) { return x; }
1037         if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1038         return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)};
1039     }
shr(I32 x,int bits)1040     I32 Builder::shr(I32 x, int bits) {
1041         if (bits == 0) { return x; }
1042         if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1043         return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)};
1044     }
sra(I32 x,int bits)1045     I32 Builder::sra(I32 x, int bits) {
1046         if (bits == 0) { return x; }
1047         if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1048         return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)};
1049     }
1050 
eq(F32 x,F32 y)1051     I32 Builder:: eq(F32 x, F32 y) {
1052         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1053         return {this, this->push(Op::eq_f32, std::min(x.id, y.id), std::max(x.id, y.id))};
1054     }
neq(F32 x,F32 y)1055     I32 Builder::neq(F32 x, F32 y) {
1056         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1057         return {this, this->push(Op::neq_f32, std::min(x.id, y.id), std::max(x.id, y.id))};
1058     }
lt(F32 x,F32 y)1059     I32 Builder::lt(F32 x, F32 y) {
1060         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
1061         return {this, this->push(Op::gt_f32, y.id, x.id)};
1062     }
lte(F32 x,F32 y)1063     I32 Builder::lte(F32 x, F32 y) {
1064         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
1065         return {this, this->push(Op::gte_f32, y.id, x.id)};
1066     }
gt(F32 x,F32 y)1067     I32 Builder::gt(F32 x, F32 y) {
1068         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1069         return {this, this->push(Op::gt_f32, x.id, y.id)};
1070     }
gte(F32 x,F32 y)1071     I32 Builder::gte(F32 x, F32 y) {
1072         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1073         return {this, this->push(Op::gte_f32, x.id, y.id)};
1074     }
1075 
eq(I32 x,I32 y)1076     I32 Builder:: eq(I32 x, I32 y) {
1077         if (x.id == y.id) { return splat(~0); }
1078         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1079         return {this, this->push(Op:: eq_i32, std::min(x.id, y.id), std::max(x.id, y.id))};
1080     }
neq(I32 x,I32 y)1081     I32 Builder::neq(I32 x, I32 y) {
1082         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1083         return ~(x == y);
1084     }
gt(I32 x,I32 y)1085     I32 Builder:: gt(I32 x, I32 y) {
1086         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1087         return {this, this->push(Op:: gt_i32, x.id, y.id)};
1088     }
gte(I32 x,I32 y)1089     I32 Builder::gte(I32 x, I32 y) {
1090         if (x.id == y.id) { return splat(~0); }
1091         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1092         return ~(x < y);
1093     }
lt(I32 x,I32 y)1094     I32 Builder:: lt(I32 x, I32 y) { return y>x; }
lte(I32 x,I32 y)1095     I32 Builder::lte(I32 x, I32 y) { return y>=x; }
1096 
bit_and(I32 x,I32 y)1097     I32 Builder::bit_and(I32 x, I32 y) {
1098         if (x.id == y.id) { return x; }
1099         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1100         if (this->isImm(y.id, 0)) { return splat(0); }   // (x & false) == false
1101         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & y) == false
1102         if (this->isImm(y.id,~0)) { return x; }          // (x & true) == x
1103         if (this->isImm(x.id,~0)) { return y; }          // (true & y) == y
1104         return {this, this->push(Op::bit_and, std::min(x.id, y.id), std::max(x.id, y.id))};
1105     }
bit_or(I32 x,I32 y)1106     I32 Builder::bit_or(I32 x, I32 y) {
1107         if (x.id == y.id) { return x; }
1108         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
1109         if (this->isImm(y.id, 0)) { return x; }           // (x | false) == x
1110         if (this->isImm(x.id, 0)) { return y; }           // (false | y) == y
1111         if (this->isImm(y.id,~0)) { return splat(~0); }   // (x | true) == true
1112         if (this->isImm(x.id,~0)) { return splat(~0); }   // (true | y) == true
1113         return {this, this->push(Op::bit_or, std::min(x.id, y.id), std::max(x.id, y.id))};
1114     }
bit_xor(I32 x,I32 y)1115     I32 Builder::bit_xor(I32 x, I32 y) {
1116         if (x.id == y.id) { return splat(0); }
1117         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1118         if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
1119         if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
1120         return {this, this->push(Op::bit_xor, std::min(x.id, y.id), std::max(x.id, y.id))};
1121     }
1122 
bit_clear(I32 x,I32 y)1123     I32 Builder::bit_clear(I32 x, I32 y) {
1124         if (x.id == y.id) { return splat(0); }
1125         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1126         if (this->isImm(y.id, 0)) { return x; }          // (x & ~false) == x
1127         if (this->isImm(y.id,~0)) { return splat(0); }   // (x & ~true) == false
1128         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & ~y) == false
1129         return {this, this->push(Op::bit_clear, x.id, y.id)};
1130     }
1131 
select(I32 x,I32 y,I32 z)1132     I32 Builder::select(I32 x, I32 y, I32 z) {
1133         if (y.id == z.id) { return y; }
1134         if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1135         if (this->isImm(x.id,~0)) { return y; }               // true  ? y : z == y
1136         if (this->isImm(x.id, 0)) { return z; }               // false ? y : z == z
1137         if (this->isImm(y.id, 0)) { return bit_clear(z,x); }  //     x ? 0 : z == ~x&z
1138         if (this->isImm(z.id, 0)) { return bit_and  (y,x); }  //     x ? y : 0 ==  x&y
1139         return {this, this->push(Op::select, x.id, y.id, z.id)};
1140     }
1141 
extract(I32 x,int bits,I32 z)1142     I32 Builder::extract(I32 x, int bits, I32 z) {
1143         if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
1144         return bit_and(z, shr(x, bits));
1145     }
1146 
pack(I32 x,I32 y,int bits)1147     I32 Builder::pack(I32 x, I32 y, int bits) {
1148         return bit_or(x, shl(y, bits));
1149     }
1150 
ceil(F32 x)1151     F32 Builder::ceil(F32 x) {
1152         if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1153         return {this, this->push(Op::ceil, x.id)};
1154     }
floor(F32 x)1155     F32 Builder::floor(F32 x) {
1156         if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1157         return {this, this->push(Op::floor, x.id)};
1158     }
to_F32(I32 x)1159     F32 Builder::to_F32(I32 x) {
1160         if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1161         return {this, this->push(Op::to_f32, x.id)};
1162     }
trunc(F32 x)1163     I32 Builder::trunc(F32 x) {
1164         if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1165         return {this, this->push(Op::trunc, x.id)};
1166     }
round(F32 x)1167     I32 Builder::round(F32 x) {
1168         if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1169         return {this, this->push(Op::round, x.id)};
1170     }
1171 
to_fp16(F32 x)1172     I32 Builder::to_fp16(F32 x) {
1173         if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1174         return {this, this->push(Op::to_fp16, x.id)};
1175     }
from_fp16(I32 x)1176     F32 Builder::from_fp16(I32 x) {
1177         if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1178         return {this, this->push(Op::from_fp16, x.id)};
1179     }
1180 
from_unorm(int bits,I32 x)1181     F32 Builder::from_unorm(int bits, I32 x) {
1182         F32 limit = splat(1 / ((1<<bits)-1.0f));
1183         return mul(to_F32(x), limit);
1184     }
to_unorm(int bits,F32 x)1185     I32 Builder::to_unorm(int bits, F32 x) {
1186         F32 limit = splat((1<<bits)-1.0f);
1187         return round(mul(x, limit));
1188     }
1189 
SkColorType_to_PixelFormat(SkColorType ct)1190     PixelFormat SkColorType_to_PixelFormat(SkColorType ct) {
1191         auto UNORM = PixelFormat::UNORM,
1192              SRGB  = PixelFormat::SRGB,
1193              FLOAT = PixelFormat::FLOAT;
1194         switch (ct) {
1195             case kUnknown_SkColorType: break;
1196 
1197             case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96};
1198 
1199             case kRGBA_F16Norm_SkColorType:       return {FLOAT,16,16,16,16, 0,16,32,48};
1200             case kRGBA_F16_SkColorType:           return {FLOAT,16,16,16,16, 0,16,32,48};
1201             case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48};
1202 
1203             case kA16_float_SkColorType:    return {FLOAT,  0, 0,0,16, 0, 0,0,0};
1204             case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0};
1205 
1206             case kAlpha_8_SkColorType:  return {UNORM, 0,0,0,8, 0,0,0,0};
1207             case kGray_8_SkColorType:   return {UNORM, 8,8,8,0, 0,0,0,0};  // Subtle.
1208             case kR8_unorm_SkColorType: return {UNORM, 8,0,0,0, 0,0,0,0};
1209 
1210             case kRGB_565_SkColorType:   return {UNORM, 5,6,5,0, 11,5,0,0};  // (BGR)
1211             case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0};  // (ABGR)
1212 
1213             case kRGBA_8888_SkColorType:  return {UNORM, 8,8,8,8,  0,8,16,24};
1214             case kRGB_888x_SkColorType:   return {UNORM, 8,8,8,0,  0,8,16,32};  // 32-bit
1215             case kBGRA_8888_SkColorType:  return {UNORM, 8,8,8,8, 16,8, 0,24};
1216             case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8,  0,8,16,24};
1217 
1218             case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2,  0,10,20,30};
1219             case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30};
1220             case kRGB_101010x_SkColorType:  return {UNORM, 10,10,10,0,  0,10,20, 0};
1221             case kBGR_101010x_SkColorType:  return {UNORM, 10,10,10,0, 20,10, 0, 0};
1222 
1223             case kR8G8_unorm_SkColorType:   return {UNORM,  8, 8,0, 0, 0, 8,0,0};
1224             case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0};
1225             case kA16_unorm_SkColorType:    return {UNORM,  0, 0,0,16, 0, 0,0,0};
1226         }
1227         SkASSERT(false);
1228         return {UNORM, 0,0,0,0, 0,0,0,0};
1229     }
1230 
byte_size(PixelFormat f)1231     static int byte_size(PixelFormat f) {
1232         // What's the highest bit we read?
1233         int bits = std::max(f.r_bits + f.r_shift,
1234                    std::max(f.g_bits + f.g_shift,
1235                    std::max(f.b_bits + f.b_shift,
1236                             f.a_bits + f.a_shift)));
1237         // Round up to bytes.
1238         return (bits + 7) / 8;
1239     }
1240 
unpack(PixelFormat f,I32 x)1241     static Color unpack(PixelFormat f, I32 x) {
1242         SkASSERT(byte_size(f) <= 4);
1243 
1244         auto from_srgb = [](int bits, I32 channel) -> F32 {
1245             const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction();
1246             F32 v = from_unorm(bits, channel);
1247             return sk_program_transfer_fn(v, sRGBish_TF,
1248                                           v->splat(tf->g),
1249                                           v->splat(tf->a),
1250                                           v->splat(tf->b),
1251                                           v->splat(tf->c),
1252                                           v->splat(tf->d),
1253                                           v->splat(tf->e),
1254                                           v->splat(tf->f));
1255         };
1256 
1257         auto unpack_rgb = [=](int bits, int shift) -> F32 {
1258             I32 channel = extract(x, shift, (1<<bits)-1);
1259             switch (f.encoding) {
1260                 case PixelFormat::UNORM: return from_unorm(bits, channel);
1261                 case PixelFormat:: SRGB: return from_srgb (bits, channel);
1262                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1263             }
1264             SkUNREACHABLE;
1265         };
1266         auto unpack_alpha = [=](int bits, int shift) -> F32 {
1267             I32 channel = extract(x, shift, (1<<bits)-1);
1268             switch (f.encoding) {
1269                 case PixelFormat::UNORM:
1270                 case PixelFormat:: SRGB: return from_unorm(bits, channel);
1271                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1272             }
1273             SkUNREACHABLE;
1274         };
1275         return {
1276             f.r_bits ? unpack_rgb  (f.r_bits, f.r_shift) : x->splat(0.0f),
1277             f.g_bits ? unpack_rgb  (f.g_bits, f.g_shift) : x->splat(0.0f),
1278             f.b_bits ? unpack_rgb  (f.b_bits, f.b_shift) : x->splat(0.0f),
1279             f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f),
1280         };
1281     }
1282 
split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1283     static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1284         SkASSERT(byte_size(f) == 8);
1285         // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1286         // The assert on byte_size(lo) will trigger if this assumption is violated.
1287         *lo = f;
1288         if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1289         if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1290         if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1291         if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1292         SkASSERT(byte_size(*lo) == 4);
1293 
1294         *hi = f;
1295         if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1296         if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1297         if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1298         if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1299         SkASSERT(byte_size(*hi) == 4);
1300     }
1301 
1302     // The only 16-byte format we support today is RGBA F32,
1303     // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
assert_16byte_is_rgba_f32(PixelFormat f)1304     static void assert_16byte_is_rgba_f32(PixelFormat f) {
1305     #if defined(SK_DEBUG)
1306         SkASSERT(byte_size(f) == 16);
1307         PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType);
1308 
1309         SkASSERT(f.encoding == rgba_f32.encoding);
1310 
1311         SkASSERT(f.r_bits == rgba_f32.r_bits);
1312         SkASSERT(f.g_bits == rgba_f32.g_bits);
1313         SkASSERT(f.b_bits == rgba_f32.b_bits);
1314         SkASSERT(f.a_bits == rgba_f32.a_bits);
1315 
1316         SkASSERT(f.r_shift == rgba_f32.r_shift);
1317         SkASSERT(f.g_shift == rgba_f32.g_shift);
1318         SkASSERT(f.b_shift == rgba_f32.b_shift);
1319         SkASSERT(f.a_shift == rgba_f32.a_shift);
1320     #endif
1321     }
1322 
load(PixelFormat f,Ptr ptr)1323     Color Builder::load(PixelFormat f, Ptr ptr) {
1324         switch (byte_size(f)) {
1325             case 1: return unpack(f, load8 (ptr));
1326             case 2: return unpack(f, load16(ptr));
1327             case 4: return unpack(f, load32(ptr));
1328             case 8: {
1329                 PixelFormat lo,hi;
1330                 split_disjoint_8byte_format(f, &lo,&hi);
1331                 Color l = unpack(lo, load64(ptr, 0)),
1332                       h = unpack(hi, load64(ptr, 1));
1333                 return {
1334                     lo.r_bits ? l.r : h.r,
1335                     lo.g_bits ? l.g : h.g,
1336                     lo.b_bits ? l.b : h.b,
1337                     lo.a_bits ? l.a : h.a,
1338                 };
1339             }
1340             case 16: {
1341                 assert_16byte_is_rgba_f32(f);
1342                 return {
1343                     pun_to_F32(load128(ptr, 0)),
1344                     pun_to_F32(load128(ptr, 1)),
1345                     pun_to_F32(load128(ptr, 2)),
1346                     pun_to_F32(load128(ptr, 3)),
1347                 };
1348             }
1349             default: SkUNREACHABLE;
1350         }
1351         return {};
1352     }
1353 
gather(PixelFormat f,UPtr ptr,int offset,I32 index)1354     Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) {
1355         switch (byte_size(f)) {
1356             case 1: return unpack(f, gather8 (ptr, offset, index));
1357             case 2: return unpack(f, gather16(ptr, offset, index));
1358             case 4: return unpack(f, gather32(ptr, offset, index));
1359             case 8: {
1360                 PixelFormat lo,hi;
1361                 split_disjoint_8byte_format(f, &lo,&hi);
1362                 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1363                       h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1364                 return {
1365                     lo.r_bits ? l.r : h.r,
1366                     lo.g_bits ? l.g : h.g,
1367                     lo.b_bits ? l.b : h.b,
1368                     lo.a_bits ? l.a : h.a,
1369                 };
1370             }
1371             case 16: {
1372                 assert_16byte_is_rgba_f32(f);
1373                 return {
1374                     gatherF(ptr, offset, (index<<2)+0),
1375                     gatherF(ptr, offset, (index<<2)+1),
1376                     gatherF(ptr, offset, (index<<2)+2),
1377                     gatherF(ptr, offset, (index<<2)+3),
1378                 };
1379             }
1380             default: SkUNREACHABLE;
1381         }
1382         return {};
1383     }
1384 
pack32(PixelFormat f,Color c)1385     static I32 pack32(PixelFormat f, Color c) {
1386         SkASSERT(byte_size(f) <= 4);
1387 
1388         auto to_srgb = [](int bits, F32 v) {
1389             const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction();
1390             return to_unorm(bits, sk_program_transfer_fn(v, sRGBish_TF,
1391                                                          v->splat(tf->g),
1392                                                          v->splat(tf->a),
1393                                                          v->splat(tf->b),
1394                                                          v->splat(tf->c),
1395                                                          v->splat(tf->d),
1396                                                          v->splat(tf->e),
1397                                                          v->splat(tf->f)));
1398         };
1399 
1400         I32 packed = c->splat(0);
1401         auto pack_rgb = [&](F32 channel, int bits, int shift) {
1402             I32 encoded;
1403             switch (f.encoding) {
1404                 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1405                 case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break;
1406                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1407             }
1408             packed = pack(packed, encoded, shift);
1409         };
1410         auto pack_alpha = [&](F32 channel, int bits, int shift) {
1411             I32 encoded;
1412             switch (f.encoding) {
1413                 case PixelFormat::UNORM:
1414                 case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break;
1415                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1416             }
1417             packed = pack(packed, encoded, shift);
1418         };
1419         if (f.r_bits) { pack_rgb  (c.r, f.r_bits, f.r_shift); }
1420         if (f.g_bits) { pack_rgb  (c.g, f.g_bits, f.g_shift); }
1421         if (f.b_bits) { pack_rgb  (c.b, f.b_bits, f.b_shift); }
1422         if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); }
1423         return packed;
1424     }
1425 
store(PixelFormat f,Ptr ptr,Color c)1426     void Builder::store(PixelFormat f, Ptr ptr, Color c) {
1427         // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1428         if (f.r_bits  == f.g_bits  && f.g_bits  == f.b_bits &&
1429             f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1430 
1431             // TODO: pull these coefficients from an SkColorSpace?  This is sRGB luma/luminance.
1432             c.r = c.r * 0.2126f
1433                 + c.g * 0.7152f
1434                 + c.b * 0.0722f;
1435             f.g_bits = f.b_bits = 0;
1436         }
1437 
1438         switch (byte_size(f)) {
1439             case 1: store8 (ptr, pack32(f,c)); break;
1440             case 2: store16(ptr, pack32(f,c)); break;
1441             case 4: store32(ptr, pack32(f,c)); break;
1442             case 8: {
1443                 PixelFormat lo,hi;
1444                 split_disjoint_8byte_format(f, &lo,&hi);
1445                 store64(ptr, pack32(lo,c)
1446                            , pack32(hi,c));
1447                 break;
1448             }
1449             case 16: {
1450                 assert_16byte_is_rgba_f32(f);
1451                 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a));
1452                 break;
1453             }
1454             default: SkUNREACHABLE;
1455         }
1456     }
1457 
unpremul(F32 * r,F32 * g,F32 * b,F32 a)1458     void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1459         skvm::F32 invA = 1.0f / a,
1460                   inf  = pun_to_F32(splat(0x7f800000));
1461         // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1462         invA = select(invA < inf, invA
1463                                 , 0.0f);
1464         *r *= invA;
1465         *g *= invA;
1466         *b *= invA;
1467     }
1468 
premul(F32 * r,F32 * g,F32 * b,F32 a)1469     void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1470         *r *= a;
1471         *g *= a;
1472         *b *= a;
1473     }
1474 
uniformColor(SkColor4f color,Uniforms * uniforms)1475     Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) {
1476         auto [r,g,b,a] = color;
1477         return {
1478             uniformF(uniforms->pushF(r)),
1479             uniformF(uniforms->pushF(g)),
1480             uniformF(uniforms->pushF(b)),
1481             uniformF(uniforms->pushF(a)),
1482         };
1483     }
1484 
lerp(F32 lo,F32 hi,F32 t)1485     F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1486         if (this->isImm(t.id, 0.0f)) { return lo; }
1487         if (this->isImm(t.id, 1.0f)) { return hi; }
1488         return mad(sub(hi, lo), t, lo);
1489     }
1490 
lerp(Color lo,Color hi,F32 t)1491     Color Builder::lerp(Color lo, Color hi, F32 t) {
1492         return {
1493             lerp(lo.r, hi.r, t),
1494             lerp(lo.g, hi.g, t),
1495             lerp(lo.b, hi.b, t),
1496             lerp(lo.a, hi.a, t),
1497         };
1498     }
1499 
to_hsla(Color c)1500     HSLA Builder::to_hsla(Color c) {
1501         F32 mx = max(max(c.r,c.g),c.b),
1502             mn = min(min(c.r,c.g),c.b),
1503              d = mx - mn,
1504           invd = 1.0f / d,
1505         g_lt_b = select(c.g < c.b, splat(6.0f)
1506                                  , splat(0.0f));
1507 
1508         F32 h = (1/6.0f) * select(mx == mn,  0.0f,
1509                            select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1510                            select(mx == c.g, invd * (c.b - c.r) + 2.0f
1511                                            , invd * (c.r - c.g) + 4.0f)));
1512 
1513         F32 sum = mx + mn,
1514               l = sum * 0.5f,
1515               s = select(mx == mn, 0.0f
1516                                  , d / select(l > 0.5f, 2.0f - sum
1517                                                       , sum));
1518         return {h, s, l, c.a};
1519     }
1520 
to_rgba(HSLA c)1521     Color Builder::to_rgba(HSLA c) {
1522         // See GrRGBToHSLFilterEffect.fp
1523 
1524         auto [h,s,l,a] = c;
1525         F32 x = s * (1.0f - abs(l + l - 1.0f));
1526 
1527         auto hue_to_rgb = [&,l=l](auto hue) {
1528             auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1529             return x * (clamp01(q) - 0.5f) + l;
1530         };
1531 
1532         return {
1533             hue_to_rgb(h + 0/3.0f),
1534             hue_to_rgb(h + 2/3.0f),
1535             hue_to_rgb(h + 1/3.0f),
1536             c.a,
1537         };
1538     }
1539 
1540     // We're basing our implementation of non-separable blend modes on
1541     //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1542     // and
1543     //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1544     // They're equivalent, but ES' math has been better simplified.
1545     //
1546     // Anything extra we add beyond that is to make the math work with premul inputs.
1547 
saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1548     static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1549         return max(r, max(g, b))
1550              - min(r, min(g, b));
1551     }
1552 
luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1553     static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1554         return r*0.30f + g*0.59f + b*0.11f;
1555     }
1556 
set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1557     static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1558         F32 mn  = min(*r, min(*g, *b)),
1559             mx  = max(*r, max(*g, *b)),
1560             sat = mx - mn;
1561 
1562         // Map min channel to 0, max channel to s, and scale the middle proportionally.
1563         auto scale = [&](skvm::F32 c) {
1564             auto scaled = ((c - mn) * s) / sat;
1565             return select(is_finite(scaled), scaled, 0.0f);
1566         };
1567         *r = scale(*r);
1568         *g = scale(*g);
1569         *b = scale(*b);
1570     }
1571 
set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1572     static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1573         auto diff = lu - luminance(*r, *g, *b);
1574         *r += diff;
1575         *g += diff;
1576         *b += diff;
1577     }
1578 
clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1579     static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1580         F32 mn  = min(*r, min(*g, *b)),
1581             mx  = max(*r, max(*g, *b)),
1582             lu = luminance(*r, *g, *b);
1583 
1584         auto clip = [&](auto c) {
1585             c = select(mn >= 0, c
1586                               , lu + ((c-lu)*(  lu)) / (lu-mn));
1587             c = select(mx >  a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1588                               , c);
1589             return clamp01(c);  // May be a little negative, or worse, NaN.
1590         };
1591         *r = clip(*r);
1592         *g = clip(*g);
1593         *b = clip(*b);
1594     }
1595 
blend(SkBlendMode mode,Color src,Color dst)1596     Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1597         auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1598             return x*y + z*w;
1599         };
1600 
1601         auto two = [](skvm::F32 x) { return x+x; };
1602 
1603         auto apply_rgba = [&](auto fn) {
1604             return Color {
1605                 fn(src.r, dst.r),
1606                 fn(src.g, dst.g),
1607                 fn(src.b, dst.b),
1608                 fn(src.a, dst.a),
1609             };
1610         };
1611 
1612         auto apply_rgb_srcover_a = [&](auto fn) {
1613             return Color {
1614                 fn(src.r, dst.r),
1615                 fn(src.g, dst.g),
1616                 fn(src.b, dst.b),
1617                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1618             };
1619         };
1620 
1621         auto non_sep = [&](auto R, auto G, auto B) {
1622             return Color{
1623                 R + mma(src.r, 1-dst.a,  dst.r, 1-src.a),
1624                 G + mma(src.g, 1-dst.a,  dst.g, 1-src.a),
1625                 B + mma(src.b, 1-dst.a,  dst.b, 1-src.a),
1626                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1627             };
1628         };
1629 
1630         switch (mode) {
1631             default:
1632                 SkASSERT(false);
1633                 [[fallthrough]]; /*but also, for safety, fallthrough*/
1634 
1635             case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1636 
1637             case SkBlendMode::kSrc: return src;
1638             case SkBlendMode::kDst: return dst;
1639 
1640             case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1641             case SkBlendMode::kSrcOver:
1642                 return apply_rgba([&](auto s, auto d) {
1643                     return mad(d,1-src.a, s);
1644                 });
1645 
1646             case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1647             case SkBlendMode::kSrcIn:
1648                 return apply_rgba([&](auto s, auto d) {
1649                     return s * dst.a;
1650                 });
1651 
1652             case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1653 
1654             case SkBlendMode::kSrcOut:
1655                 return apply_rgba([&](auto s, auto d) {
1656                     return s * (1-dst.a);
1657                 });
1658 
1659             case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1660             case SkBlendMode::kSrcATop:
1661                 return apply_rgba([&](auto s, auto d) {
1662                     return mma(s, dst.a,  d, 1-src.a);
1663                 });
1664 
1665             case SkBlendMode::kXor:
1666                 return apply_rgba([&](auto s, auto d) {
1667                     return mma(s, 1-dst.a,  d, 1-src.a);
1668                 });
1669 
1670             case SkBlendMode::kPlus:
1671                 return apply_rgba([&](auto s, auto d) {
1672                     return min(s+d, 1.0f);
1673                 });
1674 
1675             case SkBlendMode::kModulate:
1676                 return apply_rgba([&](auto s, auto d) {
1677                     return s * d;
1678                 });
1679 
1680             case SkBlendMode::kScreen:
1681                 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1682                 // It's kind of plausible that s + (d - sd) keeps more precision?
1683                 return apply_rgba([&](auto s, auto d) {
1684                     return s + (d - s*d);
1685                 });
1686 
1687             case SkBlendMode::kDarken:
1688                 return apply_rgb_srcover_a([&](auto s, auto d) {
1689                     return s + (d - max(s * dst.a,
1690                                         d * src.a));
1691                 });
1692 
1693             case SkBlendMode::kLighten:
1694                 return apply_rgb_srcover_a([&](auto s, auto d) {
1695                     return s + (d - min(s * dst.a,
1696                                         d * src.a));
1697                 });
1698 
1699             case SkBlendMode::kDifference:
1700                 return apply_rgb_srcover_a([&](auto s, auto d) {
1701                     return s + (d - two(min(s * dst.a,
1702                                             d * src.a)));
1703                 });
1704 
1705             case SkBlendMode::kExclusion:
1706                 return apply_rgb_srcover_a([&](auto s, auto d) {
1707                     return s + (d - two(s * d));
1708                 });
1709 
1710             case SkBlendMode::kColorBurn:
1711                 return apply_rgb_srcover_a([&](auto s, auto d) {
1712                     auto mn   = min(dst.a,
1713                                     src.a * (dst.a - d) / s),
1714                          burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1715                     return select(d == dst.a     , s * (1-dst.a) + d,
1716                            select(is_finite(burn), burn
1717                                                  , d * (1-src.a) + s));
1718                 });
1719 
1720             case SkBlendMode::kColorDodge:
1721                 return apply_rgb_srcover_a([&](auto s, auto d) {
1722                     auto dodge = src.a * min(dst.a,
1723                                              d * src.a / (src.a - s))
1724                                        + mma(s, 1-dst.a, d, 1-src.a);
1725                     return select(d == 0.0f       , s * (1-dst.a) + d,
1726                            select(is_finite(dodge), dodge
1727                                                   , d * (1-src.a) + s));
1728                 });
1729 
1730             case SkBlendMode::kHardLight:
1731                 return apply_rgb_srcover_a([&](auto s, auto d) {
1732                     return mma(s, 1-dst.a, d, 1-src.a) +
1733                            select(two(s) <= src.a,
1734                                   two(s * d),
1735                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1736                 });
1737 
1738             case SkBlendMode::kOverlay:
1739                 return apply_rgb_srcover_a([&](auto s, auto d) {
1740                     return mma(s, 1-dst.a, d, 1-src.a) +
1741                            select(two(d) <= dst.a,
1742                                   two(s * d),
1743                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1744                 });
1745 
1746             case SkBlendMode::kMultiply:
1747                 return apply_rgba([&](auto s, auto d) {
1748                     return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1749                 });
1750 
1751             case SkBlendMode::kSoftLight:
1752                 return apply_rgb_srcover_a([&](auto s, auto d) {
1753                     auto  m = select(dst.a > 0.0f, d / dst.a
1754                                                  , 0.0f),
1755                          s2 = two(s),
1756                          m4 = 4*m;
1757 
1758                          // The logic forks three ways:
1759                          //    1. dark src?
1760                          //    2. light src, dark dst?
1761                          //    3. light src, light dst?
1762 
1763                          // Used in case 1
1764                     auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1765                          // Used in case 2
1766                          darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1767                          // Used in case 3.
1768                          liteDst = sqrt(m) - m,
1769                          // Used in 2 or 3?
1770                          liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1771                                                                              , liteDst)
1772                                    + d * src.a;
1773                     return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1774                                                                              , liteSrc);
1775                 });
1776 
1777             case SkBlendMode::kHue: {
1778                 skvm::F32 R = src.r * src.a,
1779                           G = src.g * src.a,
1780                           B = src.b * src.a;
1781 
1782                 set_sat   (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1783                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1784                 clip_color(&R, &G, &B, src.a * dst.a);
1785 
1786                 return non_sep(R, G, B);
1787             }
1788 
1789             case SkBlendMode::kSaturation: {
1790                 skvm::F32 R = dst.r * src.a,
1791                           G = dst.g * src.a,
1792                           B = dst.b * src.a;
1793 
1794                 set_sat   (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1795                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1796                 clip_color(&R, &G, &B, src.a * dst.a);
1797 
1798                 return non_sep(R, G, B);
1799             }
1800 
1801             case SkBlendMode::kColor: {
1802                 skvm::F32 R = src.r * dst.a,
1803                           G = src.g * dst.a,
1804                           B = src.b * dst.a;
1805 
1806                 set_lum   (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1807                 clip_color(&R, &G, &B, src.a * dst.a);
1808 
1809                 return non_sep(R, G, B);
1810             }
1811 
1812             case SkBlendMode::kLuminosity: {
1813                 skvm::F32 R = dst.r * src.a,
1814                           G = dst.g * src.a,
1815                           B = dst.b * src.a;
1816 
1817                 set_lum   (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1818                 clip_color(&R, &G, &B, dst.a * src.a);
1819 
1820                 return non_sep(R, G, B);
1821             }
1822         }
1823     }
1824 
1825     // ~~~~ Program::eval() and co. ~~~~ //
1826 
1827     // Handy references for x86-64 instruction encoding:
1828     // https://wiki.osdev.org/X86-64_Instruction_Encoding
1829     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1830     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1831     // http://ref.x86asm.net/coder64.html
1832 
1833     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)1834     static uint8_t _233(int a, int b, int c) {
1835         return (a & 3) << 6
1836              | (b & 7) << 3
1837              | (c & 7) << 0;
1838     }
1839 
1840     // ModRM byte encodes the arguments of an opcode.
1841     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)1842     static uint8_t mod_rm(Mod mod, int reg, int rm) {
1843         return _233((int)mod, reg, rm);
1844     }
1845 
mod(int imm)1846     static Mod mod(int imm) {
1847         if (imm == 0)               { return Mod::Indirect; }
1848         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1849         return Mod::FourByteImm;
1850     }
1851 
imm_bytes(Mod mod)1852     static int imm_bytes(Mod mod) {
1853         switch (mod) {
1854             case Mod::Indirect:    return 0;
1855             case Mod::OneByteImm:  return 1;
1856             case Mod::FourByteImm: return 4;
1857             case Mod::Direct: SkUNREACHABLE;
1858         }
1859         SkUNREACHABLE;
1860     }
1861 
1862     // SIB byte encodes a memory address, base + (index * scale).
sib(Assembler::Scale scale,int index,int base)1863     static uint8_t sib(Assembler::Scale scale, int index, int base) {
1864         return _233((int)scale, index, base);
1865     }
1866 
1867     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)1868     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1869                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1870                        bool X,   // Extra top bit for SIB index register.
1871                        bool B) { // Extra top bit for SIB base or ModRM rm register.
1872         return 0b01000000   // Fixed 0100 for top four bits.
1873              | (W << 3)
1874              | (R << 2)
1875              | (X << 1)
1876              | (B << 0);
1877     }
1878 
1879 
1880     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1881     struct VEX {
1882         int     len;
1883         uint8_t bytes[3];
1884     };
1885 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1886     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1887                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1888                    bool   X,   // Same as REX X.
1889                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1890                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1891                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1892                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1893                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1894 
1895         // Pack x86 opcode map selector to 5-bit VEX encoding.
1896         map = [map]{
1897             switch (map) {
1898                 case   0x0f: return 0b00001;
1899                 case 0x380f: return 0b00010;
1900                 case 0x3a0f: return 0b00011;
1901                 // Several more cases only used by XOP / TBM.
1902             }
1903             SkUNREACHABLE;
1904         }();
1905 
1906         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1907         pp = [pp]{
1908             switch (pp) {
1909                 case 0x66: return 0b01;
1910                 case 0xf3: return 0b10;
1911                 case 0xf2: return 0b11;
1912             }
1913             return 0b00;
1914         }();
1915 
1916         VEX vex = {0, {0,0,0}};
1917         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1918             // With these conditions met, we can optionally compress VEX to 2-byte.
1919             vex.len = 2;
1920             vex.bytes[0] = 0xc5;
1921             vex.bytes[1] = (pp      &  3) << 0
1922                          | (L       &  1) << 2
1923                          | (~vvvv   & 15) << 3
1924                          | (~(int)R &  1) << 7;
1925         } else {
1926             // We could use this 3-byte VEX prefix all the time if we like.
1927             vex.len = 3;
1928             vex.bytes[0] = 0xc4;
1929             vex.bytes[1] = (map     & 31) << 0
1930                          | (~(int)B &  1) << 5
1931                          | (~(int)X &  1) << 6
1932                          | (~(int)R &  1) << 7;
1933             vex.bytes[2] = (pp    &  3) << 0
1934                          | (L     &  1) << 2
1935                          | (~vvvv & 15) << 3
1936                          | (WE    &  1) << 7;
1937         }
1938         return vex;
1939     }
1940 
Assembler(void * buf)1941     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
1942 
size() const1943     size_t Assembler::size() const { return fSize; }
1944 
bytes(const void * p,int n)1945     void Assembler::bytes(const void* p, int n) {
1946         if (fCode) {
1947             memcpy(fCode+fSize, p, n);
1948         }
1949         fSize += n;
1950     }
1951 
byte(uint8_t b)1952     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)1953     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1954 
align(int mod)1955     void Assembler::align(int mod) {
1956         while (this->size() % mod) {
1957             this->byte(0x00);
1958         }
1959     }
1960 
int3()1961     void Assembler::int3() {
1962         this->byte(0xcc);
1963     }
1964 
vzeroupper()1965     void Assembler::vzeroupper() {
1966         this->byte(0xc5);
1967         this->byte(0xf8);
1968         this->byte(0x77);
1969     }
ret()1970     void Assembler::ret() { this->byte(0xc3); }
1971 
op(int opcode,Operand dst,GP64 x)1972     void Assembler::op(int opcode, Operand dst, GP64 x) {
1973         if (dst.kind == Operand::REG) {
1974             this->byte(rex(W1,x>>3,0,dst.reg>>3));
1975             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1976             this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1977         } else {
1978             SkASSERT(dst.kind == Operand::MEM);
1979             const Mem& m = dst.mem;
1980             const bool need_SIB = (m.base&7) == rsp
1981                                || m.index != rsp;
1982 
1983             this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1984             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1985             this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1986             if (need_SIB) {
1987                 this->byte(sib(m.scale, m.index&7, m.base&7));
1988             }
1989             this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1990         }
1991     }
1992 
op(int opcode,int opcode_ext,Operand dst,int imm)1993     void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1994         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1995 
1996         int imm_bytes = 4;
1997         if (SkTFitsIn<int8_t>(imm)) {
1998             imm_bytes = 1;
1999             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
2000         }
2001 
2002         this->op(opcode, dst, (GP64)opcode_ext);
2003         this->bytes(&imm, imm_bytes);
2004     }
2005 
add(Operand dst,int imm)2006     void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
sub(Operand dst,int imm)2007     void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
cmp(Operand dst,int imm)2008     void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
2009 
2010     // These don't work quite like the other instructions with immediates:
2011     // these immediates are always fixed size at 4 bytes or 1 byte.
mov(Operand dst,int imm)2012     void Assembler::mov(Operand dst, int imm) {
2013         this->op(0xC7,dst,(GP64)0b000);
2014         this->word(imm);
2015     }
movb(Operand dst,int imm)2016     void Assembler::movb(Operand dst, int imm) {
2017         this->op(0xC6,dst,(GP64)0b000);
2018         this->byte(imm);
2019     }
2020 
add(Operand dst,GP64 x)2021     void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
sub(Operand dst,GP64 x)2022     void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
cmp(Operand dst,GP64 x)2023     void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
mov(Operand dst,GP64 x)2024     void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
movb(Operand dst,GP64 x)2025     void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
2026 
add(GP64 dst,Operand x)2027     void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
sub(GP64 dst,Operand x)2028     void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
cmp(GP64 dst,Operand x)2029     void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
mov(GP64 dst,Operand x)2030     void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
movb(GP64 dst,Operand x)2031     void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
2032 
movzbq(GP64 dst,Operand x)2033     void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
movzwq(GP64 dst,Operand x)2034     void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
2035 
vpaddd(Ymm dst,Ymm x,Operand y)2036     void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,Operand y)2037     void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Operand y)2038     void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
2039 
vpaddw(Ymm dst,Ymm x,Operand y)2040     void Assembler::vpaddw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfd, dst,x,y); }
vpsubw(Ymm dst,Ymm x,Operand y)2041     void Assembler::vpsubw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Operand y)2042     void Assembler::vpmullw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xd5, dst,x,y); }
vpavgw(Ymm dst,Ymm x,Operand y)2043     void Assembler::vpavgw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xe3, dst,x,y); }
vpmulhrsw(Ymm dst,Ymm x,Operand y)2044     void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
vpminsw(Ymm dst,Ymm x,Operand y)2045     void Assembler::vpminsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xea, dst,x,y); }
vpmaxsw(Ymm dst,Ymm x,Operand y)2046     void Assembler::vpmaxsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xee, dst,x,y); }
vpminuw(Ymm dst,Ymm x,Operand y)2047     void Assembler::vpminuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
vpmaxuw(Ymm dst,Ymm x,Operand y)2048     void Assembler::vpmaxuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
2049 
vpabsw(Ymm dst,Operand x)2050     void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
2051 
2052 
vpand(Ymm dst,Ymm x,Operand y)2053     void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,Operand y)2054     void Assembler::vpor  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,Operand y)2055     void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Operand y)2056     void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
2057 
vaddps(Ymm dst,Ymm x,Operand y)2058     void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,Operand y)2059     void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,Operand y)2060     void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Operand y)2061     void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
vminps(Ymm dst,Ymm x,Operand y)2062     void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
vmaxps(Ymm dst,Ymm x,Operand y)2063     void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
2064 
vfmadd132ps(Ymm dst,Ymm x,Operand y)2065     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Operand y)2066     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Operand y)2067     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
2068 
vfmsub132ps(Ymm dst,Ymm x,Operand y)2069     void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
vfmsub213ps(Ymm dst,Ymm x,Operand y)2070     void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
vfmsub231ps(Ymm dst,Ymm x,Operand y)2071     void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
2072 
vfnmadd132ps(Ymm dst,Ymm x,Operand y)2073     void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
vfnmadd213ps(Ymm dst,Ymm x,Operand y)2074     void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
vfnmadd231ps(Ymm dst,Ymm x,Operand y)2075     void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
2076 
vpackusdw(Ymm dst,Ymm x,Operand y)2077     void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Operand y)2078     void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
2079 
vpunpckldq(Ymm dst,Ymm x,Operand y)2080     void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
vpunpckhdq(Ymm dst,Ymm x,Operand y)2081     void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
2082 
vpcmpeqd(Ymm dst,Ymm x,Operand y)2083     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpeqw(Ymm dst,Ymm x,Operand y)2084     void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Operand y)2085     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
vpcmpgtw(Ymm dst,Ymm x,Operand y)2086     void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
2087 
2088 
imm_byte_after_operand(const Operand & operand,int imm)2089     void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2090         // When we've embedded a label displacement in the middle of an instruction,
2091         // we need to tweak it a little so that the resolved displacement starts
2092         // from the end of the instruction and not the end of the displacement.
2093         if (operand.kind == Operand::LABEL && fCode) {
2094             int disp;
2095             memcpy(&disp, fCode+fSize-4, 4);
2096             disp--;
2097             memcpy(fCode+fSize-4, &disp, 4);
2098         }
2099         this->byte(imm);
2100     }
2101 
vcmpps(Ymm dst,Ymm x,Operand y,int imm)2102     void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2103         this->op(0,0x0f,0xc2, dst,x,y);
2104         this->imm_byte_after_operand(y, imm);
2105     }
2106 
vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)2107     void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2108         this->op(0x66,0x3a0f,0x4c, dst,x,y);
2109         this->imm_byte_after_operand(y, z << 4);
2110     }
2111 
2112     // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
vpslld(Ymm dst,Ymm x,int imm)2113     void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2114         this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
2115         this->byte(imm);
2116     }
vpsrld(Ymm dst,Ymm x,int imm)2117     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2118         this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
2119         this->byte(imm);
2120     }
vpsrad(Ymm dst,Ymm x,int imm)2121     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2122         this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
2123         this->byte(imm);
2124     }
vpsllw(Ymm dst,Ymm x,int imm)2125     void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
2126         this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
2127         this->byte(imm);
2128     }
vpsrlw(Ymm dst,Ymm x,int imm)2129     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2130         this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
2131         this->byte(imm);
2132     }
vpsraw(Ymm dst,Ymm x,int imm)2133     void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
2134         this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
2135         this->byte(imm);
2136     }
2137 
vpermq(Ymm dst,Operand x,int imm)2138     void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2139         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2140         this->op(0x66,0x3a0f,0x00, dst,x,W1);
2141         this->imm_byte_after_operand(x, imm);
2142     }
2143 
vperm2f128(Ymm dst,Ymm x,Operand y,int imm)2144     void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2145         this->op(0x66,0x3a0f,0x06, dst,x,y);
2146         this->imm_byte_after_operand(y, imm);
2147     }
2148 
vpermps(Ymm dst,Ymm ix,Operand src)2149     void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2150         this->op(0x66,0x380f,0x16, dst,ix,src);
2151     }
2152 
vroundps(Ymm dst,Operand x,Rounding imm)2153     void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2154         this->op(0x66,0x3a0f,0x08, dst,x);
2155         this->imm_byte_after_operand(x, imm);
2156     }
2157 
vmovdqa(Ymm dst,Operand src)2158     void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
vmovups(Ymm dst,Operand src)2159     void Assembler::vmovups(Ymm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Xmm dst,Operand src)2160     void Assembler::vmovups(Xmm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Operand dst,Ymm src)2161     void Assembler::vmovups(Operand dst, Ymm src) { this->op(   0,0x0f,0x11, src,dst); }
vmovups(Operand dst,Xmm src)2162     void Assembler::vmovups(Operand dst, Xmm src) { this->op(   0,0x0f,0x11, src,dst); }
2163 
vcvtdq2ps(Ymm dst,Operand x)2164     void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op(   0,0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Operand x)2165     void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
vcvtps2dq(Ymm dst,Operand x)2166     void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
vsqrtps(Ymm dst,Operand x)2167     void Assembler::vsqrtps   (Ymm dst, Operand x) { this->op(   0,0x0f,0x51, dst,x); }
2168 
vcvtps2ph(Operand dst,Ymm x,Rounding imm)2169     void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2170         this->op(0x66,0x3a0f,0x1d, x,dst);
2171         this->imm_byte_after_operand(dst, imm);
2172     }
vcvtph2ps(Ymm dst,Operand x)2173     void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2174         this->op(0x66,0x380f,0x13, dst,x);
2175     }
2176 
disp19(Label * l)2177     int Assembler::disp19(Label* l) {
2178         SkASSERT(l->kind == Label::NotYetSet ||
2179                  l->kind == Label::ARMDisp19);
2180         int here = (int)this->size();
2181         l->kind = Label::ARMDisp19;
2182         l->references.push_back(here);
2183         // ARM 19-bit instruction count, from the beginning of this instruction.
2184         return (l->offset - here) / 4;
2185     }
2186 
disp32(Label * l)2187     int Assembler::disp32(Label* l) {
2188         SkASSERT(l->kind == Label::NotYetSet ||
2189                  l->kind == Label::X86Disp32);
2190         int here = (int)this->size();
2191         l->kind = Label::X86Disp32;
2192         l->references.push_back(here);
2193         // x86 32-bit byte count, from the end of this instruction.
2194         return l->offset - (here + 4);
2195     }
2196 
op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)2197     void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2198         switch (y.kind) {
2199             case Operand::REG: {
2200                 VEX v = vex(w, dst>>3, 0, y.reg>>3,
2201                             map, x, l, prefix);
2202                 this->bytes(v.bytes, v.len);
2203                 this->byte(opcode);
2204                 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
2205             } return;
2206 
2207             case Operand::MEM: {
2208                 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2209                 // without an SIB byte, that's where the base register would usually go.
2210                 // This means we have to use an SIB byte if we want to use rsp as a base register.
2211                 const Mem& m = y.mem;
2212                 const bool need_SIB = m.base  == rsp
2213                                    || m.index != rsp;
2214 
2215                 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2216                             map, x, l, prefix);
2217                 this->bytes(v.bytes, v.len);
2218                 this->byte(opcode);
2219                 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2220                 if (need_SIB) {
2221                     this->byte(sib(m.scale, m.index&7, m.base&7));
2222                 }
2223                 this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2224             } return;
2225 
2226             case Operand::LABEL: {
2227                 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2228                 const int rip = rbp;
2229 
2230                 VEX v = vex(w, dst>>3, 0, rip>>3,
2231                             map, x, l, prefix);
2232                 this->bytes(v.bytes, v.len);
2233                 this->byte(opcode);
2234                 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2235                 this->word(this->disp32(y.label));
2236             } return;
2237         }
2238     }
2239 
vpshufb(Ymm dst,Ymm x,Operand y)2240     void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2241 
vptest(Ymm x,Operand y)2242     void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2243 
vbroadcastss(Ymm dst,Operand y)2244     void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2245 
jump(uint8_t condition,Label * l)2246     void Assembler::jump(uint8_t condition, Label* l) {
2247         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2248         //    7?     one-byte-disp
2249         //    0F 8? four-byte-disp
2250         // We always use the near displacement to make updating labels simpler (no resizing).
2251         this->byte(0x0f);
2252         this->byte(condition);
2253         this->word(this->disp32(l));
2254     }
je(Label * l)2255     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)2256     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)2257     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
jc(Label * l)2258     void Assembler::jc (Label* l) { this->jump(0x82, l); }
2259 
jmp(Label * l)2260     void Assembler::jmp(Label* l) {
2261         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2262         this->byte(0xe9);
2263         this->word(this->disp32(l));
2264     }
2265 
vpmovzxwd(Ymm dst,Operand src)2266     void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,Operand src)2267     void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2268 
vmovq(Operand dst,Xmm src)2269     void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2270 
vmovd(Operand dst,Xmm src)2271     void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
vmovd(Xmm dst,Operand src)2272     void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2273 
vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2274     void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2275         this->op(0x66,0x3a0f,0x22, dst,src,y);
2276         this->imm_byte_after_operand(y, imm);
2277     }
vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2278     void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2279         this->op(0x66,0x0f,0xc4, dst,src,y);
2280         this->imm_byte_after_operand(y, imm);
2281     }
vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2282     void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2283         this->op(0x66,0x3a0f,0x20, dst,src,y);
2284         this->imm_byte_after_operand(y, imm);
2285     }
2286 
vextracti128(Operand dst,Ymm src,int imm)2287     void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2288         this->op(0x66,0x3a0f,0x39, src,dst);
2289         SkASSERT(dst.kind != Operand::LABEL);
2290         this->byte(imm);
2291     }
vpextrd(Operand dst,Xmm src,int imm)2292     void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2293         this->op(0x66,0x3a0f,0x16, src,dst);
2294         SkASSERT(dst.kind != Operand::LABEL);
2295         this->byte(imm);
2296     }
vpextrw(Operand dst,Xmm src,int imm)2297     void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2298         this->op(0x66,0x3a0f,0x15, src,dst);
2299         SkASSERT(dst.kind != Operand::LABEL);
2300         this->byte(imm);
2301     }
vpextrb(Operand dst,Xmm src,int imm)2302     void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2303         this->op(0x66,0x3a0f,0x14, src,dst);
2304         SkASSERT(dst.kind != Operand::LABEL);
2305         this->byte(imm);
2306     }
2307 
vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2308     void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2309         // Unlike most instructions, no aliasing is permitted here.
2310         SkASSERT(dst != ix);
2311         SkASSERT(dst != mask);
2312         SkASSERT(mask != ix);
2313 
2314         int prefix = 0x66,
2315             map    = 0x380f,
2316             opcode = 0x92;
2317         VEX v = vex(0, dst>>3, ix>>3, base>>3,
2318                     map, mask, /*ymm?*/1, prefix);
2319         this->bytes(v.bytes, v.len);
2320         this->byte(opcode);
2321         this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2322         this->byte(sib(scale, ix&7, base&7));
2323     }
2324 
2325     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2326 
operator ""_mask(unsigned long long bits)2327     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2328 
op(uint32_t hi,V m,uint32_t lo,V n,V d)2329     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2330         this->word( (hi & 11_mask) << 21
2331                   | (m  &  5_mask) << 16
2332                   | (lo &  6_mask) << 10
2333                   | (n  &  5_mask) <<  5
2334                   | (d  &  5_mask) <<  0);
2335     }
op(uint32_t op22,V n,V d,int imm)2336     void Assembler::op(uint32_t op22, V n, V d, int imm) {
2337         this->word( (op22 & 22_mask) << 10
2338                   | imm  // size and location depends on the instruction
2339                   | (n    &  5_mask) <<  5
2340                   | (d    &  5_mask) <<  0);
2341     }
2342 
and16b(V d,V n,V m)2343     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)2344     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)2345     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)2346     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)2347     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
not16b(V d,V n)2348     void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2349 
add4s(V d,V n,V m)2350     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)2351     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)2352     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2353 
cmeq4s(V d,V n,V m)2354     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)2355     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2356 
sub8h(V d,V n,V m)2357     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)2358     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2359 
fadd4s(V d,V n,V m)2360     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)2361     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)2362     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)2363     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
fmin4s(V d,V n,V m)2364     void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
fmax4s(V d,V n,V m)2365     void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2366 
fneg4s(V d,V n)2367     void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
fsqrt4s(V d,V n)2368     void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
2369 
fcmeq4s(V d,V n,V m)2370     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
fcmgt4s(V d,V n,V m)2371     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
fcmge4s(V d,V n,V m)2372     void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2373 
fmla4s(V d,V n,V m)2374     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
fmls4s(V d,V n,V m)2375     void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2376 
tbl(V d,V n,V m)2377     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2378 
uzp14s(V d,V n,V m)2379     void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); }
uzp24s(V d,V n,V m)2380     void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); }
zip14s(V d,V n,V m)2381     void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); }
zip24s(V d,V n,V m)2382     void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); }
2383 
sli4s(V d,V n,int imm5)2384     void Assembler::sli4s(V d, V n, int imm5) {
2385         this->op(0b0'1'1'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2386     }
shl4s(V d,V n,int imm5)2387     void Assembler::shl4s(V d, V n, int imm5) {
2388         this->op(0b0'1'0'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2389     }
sshr4s(V d,V n,int imm5)2390     void Assembler::sshr4s(V d, V n, int imm5) {
2391         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2392     }
ushr4s(V d,V n,int imm5)2393     void Assembler::ushr4s(V d, V n, int imm5) {
2394         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2395     }
ushr8h(V d,V n,int imm4)2396     void Assembler::ushr8h(V d, V n, int imm4) {
2397         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16);
2398     }
2399 
scvtf4s(V d,V n)2400     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)2401     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
fcvtns4s(V d,V n)2402     void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
frintp4s(V d,V n)2403     void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
frintm4s(V d,V n)2404     void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
2405 
fcvtn(V d,V n)2406     void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); }
fcvtl(V d,V n)2407     void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); }
2408 
xtns2h(V d,V n)2409     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)2410     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2411 
uxtlb2h(V d,V n)2412     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)2413     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2414 
uminv4s(V d,V n)2415     void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2416 
brk(int imm16)2417     void Assembler::brk(int imm16) {
2418         this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5);
2419     }
2420 
ret(X n)2421     void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2422 
add(X d,X n,int imm12)2423     void Assembler::add(X d, X n, int imm12) {
2424         this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2425     }
sub(X d,X n,int imm12)2426     void Assembler::sub(X d, X n, int imm12) {
2427         this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2428     }
subs(X d,X n,int imm12)2429     void Assembler::subs(X d, X n, int imm12) {
2430         this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2431     }
2432 
add(X d,X n,X m,Shift shift,int imm6)2433     void Assembler::add(X d, X n, X m, Shift shift, int imm6) {
2434         SkASSERT(shift != ROR);
2435 
2436         int imm = (imm6  & 6_mask) << 0
2437                 | (m     & 5_mask) << 6
2438                 | (0     & 1_mask) << 11
2439                 | (shift & 2_mask) << 12;
2440         this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10);
2441     }
2442 
b(Condition cond,Label * l)2443     void Assembler::b(Condition cond, Label* l) {
2444         const int imm19 = this->disp19(l);
2445         this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5);
2446     }
cbz(X t,Label * l)2447     void Assembler::cbz(X t, Label* l) {
2448         const int imm19 = this->disp19(l);
2449         this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2450     }
cbnz(X t,Label * l)2451     void Assembler::cbnz(X t, Label* l) {
2452         const int imm19 = this->disp19(l);
2453         this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2454     }
2455 
ldrd(X dst,X src,int imm12)2456     void Assembler::ldrd(X dst, X src, int imm12) {
2457         this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2458     }
ldrs(X dst,X src,int imm12)2459     void Assembler::ldrs(X dst, X src, int imm12) {
2460         this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2461     }
ldrh(X dst,X src,int imm12)2462     void Assembler::ldrh(X dst, X src, int imm12) {
2463         this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2464     }
ldrb(X dst,X src,int imm12)2465     void Assembler::ldrb(X dst, X src, int imm12) {
2466         this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2467     }
2468 
ldrq(V dst,X src,int imm12)2469     void Assembler::ldrq(V dst, X src, int imm12) {
2470         this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10);
2471     }
ldrd(V dst,X src,int imm12)2472     void Assembler::ldrd(V dst, X src, int imm12) {
2473         this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2474     }
ldrs(V dst,X src,int imm12)2475     void Assembler::ldrs(V dst, X src, int imm12) {
2476         this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2477     }
ldrh(V dst,X src,int imm12)2478     void Assembler::ldrh(V dst, X src, int imm12) {
2479         this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2480     }
ldrb(V dst,X src,int imm12)2481     void Assembler::ldrb(V dst, X src, int imm12) {
2482         this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2483     }
2484 
strs(X src,X dst,int imm12)2485     void Assembler::strs(X src, X dst, int imm12) {
2486         this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2487     }
2488 
strq(V src,X dst,int imm12)2489     void Assembler::strq(V src, X dst, int imm12) {
2490         this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10);
2491     }
strd(V src,X dst,int imm12)2492     void Assembler::strd(V src, X dst, int imm12) {
2493         this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2494     }
strs(V src,X dst,int imm12)2495     void Assembler::strs(V src, X dst, int imm12) {
2496         this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2497     }
strh(V src,X dst,int imm12)2498     void Assembler::strh(V src, X dst, int imm12) {
2499         this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2500     }
strb(V src,X dst,int imm12)2501     void Assembler::strb(V src, X dst, int imm12) {
2502         this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2503     }
2504 
movs(X dst,V src,int lane)2505     void Assembler::movs(X dst, V src, int lane) {
2506         int imm5 = (lane << 3) | 0b100;
2507         this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16);
2508     }
inss(V dst,X src,int lane)2509     void Assembler::inss(V dst, X src, int lane) {
2510         int imm5 = (lane << 3) | 0b100;
2511         this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16);
2512     }
2513 
2514 
ldrq(V dst,Label * l)2515     void Assembler::ldrq(V dst, Label* l) {
2516         const int imm19 = this->disp19(l);
2517         this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
2518     }
2519 
dup4s(V dst,X src)2520     void Assembler::dup4s(V dst, X src) {
2521         this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2522     }
2523 
ld1r4s(V dst,X src)2524     void Assembler::ld1r4s(V dst, X src) {
2525         this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
2526     }
ld1r8h(V dst,X src)2527     void Assembler::ld1r8h(V dst, X src) {
2528         this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst);
2529     }
ld1r16b(V dst,X src)2530     void Assembler::ld1r16b(V dst, X src) {
2531         this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst);
2532     }
2533 
ld24s(V dst,X src)2534     void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); }
ld44s(V dst,X src)2535     void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); }
st24s(V src,X dst)2536     void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); }
st44s(V src,X dst)2537     void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); }
2538 
ld24s(V dst,X src,int lane)2539     void Assembler::ld24s(V dst, X src, int lane) {
2540         int Q = (lane & 2)>>1,
2541             S = (lane & 1);
2542                  /*  Q                       S */
2543         this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12));
2544     }
ld44s(V dst,X src,int lane)2545     void Assembler::ld44s(V dst, X src, int lane) {
2546         int Q = (lane & 2)>>1,
2547             S = (lane & 1);
2548         this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12));
2549     }
2550 
label(Label * l)2551     void Assembler::label(Label* l) {
2552         if (fCode) {
2553             // The instructions all currently point to l->offset.
2554             // We'll want to add a delta to point them to here.
2555             int here = (int)this->size();
2556             int delta = here - l->offset;
2557             l->offset = here;
2558 
2559             if (l->kind == Label::ARMDisp19) {
2560                 for (int ref : l->references) {
2561                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2562                     uint32_t inst;
2563                     memcpy(&inst, fCode + ref, 4);
2564 
2565                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2566                     int disp = (int)(inst << 8) >> 13;
2567 
2568                     disp += delta/4;  // delta is in bytes, we want instructions.
2569 
2570                     // Put it all back together, preserving the high 8 bits and low 5.
2571                     inst = ((disp << 5) &  (19_mask << 5))
2572                          | ((inst     ) & ~(19_mask << 5));
2573                     memcpy(fCode + ref, &inst, 4);
2574                 }
2575             }
2576 
2577             if (l->kind == Label::X86Disp32) {
2578                 for (int ref : l->references) {
2579                     // ref points to a 32-bit displacement in bytes.
2580                     int disp;
2581                     memcpy(&disp, fCode + ref, 4);
2582 
2583                     disp += delta;
2584 
2585                     memcpy(fCode + ref, &disp, 4);
2586                 }
2587             }
2588         }
2589     }
2590 
eval(int n,void * args[]) const2591     void Program::eval(int n, void* args[]) const {
2592     #define SKVM_JIT_STATS 0
2593     #if SKVM_JIT_STATS
2594         static std::atomic<int64_t>  calls{0}, jits{0},
2595                                     pixels{0}, fast{0};
2596         pixels += n;
2597         if (0 == calls++) {
2598             atexit([]{
2599                 int64_t num = jits .load(),
2600                         den = calls.load();
2601                 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2602                 num = fast  .load();
2603                 den = pixels.load();
2604                 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2605             });
2606         }
2607     #endif
2608 
2609     #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2610         const void* jit_entry = fImpl->jit_entry.load();
2611         // jit_entry may be null either simply because we can't JIT, or when using LLVM
2612         // if the work represented by fImpl->llvm_compiling hasn't finished yet.
2613         //
2614         // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2615         // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2616         // due to timing or program caching.
2617         if (jit_entry != nullptr && gSkVMAllowJIT) {
2618         #if SKVM_JIT_STATS
2619             jits++;
2620             fast += n;
2621         #endif
2622             void** a = args;
2623             switch (fImpl->strides.size()) {
2624                 case 0: return ((void(*)(int                        ))jit_entry)(n               );
2625                 case 1: return ((void(*)(int,void*                  ))jit_entry)(n,a[0]          );
2626                 case 2: return ((void(*)(int,void*,void*            ))jit_entry)(n,a[0],a[1]     );
2627                 case 3: return ((void(*)(int,void*,void*,void*      ))jit_entry)(n,a[0],a[1],a[2]);
2628                 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2629                                 (n,a[0],a[1],a[2],a[3]);
2630                 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2631                                 (n,a[0],a[1],a[2],a[3],a[4]);
2632                 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2633                                 (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2634                 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry)
2635                                 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]);
2636                 default: break; //SkASSERT(fImpl->strides.size() <= 7);
2637             }
2638         }
2639     #endif
2640 
2641         // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2642         SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2643                                this->nregs(), this->loop(), fImpl->strides.data(),
2644                                fImpl->traceHooks.data(), fImpl->traceHooks.size(),
2645                                this->nargs(), n, args);
2646     }
2647 
2648     #if defined(SKVM_LLVM)
2649     // -- SKVM_LLVM --------------------------------------------------------------------------------
setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2650     void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2651                             const char* debug_name) {
2652         auto ctx = std::make_unique<llvm::LLVMContext>();
2653 
2654         auto mod = std::make_unique<llvm::Module>("", *ctx);
2655         // All the scary bare pointers from here on are owned by ctx or mod, I think.
2656 
2657         // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2658         const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2659 
2660         llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2661                    *i32 = llvm::Type::getInt32Ty(*ctx);
2662 
2663         std::vector<llvm::Type*> arg_types = { i32 };
2664         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2665             arg_types.push_back(ptr);
2666         }
2667 
2668         llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2669                                                               arg_types, /*vararg?=*/false);
2670         llvm::Function* fn
2671             = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2672         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2673             fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2674         }
2675 
2676         llvm::BasicBlock *enter  = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2677                          *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2678                          *testK  = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2679                          *loopK  = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2680                          *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2681                          *test1  = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2682                          *loop1  = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2683                          *leave  = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2684 
2685         using IRBuilder = llvm::IRBuilder<>;
2686 
2687         llvm::PHINode*                 n;
2688         std::vector<llvm::PHINode*> args;
2689         std::vector<llvm::Value*> vals(instructions.size());
2690 
2691         auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2692             auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i];
2693 
2694             llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
2695                        *i8    = llvm::Type::getInt8Ty (*ctx),
2696                        *i16   = llvm::Type::getInt16Ty(*ctx),
2697                        *f32   = llvm::Type::getFloatTy(*ctx),
2698                        *I1    = scalar ? i1    : llvm::VectorType::get(i1 , K, false  ),
2699                        *I8    = scalar ? i8    : llvm::VectorType::get(i8 , K, false  ),
2700                        *I16   = scalar ? i16   : llvm::VectorType::get(i16, K, false  ),
2701                        *I32   = scalar ? i32   : llvm::VectorType::get(i32, K, false  ),
2702                        *F32   = scalar ? f32   : llvm::VectorType::get(f32, K, false  );
2703 
2704             auto I  = [&](llvm::Value* v) { return b->CreateBitCast(v, I32  ); };
2705             auto F  = [&](llvm::Value* v) { return b->CreateBitCast(v, F32  ); };
2706 
2707             auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2708 
2709             llvm::Type* vt = nullptr;
2710             switch (llvm::Type* t = nullptr; op) {
2711                 default:
2712                     SkDebugf("can't llvm %s (%d)\n", name(op), op);
2713                     return false;
2714 
2715                 case Op::assert_true: /*TODO*/ break;
2716 
2717                 case Op::trace_line:
2718                 case Op::trace_var:
2719                 case Op::trace_enter:
2720                 case Op::trace_exit:
2721                 case Op::trace_scope:
2722                     /* Force this program to run in the interpreter. */
2723                     return false;
2724 
2725                 case Op::index:
2726                     if (I32->isVectorTy()) {
2727                         std::vector<llvm::Constant*> iota(K);
2728                         for (int j = 0; j < K; j++) {
2729                             iota[j] = b->getInt32(j);
2730                         }
2731                         vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2732                                                llvm::ConstantVector::get(iota));
2733                     } else {
2734                         vals[i] = n;
2735                     } break;
2736 
2737                 case Op::load8:  t = I8 ; goto load;
2738                 case Op::load16: t = I16; goto load;
2739                 case Op::load32: t = I32; goto load;
2740                 load: {
2741                     llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo());
2742                     vals[i] = b->CreateZExt(
2743                             b->CreateAlignedLoad(t, ptr, llvm::MaybeAlign{1}), I32);
2744                 } break;
2745 
2746 
2747                 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break;
2748 
2749                 case Op::uniform32: {
2750                     llvm::Value* ptr = b->CreateBitCast(
2751                             b->CreateConstInBoundsGEP1_32(i8, args[immA], immB),
2752                             i32->getPointerTo());
2753                     llvm::Value* val = b->CreateZExt(
2754                             b->CreateAlignedLoad(i32, ptr, llvm::MaybeAlign{1}), i32);
2755                     vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2756                                                 : val;
2757                 } break;
2758 
2759                 case Op::gather8:  t = i8 ; vt = I8; goto gather;
2760                 case Op::gather16: t = i16; vt = I16; goto gather;
2761                 case Op::gather32: t = i32; vt = I32; goto gather;
2762                 gather: {
2763                     // Our gather base pointer is immB bytes off of uniform immA.
2764                     llvm::Value* base =
2765                         b->CreateLoad(b->CreateBitCast(
2766                                 b->CreateConstInBoundsGEP1_32(i8, args[immA],immB),
2767                                 t->getPointerTo()->getPointerTo()));
2768 
2769                     llvm::Value* ptr = b->CreateInBoundsGEP(t, base, vals[x]);
2770                     llvm::Value* gathered;
2771                     if (ptr->getType()->isVectorTy()) {
2772                         gathered = b->CreateMaskedGather(
2773                                 vt,
2774                                 ptr,
2775                                 llvm::Align{1});
2776                     } else {
2777                         gathered = b->CreateAlignedLoad(vt, ptr, llvm::MaybeAlign{1});
2778                     }
2779                     vals[i] = b->CreateZExt(gathered, I32);
2780                 } break;
2781 
2782                 case Op::store8:  t = I8 ; goto store;
2783                 case Op::store16: t = I16; goto store;
2784                 case Op::store32: t = I32; goto store;
2785                 store: {
2786                     llvm::Value* val = b->CreateTrunc(vals[x], t);
2787                     llvm::Value* ptr = b->CreateBitCast(args[immA],
2788                                                         val->getType()->getPointerTo());
2789                     vals[i] = b->CreateAlignedStore(val, ptr, llvm::MaybeAlign{1});
2790                 } break;
2791 
2792                 case Op::bit_and:   vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2793                 case Op::bit_or :   vals[i] = b->CreateOr (vals[x], vals[y]); break;
2794                 case Op::bit_xor:   vals[i] = b->CreateXor(vals[x], vals[y]); break;
2795                 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2796 
2797                 case Op::select:
2798                     vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2799                     break;
2800 
2801                 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2802                 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2803                 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2804 
2805                 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break;
2806                 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break;
2807                 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break;
2808 
2809                 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2810                 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2811 
2812                 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2813                 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2814                 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2815                 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2816 
2817                 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2818                 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2819                 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2820                 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2821 
2822                 case Op::fma_f32:
2823                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2824                                                    {F(vals[x]), F(vals[y]), F(vals[z])}));
2825                     break;
2826 
2827                 case Op::fms_f32:
2828                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2829                                                    {F(vals[x]), F(vals[y]),
2830                                                     b->CreateFNeg(F(vals[z]))}));
2831                     break;
2832 
2833                 case Op::fnma_f32:
2834                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2835                                                    {b->CreateFNeg(F(vals[x])), F(vals[y]),
2836                                                     F(vals[z])}));
2837                     break;
2838 
2839                 case Op::ceil:
2840                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2841                     break;
2842                 case Op::floor:
2843                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2844                     break;
2845 
2846                 case Op::max_f32:
2847                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2848                                                 F(vals[y]), F(vals[x])));
2849                     break;
2850                 case Op::min_f32:
2851                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2852                                                 F(vals[y]), F(vals[x])));
2853                     break;
2854 
2855                 case Op::sqrt_f32:
2856                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2857                     break;
2858 
2859                 case Op::to_f32: vals[i] = I(b->CreateSIToFP(  vals[x] , F32)); break;
2860                 case Op::trunc : vals[i] =   b->CreateFPToSI(F(vals[x]), I32) ; break;
2861                 case Op::round : {
2862                     // Basic impl when we can't use cvtps2dq and co.
2863                     auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2864                     vals[i] = b->CreateFPToSI(round, I32);
2865 
2866                 #if 1 && defined(SK_CPU_X86)
2867                     // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2868                     if (scalar) {
2869                         // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3.  ¯\_(ツ)_/¯
2870                         llvm::Value* v = llvm::UndefValue::get(
2871                                 llvm::VectorType::get(f32, 4, false));
2872                         v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2873                         vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2874                     } else {
2875                         SkASSERT(K == 4  || K == 8);
2876                         auto intr = K == 4 ?   llvm::Intrinsic::x86_sse2_cvtps2dq :
2877                                  /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2878                         vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2879                     }
2880                 #endif
2881                 } break;
2882 
2883             }
2884             return true;
2885         };
2886 
2887         {
2888             IRBuilder b(enter);
2889             b.CreateBr(hoistK);
2890         }
2891 
2892         // hoistK: emit each hoistable vector instruction; goto testK;
2893         // LLVM can do this sort of thing itself, but we've got the information cheap,
2894         // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2895         {
2896             IRBuilder b(hoistK);
2897 
2898             // Hoisted instructions will need args (think, uniforms), so set that up now.
2899             // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2900             // Later on when we start looping the phi nodes will start looking useful.
2901             llvm::Argument* arg = fn->arg_begin();
2902             (void)arg++;  // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2903             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2904                 args.push_back(b.CreatePHI(arg->getType(), 1));
2905                 args.back()->addIncoming(arg++, enter);
2906             }
2907 
2908             for (size_t i = 0; i < instructions.size(); i++) {
2909                 if (instructions[i].can_hoist && !emit(i, false, &b)) {
2910                     return;
2911                 }
2912             }
2913 
2914             b.CreateBr(testK);
2915         }
2916 
2917         // testK:  if (N >= K) goto loopK; else goto hoist1;
2918         {
2919             IRBuilder b(testK);
2920 
2921             // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2922             // These also start as the initial function arguments; hoistK can't have changed them.
2923             llvm::Argument* arg = fn->arg_begin();
2924 
2925             n = b.CreatePHI(arg->getType(), 2);
2926             n->addIncoming(arg++, hoistK);
2927 
2928             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2929                 args[i] = b.CreatePHI(arg->getType(), 2);
2930                 args[i]->addIncoming(arg++, hoistK);
2931             }
2932 
2933             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2934         }
2935 
2936         // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2937         {
2938             IRBuilder b(loopK);
2939             for (size_t i = 0; i < instructions.size(); i++) {
2940                 if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2941                     return;
2942                 }
2943             }
2944 
2945             // n -= K
2946             llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2947             n->addIncoming(n_next, loopK);
2948 
2949             // Each arg ptr += K
2950             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2951                 llvm::Value* arg_next
2952                     = b.CreateConstInBoundsGEP1_32(
2953                             llvm::Type::getInt8Ty (*ctx),
2954                             args[i],
2955                             K*fImpl->strides[i]);
2956                 args[i]->addIncoming(arg_next, loopK);
2957             }
2958             b.CreateBr(testK);
2959         }
2960 
2961         // hoist1: emit each hoistable scalar instruction; goto test1;
2962         {
2963             IRBuilder b(hoist1);
2964             for (size_t i = 0; i < instructions.size(); i++) {
2965                 if (instructions[i].can_hoist && !emit(i, true, &b)) {
2966                     return;
2967                 }
2968             }
2969             b.CreateBr(test1);
2970         }
2971 
2972         // test1:  if (N >= 1) goto loop1; else goto leave;
2973         {
2974             IRBuilder b(test1);
2975 
2976             // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2977             llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2978             n_new->addIncoming(n, hoist1);
2979             n = n_new;
2980 
2981             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2982                 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2983                 arg_new->addIncoming(args[i], hoist1);
2984                 args[i] = arg_new;
2985             }
2986 
2987             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2988         }
2989 
2990         // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
2991         {
2992             IRBuilder b(loop1);
2993             for (size_t i = 0; i < instructions.size(); i++) {
2994                 if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2995                     return;
2996                 }
2997             }
2998 
2999             // n -= 1
3000             llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
3001             n->addIncoming(n_next, loop1);
3002 
3003             // Each arg ptr += 1
3004             for (size_t i = 0; i < fImpl->strides.size(); i++) {
3005                 llvm::Value* arg_next
3006                     = b.CreateConstInBoundsGEP1_32(
3007                             llvm::Type::getInt8Ty (*ctx), args[i], fImpl->strides[i]);
3008                 args[i]->addIncoming(arg_next, loop1);
3009             }
3010             b.CreateBr(test1);
3011         }
3012 
3013         // leave:  ret
3014         {
3015             IRBuilder b(leave);
3016             b.CreateRetVoid();
3017         }
3018 
3019         SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
3020 
3021         if (true) {
3022             SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
3023             std::error_code err;
3024             llvm::raw_fd_ostream os(path.c_str(), err);
3025             if (err) {
3026                 return;
3027             }
3028             llvm::WriteBitcodeToFile(*mod, os);
3029         }
3030 
3031         static SkOnce once;
3032         once([]{
3033             SkAssertResult(false == llvm::InitializeNativeTarget());
3034             SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
3035         });
3036 
3037         if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
3038                                             .setEngineKind(llvm::EngineKind::JIT)
3039                                             .setMCPU(llvm::sys::getHostCPUName())
3040                                             .create()) {
3041             fImpl->llvm_ctx = std::move(ctx);
3042             fImpl->llvm_ee.reset(ee);
3043 
3044             #if defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3045             // Wait for llvm to compile
3046             void* function = (void*)ee->getFunctionAddress(debug_name);
3047             fImpl->jit_entry.store(function);
3048             // We have to be careful here about what we close over and how, in case fImpl moves.
3049             // fImpl itself may change, but its pointee fields won't, so close over them by value.
3050             // Also, debug_name will almost certainly leave scope, so copy it.
3051             #else
3052             fImpl->llvm_compiling = std::async(std::launch::async, [dst  = &fImpl->jit_entry,
3053                                                                     ee   =  fImpl->llvm_ee.get(),
3054                                                                     name = std::string(debug_name)]{
3055                 // std::atomic<void*>*    dst;
3056                 // llvm::ExecutionEngine* ee;
3057                 // std::string            name;
3058                 dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
3059             });
3060             #endif
3061         }
3062     }
3063     #endif  // SKVM_LLVM
3064 
waitForLLVM() const3065     void Program::waitForLLVM() const {
3066     #if defined(SKVM_LLVM) && !defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3067         if (fImpl->llvm_compiling.valid()) {
3068             fImpl->llvm_compiling.wait();
3069         }
3070     #endif
3071     }
3072 
hasTraceHooks() const3073     bool Program::hasTraceHooks() const {
3074         // Identifies a program which has been instrumented for debugging.
3075         return !fImpl->traceHooks.empty();
3076     }
3077 
hasJIT() const3078     bool Program::hasJIT() const {
3079         // Program::hasJIT() is really just a debugging / test aid,
3080         // so we don't mind adding a sync point here to wait for compilation.
3081         this->waitForLLVM();
3082 
3083         return fImpl->jit_entry.load() != nullptr;
3084     }
3085 
dropJIT()3086     void Program::dropJIT() {
3087     #if defined(SKVM_LLVM)
3088         this->waitForLLVM();
3089         fImpl->llvm_ee .reset(nullptr);
3090         fImpl->llvm_ctx.reset(nullptr);
3091     #elif defined(SKVM_JIT)
3092         if (fImpl->dylib) {
3093             close_dylib(fImpl->dylib);
3094         } else if (auto jit_entry = fImpl->jit_entry.load()) {
3095             unmap_jit_buffer(jit_entry, fImpl->jit_size);
3096         }
3097     #else
3098         SkASSERT(!this->hasJIT());
3099     #endif
3100 
3101         fImpl->jit_entry.store(nullptr);
3102         fImpl->jit_size  = 0;
3103         fImpl->dylib     = nullptr;
3104     }
3105 
Program()3106     Program::Program() : fImpl(std::make_unique<Impl>()) {}
3107 
~Program()3108     Program::~Program() {
3109         // Moved-from Programs may have fImpl == nullptr.
3110         if (fImpl) {
3111             this->dropJIT();
3112         }
3113     }
3114 
Program(Program && other)3115     Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
3116 
operator =(Program && other)3117     Program& Program::operator=(Program&& other) {
3118         fImpl = std::move(other.fImpl);
3119         return *this;
3120     }
3121 
Program(const std::vector<OptimizedInstruction> & instructions,std::unique_ptr<viz::Visualizer> visualizer,const std::vector<int> & strides,const std::vector<TraceHook * > & traceHooks,const char * debug_name,bool allow_jit)3122     Program::Program(const std::vector<OptimizedInstruction>& instructions,
3123                      std::unique_ptr<viz::Visualizer> visualizer,
3124                      const std::vector<int>& strides,
3125                      const std::vector<TraceHook*>& traceHooks,
3126                      const char* debug_name, bool allow_jit) : Program() {
3127         fImpl->visualizer = std::move(visualizer);
3128         fImpl->strides = strides;
3129         fImpl->traceHooks = traceHooks;
3130         if (gSkVMAllowJIT && allow_jit) {
3131         #if 1 && defined(SKVM_LLVM)
3132             this->setupLLVM(instructions, debug_name);
3133         #elif 1 && defined(SKVM_JIT)
3134             this->setupJIT(instructions, debug_name);
3135         #endif
3136         }
3137 
3138         // Might as well do this after setupLLVM() to get a little more time to compile.
3139         this->setupInterpreter(instructions);
3140     }
3141 
instructions() const3142     std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
nargs() const3143     int  Program::nargs() const { return (int)fImpl->strides.size(); }
nregs() const3144     int  Program::nregs() const { return fImpl->regs; }
loop() const3145     int  Program::loop () const { return fImpl->loop; }
empty() const3146     bool Program::empty() const { return fImpl->instructions.empty(); }
3147 
3148     // Translate OptimizedInstructions to InterpreterInstructions.
setupInterpreter(const std::vector<OptimizedInstruction> & instructions)3149     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
3150         // Register each instruction is assigned to.
3151         std::vector<Reg> reg(instructions.size());
3152 
3153         // This next bit is a bit more complicated than strictly necessary;
3154         // we could just assign every instruction to its own register.
3155         //
3156         // But recycling registers is fairly cheap, and good practice for the
3157         // JITs where minimizing register pressure really is important.
3158         //
3159         // We have effectively infinite registers, so we hoist any value we can.
3160         // (The JIT may choose a more complex policy to reduce register pressure.)
3161 
3162         fImpl->regs = 0;
3163         std::vector<Reg> avail;
3164 
3165         // Assign this value to a register, recycling them where we can.
3166         auto assign_register = [&](Val id) {
3167             const OptimizedInstruction& inst = instructions[id];
3168 
3169             // If this is a real input and it's lifetime ends at this instruction,
3170             // we can recycle the register it's occupying.
3171             auto maybe_recycle_register = [&](Val input) {
3172                 if (input != NA && instructions[input].death == id) {
3173                     avail.push_back(reg[input]);
3174                 }
3175             };
3176 
3177             // Take care to not recycle the same register twice.
3178             const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w;
3179             if (true                      ) { maybe_recycle_register(x); }
3180             if (y != x                    ) { maybe_recycle_register(y); }
3181             if (z != x && z != y          ) { maybe_recycle_register(z); }
3182             if (w != x && w != y && w != z) { maybe_recycle_register(w); }
3183 
3184             // Instructions that die at themselves (stores) don't need a register.
3185             if (inst.death != id) {
3186                 // Allocate a register if we have to, preferring to reuse anything available.
3187                 if (avail.empty()) {
3188                     reg[id] = fImpl->regs++;
3189                 } else {
3190                     reg[id] = avail.back();
3191                     avail.pop_back();
3192                 }
3193             }
3194         };
3195 
3196         // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
3197         for (Val id = 0; id < (Val)instructions.size(); id++) {
3198             if ( instructions[id].can_hoist) { assign_register(id); }
3199         }
3200         for (Val id = 0; id < (Val)instructions.size(); id++) {
3201             if (!instructions[id].can_hoist) { assign_register(id); }
3202         }
3203 
3204         // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
3205         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
3206 
3207         // The loop begins at the fImpl->loop'th Instruction.
3208         fImpl->loop = 0;
3209         fImpl->instructions.reserve(instructions.size());
3210 
3211         // Add a mapping for the N/A sentinel Val to any arbitrary register
3212         // so lookups don't have to know which arguments are used by which Ops.
3213         auto lookup_register = [&](Val id) {
3214             return id == NA ? (Reg)0
3215                             : reg[id];
3216         };
3217 
3218         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
3219             InterpreterInstruction pinst{
3220                 inst.op,
3221                 lookup_register(id),
3222                 lookup_register(inst.x),
3223                 lookup_register(inst.y),
3224                 lookup_register(inst.z),
3225                 lookup_register(inst.w),
3226                 inst.immA,
3227                 inst.immB,
3228                 inst.immC,
3229             };
3230             fImpl->instructions.push_back(pinst);
3231         };
3232 
3233         for (Val id = 0; id < (Val)instructions.size(); id++) {
3234             const OptimizedInstruction& inst = instructions[id];
3235             if (inst.can_hoist) {
3236                 push_instruction(id, inst);
3237                 fImpl->loop++;
3238             }
3239         }
3240         for (Val id = 0; id < (Val)instructions.size(); id++) {
3241             const OptimizedInstruction& inst = instructions[id];
3242             if (!inst.can_hoist) {
3243                 push_instruction(id, inst);
3244             }
3245         }
3246     }
3247 
3248 #if defined(SKVM_JIT)
3249 
3250     namespace SkVMJitTypes {
3251     #if defined(__x86_64__) || defined(_M_X64)
3252         using Reg = Assembler::Ymm;
3253     #elif defined(__aarch64__)
3254         using Reg = Assembler::V;
3255     #endif
3256     }  // namespace SkVMJitTypes
3257 
jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const3258     bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3259                       int* stack_hint,
3260                       uint32_t* registers_used,
3261                       Assembler* a) const {
3262         using A = Assembler;
3263         using SkVMJitTypes::Reg;
3264 
3265         SkTHashMap<int, A::Label> constants;    // Constants (mostly splats) share the same pool.
3266         A::Label                  iota;         // Varies per lane, for Op::index.
3267         A::Label                  load64_index; // Used to load low or high half of 64-bit lanes.
3268 
3269         // The `regs` array tracks everything we know about each register's state:
3270         //   - NA:   empty
3271         //   - RES:  reserved by ABI
3272         //   - TMP:  holding a temporary
3273         //   - id:   holding Val id
3274         constexpr Val RES = NA-1,
3275                       TMP = RES-1;
3276 
3277         // Map val -> stack slot.
3278         std::vector<int> stack_slot(instructions.size(), NA);
3279         int next_stack_slot = 0;
3280 
3281         const int nstack_slots = *stack_hint >= 0 ? *stack_hint
3282                                                   : stack_slot.size();
3283     #if defined(__x86_64__) || defined(_M_X64)
3284         if (!SkCpu::Supports(SkCpu::HSW)) {
3285             return false;
3286         }
3287         const int K = 8;
3288         #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
3289             const A::GP64 N = A::rcx,
3290                         GP0 = A::rax,
3291                         GP1 = A::r11,
3292                         arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3293 
3294             // xmm6-15 need are callee-saved.
3295             std::array<Val,16> regs = {
3296                  NA, NA, NA, NA,  NA, NA,RES,RES,
3297                 RES,RES,RES,RES, RES,RES,RES,RES,
3298             };
3299             const uint32_t incoming_registers_used = *registers_used;
3300 
3301             auto enter = [&]{
3302                 // rcx,rdx,r8,r9 are all already holding their correct values.
3303                 // Load caller-saved r10 from rsp+40 if there's a fourth arg.
3304                 if (fImpl->strides.size() >= 4) {
3305                     a->mov(A::r10, A::Mem{A::rsp, 40});
3306                 }
3307                 // Load callee-saved rdi from rsp+48 if there's a fifth arg,
3308                 // first saving it to ABI reserved shadow area rsp+8.
3309                 if (fImpl->strides.size() >= 5) {
3310                     a->mov(A::Mem{A::rsp, 8}, A::rdi);
3311                     a->mov(A::rdi, A::Mem{A::rsp, 48});
3312                 }
3313                 // Load callee-saved rsi from rsp+56 if there's a sixth arg,
3314                 // first saving it to ABI reserved shadow area rsp+16.
3315                 if (fImpl->strides.size() >= 6) {
3316                     a->mov(A::Mem{A::rsp, 16}, A::rsi);
3317                     a->mov(A::rsi, A::Mem{A::rsp, 56});
3318                 }
3319 
3320                 // Allocate stack for our values and callee-saved xmm6-15.
3321                 int stack_needed = nstack_slots*K*4;
3322                 for (int r = 6; r < 16; r++) {
3323                     if (incoming_registers_used & (1<<r)) {
3324                         stack_needed += 16;
3325                     }
3326                 }
3327                 if (stack_needed) { a->sub(A::rsp, stack_needed); }
3328 
3329                 int next_saved_xmm = nstack_slots*K*4;
3330                 for (int r = 6; r < 16; r++) {
3331                     if (incoming_registers_used & (1<<r)) {
3332                         a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3333                         next_saved_xmm += 16;
3334                         regs[r] = NA;
3335                     }
3336                 }
3337             };
3338             auto exit  = [&]{
3339                 // The second pass of jit() shouldn't use any register it didn't in the first pass.
3340                 SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
3341 
3342                 // Restore callee-saved xmm6-15 and the stack pointer.
3343                 int stack_used = nstack_slots*K*4;
3344                 for (int r = 6; r < 16; r++) {
3345                     if (incoming_registers_used & (1<<r)) {
3346                         a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3347                         stack_used += 16;
3348                     }
3349                 }
3350                 if (stack_used) { a->add(A::rsp, stack_used); }
3351 
3352                 // Restore callee-saved rdi/rsi if we used them.
3353                 if (fImpl->strides.size() >= 5) {
3354                     a->mov(A::rdi, A::Mem{A::rsp, 8});
3355                 }
3356                 if (fImpl->strides.size() >= 6) {
3357                     a->mov(A::rsi, A::Mem{A::rsp, 16});
3358                 }
3359 
3360                 a->vzeroupper();
3361                 a->ret();
3362             };
3363         #elif defined(__x86_64__)
3364             const A::GP64 N = A::rdi,
3365                         GP0 = A::rax,
3366                         GP1 = A::r11,
3367                         arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3368 
3369             // All 16 ymm registers are available to use.
3370             std::array<Val,16> regs = {
3371                 NA,NA,NA,NA, NA,NA,NA,NA,
3372                 NA,NA,NA,NA, NA,NA,NA,NA,
3373             };
3374 
3375             auto enter = [&]{
3376                 // Load caller-saved r10 from rsp+8 if there's a sixth arg.
3377                 if (fImpl->strides.size() >= 6) {
3378                     a->mov(A::r10, A::Mem{A::rsp, 8});
3379                 }
3380                 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
3381             };
3382             auto exit  = [&]{
3383                 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
3384                 a->vzeroupper();
3385                 a->ret();
3386             };
3387         #endif
3388 
3389         auto load_from_memory = [&](Reg r, Val v) {
3390             if (instructions[v].op == Op::splat) {
3391                 if (instructions[v].immA == 0) {
3392                     a->vpxor(r,r,r);
3393                 } else {
3394                     a->vmovups(r, constants.find(instructions[v].immA));
3395                 }
3396             } else {
3397                 SkASSERT(stack_slot[v] != NA);
3398                 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
3399             }
3400         };
3401         auto store_to_stack = [&](Reg r, Val v) {
3402             SkASSERT(next_stack_slot < nstack_slots);
3403             stack_slot[v] = next_stack_slot++;
3404             a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
3405         };
3406     #elif defined(__aarch64__)
3407         const int K = 4;
3408         const A::X N     = A::x0,
3409                    GP0   = A::x8,
3410                    GP1   = A::x9,
3411                    arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3412 
3413         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3414         std::array<Val,32> regs = {
3415              NA, NA, NA, NA,  NA, NA, NA, NA,
3416             RES,RES,RES,RES, RES,RES,RES,RES,
3417              NA, NA, NA, NA,  NA, NA, NA, NA,
3418              NA, NA, NA, NA,  NA, NA, NA, NA,
3419         };
3420 
3421         auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
3422         auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
3423                           a->ret(A::x30); };
3424 
3425         auto load_from_memory = [&](Reg r, Val v) {
3426             if (instructions[v].op == Op::splat) {
3427                 if (instructions[v].immA == 0) {
3428                     a->eor16b(r,r,r);
3429                 } else {
3430                     a->ldrq(r, constants.find(instructions[v].immA));
3431                 }
3432             } else {
3433                 SkASSERT(stack_slot[v] != NA);
3434                 a->ldrq(r, A::sp, stack_slot[v]);
3435             }
3436         };
3437         auto store_to_stack  = [&](Reg r, Val v) {
3438             SkASSERT(next_stack_slot < nstack_slots);
3439             stack_slot[v] = next_stack_slot++;
3440             a->strq(r, A::sp, stack_slot[v]);
3441         };
3442     #endif
3443 
3444         *registers_used = 0;  // We'll update this as we go.
3445 
3446         if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3447             return false;
3448         }
3449 
3450         auto emit = [&](Val id, bool scalar) {
3451             const int active_lanes = scalar ? 1 : K;
3452             const OptimizedInstruction& inst = instructions[id];
3453             const Op op = inst.op;
3454             const Val x = inst.x,
3455                       y = inst.y,
3456                       z = inst.z,
3457                       w = inst.w;
3458             const int immA = inst.immA,
3459                       immB = inst.immB,
3460                       immC = inst.immC;
3461 
3462             // alloc_tmp() returns the first of N adjacent temporary registers,
3463             // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
3464             auto alloc_tmp = [&](int N=1) -> Reg {
3465                 auto needs_spill = [&](Val v) -> bool {
3466                     SkASSERT(v >= 0);   // {NA,TMP,RES} need to be handled before calling this.
3467                     return stack_slot[v] == NA               // We haven't spilled it already?
3468                         && instructions[v].op != Op::splat;  // No need to spill constants.
3469                 };
3470 
3471                 // We want to find a block of N adjacent registers requiring the fewest spills.
3472                 int best_block = -1,
3473                     min_spills = 0x7fff'ffff;
3474                 for (int block = 0; block+N <= (int)regs.size(); block++) {
3475                     int spills = 0;
3476                     for (int r = block; r < block+N; r++) {
3477                         Val v = regs[r];
3478                         // Registers holding NA (nothing) are ideal, nothing to spill.
3479                         if (v == NA) {
3480                             continue;
3481                         }
3482                         // We can't spill anything REServed or that we'll need this instruction.
3483                         if (v == RES ||
3484                             v == TMP || v == id || v == x || v == y || v == z || v == w) {
3485                             spills = 0x7fff'ffff;
3486                             block  = r;   // (optimization) continue outer loop at next register.
3487                             break;
3488                         }
3489                         // Usually here we've got a value v that we'd have to spill to the stack
3490                         // before reusing its register, but sometimes even now we get a freebie.
3491                         spills += needs_spill(v) ? 1 : 0;
3492                     }
3493 
3494                     // TODO: non-arbitrary tie-breaking?
3495                     if (min_spills > spills) {
3496                         min_spills = spills;
3497                         best_block = block;
3498                     }
3499                     if (min_spills == 0) {
3500                         break;  // (optimization) stop early if we find an unbeatable block.
3501                     }
3502                 }
3503 
3504                 // TODO: our search's success isn't obviously guaranteed... it depends on N
3505                 // and the number and relative position in regs of any unspillable values.
3506                 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64;
3507                 // we'll need to revisit this logic should this assert fire.
3508                 SkASSERT(min_spills <= N);
3509 
3510                 // Spill what needs spilling, and mark the block all as TMP.
3511                 for (int r = best_block; r < best_block+N; r++) {
3512                     Val& v = regs[r];
3513                     *registers_used |= (1<<r);
3514 
3515                     SkASSERT(v == NA || v >= 0);
3516                     if (v >= 0 && needs_spill(v)) {
3517                         store_to_stack((Reg)r, v);
3518                         SkASSERT(!needs_spill(v));
3519                         min_spills--;
3520                     }
3521 
3522                     v = TMP;
3523                 }
3524                 SkASSERT(min_spills == 0);
3525                 return (Reg)best_block;
3526             };
3527 
3528             auto free_tmp = [&](Reg r) {
3529                 SkASSERT(regs[r] == TMP);
3530                 regs[r] = NA;
3531             };
3532 
3533             // Which register holds dst,x,y,z,w for this instruction?  NA if none does yet.
3534             int rd = NA,
3535                 rx = NA,
3536                 ry = NA,
3537                 rz = NA,
3538                 rw = NA;
3539 
3540             auto update_regs = [&](Reg r, Val v) {
3541                 if (v == id) { rd = r; }
3542                 if (v ==  x) { rx = r; }
3543                 if (v ==  y) { ry = r; }
3544                 if (v ==  z) { rz = r; }
3545                 if (v ==  w) { rw = r; }
3546                 return r;
3547             };
3548 
3549             auto find_existing_reg = [&](Val v) -> int {
3550                 // Quick-check our working registers.
3551                 if (v == id && rd != NA) { return rd; }
3552                 if (v ==  x && rx != NA) { return rx; }
3553                 if (v ==  y && ry != NA) { return ry; }
3554                 if (v ==  z && rz != NA) { return rz; }
3555                 if (v ==  w && rw != NA) { return rw; }
3556 
3557                 // Search inter-instruction register map.
3558                 for (auto [r,val] : SkMakeEnumerate(regs)) {
3559                     if (val == v) {
3560                         return update_regs((Reg)r, v);
3561                     }
3562                 }
3563                 return NA;
3564             };
3565 
3566             // Return a register for Val, holding that value if it already exists.
3567             // During this instruction all calls to r(v) will return the same register.
3568             auto r = [&](Val v) -> Reg {
3569                 SkASSERT(v >= 0);
3570 
3571                 if (int found = find_existing_reg(v); found != NA) {
3572                     return (Reg)found;
3573                 }
3574 
3575                 Reg r = alloc_tmp();
3576                 SkASSERT(regs[r] == TMP);
3577 
3578                 SkASSERT(v <= id);
3579                 if (v < id) {
3580                     // If v < id, we're loading one of this instruction's inputs.
3581                     // If v == id we're just allocating its destination register.
3582                     load_from_memory(r, v);
3583                 }
3584                 regs[r] = v;
3585                 return update_regs(r, v);
3586             };
3587 
3588             auto dies_here = [&](Val v) -> bool {
3589                 SkASSERT(v >= 0);
3590                 return instructions[v].death == id;
3591             };
3592 
3593             // Alias dst() to r(v) if dies_here(v).
3594             auto try_alias = [&](Val v) -> bool {
3595                 SkASSERT(v == x || v == y || v == z || v == w);
3596                 if (dies_here(v)) {
3597                     rd = r(v);      // Vals v and id share a register for this instruction.
3598                     regs[rd] = id;  // Next instruction, Val id will be in the register, not Val v.
3599                     return true;
3600                 }
3601                 return false;
3602             };
3603 
3604             // Generally r(id),
3605             // but with a hint, try to alias dst() to r(v) if dies_here(v).
3606             auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3607                 if (hint1 != NA && try_alias(hint1)) { return r(id); }
3608                 if (hint2 != NA && try_alias(hint2)) { return r(id); }
3609                 return r(id);
3610             };
3611 
3612         #if defined(__aarch64__)  // Nothing sneaky, just unused on x86-64.
3613             auto mark_tmp_as_dst = [&](Reg tmp) {
3614                 SkASSERT(regs[tmp] == TMP);
3615                 rd = tmp;
3616                 regs[rd] = id;
3617                 SkASSERT(dst() == tmp);
3618             };
3619         #endif
3620 
3621         #if defined(__x86_64__) || defined(_M_X64)
3622             // On x86 we can work with many values directly from the stack or program constant pool.
3623             auto any = [&](Val v) -> A::Operand {
3624                 SkASSERT(v >= 0);
3625                 SkASSERT(v < id);
3626 
3627                 if (int found = find_existing_reg(v); found != NA) {
3628                     return (Reg)found;
3629                 }
3630                 if (instructions[v].op == Op::splat) {
3631                     return constants.find(instructions[v].immA);
3632                 }
3633                 return A::Mem{A::rsp, stack_slot[v]*K*4};
3634             };
3635 
3636             // This is never really worth asking except when any() might be used;
3637             // if we need this value in ARM, might as well just call r(v) to get it into a register.
3638             auto in_reg = [&](Val v) -> bool {
3639                 return find_existing_reg(v) != NA;
3640             };
3641         #endif
3642 
3643             switch (op) {
3644                 // Make sure splat constants can be found by load_from_memory() or any().
3645                 case Op::splat:
3646                     (void)constants[immA];
3647                     break;
3648 
3649             #if defined(__x86_64__) || defined(_M_X64)
3650                 case Op::assert_true: {
3651                     a->vptest (r(x), &constants[0xffffffff]);
3652                     A::Label all_true;
3653                     a->jc(&all_true);
3654                     a->int3();
3655                     a->label(&all_true);
3656                 } break;
3657 
3658                 case Op::trace_line:
3659                 case Op::trace_var:
3660                 case Op::trace_enter:
3661                 case Op::trace_exit:
3662                 case Op::trace_scope:
3663                     /* Force this program to run in the interpreter. */
3664                     return false;
3665 
3666                 case Op::store8:
3667                     if (scalar) {
3668                         a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3669                     } else {
3670                         a->vpackusdw(dst(x), r(x), r(x));
3671                         a->vpermq   (dst(), dst(), 0xd8);
3672                         a->vpackuswb(dst(), dst(), dst());
3673                         a->vmovq    (A::Mem{arg[immA]}, (A::Xmm)dst());
3674                     } break;
3675 
3676                 case Op::store16:
3677                     if (scalar) {
3678                         a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3679                     } else {
3680                         a->vpackusdw(dst(x), r(x), r(x));
3681                         a->vpermq   (dst(), dst(), 0xd8);
3682                         a->vmovups  (A::Mem{arg[immA]}, (A::Xmm)dst());
3683                     } break;
3684 
3685                 case Op::store32: if (scalar) { a->vmovd  (A::Mem{arg[immA]}, (A::Xmm)r(x)); }
3686                                   else        { a->vmovups(A::Mem{arg[immA]},         r(x)); }
3687                                   break;
3688 
3689                 case Op::store64: if (scalar) {
3690                                       a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x));
3691                                       a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y));
3692                                   } else {
3693                                       // r(x) = {a,b,c,d|e,f,g,h}
3694                                       // r(y) = {i,j,k,l|m,n,o,p}
3695                                       // We want to write a,i,b,j,c,k,d,l,e,m...
3696                                       A::Ymm L = alloc_tmp(),
3697                                              H = alloc_tmp();
3698                                       a->vpunpckldq(L, r(x), any(y));  // L = {a,i,b,j|e,m,f,n}
3699                                       a->vpunpckhdq(H, r(x), any(y));  // H = {c,k,d,l|g,o,h,p}
3700                                       a->vperm2f128(dst(), L,H, 0x20); //   = {a,i,b,j|c,k,d,l}
3701                                       a->vmovups(A::Mem{arg[immA], 0}, dst());
3702                                       a->vperm2f128(dst(), L,H, 0x31); //   = {e,m,f,n|g,o,h,p}
3703                                       a->vmovups(A::Mem{arg[immA],32}, dst());
3704                                       free_tmp(L);
3705                                       free_tmp(H);
3706                                   } break;
3707 
3708                 case Op::store128: {
3709                     // TODO: >32-bit stores
3710                     a->vmovd  (A::Mem{arg[immA], 0*16 +  0}, (A::Xmm)r(x)   );
3711                     a->vmovd  (A::Mem{arg[immA], 0*16 +  4}, (A::Xmm)r(y)   );
3712                     a->vmovd  (A::Mem{arg[immA], 0*16 +  8}, (A::Xmm)r(z)   );
3713                     a->vmovd  (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w)   );
3714                     if (scalar) { break; }
3715 
3716                     a->vpextrd(A::Mem{arg[immA], 1*16 +  0}, (A::Xmm)r(x), 1);
3717                     a->vpextrd(A::Mem{arg[immA], 1*16 +  4}, (A::Xmm)r(y), 1);
3718                     a->vpextrd(A::Mem{arg[immA], 1*16 +  8}, (A::Xmm)r(z), 1);
3719                     a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1);
3720 
3721                     a->vpextrd(A::Mem{arg[immA], 2*16 +  0}, (A::Xmm)r(x), 2);
3722                     a->vpextrd(A::Mem{arg[immA], 2*16 +  4}, (A::Xmm)r(y), 2);
3723                     a->vpextrd(A::Mem{arg[immA], 2*16 +  8}, (A::Xmm)r(z), 2);
3724                     a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2);
3725 
3726                     a->vpextrd(A::Mem{arg[immA], 3*16 +  0}, (A::Xmm)r(x), 3);
3727                     a->vpextrd(A::Mem{arg[immA], 3*16 +  4}, (A::Xmm)r(y), 3);
3728                     a->vpextrd(A::Mem{arg[immA], 3*16 +  8}, (A::Xmm)r(z), 3);
3729                     a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3);
3730                     // Now we need to store the upper 128 bits of x,y,z,w.
3731                     // Storing in this order rather than interlacing minimizes temporaries.
3732                     a->vextracti128(dst(), r(x), 1);
3733                     a->vmovd  (A::Mem{arg[immA], 4*16 +  0}, (A::Xmm)dst()   );
3734                     a->vpextrd(A::Mem{arg[immA], 5*16 +  0}, (A::Xmm)dst(), 1);
3735                     a->vpextrd(A::Mem{arg[immA], 6*16 +  0}, (A::Xmm)dst(), 2);
3736                     a->vpextrd(A::Mem{arg[immA], 7*16 +  0}, (A::Xmm)dst(), 3);
3737 
3738                     a->vextracti128(dst(), r(y), 1);
3739                     a->vmovd  (A::Mem{arg[immA], 4*16 +  4}, (A::Xmm)dst()   );
3740                     a->vpextrd(A::Mem{arg[immA], 5*16 +  4}, (A::Xmm)dst(), 1);
3741                     a->vpextrd(A::Mem{arg[immA], 6*16 +  4}, (A::Xmm)dst(), 2);
3742                     a->vpextrd(A::Mem{arg[immA], 7*16 +  4}, (A::Xmm)dst(), 3);
3743 
3744                     a->vextracti128(dst(), r(z), 1);
3745                     a->vmovd  (A::Mem{arg[immA], 4*16 +  8}, (A::Xmm)dst()   );
3746                     a->vpextrd(A::Mem{arg[immA], 5*16 +  8}, (A::Xmm)dst(), 1);
3747                     a->vpextrd(A::Mem{arg[immA], 6*16 +  8}, (A::Xmm)dst(), 2);
3748                     a->vpextrd(A::Mem{arg[immA], 7*16 +  8}, (A::Xmm)dst(), 3);
3749 
3750                     a->vextracti128(dst(), r(w), 1);
3751                     a->vmovd  (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst()   );
3752                     a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1);
3753                     a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2);
3754                     a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3);
3755                 } break;
3756 
3757                 case Op::load8:  if (scalar) {
3758                                      a->vpxor  (dst(), dst(), dst());
3759                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3760                                  } else {
3761                                      a->vpmovzxbd(dst(), A::Mem{arg[immA]});
3762                                  } break;
3763 
3764                 case Op::load16: if (scalar) {
3765                                      a->vpxor  (dst(), dst(), dst());
3766                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3767                                  } else {
3768                                      a->vpmovzxwd(dst(), A::Mem{arg[immA]});
3769                                  } break;
3770 
3771                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), A::Mem{arg[immA]}); }
3772                                  else        { a->vmovups(        dst(), A::Mem{arg[immA]}); }
3773                                  break;
3774 
3775                 case Op::load64: if (scalar) {
3776                                     a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3777                                  } else {
3778                                     A::Ymm tmp = alloc_tmp();
3779                                     a->vmovups(tmp, &load64_index);
3780                                     a->vpermps(dst(), tmp, A::Mem{arg[immA],  0});
3781                                     a->vpermps(  tmp, tmp, A::Mem{arg[immA], 32});
3782                                     // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1.
3783                                     a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20);
3784                                     free_tmp(tmp);
3785                                  } break;
3786 
3787                 case Op::load128: if (scalar) {
3788                                       a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3789                                   } else {
3790                                       // Load 4 low values into xmm tmp,
3791                                       A::Ymm tmp = alloc_tmp();
3792                                       A::Xmm t = (A::Xmm)tmp;
3793                                       a->vmovd  (t,   A::Mem{arg[immA], 0*16 + 4*immB}   );
3794                                       a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1);
3795                                       a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2);
3796                                       a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3);
3797 
3798                                       // Load 4 high values into xmm dst(),
3799                                       A::Xmm d = (A::Xmm)dst();
3800                                       a->vmovd  (d,   A::Mem{arg[immA], 4*16 + 4*immB}   );
3801                                       a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1);
3802                                       a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2);
3803                                       a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3);
3804 
3805                                       // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3806                                       a->vperm2f128(dst(), tmp,dst(), 0x20);
3807                                       free_tmp(tmp);
3808                                   } break;
3809 
3810                 case Op::gather8: {
3811                     // As usual, the gather base pointer is immB bytes off of uniform immA.
3812                     a->mov(GP0, A::Mem{arg[immA], immB});
3813 
3814                     A::Ymm tmp = alloc_tmp();
3815                     a->vmovups(tmp, any(x));
3816 
3817                     for (int i = 0; i < active_lanes; i++) {
3818                         if (i == 4) {
3819                             // vpextrd can only pluck indices out from an Xmm register,
3820                             // so we manually swap over to the top when we're halfway through.
3821                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3822                         }
3823                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3824                         a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3825                     }
3826                     a->vpmovzxbd(dst(), dst());
3827                     free_tmp(tmp);
3828                 } break;
3829 
3830                 case Op::gather16: {
3831                     // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3832                     a->mov(GP0, A::Mem{arg[immA], immB});
3833 
3834                     A::Ymm tmp = alloc_tmp();
3835                     a->vmovups(tmp, any(x));
3836 
3837                     for (int i = 0; i < active_lanes; i++) {
3838                         if (i == 4) {
3839                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3840                         }
3841                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3842                         a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3843                     }
3844                     a->vpmovzxwd(dst(), dst());
3845                     free_tmp(tmp);
3846                 } break;
3847 
3848                 case Op::gather32:
3849                 if (scalar) {
3850                     // Our gather base pointer is immB bytes off of uniform immA.
3851                     a->mov(GP0, A::Mem{arg[immA], immB});
3852 
3853                     // Grab our index from lane 0 of the index argument.
3854                     a->vmovd(GP1, (A::Xmm)r(x));
3855 
3856                     // dst = *(base + 4*index)
3857                     a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3858                 } else {
3859                     a->mov(GP0, A::Mem{arg[immA], immB});
3860 
3861                     A::Ymm mask = alloc_tmp();
3862                     a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3863 
3864                     a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3865                     free_tmp(mask);
3866                 }
3867                 break;
3868 
3869                 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
3870                                     break;
3871 
3872                 case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB});
3873                                   a->vbroadcastss(dst(), A::Mem{GP0, immC});
3874                                   break;
3875 
3876                 case Op::index: a->vmovd((A::Xmm)dst(), N);
3877                                 a->vbroadcastss(dst(), dst());
3878                                 a->vpsubd(dst(), dst(), &iota);
3879                                 break;
3880 
3881                 // We can swap the arguments of symmetric instructions to make better use of any().
3882                 case Op::add_f32:
3883                     if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3884                     else           { a->vaddps(dst(y), r(y), any(x)); }
3885                                      break;
3886 
3887                 case Op::mul_f32:
3888                     if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3889                     else           { a->vmulps(dst(y), r(y), any(x)); }
3890                                      break;
3891 
3892                 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3893                 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3894                 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break;  // Order matters,
3895                 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break;  // see test SkVM_min_max.
3896 
3897                 case Op::fma_f32:
3898                     if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3899                     if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3900                     if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3901                                       { a->vmovups    (dst(), any(x));
3902                                         a->vfmadd132ps(dst(), r(z), any(y)); }
3903                                         break;
3904 
3905                 case Op::fms_f32:
3906                     if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3907                     if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3908                     if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3909                                       { a->vmovups    (dst(), any(x));
3910                                         a->vfmsub132ps(dst(), r(z), any(y)); }
3911                                         break;
3912 
3913                 case Op::fnma_f32:
3914                     if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3915                     if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3916                     if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3917                                       { a->vmovups     (dst(), any(x));
3918                                         a->vfnmadd132ps(dst(), r(z), any(y)); }
3919                                         break;
3920 
3921                 // In situations like this we want to try aliasing dst(x) when x is
3922                 // already in a register, but not if we'd have to load it from the stack
3923                 // just to alias it.  That's done better directly into the new register.
3924                 case Op::sqrt_f32:
3925                     if (in_reg(x)) { a->vsqrtps(dst(x),  r(x)); }
3926                     else           { a->vsqrtps(dst(), any(x)); }
3927                                      break;
3928 
3929                 case Op::add_i32:
3930                     if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3931                     else           { a->vpaddd(dst(y), r(y), any(x)); }
3932                                      break;
3933 
3934                 case Op::mul_i32:
3935                     if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3936                     else           { a->vpmulld(dst(y), r(y), any(x)); }
3937                                      break;
3938 
3939                 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3940 
3941                 case Op::bit_and:
3942                     if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3943                     else           { a->vpand(dst(y), r(y), any(x)); }
3944                                      break;
3945                 case Op::bit_or:
3946                     if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3947                     else           { a->vpor(dst(y), r(y), any(x)); }
3948                                      break;
3949                 case Op::bit_xor:
3950                     if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3951                     else           { a->vpxor(dst(y), r(y), any(x)); }
3952                                      break;
3953 
3954                 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3955 
3956                 case Op::select:
3957                     if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3958                     else              { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3959                                         break;
3960 
3961                 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break;
3962                 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break;
3963                 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break;
3964 
3965                 case Op::eq_i32:
3966                     if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3967                     else           { a->vpcmpeqd(dst(y), r(y), any(x)); }
3968                                      break;
3969 
3970                 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3971 
3972                 case Op::eq_f32:
3973                     if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3974                     else           { a->vcmpeqps(dst(y), r(y), any(x)); }
3975                                      break;
3976                 case Op::neq_f32:
3977                     if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3978                     else           { a->vcmpneqps(dst(y), r(y), any(x)); }
3979                                      break;
3980 
3981                 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3982                 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3983 
3984                 case Op::ceil:
3985                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::CEIL); }
3986                     else           { a->vroundps(dst(), any(x), Assembler::CEIL); }
3987                                      break;
3988 
3989                 case Op::floor:
3990                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::FLOOR); }
3991                     else           { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3992                                      break;
3993 
3994                 case Op::to_f32:
3995                     if (in_reg(x)) { a->vcvtdq2ps(dst(x),  r(x)); }
3996                     else           { a->vcvtdq2ps(dst(), any(x)); }
3997                                      break;
3998 
3999                 case Op::trunc:
4000                     if (in_reg(x)) { a->vcvttps2dq(dst(x),  r(x)); }
4001                     else           { a->vcvttps2dq(dst(), any(x)); }
4002                                      break;
4003 
4004                 case Op::round:
4005                     if (in_reg(x)) { a->vcvtps2dq(dst(x),  r(x)); }
4006                     else           { a->vcvtps2dq(dst(), any(x)); }
4007                                      break;
4008 
4009                 case Op::to_fp16:
4010                     a->vcvtps2ph(dst(x), r(x), A::CURRENT);  // f32 ymm -> f16 xmm
4011                     a->vpmovzxwd(dst(), dst());              // f16 xmm -> f16 ymm
4012                     break;
4013 
4014                 case Op::from_fp16:
4015                     a->vpackusdw(dst(x), r(x), r(x));  // f16 ymm -> f16 xmm
4016                     a->vpermq   (dst(), dst(), 0xd8);  // swap middle two 64-bit lanes
4017                     a->vcvtph2ps(dst(), dst());        // f16 xmm -> f32 ymm
4018                     break;
4019 
4020                 case Op::duplicate: break;
4021 
4022             #elif defined(__aarch64__)
4023                 case Op::assert_true: {
4024                     a->uminv4s(dst(), r(x));   // uminv acts like an all() across the vector.
4025                     a->movs(GP0, dst(), 0);
4026                     A::Label all_true;
4027                     a->cbnz(GP0, &all_true);
4028                     a->brk(0);
4029                     a->label(&all_true);
4030                 } break;
4031 
4032                 case Op::trace_line:
4033                 case Op::trace_var:
4034                 case Op::trace_enter:
4035                 case Op::trace_exit:
4036                 case Op::trace_scope:
4037                     /* Force this program to run in the interpreter. */
4038                     return false;
4039 
4040                 case Op::index: {
4041                     A::V tmp = alloc_tmp();
4042                     a->ldrq (tmp, &iota);
4043                     a->dup4s(dst(), N);
4044                     a->sub4s(dst(), dst(), tmp);
4045                     free_tmp(tmp);
4046                 } break;
4047 
4048                 case Op::store8: a->xtns2h(dst(x), r(x));
4049                                  a->xtnh2b(dst(), dst());
4050                    if (scalar) { a->strb  (dst(), arg[immA]); }
4051                    else        { a->strs  (dst(), arg[immA]); }
4052                                  break;
4053 
4054                 case Op::store16: a->xtns2h(dst(x), r(x));
4055                     if (scalar) { a->strh  (dst(), arg[immA]); }
4056                     else        { a->strd  (dst(), arg[immA]); }
4057                                   break;
4058 
4059                 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); }
4060                                   else        { a->strq(r(x), arg[immA]); }
4061                                                 break;
4062 
4063                 case Op::store64: if (scalar) {
4064                                       a->strs(r(x), arg[immA], 0);
4065                                       a->strs(r(y), arg[immA], 1);
4066                                   } else if (r(y) == r(x)+1) {
4067                                       a->st24s(r(x), arg[immA]);
4068                                   } else {
4069                                       Reg tmp0 = alloc_tmp(2),
4070                                           tmp1 = (Reg)(tmp0+1);
4071                                       a->orr16b(tmp0, r(x), r(x));
4072                                       a->orr16b(tmp1, r(y), r(y));
4073                                       a-> st24s(tmp0, arg[immA]);
4074                                       free_tmp(tmp0);
4075                                       free_tmp(tmp1);
4076                                   } break;
4077 
4078                 case Op::store128:
4079                     if (scalar) {
4080                         a->strs(r(x), arg[immA], 0);
4081                         a->strs(r(y), arg[immA], 1);
4082                         a->strs(r(z), arg[immA], 2);
4083                         a->strs(r(w), arg[immA], 3);
4084                     } else if (r(y) == r(x)+1 &&
4085                                r(z) == r(x)+2 &&
4086                                r(w) == r(x)+3) {
4087                         a->st44s(r(x), arg[immA]);
4088                     } else {
4089                         Reg tmp0 = alloc_tmp(4),
4090                             tmp1 = (Reg)(tmp0+1),
4091                             tmp2 = (Reg)(tmp0+2),
4092                             tmp3 = (Reg)(tmp0+3);
4093                         a->orr16b(tmp0, r(x), r(x));
4094                         a->orr16b(tmp1, r(y), r(y));
4095                         a->orr16b(tmp2, r(z), r(z));
4096                         a->orr16b(tmp3, r(w), r(w));
4097                         a-> st44s(tmp0, arg[immA]);
4098                         free_tmp(tmp0);
4099                         free_tmp(tmp1);
4100                         free_tmp(tmp2);
4101                         free_tmp(tmp3);
4102                     } break;
4103 
4104 
4105                 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); }
4106                                 else        { a->ldrs(dst(), arg[immA]); }
4107                                               a->uxtlb2h(dst(), dst());
4108                                               a->uxtlh2s(dst(), dst());
4109                                               break;
4110 
4111                 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); }
4112                                  else        { a->ldrd(dst(), arg[immA]); }
4113                                                a->uxtlh2s(dst(), dst());
4114                                                break;
4115 
4116                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); }
4117                                  else        { a->ldrq(dst(), arg[immA]); }
4118                                                break;
4119 
4120                 case Op::load64: if (scalar) {
4121                                     a->ldrs(dst(), arg[immA], immB);
4122                                  } else {
4123                                     Reg tmp0 = alloc_tmp(2),
4124                                         tmp1 = (Reg)(tmp0+1);
4125                                     a->ld24s(tmp0, arg[immA]);
4126                                     // TODO: return both
4127                                     switch (immB) {
4128                                         case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break;
4129                                         case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break;
4130                                     }
4131                                  } break;
4132 
4133                 case Op::load128: if (scalar) {
4134                                       a->ldrs(dst(), arg[immA], immB);
4135                                   } else {
4136                                       Reg tmp0 = alloc_tmp(4),
4137                                           tmp1 = (Reg)(tmp0+1),
4138                                           tmp2 = (Reg)(tmp0+2),
4139                                           tmp3 = (Reg)(tmp0+3);
4140                                       a->ld44s(tmp0, arg[immA]);
4141                                       // TODO: return all four
4142                                       switch (immB) {
4143                                           case 0: mark_tmp_as_dst(tmp0); break;
4144                                           case 1: mark_tmp_as_dst(tmp1); break;
4145                                           case 2: mark_tmp_as_dst(tmp2); break;
4146                                           case 3: mark_tmp_as_dst(tmp3); break;
4147                                       }
4148                                       if (immB != 0) { free_tmp(tmp0); }
4149                                       if (immB != 1) { free_tmp(tmp1); }
4150                                       if (immB != 2) { free_tmp(tmp2); }
4151                                       if (immB != 3) { free_tmp(tmp3); }
4152                                   } break;
4153 
4154                 case Op::uniform32: a->add(GP0, arg[immA], immB);
4155                                     a->ld1r4s(dst(), GP0);
4156                                     break;
4157 
4158                 case Op::array32: a->add(GP0, arg[immA], immB);
4159                                   a->ldrd(GP0, GP0);
4160                                   a->add(GP0, GP0, immC);
4161                                   a->ld1r4s(dst(), GP0);
4162                                   break;
4163 
4164                 case Op::gather8: {
4165                     // As usual, the gather base pointer is immB bytes off of uniform immA.
4166                     a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
4167                     a->ldrd(GP0, GP0);              // GP0 =   gather base pointer
4168 
4169                     for (int i = 0; i < active_lanes; i++) {
4170                         a->movs(GP1, r(x), i);    // Extract index lane i into GP1.
4171                         a->add (GP1, GP0, GP1);   // Add the gather base pointer.
4172                         a->ldrb(GP1, GP1);        // Load that byte.
4173                         a->inss(dst(x), GP1, i);  // Insert it into dst() lane i.
4174                     }
4175                 } break;
4176 
4177                 // See gather8 for general idea; comments here only where gather16 differs.
4178                 case Op::gather16: {
4179                     a->add (GP0, arg[immA], immB);
4180                     a->ldrd(GP0, GP0);
4181                     for (int i = 0; i < active_lanes; i++) {
4182                         a->movs(GP1, r(x), i);
4183                         a->add (GP1, GP0, GP1, A::LSL, 1);  // Scale index 2x into a byte offset.
4184                         a->ldrh(GP1, GP1);                  // 2-byte load.
4185                         a->inss(dst(x), GP1, i);
4186                     }
4187                 } break;
4188 
4189                 // See gather8 for general idea; comments here only where gather32 differs.
4190                 case Op::gather32: {
4191                     a->add (GP0, arg[immA], immB);
4192                     a->ldrd(GP0, GP0);
4193                     for (int i = 0; i < active_lanes; i++) {
4194                         a->movs(GP1, r(x), i);
4195                         a->add (GP1, GP0, GP1, A::LSL, 2);  // Scale index 4x into a byte offset.
4196                         a->ldrs(GP1, GP1);                  // 4-byte load.
4197                         a->inss(dst(x), GP1, i);
4198                     }
4199                 } break;
4200 
4201                 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
4202                 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
4203                 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
4204                 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
4205 
4206                 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
4207 
4208                 case Op::fma_f32: // fmla.4s is z += x*y
4209                     if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
4210                     else              { a->orr16b(dst(), r(z), r(z));
4211                                         a->fmla4s(dst(), r(x), r(y)); }
4212                                         break;
4213 
4214                 case Op::fnma_f32:  // fmls.4s is z -= x*y
4215                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4216                     else              { a->orr16b(dst(), r(z), r(z));
4217                                         a->fmls4s(dst(), r(x), r(y)); }
4218                                         break;
4219 
4220                 case Op::fms_f32:   // calculate z - xy, then negate to xy - z
4221                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4222                     else              { a->orr16b(dst(), r(z), r(z));
4223                                         a->fmls4s(dst(), r(x), r(y)); }
4224                                         a->fneg4s(dst(), dst());
4225                                         break;
4226 
4227                 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
4228                 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
4229                 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
4230                 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
4231                                   a->not16b  (dst(), dst());         break;
4232 
4233 
4234                 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
4235                 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
4236                 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
4237 
4238                 case Op::bit_and  : a->and16b(dst(x,y), r(x), r(y)); break;
4239                 case Op::bit_or   : a->orr16b(dst(x,y), r(x), r(y)); break;
4240                 case Op::bit_xor  : a->eor16b(dst(x,y), r(x), r(y)); break;
4241                 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
4242 
4243                 case Op::select: // bsl16b is x = x ? y : z
4244                     if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
4245                     else              { a->orr16b(dst(), r(x), r(x));
4246                                         a->bsl16b(dst(), r(y), r(z)); }
4247                                         break;
4248 
4249                 // fmin4s and fmax4s don't work the way we want with NaN,
4250                 // so we write them the long way:
4251                 case Op::min_f32: // min(x,y) = y<x ? y : x
4252                                   a->fcmgt4s(dst(), r(x), r(y));
4253                                   a->bsl16b (dst(), r(y), r(x));
4254                                   break;
4255 
4256                 case Op::max_f32: // max(x,y) = x<y ? y : x
4257                                   a->fcmgt4s(dst(), r(y), r(x));
4258                                   a->bsl16b (dst(), r(y), r(x));
4259                                   break;
4260 
4261                 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break;
4262                 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break;
4263                 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break;
4264 
4265                 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
4266                 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
4267 
4268                 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
4269                 case Op::trunc:  a->fcvtzs4s(dst(x), r(x)); break;
4270                 case Op::round:  a->fcvtns4s(dst(x), r(x)); break;
4271                 case Op::ceil:   a->frintp4s(dst(x), r(x)); break;
4272                 case Op::floor:  a->frintm4s(dst(x), r(x)); break;
4273 
4274                 case Op::to_fp16:
4275                     a->fcvtn  (dst(x), r(x));    // 4x f32 -> 4x f16 in bottom four lanes
4276                     a->uxtlh2s(dst(), dst());    // expand to 4x f16 in even 16-bit lanes
4277                     break;
4278 
4279                 case Op::from_fp16:
4280                     a->xtns2h(dst(x), r(x));     // pack even 16-bit lanes into bottom four lanes
4281                     a->fcvtl (dst(), dst());     // 4x f16 -> 4x f32
4282                     break;
4283 
4284                 case Op::duplicate: break;
4285             #endif
4286             }
4287 
4288             // Proactively free the registers holding any value that dies here.
4289             if (rd != NA &&                   dies_here(regs[rd])) { regs[rd] = NA; }
4290             if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
4291             if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
4292             if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
4293             if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; }
4294             return true;
4295         };
4296 
4297         #if defined(__x86_64__) || defined(_M_X64)
4298             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
4299             auto jump         = [&](A::Label* l) { a->jmp(l); };
4300 
4301             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
4302             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
4303         #elif defined(__aarch64__)
4304             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
4305             auto jump         = [&](A::Label* l) { a->b  (l); };
4306 
4307             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
4308             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
4309         #endif
4310 
4311         A::Label body,
4312                  tail,
4313                  done;
4314 
4315         enter();
4316         for (Val id = 0; id < (Val)instructions.size(); id++) {
4317             if (fImpl->visualizer && is_trace(instructions[id].op)) {
4318                 // Make sure trace commands stay on JIT for visualizer
4319                 continue;
4320             }
4321             auto start = a->size();
4322             if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4323                 return false;
4324             }
4325             if (fImpl->visualizer && instructions[id].can_hoist) {
4326                 fImpl->visualizer->addMachineCommands(id, start, a->size());
4327             }
4328         }
4329 
4330         // This point marks a kind of canonical fixed point for register contents: if loop
4331         // code is generated as if these registers are holding these values, the next time
4332         // the loop comes around we'd better find those same registers holding those same values.
4333         auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
4334                                       saved_next_stack_slot=next_stack_slot]{
4335             for (int r = 0; r < (int)regs.size(); r++) {
4336                 if (regs[r] != incoming[r]) {
4337                     regs[r]  = incoming[r];
4338                     if (regs[r] >= 0) {
4339                         load_from_memory((Reg)r, regs[r]);
4340                     }
4341                 }
4342             }
4343             *stack_hint = std::max(*stack_hint, next_stack_slot);
4344             stack_slot = saved_stack_slot;
4345             next_stack_slot = saved_next_stack_slot;
4346         };
4347 
4348         a->label(&body);
4349         {
4350             a->cmp(N, K);
4351             jump_if_less(&tail);
4352             for (Val id = 0; id < (Val)instructions.size(); id++) {
4353                 if (fImpl->visualizer != nullptr && is_trace(instructions[id].op)) {
4354                     // Make sure trace commands stay on JIT for visualizer
4355                     continue;
4356                 }
4357                 auto start = a->size();
4358                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4359                     return false;
4360                 }
4361                 if (fImpl->visualizer && !instructions[id].can_hoist) {
4362                     fImpl->visualizer->addMachineCommands(id, start, a->size());
4363                 }
4364             }
4365             restore_incoming_regs();
4366             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4367                 if (fImpl->strides[i]) {
4368                     add(arg[i], K*fImpl->strides[i]);
4369                 }
4370             }
4371             sub(N, K);
4372             jump(&body);
4373         }
4374 
4375         a->label(&tail);
4376         {
4377             a->cmp(N, 1);
4378             jump_if_less(&done);
4379             for (Val id = 0; id < (Val)instructions.size(); id++) {
4380                 if (fImpl->visualizer && is_trace(instructions[id].op)) {
4381                     // Make sure trace commands stay on JIT for visualizer
4382                     continue;
4383                 }
4384                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
4385                     return false;
4386                 }
4387             }
4388             restore_incoming_regs();
4389             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4390                 if (fImpl->strides[i]) {
4391                     add(arg[i], 1*fImpl->strides[i]);
4392                 }
4393             }
4394             sub(N, 1);
4395             jump(&tail);
4396         }
4397 
4398         a->label(&done);
4399         {
4400             exit();
4401         }
4402 
4403         // Except for explicit aligned load and store instructions, AVX allows
4404         // memory operands to be unaligned.  So even though we're creating 16
4405         // byte patterns on ARM or 32-byte patterns on x86, we only need to
4406         // align to 4 bytes, the element size and alignment requirement.
4407 
4408         constants.foreach([&](int imm, A::Label* label) {
4409             a->align(4);
4410             a->label(label);
4411             for (int i = 0; i < K; i++) {
4412                 a->word(imm);
4413             }
4414         });
4415 
4416         if (!iota.references.empty()) {
4417             a->align(4);
4418             a->label(&iota);        // 0,1,2,3,4,...
4419             for (int i = 0; i < K; i++) {
4420                 a->word(i);
4421             }
4422         }
4423 
4424         if (!load64_index.references.empty()) {
4425             a->align(4);
4426             a->label(&load64_index);  // {0,2,4,6|1,3,5,7}
4427             a->word(0); a->word(2); a->word(4); a->word(6);
4428             a->word(1); a->word(3); a->word(5); a->word(7);
4429         }
4430 
4431         return true;
4432     }
4433 
setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4434     void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4435                            const char* debug_name) {
4436         // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4437         // and stack_hint/registers_used to feed forward into the next jit() call.
4438         Assembler a{nullptr};
4439         int stack_hint = -1;
4440         uint32_t registers_used = 0xffff'ffff;  // Start conservatively with all.
4441         if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4442             return;
4443         }
4444 
4445         fImpl->jit_size = a.size();
4446         void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4447         fImpl->jit_entry.store(jit_entry);
4448 
4449         // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4450         a = Assembler{jit_entry};
4451         SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4452         SkASSERT(a.size() <= fImpl->jit_size);
4453 
4454         // Remap as executable, and flush caches on platforms that need that.
4455         remap_as_executable(jit_entry, fImpl->jit_size);
4456 
4457         notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4458 
4459     #if !defined(SK_BUILD_FOR_WIN)
4460         // For profiling and debugging, it's helpful to have this code loaded
4461         // dynamically rather than just jumping info fImpl->jit_entry.
4462         if (gSkVMJITViaDylib) {
4463             // Dump the raw program binary.
4464             SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4465             int fd = mkstemp(path.writable_str());
4466             ::write(fd, jit_entry, a.size());
4467             close(fd);
4468 
4469             this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
4470 
4471             // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4472             SkString cmd = SkStringPrintf(
4473                     "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4474                     " | clang -x assembler -shared - -o %s",
4475                     path.c_str(), path.c_str());
4476             system(cmd.c_str());
4477 
4478             // Load that dynamic library and look up skvm_jit().
4479             fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4480             void* sym = nullptr;
4481             for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4482                 if (!sym) { sym = dlsym(fImpl->dylib, name); }
4483             }
4484             fImpl->jit_entry.store(sym);
4485         }
4486     #endif
4487     }
4488 
disassemble(SkWStream * o) const4489     void Program::disassemble(SkWStream* o) const {
4490     #if !defined(SK_BUILD_FOR_WIN)
4491         SkDebugfStream debug;
4492         if (!o) { o = &debug; }
4493 
4494         const void* jit_entry = fImpl->jit_entry.load();
4495         size_t jit_size = fImpl->jit_size;
4496 
4497         if (!jit_entry) {
4498             o->writeText("Program not JIT'd. Did you pass --jit?\n");
4499             return;
4500         }
4501 
4502         char path[] = "/tmp/skvm-jit.XXXXXX";
4503         int fd = mkstemp(path);
4504         ::write(fd, jit_entry, jit_size);
4505         close(fd);
4506 
4507         // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4508         SkString cmd = SkStringPrintf(
4509                 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4510                 " | clang -x assembler -shared - -o %s",
4511                 path, path);
4512         system(cmd.c_str());
4513 
4514         // Now objdump to disassemble our function:
4515         // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`,
4516         // but the symbol name varies with OS, and that option may be missing from objdump on some
4517         // machines? There also apears to be quite a bit of junk after the end of the JIT'd code.
4518         // Trimming that would let us pass '--visualize-jumps' and get the loop annotated.
4519         // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art.
4520         cmd = SkStringPrintf("objdump -D %s", path);
4521     #if defined(SK_BUILD_FOR_UNIX)
4522         cmd.append(" --section=.text");
4523     #endif
4524         FILE* fp = popen(cmd.c_str(), "r");
4525         if (!fp) {
4526             o->writeText("objdump failed\n");
4527             return;
4528         }
4529 
4530         char line[1024];
4531         while (fgets(line, sizeof(line), fp)) {
4532             o->writeText(line);
4533         }
4534 
4535         pclose(fp);
4536     #endif
4537     }
4538 
4539 #endif
4540 
4541 }  // namespace skvm
4542